From ce7c9ddd80ae37d8bd9e1b2d778d83a4fe3e3b3d Mon Sep 17 00:00:00 2001
From: Ben Rangihuna <Benrangihuna@gmail.com>
Date: Thu, 4 Mar 2021 11:07:42 +1300
Subject: [PATCH 01/23] should run ready for test

---
 ...{whole_pop_model_residuals.R => model_residuals_whole_pop.R} | 2 +-
 scripts/run_gcta_whole_pop.sh                                   | 2 +-
 scripts/run_ldak_whole_pop.sh                                   | 2 +-
 scripts/{whole_pop_pipline.sh => whole_pop_pipeline.sh}         | 0
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename scripts/{whole_pop_model_residuals.R => model_residuals_whole_pop.R} (98%)
 rename scripts/{whole_pop_pipline.sh => whole_pop_pipeline.sh} (100%)

diff --git a/scripts/whole_pop_model_residuals.R b/scripts/model_residuals_whole_pop.R
similarity index 98%
rename from scripts/whole_pop_model_residuals.R
rename to scripts/model_residuals_whole_pop.R
index c93b043..e7237d8 100644
--- a/scripts/whole_pop_model_residuals.R
+++ b/scripts/model_residuals_whole_pop.R
@@ -125,7 +125,7 @@ new_dat %>%
   select(SUBJECT, !!trait) %>% # pull out ids and trait column
   slice(-unlist(model_remove)) %>% # remove rows that didn't have residuals
   bind_cols(model_residuals) %>% 
-  write_delim(path = here("tmp",paste0(pop,"_",trait,".residuals.txt")),
+  write_delim(file = here("tmp",paste0(pop,"_",trait,".residuals.txt")),
               delim = " ",
               col_names = FALSE)
 
diff --git a/scripts/run_gcta_whole_pop.sh b/scripts/run_gcta_whole_pop.sh
index f6ac54a..c40fbc6 100644
--- a/scripts/run_gcta_whole_pop.sh
+++ b/scripts/run_gcta_whole_pop.sh
@@ -59,6 +59,6 @@ software/gcta64 --bfile tmp/$BFILE --blup-snp ${POP}_results/GCTA/${TRAIT}_${MOD
 
 
 
-cat  ${POP}_results/GCTA/${TRAIT}_${MODEL}.hsq |tr -s " " | tr " " "\t" > ${DIR}/GCTA/${TRAIT}_${MODEL}.hsq.tsv
+cat  ${POP}_results/GCTA/${TRAIT}_${MODEL}.hsq |tr -s " " | tr " " "\t" > ${POP}_results/GCTA/${TRAIT}_${MODEL}.hsq.tsv
 
 
diff --git a/scripts/run_ldak_whole_pop.sh b/scripts/run_ldak_whole_pop.sh
index 78b3455..77d0442 100644
--- a/scripts/run_ldak_whole_pop.sh
+++ b/scripts/run_ldak_whole_pop.sh
@@ -28,7 +28,7 @@ software/LDAK/ldak5.linux --reml ${POP}_results/LDAK/${TRAIT}_${MODEL} --pheno t
 
 ### blups
 
-software/LDAK/ldak5.linux --calc-blups ${POP}_results/LDAK/${TRAIT}_${MODEL} --remlfile ${POP}_results/LDAK/${TRAIT}_${CV}_${MODEL}.reml --grm results/${POP}_ldak_kinships --bfile tmp/${BFILE} --check-root NO
+software/LDAK/ldak5.linux --calc-blups ${POP}_results/LDAK/${TRAIT}_${MODEL} --remlfile ${POP}_results/LDAK/${TRAIT}_${MODEL}.reml --grm results/${POP}_ldak_kinships --bfile tmp/${BFILE} --check-root NO
 
 
 
diff --git a/scripts/whole_pop_pipline.sh b/scripts/whole_pop_pipeline.sh
similarity index 100%
rename from scripts/whole_pop_pipline.sh
rename to scripts/whole_pop_pipeline.sh

From bf9ca9463f9edd85b5108dc8ef2bbef718034c93 Mon Sep 17 00:00:00 2001
From: Ben Rangihuna <Benrangihuna@gmail.com>
Date: Thu, 4 Mar 2021 11:07:55 +1300
Subject: [PATCH 02/23] testing

---
 scripts/whole_pop_pipeline.sh | 49 ++++++++++++-----------------------
 1 file changed, 17 insertions(+), 32 deletions(-)

diff --git a/scripts/whole_pop_pipeline.sh b/scripts/whole_pop_pipeline.sh
index e7826d7..60a4485 100644
--- a/scripts/whole_pop_pipeline.sh
+++ b/scripts/whole_pop_pipeline.sh
@@ -23,44 +23,31 @@ date
 
 
 #### Initial setup #####
-mkdir -p tmp ${RESULTS}/{GCTA,LDAK}
+mkdir -p tmp ${RESULTS}/
 
-software/plink1.9b6.10 --bfile ${ORIG_DATA} --make-bed --out tmp/data_sorted
+ software/plink1.9b6.10 --bfile ${ORIG_DATA} --make-bed --out tmp/data_sorted
 
 # create lise of independent SNPs from data
 
-software/plink1.9b6.10 --bfile tmp/data_sorted --indep-pairwise 50 5 0.2 --maf 0.1 --out tmp/data_pca_markers
-## sort and keep our population
-
-
-mkdir -p tmp
+ software/plink1.9b6.10 --bfile tmp/data_sorted --indep-pairwise 50 5 0.2 --maf 0.1 --out tmp/data_pca_markers
 
+## sort and keep our population
 software/plink1.9b6.10 --bfile tmp/data_sorted --keep data/${POP}.keep --maf 0.01 --geno 0.05 --make-bed --out tmp/${POP}_sorteddata
 
 
-
-
-
-
-
 ######################## GCTA
 
 #grm
 
 # only run this line once per population to save time
-
 software/gcta64 --bfile tmp/${POP}_sorteddata --autosome --make-grm --out ${RESULTS}/${POP}_gcta_grm --threads 4
 
 
 
-
-
-
 ######################## PCAs
 
 ### calculate principal components for the population using the independent markers
-
-software/plink1.9b6.10 --bfile tmp/${POP}_sorteddata --pca 10 --out ${RESULTS}/${POP}_pcafile --extract data/data_pca_markers.prune.in
+ software/plink1.9b6.10 --bfile tmp/${POP}_sorteddata --pca 10 --out ${RESULTS}/${POP}_pcafile --extract data/data_pca_markers.prune.in
 
 
 
@@ -68,29 +55,27 @@ software/plink1.9b6.10 --bfile tmp/${POP}_sorteddata --pca 10 --out ${RESULTS}/$
 ####################### LDAK
 
 ### calculate the weights and kinships for LDAK prior to GRM
-
-software/LDAK/ldak5.linux --cut-weights $DIR/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata
-
+ software/LDAK/ldak5.linux --cut-weights ${RESULTS}/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata
 
 
+ software/LDAK/ldak5.linux --calc-weights-all ${RESULTS}/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata
 
-software/LDAK/ldak5.linux --calc-weights-all $DIR/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata
-
-
-
-
-software/LDAK/ldak5.linux --calc-kins-direct $DIR/${POP}_ldak_kinships --bfile  tmp/${POP}_sorteddata --weights $DIR/${POP}_ldak_sections/weights.short --power -0.25
 
+software/LDAK/ldak5.linux --calc-kins-direct ${RESULTS}/${POP}_ldak_kinships --bfile  tmp/${POP}_sorteddata --weights ${RESULTS}/${POP}_ldak_sections/weights.short --power -0.25
 
 
+# example line to use for testing one iteration
+#line="HEIGHT,linear",
 while read line
 do
 	TRAIT=$(echo $line | cut -d',' -f1)
 	REGRESSION_TYPE=$(echo $line | cut -d',' -f2)
-
+	mkdir -p ${RESULTS}/{GCTA,LDAK}
 	##### Generate regression residuals for each model #####
 	# makes tmp/${POP}_${TRAIT}_residuals.txt
-	bash scripts/generate_models_whole_pop.sh ${POP} ${TRAIT} ${REGRESSION_TYPE} 
+	
+	#bash scripts/generate_models_whole_pop.sh ${POP} ${TRAIT} ${REGRESSION_TYPE} 
+	Rscript scripts/model_residuals_whole_pop.R --trait ${TRAIT} --pop ${POP} --regression ${REGRESSION_TYPE} --out_dir ${RESULTS}/${TRAIT}
 
 	##### Create phone files with the residuals
 	### prune and sort the data
@@ -101,12 +86,12 @@ do
 	# columns 8-12 are the FIVE residuals
 	# height, egfr, serumurate, diabetes and gout in that order
 	# need to join based on number fo models 2.2 - 2.8  was for 7 models
-	software/plink1.9b6.10 --bfile data/data --keep tmp/${POP}.keep --make-bed --out tmp/${POP}_${TRAIT}
+	software/plink1.9b6.10 --bfile data/data --keep data/${POP}.keep --make-bed --out tmp/${POP}_${TRAIT}
 	# this should join using IID from fam and IID from the pheno (in pheno first and second cols are duplicated) includes entire fam in output so need to adjust accordingly for model numbers (-6 cols)
-	join -1 2 -2 1 -o auto -e "NA" <(sort -k1 tmp/${POP}.fam) <(sort -k1 tmp/${POP}_${TRAIT}_residuals.txt | cut -f2- )  > tmp/${POP}_${TRAIT}.pheno 
+	join -1 2 -2 1 -o auto -e "NA" <(sort -k1 tmp/${POP}_${TRAIT}.fam) <(sort -k1 tmp/${POP}_${TRAIT}.residuals.txt | cut -f2- )  > tmp/${POP}_${TRAIT}.pheno 
 
 	# calculate the column indexes, offset by 2 (to account for FID and IID) from all of the residuals -> should be equal to $(seq 1 number_of_models)
-	pheno_cols=$( head -1 tmp/${POP}_${TRAIT}_residuals.txt | awk '{ for(i=1;i<=NF-2;i++){print i}}' | sort -u)	
+	pheno_cols=$( head -1 tmp/${POP}_${TRAIT}.residuals.txt | awk '{ for(i=1;i<=NF-2;i++){print i}}' | sort -u)	
 
 
 	#### GCTA REML for each model of current trait

From df7a669e33caa9687f36d21e08eed1dcf2b529d6 Mon Sep 17 00:00:00 2001
From: Ben Rangihuna <Benrangihuna@gmail.com>
Date: Tue, 9 Mar 2021 10:40:37 +1300
Subject: [PATCH 03/23] more timestamps

---
 scripts/whole_pop_pipeline.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/scripts/whole_pop_pipeline.sh b/scripts/whole_pop_pipeline.sh
index 60a4485..3db87a8 100644
--- a/scripts/whole_pop_pipeline.sh
+++ b/scripts/whole_pop_pipeline.sh
@@ -54,6 +54,7 @@ software/gcta64 --bfile tmp/${POP}_sorteddata --autosome --make-grm --out ${RESU
 
 ####################### LDAK
 
+echo "Start LDAK $(date)"
 ### calculate the weights and kinships for LDAK prior to GRM
  software/LDAK/ldak5.linux --cut-weights ${RESULTS}/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata
 
@@ -63,13 +64,15 @@ software/gcta64 --bfile tmp/${POP}_sorteddata --autosome --make-grm --out ${RESU
 
 software/LDAK/ldak5.linux --calc-kins-direct ${RESULTS}/${POP}_ldak_kinships --bfile  tmp/${POP}_sorteddata --weights ${RESULTS}/${POP}_ldak_sections/weights.short --power -0.25
 
-
+echo "Start traits $(date)"
 # example line to use for testing one iteration
 #line="HEIGHT,linear",
 while read line
 do
 	TRAIT=$(echo $line | cut -d',' -f1)
 	REGRESSION_TYPE=$(echo $line | cut -d',' -f2)
+
+	echo "Start ${TRAIT} $(date)"
 	mkdir -p ${RESULTS}/{GCTA,LDAK}
 	##### Generate regression residuals for each model #####
 	# makes tmp/${POP}_${TRAIT}_residuals.txt

From ec3ce136011adf4115ce16b220928c9429a16afd Mon Sep 17 00:00:00 2001
From: Ben Rangihuna <Benrangihuna@gmail.com>
Date: Tue, 9 Mar 2021 11:10:56 +1300
Subject: [PATCH 04/23] comment out testing code that was causing issues

---
 scripts/model_residuals_whole_pop.R | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/scripts/model_residuals_whole_pop.R b/scripts/model_residuals_whole_pop.R
index e7237d8..67a4b01 100644
--- a/scripts/model_residuals_whole_pop.R
+++ b/scripts/model_residuals_whole_pop.R
@@ -58,10 +58,12 @@ if(is.null(opt$pop)){
   pop <- opt$pop
 }
 
-pop <- 'nph'
-model_type <- "linear"
-trait <- "HEIGHT"
-opt <-list(out_dir = "nph_results/")
+
+## Used for testing inside of an interactive session
+#pop <- 'nph'
+#model_type <- "linear"
+#trait <- "HEIGHT"
+#opt <-list(out_dir = "nph_results/")
 
 all_dat <- read_csv(file = here("data/curated_data.csv"), col_names = TRUE) %>% mutate(GOUT01recode = GOUT -1, T2D01recode = T2D -1, logBMI= log(BMI))
 newpca <- read_delim(file = here("results",paste0(pop,"_","pcafile.eigenvec")), delim = " ", col_names = c("FID","SUBJECT",paste0("PC",1:10)), col_types = paste0(c("c","c",rep("d", 10)),collapse = ""))
@@ -130,7 +132,7 @@ new_dat %>%
               col_names = FALSE)
 
   
-  
+message(paste("residuals file:", here("tmp",paste0(pop,"_",trait,".residuals.txt")))) 
 
 
 

From e557c32d4d4afc148c5839645e18ef672bec3806 Mon Sep 17 00:00:00 2001
From: Ben Rangihuna <Benrangihuna@gmail.com>
Date: Tue, 1 Jun 2021 11:33:39 +1200
Subject: [PATCH 05/23] calc pca for individual pops

---
 scripts/model_residuals_whole_pop.R |  6 +++---
 scripts/run_gcta_whole_pop.sh       |  8 ++++----
 scripts/run_ldak_whole_pop.sh       | 10 +++++-----
 scripts/whole_pop_pipeline.sh       | 21 +++++++++++----------
 4 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/scripts/model_residuals_whole_pop.R b/scripts/model_residuals_whole_pop.R
index 67a4b01..3c383ab 100644
--- a/scripts/model_residuals_whole_pop.R
+++ b/scripts/model_residuals_whole_pop.R
@@ -66,7 +66,7 @@ if(is.null(opt$pop)){
 #opt <-list(out_dir = "nph_results/")
 
 all_dat <- read_csv(file = here("data/curated_data.csv"), col_names = TRUE) %>% mutate(GOUT01recode = GOUT -1, T2D01recode = T2D -1, logBMI= log(BMI))
-newpca <- read_delim(file = here("results",paste0(pop,"_","pcafile.eigenvec")), delim = " ", col_names = c("FID","SUBJECT",paste0("PC",1:10)), col_types = paste0(c("c","c",rep("d", 10)),collapse = ""))
+newpca <- read_delim(file = here(paste0(pop,"_results"),paste0(pop,"_pcafile.eigenvec")), delim = " ", col_names = c("FID","SUBJECT",paste0("PC",1:10)), col_types = paste0(c("c","c",rep("d", 10)),collapse = ""))
 
 if (!trait %in% names(all_dat)){
   stop("Trait argument supplied does not match a column in the phenotypes.", call.=FALSE)
@@ -127,12 +127,12 @@ new_dat %>%
   select(SUBJECT, !!trait) %>% # pull out ids and trait column
   slice(-unlist(model_remove)) %>% # remove rows that didn't have residuals
   bind_cols(model_residuals) %>% 
-  write_delim(file = here("tmp",paste0(pop,"_",trait,".residuals.txt")),
+  write_delim(file = here(opt$out_dir,paste0(pop,"_",trait,".residuals.txt")),
               delim = " ",
               col_names = FALSE)
 
   
-message(paste("residuals file:", here("tmp",paste0(pop,"_",trait,".residuals.txt")))) 
+message(paste("residuals file:", here(opt$out_dir,paste0(pop,"_",trait,".residuals.txt")))) 
 
 
 
diff --git a/scripts/run_gcta_whole_pop.sh b/scripts/run_gcta_whole_pop.sh
index c40fbc6..44e069e 100644
--- a/scripts/run_gcta_whole_pop.sh
+++ b/scripts/run_gcta_whole_pop.sh
@@ -6,7 +6,7 @@ BFILE=$(basename $1 .fam)
 
 
 POP=$(echo $BFILE | cut -d'_' -f1)
-
+RESULTS=${POP}_results
 TRAIT=$(echo $BFILE | cut -d'_' -f2)
 
 
@@ -45,7 +45,7 @@ echo "****** $REGRESSION ******"
 
 # $(seq 1 to how many trait/residual columns 
 
-software/gcta64 --reml --reml-pred-rand --grm ${POP}_results/${POP}_gcta_grm --pheno tmp/${BFILE}.pheno --mpheno ${PHENOCOL} --out ${POP}_results/GCTA/${TRAIT}_${MODEL} --threads 4
+software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno ${RESULTS}/${BFILE}.pheno --mpheno ${PHENOCOL} --out ${RESULTS}/GCTA/${TRAIT}_${MODEL} --threads 4
 
 
 
@@ -55,10 +55,10 @@ software/gcta64 --reml --reml-pred-rand --grm ${POP}_results/${POP}_gcta_grm --p
 
 #software/plink1.9b6.10 --bfile data/data --keep tmp/$BFILE --maf 0.01 --geno 0.05 --make-bed --out tmp/$BFILE
 
-software/gcta64 --bfile tmp/$BFILE --blup-snp ${POP}_results/GCTA/${TRAIT}_${MODEL}.indi.blp --autosome --out ${POP}_results/GCTA/${TRAIT}_${MODEL} --threads 4
+software/gcta64 --bfile tmp/$BFILE --blup-snp ${RESULTS}/GCTA/${TRAIT}_${MODEL}.indi.blp --autosome --out ${RESULTS}/GCTA/${TRAIT}_${MODEL} --threads 4
 
 
 
-cat  ${POP}_results/GCTA/${TRAIT}_${MODEL}.hsq |tr -s " " | tr " " "\t" > ${POP}_results/GCTA/${TRAIT}_${MODEL}.hsq.tsv
+cat  ${RESULTS}/GCTA/${TRAIT}_${MODEL}.hsq |tr -s " " | tr " " "\t" > ${RESULTS}/GCTA/${TRAIT}_${MODEL}.hsq.tsv
 
 
diff --git a/scripts/run_ldak_whole_pop.sh b/scripts/run_ldak_whole_pop.sh
index 77d0442..799a487 100644
--- a/scripts/run_ldak_whole_pop.sh
+++ b/scripts/run_ldak_whole_pop.sh
@@ -6,7 +6,7 @@ BFILE=$(basename $1 .fam)
 
 
 POP=$(echo $BFILE | cut -d'_' -f1)
-
+RESULTS=${POP}_results
 TRAIT=$(echo $BFILE | cut -d'_' -f2)
 
 
@@ -21,14 +21,14 @@ PHENOCOL=$(echo $MODEL + 4 | bc) # ignore the first 4 columns after FID and IID
 
 ### reml
 
-software/LDAK/ldak5.linux --reml ${POP}_results/LDAK/${TRAIT}_${MODEL} --pheno tmp/${BFILE}.pheno --mpheno ${PHENOCOL} --grm ${POP}_results/${POP}_ldak_kinships
+software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}_${MODEL} --pheno ${RESULTS}/${BFILE}.pheno --mpheno ${PHENOCOL} --grm ${RESULTS}/${POP}_ldak_kinships
 
 
 
 
 ### blups
 
-software/LDAK/ldak5.linux --calc-blups ${POP}_results/LDAK/${TRAIT}_${MODEL} --remlfile ${POP}_results/LDAK/${TRAIT}_${MODEL}.reml --grm results/${POP}_ldak_kinships --bfile tmp/${BFILE} --check-root NO
+software/LDAK/ldak5.linux --calc-blups ${RESULTS}/LDAK/${TRAIT}_${MODEL} --remlfile ${RESULTS}/LDAK/${TRAIT}_${MODEL}.reml --grm ${RESULTS}/${POP}_ldak_kinships --bfile tmp/${BFILE} --check-root NO
 
 
 
@@ -38,8 +38,8 @@ software/LDAK/ldak5.linux --calc-blups ${POP}_results/LDAK/${TRAIT}_${MODEL} --r
 
 ## make better results files
 
-grep "^Her\|^Com"  ${POP}_results/LDAK/${TRAIT}_${MODEL}.reml  >  ${POP}_results/LDAK/${TRAIT}_${MODEL}.h2
+grep "^Her\|^Com"  ${RESULTS}/LDAK/${TRAIT}_${MODEL}.reml  >  ${RESULTS}/LDAK/${TRAIT}_${MODEL}.h2
 
-grep "^LRT_P"  ${POP}_results/LDAK/${TRAIT}_${MODEL}.reml > ${POP}_results/LDAK/${TRAIT}_${MODEL}.p 
+grep "^LRT_P"  ${RESULTS}/LDAK/${TRAIT}_${MODEL}.reml > ${RESULTS}/LDAK/${TRAIT}_${MODEL}.p 
 
 
diff --git a/scripts/whole_pop_pipeline.sh b/scripts/whole_pop_pipeline.sh
index 3db87a8..8a1f43b 100644
--- a/scripts/whole_pop_pipeline.sh
+++ b/scripts/whole_pop_pipeline.sh
@@ -25,18 +25,18 @@ date
 #### Initial setup #####
 mkdir -p tmp ${RESULTS}/
 
- software/plink1.9b6.10 --bfile ${ORIG_DATA} --make-bed --out tmp/data_sorted
 
-# create lise of independent SNPs from data
+software/plink1.9b6.10 --bfile ${ORIG_DATA} --make-bed --out tmp/data_sorted
 
- software/plink1.9b6.10 --bfile tmp/data_sorted --indep-pairwise 50 5 0.2 --maf 0.1 --out tmp/data_pca_markers
 
 ## sort and keep our population
 software/plink1.9b6.10 --bfile tmp/data_sorted --keep data/${POP}.keep --maf 0.01 --geno 0.05 --make-bed --out tmp/${POP}_sorteddata
 
 
-######################## GCTA
+# create list of population specific independent SNPs from data
+software/plink1.9b6.10 --bfile tmp/${POP}_sorteddata --indep-pairwise 50 5 0.2 --maf 0.1 --out ${RESULTS}/${POP}_pca_markers
 
+######################## GCTA
 #grm
 
 # only run this line once per population to save time
@@ -47,7 +47,8 @@ software/gcta64 --bfile tmp/${POP}_sorteddata --autosome --make-grm --out ${RESU
 ######################## PCAs
 
 ### calculate principal components for the population using the independent markers
- software/plink1.9b6.10 --bfile tmp/${POP}_sorteddata --pca 10 --out ${RESULTS}/${POP}_pcafile --extract data/data_pca_markers.prune.in
+software/plink1.9b6.10 --bfile tmp/${POP}_sorteddata --pca 10 --out ${RESULTS}/${POP}_pcafile --extract ${RESULTS}/${POP}_pca_markers.prune.in --allow-no-sex
+
 
 
 
@@ -56,10 +57,10 @@ software/gcta64 --bfile tmp/${POP}_sorteddata --autosome --make-grm --out ${RESU
 
 echo "Start LDAK $(date)"
 ### calculate the weights and kinships for LDAK prior to GRM
- software/LDAK/ldak5.linux --cut-weights ${RESULTS}/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata
+software/LDAK/ldak5.linux --cut-weights ${RESULTS}/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata
 
 
- software/LDAK/ldak5.linux --calc-weights-all ${RESULTS}/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata
+software/LDAK/ldak5.linux --calc-weights-all ${RESULTS}/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata
 
 
 software/LDAK/ldak5.linux --calc-kins-direct ${RESULTS}/${POP}_ldak_kinships --bfile  tmp/${POP}_sorteddata --weights ${RESULTS}/${POP}_ldak_sections/weights.short --power -0.25
@@ -78,7 +79,7 @@ do
 	# makes tmp/${POP}_${TRAIT}_residuals.txt
 	
 	#bash scripts/generate_models_whole_pop.sh ${POP} ${TRAIT} ${REGRESSION_TYPE} 
-	Rscript scripts/model_residuals_whole_pop.R --trait ${TRAIT} --pop ${POP} --regression ${REGRESSION_TYPE} --out_dir ${RESULTS}/${TRAIT}
+	Rscript scripts/model_residuals_whole_pop.R --trait ${TRAIT} --pop ${POP} --regression ${REGRESSION_TYPE} --out_dir ${RESULTS}/
 
 	##### Create phone files with the residuals
 	### prune and sort the data
@@ -91,10 +92,10 @@ do
 	# need to join based on number fo models 2.2 - 2.8  was for 7 models
 	software/plink1.9b6.10 --bfile data/data --keep data/${POP}.keep --make-bed --out tmp/${POP}_${TRAIT}
 	# this should join using IID from fam and IID from the pheno (in pheno first and second cols are duplicated) includes entire fam in output so need to adjust accordingly for model numbers (-6 cols)
-	join -1 2 -2 1 -o auto -e "NA" <(sort -k1 tmp/${POP}_${TRAIT}.fam) <(sort -k1 tmp/${POP}_${TRAIT}.residuals.txt | cut -f2- )  > tmp/${POP}_${TRAIT}.pheno 
+	join -1 2 -2 1 -o auto -e "NA" <(sort -k1 tmp/${POP}_${TRAIT}.fam) <(sort -k1 ${RESULTS}/${POP}_${TRAIT}.residuals.txt | cut -f2- )  > ${RESULTS}/${POP}_${TRAIT}.pheno 
 
 	# calculate the column indexes, offset by 2 (to account for FID and IID) from all of the residuals -> should be equal to $(seq 1 number_of_models)
-	pheno_cols=$( head -1 tmp/${POP}_${TRAIT}.residuals.txt | awk '{ for(i=1;i<=NF-2;i++){print i}}' | sort -u)	
+	pheno_cols=$( head -1 ${RESULTS}/${POP}_${TRAIT}.residuals.txt | awk '{ for(i=1;i<=NF-2;i++){print i}}' | sort -u)	
 
 
 	#### GCTA REML for each model of current trait

From a6566ea6ce3fdc05f208415083688960368cd355 Mon Sep 17 00:00:00 2001
From: Ben Rangihuna <Benrangihuna@gmail.com>
Date: Tue, 1 Jun 2021 17:20:46 +1200
Subject: [PATCH 06/23] calc pca for individual pops

---
 .../scripts/model_residuals_whole_pop.R       | 140 ++++++++++++++++++
 tanya_data/scripts/run_gcta_whole_pop.sh      |  64 ++++++++
 tanya_data/scripts/run_ldak_whole_pop.sh      |  45 ++++++
 tanya_data/scripts/whole_pop_pipeline.sh      | 116 +++++++++++++++
 4 files changed, 365 insertions(+)
 create mode 100644 tanya_data/scripts/model_residuals_whole_pop.R
 create mode 100644 tanya_data/scripts/run_gcta_whole_pop.sh
 create mode 100644 tanya_data/scripts/run_ldak_whole_pop.sh
 create mode 100644 tanya_data/scripts/whole_pop_pipeline.sh

diff --git a/tanya_data/scripts/model_residuals_whole_pop.R b/tanya_data/scripts/model_residuals_whole_pop.R
new file mode 100644
index 0000000..d186d45
--- /dev/null
+++ b/tanya_data/scripts/model_residuals_whole_pop.R
@@ -0,0 +1,140 @@
+if(!require(here)){
+  install.packages("here", repos = "https://cloud.r-project.org")
+  library("here")
+}
+if(!require(fs)){
+  install.packages("fs", repos = "https://cloud.r-project.org")
+  library(fs)
+}
+if(!require(tidyverse)){
+  install.packages("tidyverse", repos = "https://cloud.r-project.org")
+  library(tidyverse)
+}
+if(!require(optparse)){
+  install.packages("optparse", repos = "https://cloud.r-project.org")
+  library("optparse")
+}
+
+
+option_list = list(
+  make_option(c("--trait"), type="character", default=NULL,
+              help="Name of trait column that matches exactly from the phenotype file", metavar="character"),
+  make_option(c("-r", "--regression"), type="character", default="linear",
+              help="linear|logistic", metavar="character"),
+  make_option(c("-p","--pop"), type = "character", default=NULL,
+              help="Name of the population (no special characters except underscore).", metavar="character"),
+  make_option(c("--out_dir"), type = "character", default = "results",
+              help = "Name of the directory to store the results (no trailing slash needed).")
+  );
+
+opt_parser = OptionParser(option_list=option_list);
+opt = parse_args(opt_parser);
+
+
+# name of trait column
+#trait <- "HEIGHT"
+if (is.null(opt$trait)){
+  print_help(opt_parser)
+  stop("Trait argument must be supplied.", call.=FALSE)
+} else {
+  trait <- opt$trait
+}
+
+
+# define model type: linear or logistic regression
+#model_type <- "linear"
+if(!(opt$regression == "linear" | opt$regression ==  "logistic")){
+  print_help(opt_parser)
+  stop("Regression argument must be 'linear' or 'logistic'.", call.=FALSE)
+} else {
+  model_type <- opt$regression
+}
+
+# population name
+if(is.null(opt$pop)){
+  print_help(opt_parser)
+  stop("Population (pop) argument must be supplied.", call.=FALSE)
+} else {
+  pop <- opt$pop
+}
+
+
+## Used for testing inside of an interactive session
+#pop <- 'nph'
+#model_type <- "linear"
+#trait <- "HEIGHT"
+#opt <-list(out_dir = "nph_results/")
+
+all_dat <- read_csv(file = here("data/tanya_data.csv"), col_names = TRUE)# %>% mutate(GOUT01recode = GOUT -1, T2D01recode = T2D -1, logBMI= log(BMI))
+newpca <- read_delim(file = here("results",paste0(pop,"_","pcafile.eigenvec")), delim = " ", col_names = c("FID","SUBJECT",paste0("PC",1:10)), col_types = paste0(c("c","c",rep("d", 10)),collapse = ""))
+
+if (!trait %in% names(all_dat)){
+  stop("Trait argument supplied does not match a column in the phenotypes.", call.=FALSE)
+}
+
+### add the trait information on to the samples in the PCA
+new_dat <- newpca %>% left_join(., all_dat, by = "SUBJECT")
+
+
+
+# Define list of variables for the models ####
+preds <- c("AGECOL", "SEX", paste0("PC",1:10))
+
+# generate models stepwise adding variables
+models <- list()
+for(pred_idx in seq_along(preds)){
+  if(pred_idx ==1){
+    # create the base model
+    models <- paste(trait,"~",preds[[pred_idx]])
+  } else{
+    # add the next covariate to the base model
+    models[[pred_idx]] <- paste(models[[pred_idx-1]], preds[[pred_idx]], sep = " + ")
+  }
+}
+
+# function to use for a given formula run the model as either linear or logistic regression
+# runs the supplied model on the supplied dataset and returns the model results as a list(model, formula)
+run_model <- function(model_formula, dat){
+  model_results <- list()
+  if(model_type == "linear"){
+    model_results[["model"]] <- lm(model_formula, data = dat, na.action = na.exclude)
+  } else if(model_type == "logistic"){
+    model_results[["model"]] <- glm(model_formula, family = binomial(link='logit'), data = dat, na.action = na.exclude)
+  }
+  
+  model_results[["formula"]] <- model_formula
+  return(model_results)
+}
+
+# test example
+#run_model(model_formula = models[[1]], new_dat)
+
+
+
+# run the set of models on a the data
+model_results <- purrr::map(models, run_model, new_dat) %>% set_names(., nm =  paste0("model",seq_along(models)))
+names(model_results) <- paste0("model",seq_along(models))
+
+
+
+# put the residuals from all models into a dataframe for each cv
+model_residuals <- map(model_results, list("model", "residuals"))
+model_remove <-  map(model_results, list("model","na.action"))
+
+
+# write out residuals, columns: SUBJECT TRAIT model1 ... modelx
+new_dat %>% 
+  select(SUBJECT, !!trait) %>% # pull out ids and trait column
+  slice(-unlist(model_remove)) %>% # remove rows that didn't have residuals
+  bind_cols(model_residuals) %>% 
+  write_delim(file = here("tmp",paste0(pop,"_",trait,".residuals.txt")),
+              delim = " ",
+              col_names = FALSE)
+
+  
+message(paste("residuals file:", here("tmp",paste0(pop,"_",trait,".residuals.txt")))) 
+
+
+
+saveRDS(list(model_residuals, model_remove, trait, model_type, models, preds), file = here(opt$out_dir,paste0(pop,"_",trait,".RDS")))
+
diff --git a/tanya_data/scripts/run_gcta_whole_pop.sh b/tanya_data/scripts/run_gcta_whole_pop.sh
new file mode 100644
index 0000000..c40fbc6
--- /dev/null
+++ b/tanya_data/scripts/run_gcta_whole_pop.sh
@@ -0,0 +1,64 @@
+### run_gcta_whole_pop.sh
+
+## run: bash run_gcta_whole_pop.sh BFILE MODEL
+
+BFILE=$(basename $1 .fam)
+
+
+POP=$(echo $BFILE | cut -d'_' -f1)
+
+TRAIT=$(echo $BFILE | cut -d'_' -f2)
+
+
+
+MODEL=$2
+
+PHENOCOL=$(echo $MODEL + 4 | bc)
+
+
+
+
+
+
+
+
+
+echo "****** $REGRESSION ******"
+
+
+
+
+######################## GCTA
+
+#grm only run this line once per population to save time
+
+# transfered to the phenoprep script
+
+
+
+
+# software/gcta64 --bfile tmp/ --autosome --make-grm --out $DIR/GCTA/grm
+
+
+
+#reml
+
+# $(seq 1 to how many trait/residual columns 
+
+software/gcta64 --reml --reml-pred-rand --grm ${POP}_results/${POP}_gcta_grm --pheno tmp/${BFILE}.pheno --mpheno ${PHENOCOL} --out ${POP}_results/GCTA/${TRAIT}_${MODEL} --threads 4
+
+
+
+
+
+# BLUP solutions for the SNP effects
+
+#software/plink1.9b6.10 --bfile data/data --keep tmp/$BFILE --maf 0.01 --geno 0.05 --make-bed --out tmp/$BFILE
+
+software/gcta64 --bfile tmp/$BFILE --blup-snp ${POP}_results/GCTA/${TRAIT}_${MODEL}.indi.blp --autosome --out ${POP}_results/GCTA/${TRAIT}_${MODEL} --threads 4
+
+
+
+cat  ${POP}_results/GCTA/${TRAIT}_${MODEL}.hsq |tr -s " " | tr " " "\t" > ${POP}_results/GCTA/${TRAIT}_${MODEL}.hsq.tsv
+
+
diff --git a/tanya_data/scripts/run_ldak_whole_pop.sh b/tanya_data/scripts/run_ldak_whole_pop.sh
new file mode 100644
index 0000000..77d0442
--- /dev/null
+++ b/tanya_data/scripts/run_ldak_whole_pop.sh
@@ -0,0 +1,45 @@
+### run_ldak_whole_pop.sh
+
+
+
+BFILE=$(basename $1 .fam)
+
+
+POP=$(echo $BFILE | cut -d'_' -f1)
+
+TRAIT=$(echo $BFILE | cut -d'_' -f2)
+
+
+
+MODEL=$2
+
+PHENOCOL=$(echo $MODEL + 4 | bc) # ignore the first 4 columns after FID and IID
+
+
+
+####################### LDAK 
+
+### reml
+
+software/LDAK/ldak5.linux --reml ${POP}_results/LDAK/${TRAIT}_${MODEL} --pheno tmp/${BFILE}.pheno --mpheno ${PHENOCOL} --grm ${POP}_results/${POP}_ldak_kinships
+
+
+
+
+### blups
+
+software/LDAK/ldak5.linux --calc-blups ${POP}_results/LDAK/${TRAIT}_${MODEL} --remlfile ${POP}_results/LDAK/${TRAIT}_${MODEL}.reml --grm results/${POP}_ldak_kinships --bfile tmp/${BFILE} --check-root NO
+
+
+
+
+
+
+
+## make better results files
+
+grep "^Her\|^Com"  ${POP}_results/LDAK/${TRAIT}_${MODEL}.reml  >  ${POP}_results/LDAK/${TRAIT}_${MODEL}.h2
+
+grep "^LRT_P"  ${POP}_results/LDAK/${TRAIT}_${MODEL}.reml > ${POP}_results/LDAK/${TRAIT}_${MODEL}.p 
+
+
diff --git a/tanya_data/scripts/whole_pop_pipeline.sh b/tanya_data/scripts/whole_pop_pipeline.sh
new file mode 100644
index 0000000..3db87a8
--- /dev/null
+++ b/tanya_data/scripts/whole_pop_pipeline.sh
@@ -0,0 +1,116 @@
+### whole_pop_pipeline.sh
+
+## Run:
+
+
+## bash scripts/whole_pop_pipeline.sh data/data {POP_NAME} 
+
+## 
+
+
+
+
+ORIG_DATA=$1
+POP=$2
+RESULTS=${POP}_results/
+
+
+
+
+echo "START"
+
+date
+
+
+#### Initial setup #####
+mkdir -p tmp ${RESULTS}/
+
+ software/plink1.9b6.10 --bfile ${ORIG_DATA} --make-bed --out tmp/data_sorted
+
+# create lise of independent SNPs from data
+
+ software/plink1.9b6.10 --bfile tmp/data_sorted --indep-pairwise 50 5 0.2 --maf 0.1 --out tmp/data_pca_markers
+
+## sort and keep our population
+software/plink1.9b6.10 --bfile tmp/data_sorted --keep data/${POP}.keep --maf 0.01 --geno 0.05 --make-bed --out tmp/${POP}_sorteddata
+
+
+######################## GCTA
+
+#grm
+
+# only run this line once per population to save time
+software/gcta64 --bfile tmp/${POP}_sorteddata --autosome --make-grm --out ${RESULTS}/${POP}_gcta_grm --threads 4
+
+
+
+######################## PCAs
+
+### calculate principal components for the population using the independent markers
+ software/plink1.9b6.10 --bfile tmp/${POP}_sorteddata --pca 10 --out ${RESULTS}/${POP}_pcafile --extract data/data_pca_markers.prune.in
+
+
+
+
+####################### LDAK
+
+echo "Start LDAK $(date)"
+### calculate the weights and kinships for LDAK prior to GRM
+ software/LDAK/ldak5.linux --cut-weights ${RESULTS}/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata
+
+
+ software/LDAK/ldak5.linux --calc-weights-all ${RESULTS}/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata
+
+
+software/LDAK/ldak5.linux --calc-kins-direct ${RESULTS}/${POP}_ldak_kinships --bfile  tmp/${POP}_sorteddata --weights ${RESULTS}/${POP}_ldak_sections/weights.short --power -0.25
+
+echo "Start traits $(date)"
+# example line to use for testing one iteration
+#line="HEIGHT,linear",
+while read line
+do
+	TRAIT=$(echo $line | cut -d',' -f1)
+	REGRESSION_TYPE=$(echo $line | cut -d',' -f2)
+
+	echo "Start ${TRAIT} $(date)"
+	mkdir -p ${RESULTS}/{GCTA,LDAK}
+	##### Generate regression residuals for each model #####
+	# makes tmp/${POP}_${TRAIT}_residuals.txt
+	
+	#bash scripts/generate_models_whole_pop.sh ${POP} ${TRAIT} ${REGRESSION_TYPE} 
+	Rscript scripts/model_residuals_whole_pop.R --trait ${TRAIT} --pop ${POP} --regression ${REGRESSION_TYPE} --out_dir ${RESULTS}/${TRAIT}
+
+	##### Create phone files with the residuals
+	### prune and sort the data
+	# prune to only those with residuals
+	# residual are population specific with PCAs included in the model outside this script.
+	# combine residuals to fam file
+	# this step need to be repeated for each of the groups at bayesR section
+	# columns 8-12 are the FIVE residuals
+	# height, egfr, serumurate, diabetes and gout in that order
+	# need to join based on number fo models 2.2 - 2.8  was for 7 models
+	software/plink1.9b6.10 --bfile data/data --keep data/${POP}.keep --make-bed --out tmp/${POP}_${TRAIT}
+	# this should join using IID from fam and IID from the pheno (in pheno first and second cols are duplicated) includes entire fam in output so need to adjust accordingly for model numbers (-6 cols)
+	join -1 2 -2 1 -o auto -e "NA" <(sort -k1 tmp/${POP}_${TRAIT}.fam) <(sort -k1 tmp/${POP}_${TRAIT}.residuals.txt | cut -f2- )  > tmp/${POP}_${TRAIT}.pheno 
+
+	# calculate the column indexes, offset by 2 (to account for FID and IID) from all of the residuals -> should be equal to $(seq 1 number_of_models)
+	pheno_cols=$( head -1 tmp/${POP}_${TRAIT}.residuals.txt | awk '{ for(i=1;i<=NF-2;i++){print i}}' | sort -u)	
+
+
+	#### GCTA REML for each model of current trait
+	# run the following with 16 parallel jobs
+	parallel -j 16 'nice -n 10 bash scripts/run_gcta_whole_pop.sh {1} {2}' ::: tmp/${POP}_${TRAIT}.fam  ::: ${pheno_cols} 
+
+	#### LDAK for each model of current trait
+	parallel -j 16 'nice -n 10 bash scripts/run_ldak_whole_pop.sh {1} {2} ' :::  tmp/${POP}_${TRAIT}.fam  ::: ${pheno_cols} 
+
+done < data/pop_trait_models.csv
+
+
+
+
+echo "FINISH"
+
+date
+
+

From 39971250c427ede220eb88182464749186818d60 Mon Sep 17 00:00:00 2001
From: Ben Rangihuna <Benrangihuna@gmail.com>
Date: Fri, 4 Jun 2021 14:08:44 +1200
Subject: [PATCH 07/23] exclude missing data before making models

---
 scripts/model_residuals_whole_pop.R | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/scripts/model_residuals_whole_pop.R b/scripts/model_residuals_whole_pop.R
index 3c383ab..ba19326 100644
--- a/scripts/model_residuals_whole_pop.R
+++ b/scripts/model_residuals_whole_pop.R
@@ -65,7 +65,7 @@ if(is.null(opt$pop)){
 #trait <- "HEIGHT"
 #opt <-list(out_dir = "nph_results/")
 
-all_dat <- read_csv(file = here("data/curated_data.csv"), col_names = TRUE) %>% mutate(GOUT01recode = GOUT -1, T2D01recode = T2D -1, logBMI= log(BMI))
+all_dat <- read_csv(file = here("data/tanya_data.csv"), col_names = TRUE) # %>% mutate(GOUT01recode = GOUT -1, T2D01recode = T2D -1, logBMI= log(BMI))
 newpca <- read_delim(file = here(paste0(pop,"_results"),paste0(pop,"_pcafile.eigenvec")), delim = " ", col_names = c("FID","SUBJECT",paste0("PC",1:10)), col_types = paste0(c("c","c",rep("d", 10)),collapse = ""))
 
 if (!trait %in% names(all_dat)){
@@ -80,6 +80,9 @@ new_dat <- newpca %>% left_join(., all_dat, by = "SUBJECT")
 # Define list of variables for the models ####
 preds <- c("AGECOL", "SEX", paste0("PC",1:10))
 
+# subset the data to only include the variables needed to run the models and make it so that only individuals with complete observations are included
+new_dat <- new_dat %>% select(SUBJECT, !!trait, all_of(preds)) %>% drop_na()
+
 # generate models stepwise adding variables
 models <- list()
 for(pred_idx in seq_along(preds)){
@@ -125,7 +128,7 @@ model_remove <-  map(model_results, list("model","na.action"))
 # write out residuals, columns: SUBJECT TRAIT model1 ... modelx
 new_dat %>% 
   select(SUBJECT, !!trait) %>% # pull out ids and trait column
-  slice(-unlist(model_remove)) %>% # remove rows that didn't have residuals
+  #slice(-unlist(model_remove)) %>% # remove rows that didn't have residuals (step no longer needed due to step before model creation that only lets complete obs go into the model)
   bind_cols(model_residuals) %>% 
   write_delim(file = here(opt$out_dir,paste0(pop,"_",trait,".residuals.txt")),
               delim = " ",

From 45d9ac707aa477ac82056ac6531a4a7615446036 Mon Sep 17 00:00:00 2001
From: BenRangihuna <75346750+BenRangihuna@users.noreply.github.com>
Date: Wed, 23 Jun 2021 10:09:58 +1200
Subject: [PATCH 08/23] Update whole_pop_pipeline.sh

---
 scripts/whole_pop_pipeline.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/whole_pop_pipeline.sh b/scripts/whole_pop_pipeline.sh
index 8a1f43b..738fcea 100644
--- a/scripts/whole_pop_pipeline.sh
+++ b/scripts/whole_pop_pipeline.sh
@@ -92,7 +92,7 @@ do
 	# need to join based on number fo models 2.2 - 2.8  was for 7 models
 	software/plink1.9b6.10 --bfile data/data --keep data/${POP}.keep --make-bed --out tmp/${POP}_${TRAIT}
 	# this should join using IID from fam and IID from the pheno (in pheno first and second cols are duplicated) includes entire fam in output so need to adjust accordingly for model numbers (-6 cols)
-	join -1 2 -2 1 -o auto -e "NA" <(sort -k1 tmp/${POP}_${TRAIT}.fam) <(sort -k1 ${RESULTS}/${POP}_${TRAIT}.residuals.txt | cut -f2- )  > ${RESULTS}/${POP}_${TRAIT}.pheno 
+	join -1 2 -2 1 -o auto -e "NA" <(sort -k2 tmp/${POP}_${TRAIT}.fam) <(sort -k1 ${RESULTS}/${POP}_${TRAIT}.residuals.txt)  > ${RESULTS}/${POP}_${TRAIT}.pheno 
 
 	# calculate the column indexes, offset by 2 (to account for FID and IID) from all of the residuals -> should be equal to $(seq 1 number_of_models)
 	pheno_cols=$( head -1 ${RESULTS}/${POP}_${TRAIT}.residuals.txt | awk '{ for(i=1;i<=NF-2;i++){print i}}' | sort -u)	

From 4a0c47ee2935b2f3f40546819736853a661108c0 Mon Sep 17 00:00:00 2001
From: BenRangihuna <75346750+BenRangihuna@users.noreply.github.com>
Date: Wed, 23 Jun 2021 10:28:50 +1200
Subject: [PATCH 09/23] use separate pheno file per model

---
 scripts/run_gcta_whole_pop.sh | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/scripts/run_gcta_whole_pop.sh b/scripts/run_gcta_whole_pop.sh
index 44e069e..c32ef9a 100644
--- a/scripts/run_gcta_whole_pop.sh
+++ b/scripts/run_gcta_whole_pop.sh
@@ -13,7 +13,8 @@ TRAIT=$(echo $BFILE | cut -d'_' -f2)
 
 MODEL=$2
 
-PHENOCOL=$(echo $MODEL + 4 | bc)
+# pulls out the from the start of the residuals - first 7 cols are: FID IID PID MID SEX AFF TRUEPHENOVALUE
+PHENOCOL=$(echo $MODEL + 7 | bc)
 
 
 
@@ -44,8 +45,8 @@ echo "****** $REGRESSION ******"
 #reml
 
 # $(seq 1 to how many trait/residual columns 
-
-software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno ${RESULTS}/${BFILE}.pheno --mpheno ${PHENOCOL} --out ${RESULTS}/GCTA/${TRAIT}_${MODEL} --threads 4
+cut -d' ' -f1,2,${PHENOCOL} < ${RESULTS}/${BFILE}.pheno > ${RESULTS}/${BFILE}.pheno${MODEL}
+software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno ${RESULTS}/${BFILE}.pheno${MODEL} --out ${RESULTS}/GCTA/${TRAIT}_${MODEL} --threads 4
 
 
 

From f985064821627897bd57a8e7ec7d638a7ff420c4 Mon Sep 17 00:00:00 2001
From: BenRangihuna <75346750+BenRangihuna@users.noreply.github.com>
Date: Wed, 23 Jun 2021 10:32:34 +1200
Subject: [PATCH 10/23] use separate pheno for each model

---
 scripts/run_ldak_whole_pop.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/run_ldak_whole_pop.sh b/scripts/run_ldak_whole_pop.sh
index 799a487..3c4ce50 100644
--- a/scripts/run_ldak_whole_pop.sh
+++ b/scripts/run_ldak_whole_pop.sh
@@ -14,14 +14,14 @@ TRAIT=$(echo $BFILE | cut -d'_' -f2)
 MODEL=$2
 
 PHENOCOL=$(echo $MODEL + 4 | bc) # ignore the first 4 columns after FID and IID
-
+cut -d' ' -f1,2,${PHENOCOL} <  ${RESULTS}/${BFILE}.pheno >  ${RESULTS}/${BFILE}.pheno${MODEL}_ldak
 
 
 ####################### LDAK 
 
 ### reml
 
-software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}_${MODEL} --pheno ${RESULTS}/${BFILE}.pheno --mpheno ${PHENOCOL} --grm ${RESULTS}/${POP}_ldak_kinships
+software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}_${MODEL} --pheno ${RESULTS}/${BFILE}.pheno${MODEL}_ldak  --grm ${RESULTS}/${POP}_ldak_kinships
 
 
 

From 3ee50ac03c8a0716199f2af8923e1c45260e1766 Mon Sep 17 00:00:00 2001
From: BenRangihuna <75346750+BenRangihuna@users.noreply.github.com>
Date: Wed, 23 Jun 2021 10:33:35 +1200
Subject: [PATCH 11/23] use gcta suffix on pheno file

---
 scripts/run_gcta_whole_pop.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/run_gcta_whole_pop.sh b/scripts/run_gcta_whole_pop.sh
index c32ef9a..e847510 100644
--- a/scripts/run_gcta_whole_pop.sh
+++ b/scripts/run_gcta_whole_pop.sh
@@ -15,7 +15,7 @@ MODEL=$2
 
 # pulls out the from the start of the residuals - first 7 cols are: FID IID PID MID SEX AFF TRUEPHENOVALUE
 PHENOCOL=$(echo $MODEL + 7 | bc)
-
+cut -d' ' -f1,2,${PHENOCOL} < ${RESULTS}/${BFILE}.pheno > ${RESULTS}/${BFILE}.pheno${MODEL}_gcta
 
 
 
@@ -45,8 +45,8 @@ echo "****** $REGRESSION ******"
 #reml
 
 # $(seq 1 to how many trait/residual columns 
-cut -d' ' -f1,2,${PHENOCOL} < ${RESULTS}/${BFILE}.pheno > ${RESULTS}/${BFILE}.pheno${MODEL}
-software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno ${RESULTS}/${BFILE}.pheno${MODEL} --out ${RESULTS}/GCTA/${TRAIT}_${MODEL} --threads 4
+
+software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno ${RESULTS}/${BFILE}.pheno${MODEL}_gcta --out ${RESULTS}/GCTA/${TRAIT}_${MODEL} --threads 4
 
 
 

From d2c3eb009c50d5a8c2b25726b29d3fbdc53d3158 Mon Sep 17 00:00:00 2001
From: BenRangihuna <75346750+BenRangihuna@users.noreply.github.com>
Date: Wed, 23 Jun 2021 10:34:56 +1200
Subject: [PATCH 12/23] Update whole_pop_pipeline.sh

---
 scripts/whole_pop_pipeline.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/whole_pop_pipeline.sh b/scripts/whole_pop_pipeline.sh
index 738fcea..662abfc 100644
--- a/scripts/whole_pop_pipeline.sh
+++ b/scripts/whole_pop_pipeline.sh
@@ -90,7 +90,7 @@ do
 	# columns 8-12 are the FIVE residuals
 	# height, egfr, serumurate, diabetes and gout in that order
 	# need to join based on number fo models 2.2 - 2.8  was for 7 models
-	software/plink1.9b6.10 --bfile data/data --keep data/${POP}.keep --make-bed --out tmp/${POP}_${TRAIT}
+	software/plink1.9b6.10 --bfile data/data --keep data/${POP}.keep --make-bed --maf 0.01 --geno 0.05 --out tmp/${POP}_${TRAIT}
 	# this should join using IID from fam and IID from the pheno (in pheno first and second cols are duplicated) includes entire fam in output so need to adjust accordingly for model numbers (-6 cols)
 	join -1 2 -2 1 -o auto -e "NA" <(sort -k2 tmp/${POP}_${TRAIT}.fam) <(sort -k1 ${RESULTS}/${POP}_${TRAIT}.residuals.txt)  > ${RESULTS}/${POP}_${TRAIT}.pheno 
 

From ffeb7d8c9c5f2b0112dd1ce1e6c5b00b272069e8 Mon Sep 17 00:00:00 2001
From: Ben Rangihuna <Benrangihuna@gmail.com>
Date: Wed, 23 Jun 2021 10:36:40 +1200
Subject: [PATCH 13/23] extract results at the end

---
 scripts/whole_pop_pipeline.sh | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/scripts/whole_pop_pipeline.sh b/scripts/whole_pop_pipeline.sh
index 8a1f43b..802324c 100644
--- a/scripts/whole_pop_pipeline.sh
+++ b/scripts/whole_pop_pipeline.sh
@@ -104,9 +104,14 @@ do
 
 	#### LDAK for each model of current trait
 	parallel -j 16 'nice -n 10 bash scripts/run_ldak_whole_pop.sh {1} {2} ' :::  tmp/${POP}_${TRAIT}.fam  ::: ${pheno_cols} 
-
 done < data/pop_trait_models.csv
 
+WD=$(pwd)
+cd ${RESULTS}/LDAK
+# combine all LDAK h2 for pop into a single file
+grep 'K1' *.h2 | cat <(echo FILE $(head -1 ${TRAIT}_1.h2 )) - > ${POP}_all_traits_LDAK_K1_h2.txt
+# return to the orignal directory
+cd $WD
 
 
 

From 9d699101a33398705c317b8c22a93c1684444533 Mon Sep 17 00:00:00 2001
From: BenRangihuna <75346750+BenRangihuna@users.noreply.github.com>
Date: Wed, 23 Jun 2021 10:51:07 +1200
Subject: [PATCH 14/23] extract from actual original data

---
 scripts/whole_pop_pipeline.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/whole_pop_pipeline.sh b/scripts/whole_pop_pipeline.sh
index a05ab3e..2ae8e68 100644
--- a/scripts/whole_pop_pipeline.sh
+++ b/scripts/whole_pop_pipeline.sh
@@ -90,7 +90,7 @@ do
 	# columns 8-12 are the FIVE residuals
 	# height, egfr, serumurate, diabetes and gout in that order
 	# need to join based on number fo models 2.2 - 2.8  was for 7 models
-	software/plink1.9b6.10 --bfile data/data --keep data/${POP}.keep --make-bed --maf 0.01 --geno 0.05 --out tmp/${POP}_${TRAIT}
+	software/plink1.9b6.10 --bfile ${ORIG_DATA} --keep data/${POP}.keep --make-bed --maf 0.01 --geno 0.05 --out tmp/${POP}_${TRAIT}
 	# this should join using IID from fam and IID from the pheno (in pheno first and second cols are duplicated) includes entire fam in output so need to adjust accordingly for model numbers (-6 cols)
 	join -1 2 -2 1 -o auto -e "NA" <(sort -k2 tmp/${POP}_${TRAIT}.fam) <(sort -k1 ${RESULTS}/${POP}_${TRAIT}.residuals.txt)  > ${RESULTS}/${POP}_${TRAIT}.pheno 
 

From 5f6cd93bf1f10b1fec71c0717b34949b641a2da3 Mon Sep 17 00:00:00 2001
From: Ben Rangihuna <Benrangihuna@gmail.com>
Date: Wed, 14 Jul 2021 11:06:46 +1200
Subject: [PATCH 15/23] new scripts for running models on pca selected
 populations

---
 ...DAK_GCTA_heritability_with_trait_values.sh | 34 ++++++++
 scripts/make_pheno_and_covar_files.R          | 72 +++++++++++++++++
 scripts/pca_select_pops.R                     | 81 +++++++++++++++++++
 3 files changed, 187 insertions(+)
 create mode 100644 scripts/LDAK_GCTA_heritability_with_trait_values.sh
 create mode 100644 scripts/make_pheno_and_covar_files.R
 create mode 100644 scripts/pca_select_pops.R

diff --git a/scripts/LDAK_GCTA_heritability_with_trait_values.sh b/scripts/LDAK_GCTA_heritability_with_trait_values.sh
new file mode 100644
index 0000000..dcfb48f
--- /dev/null
+++ b/scripts/LDAK_GCTA_heritability_with_trait_values.sh
@@ -0,0 +1,34 @@
+
+POP=nphpca
+RESULTS=${POP}_results
+mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT}
+for covar in ls ${RESULTS}/*.covar
+do
+ TRAIT=gout
+ MODEL=$(basename ${covar} .covar)
+ PREV=0.049
+ mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT}
+ software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}/${MODEL} --pheno data/tanya_${TRAIT}.pheno  --grm ${RESULTS}/${POP}_ldak_kinships --prevalence ${PREV} --covar ${covar}
+ software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno data/tanya_${TRAIT}.pheno --prevalence ${PREV} --out ${RESULTS}/GCTA/${TRAIT}/${MODEL} --qcovar ${covar} --threads 4
+
+done > ${POP}_${TRAIT}.log
+
+for covar in ls ${RESULTS}/*.covar
+do
+ TRAIT=t2d
+ MODEL=$(basename ${covar} .covar)
+ PREV=0.078
+ mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT}
+ software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}/${MODEL} --pheno data/tanya_${TRAIT}.pheno  --grm ${RESULTS}/${POP}_ldak_kinships --prevalence ${PREV} --covar ${covar}
+  software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno data/tanya_${TRAIT}.pheno --prevalence ${PREV} --out ${RESULTS}/GCTA/${TRAIT}/${MODEL} --qcovar ${covar} --threads 4
+done > ${POP}_${TRAIT}.log
+
+
+
+
+#Code for transferring directories to local computer..
+# scp -r benrangihuna@biochemcompute1.uod.otago.ac.nz:/Volumes/scratch/merrimanlab/ben/genomic_predictions/{nph,euro,west,east}pca_results ~/Documents/genomic_prediction_results/
+
+
+
+
diff --git a/scripts/make_pheno_and_covar_files.R b/scripts/make_pheno_and_covar_files.R
new file mode 100644
index 0000000..303c398
--- /dev/null
+++ b/scripts/make_pheno_and_covar_files.R
@@ -0,0 +1,72 @@
+library(tidyverse)
+library(here)
+
+#CREBRF MASTER FILE (FROM TANYA)
+tanya <- read_csv(file = here("data/tanya_data.csv"), col_names = TRUE) %>%
+  filter(SUBJECT != "Blank")
+
+
+
+nph <- read_delim(here("nphpca_results/nphpca_pcafile.eigenvec"), col_names = c("FID","IID",paste0("PCA",1:10)), delim = ' ')
+east <- read_delim(here("eastpca_results/eastpca_pcafile.eigenvec"), col_names = c("FID","IID",paste0("PCA",1:10)), delim = ' ')
+west <- read_delim(here("westpca_results/westpca_pcafile.eigenvec"), col_names = c("FID","IID",paste0("PCA",1:10)), delim = ' ')
+euro <- read_delim(here("europca_results/europca_pcafile.eigenvec"), col_names = c("FID","IID",paste0("PCA",1:10)), delim = ' ')
+
+# pheno files
+
+all_IIDS <- c(nph$IID, east$IID, west$IID, euro$IID)
+
+tanya <- tanya %>% filter(SUBECT %in% all_IIDS)
+
+tanya %>% select(IID = SUBJECT, GOUT) %>%
+  mutate(GOUT_recode = case_when(str_detect(GOUT,"Control") ~ 1,
+                          GOUT %in% c("ACR Gout","GP Gout") ~ 2,
+                          TRUE ~ NA_real_
+                          ),
+         FID = IID) %>% select(FID, IID, GOUT = GOUT_recode) %>%
+  write_tsv(here("data/tanya_gout.pheno"))
+
+
+tanya %>% select(IID = SUBJECT, TYPE2D) %>%
+  mutate(TYPE2D_recode = case_when(TYPE2D == "No" ~ 1,
+                                 TYPE2D == "Yes" ~ 2,
+                                 TRUE ~ NA_real_
+  ),
+  FID = IID) %>% select(FID, IID, TYPE2D = TYPE2D_recode) %>% write_tsv(here("data/tanya_t2d.pheno"))
+
+# covar files
+nph_covar <- tanya %>% mutate(FID = SUBJECT, IID = SUBJECT) %>%
+  select(FID,IID, AGECOL, SEX) %>%
+  right_join(nph)
+
+east_covar <- tanya %>% mutate(FID = SUBJECT, IID = SUBJECT) %>%
+  select(FID,IID, AGECOL, SEX) %>%
+  right_join(east)
+
+west_covar <- tanya %>% mutate(FID = SUBJECT, IID = SUBJECT) %>%
+  select(FID,IID, AGECOL, SEX) %>%
+  right_join(west)
+
+euro_covar <- tanya %>% mutate(FID = SUBJECT, IID = SUBJECT) %>%
+  select(FID,IID, AGECOL, SEX) %>%
+  right_join(euro)
+
+cols <- names(nph_covar)[-1:-2]
+for(n in cols){
+  p <- nph_covar %>% select(FID, IID:!!n)
+  fn <- paste(names(p)[-1:-2], collapse = "_")
+  print(fn)
+  write_tsv(p, here("nphpca_results/",paste0("nphpca_", fn,".covar")))
+
+  east_covar %>% select(FID, IID:!!n) %>%
+    write_tsv( here("eastpca_results/",paste0("eastpca_", fn,".covar")))
+
+  west_covar %>% select(FID, IID:!!n) %>%
+    write_tsv( here("westpca_results/",paste0("westpca_", fn,".covar")))
+
+  euro_covar %>% select(FID, IID:!!n) %>%
+    write_tsv( here("europca_results/",paste0("europca_", fn,".covar")))
+
+}
+
+
diff --git a/scripts/pca_select_pops.R b/scripts/pca_select_pops.R
new file mode 100644
index 0000000..0964374
--- /dev/null
+++ b/scripts/pca_select_pops.R
@@ -0,0 +1,81 @@
+## 12 July 2021.
+
+## PCA based Pop selection.
+library(tidyverse)
+library(here)
+library(patchwork)
+#CREBRF MASTER FILE (FROM TANYA)
+tanya <- read_csv(file = here("data/tanya_data.csv"), col_names = TRUE) # %>% mutate(GOUT01recode = GOUT -1, T2D01recode = T2D -1, logBMI= log(BMI))
+
+orig <- read_csv(here("data/curated_data.csv"))
+
+east <- read_tsv(here("data/east597btanya.keep"), col_names = FALSE) %>% mutate(pop = "east")
+
+west <- read_tsv(here("data/west597btanya.keep"), col_names = FALSE) %>% mutate(pop = "west")
+
+euro <- read_tsv(here("data/euro597btanya.keep"), col_names = FALSE) %>% mutate(pop = "euro")
+
+nph <- read_tsv(here("data/nphtanya.keep"), col_names = FALSE) %>% mutate(pop ="nph")
+
+pops <- bind_rows(east, west, euro, nph)
+
+pops <- pops %>% left_join(orig %>%
+                             select(SUBJECT, starts_with("PC")),
+                           by = c("X2" = "SUBJECT")) %>%
+  drop_na()
+
+
+# make plots using the original global coreExome PCs
+theme_set(theme_bw())
+p1 <- pops %>% ggplot(aes(x = PCA2, y = PCA3, colour = pop)) + geom_point() +
+  ggtitle("Original") +
+  geom_hline(yintercept = 0.0025, colour = "red", linetype = "dashed") +
+  geom_vline(xintercept = 0, colour = 'red', linetype = "dashed")
+
+
+new_nph <- pops %>% filter(pop == "nph" & PCA2 > 0 & PCA3 > 0.0025) %>%
+  select(X1,X2) %>% write_delim(file = here("data/nphpca.keep"),
+                                delim = "\t",
+                                col_names = FALSE) %>% mutate(pop = "nph")
+
+new_east <- pops %>% filter(pop == "east" & PCA2 > 0 & PCA3 > 0.0025) %>%
+  select(X1,X2) %>% write_delim(file = here("data/eastpca.keep"),
+                                delim = "\t",
+                                col_names = FALSE) %>% mutate(pop ="east")
+
+new_west <- pops %>% filter(pop == "west" & PCA2 > 0 & PCA3 < 0.0025) %>%
+  select(X1,X2) %>% write_delim(file = here("data/westpca.keep"),
+                                delim = "\t",
+                                col_names = FALSE) %>% mutate(pop = "west")
+
+new_euro <- pops %>% filter(pop == "euro" & PCA2 < 0) %>%
+  select(X1,X2) %>% write_delim(file = here("data/europca.keep"),
+                                delim = "\t",
+                                col_names = FALSE) %>% mutate(pop = "euro")
+
+
+
+new_pops <- bind_rows(new_east, new_west, new_euro, new_nph)
+
+new_pops <- new_pops %>% left_join(orig %>%
+                                     select(SUBJECT, starts_with("PC")),
+                                   by = c("X2" = "SUBJECT")) %>%
+  drop_na()
+
+
+# make plots using the original global coreExome PCs
+
+p2 <- new_pops %>% ggplot(aes(x = PCA2, y = PCA3, colour = pop)) +
+  geom_point() +
+  ggtitle("After PCA Thresholds") +
+  geom_hline(yintercept = 0.0025, colour = "red", linetype = "dashed") +
+  geom_vline(xintercept = 0, colour = 'red', linetype = "dashed")
+
+p1 + p2 + patchwork::plot_layout(guides = "collect")
+
+
+# make covar files
+new_pops %>% left_join(tanya %>% select(X1 = SUBJECT, AGECOL, SEX), by = "X1") %>%
+  relocate(AGECOL, SEX, .after = X2) %>% rename(FID = X1, IID = X2) %>%
+  select(-pop) %>% write_tsv("data/tanya_covar.csv")
+

From cb360a5ca705b190099ff527fbb1bc29b7df1791 Mon Sep 17 00:00:00 2001
From: Ben Rangihuna <Benrangihuna@gmail.com>
Date: Wed, 14 Jul 2021 11:08:41 +1200
Subject: [PATCH 16/23] all pops

---
 ...DAK_GCTA_heritability_with_trait_values.sh | 79 +++++++++++++++++++
 1 file changed, 79 insertions(+)

diff --git a/scripts/LDAK_GCTA_heritability_with_trait_values.sh b/scripts/LDAK_GCTA_heritability_with_trait_values.sh
index dcfb48f..5fdde54 100644
--- a/scripts/LDAK_GCTA_heritability_with_trait_values.sh
+++ b/scripts/LDAK_GCTA_heritability_with_trait_values.sh
@@ -24,6 +24,85 @@ do
 done > ${POP}_${TRAIT}.log
 
 
+## East
+
+POP=eastpca
+RESULTS=${POP}_results
+mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT}
+for covar in ls ${RESULTS}/*.covar
+do
+ TRAIT=gout
+ MODEL=$(basename ${covar} .covar)
+ PREV=0.049
+ mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT}
+ software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}/${MODEL} --pheno data/tanya_${TRAIT}.pheno  --grm ${RESULTS}/${POP}_ldak_kinships --prevalence ${PREV} --covar ${covar}
+ software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno data/tanya_${TRAIT}.pheno --prevalence ${PREV} --out ${RESULTS}/GCTA/${TRAIT}/${MODEL} --qcovar ${covar} --threads 4
+
+done > ${POP}_${TRAIT}.log
+
+for covar in ls ${RESULTS}/*.covar
+do
+ TRAIT=t2d
+ MODEL=$(basename ${covar} .covar)
+ PREV=0.078
+ mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT}
+ software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}/${MODEL} --pheno data/tanya_${TRAIT}.pheno  --grm ${RESULTS}/${POP}_ldak_kinships --prevalence ${PREV} --covar ${covar}
+  software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno data/tanya_${TRAIT}.pheno --prevalence ${PREV} --out ${RESULTS}/GCTA/${TRAIT}/${MODEL} --qcovar ${covar} --threads 4
+done > ${POP}_${TRAIT}.log
+
+## West
+
+POP=westpca
+RESULTS=${POP}_results
+mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT}
+for covar in ls ${RESULTS}/*.covar
+do
+ TRAIT=gout
+ MODEL=$(basename ${covar} .covar)
+ PREV=0.049
+ mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT}
+ software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}/${MODEL} --pheno data/tanya_${TRAIT}.pheno  --grm ${RESULTS}/${POP}_ldak_kinships --prevalence ${PREV} --covar ${covar}
+ software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno data/tanya_${TRAIT}.pheno --prevalence ${PREV} --out ${RESULTS}/GCTA/${TRAIT}/${MODEL} --qcovar ${covar} --threads 4
+
+done > ${POP}_${TRAIT}.log
+
+for covar in ls ${RESULTS}/*.covar
+do
+ TRAIT=t2d
+ MODEL=$(basename ${covar} .covar)
+ PREV=0.078
+ mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT}
+ software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}/${MODEL} --pheno data/tanya_${TRAIT}.pheno  --grm ${RESULTS}/${POP}_ldak_kinships --prevalence ${PREV} --covar ${covar}
+  software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno data/tanya_${TRAIT}.pheno --prevalence ${PREV} --out ${RESULTS}/GCTA/${TRAIT}/${MODEL} --qcovar ${covar} --threads 4
+done > ${POP}_${TRAIT}.log
+
+
+## Euro
+
+
+POP=europca
+RESULTS=${POP}_results
+mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT}
+for covar in ls ${RESULTS}/*.covar
+do
+ TRAIT=gout
+ MODEL=$(basename ${covar} .covar)
+ PREV=0.049
+ mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT}
+ software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}/${MODEL} --pheno data/tanya_${TRAIT}.pheno  --grm ${RESULTS}/${POP}_ldak_kinships --prevalence ${PREV} --covar ${covar}
+ software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno data/tanya_${TRAIT}.pheno --prevalence ${PREV} --out ${RESULTS}/GCTA/${TRAIT}/${MODEL} --qcovar ${covar} --threads 4
+
+done > ${POP}_${TRAIT}.log
+
+for covar in ls ${RESULTS}/*.covar
+do
+ TRAIT=t2d
+ MODEL=$(basename ${covar} .covar)
+ PREV=0.078
+ mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT}
+ software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}/${MODEL} --pheno data/tanya_${TRAIT}.pheno  --grm ${RESULTS}/${POP}_ldak_kinships --prevalence ${PREV} --covar ${covar}
+  software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno data/tanya_${TRAIT}.pheno --prevalence ${PREV} --out ${RESULTS}/GCTA/${TRAIT}/${MODEL} --qcovar ${covar} --threads 4
+done > ${POP}_${TRAIT}.log
 
 
 #Code for transferring directories to local computer..

From f5981dab13b538403a7bc5ddee9edcca6c1e65de Mon Sep 17 00:00:00 2001
From: Ben Rangihuna <Benrangihuna@gmail.com>
Date: Wed, 14 Jul 2021 11:18:29 +1200
Subject: [PATCH 17/23] update prevalences

---
 ...DAK_GCTA_heritability_with_trait_values.sh | 34 ++++++++++---------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/scripts/LDAK_GCTA_heritability_with_trait_values.sh b/scripts/LDAK_GCTA_heritability_with_trait_values.sh
index 5fdde54..d476f61 100644
--- a/scripts/LDAK_GCTA_heritability_with_trait_values.sh
+++ b/scripts/LDAK_GCTA_heritability_with_trait_values.sh
@@ -1,10 +1,11 @@
-
+echo NPH
 POP=nphpca
 RESULTS=${POP}_results
+TRAIT=gout
 mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT}
 for covar in ls ${RESULTS}/*.covar
 do
- TRAIT=gout
+
  MODEL=$(basename ${covar} .covar)
  PREV=0.049
  mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT}
@@ -13,9 +14,9 @@ do
 
 done > ${POP}_${TRAIT}.log
 
+TRAIT=t2d
 for covar in ls ${RESULTS}/*.covar
 do
- TRAIT=t2d
  MODEL=$(basename ${covar} .covar)
  PREV=0.078
  mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT}
@@ -25,26 +26,26 @@ done > ${POP}_${TRAIT}.log
 
 
 ## East
-
+echo East
 POP=eastpca
 RESULTS=${POP}_results
+TRAIT=gout
 mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT}
 for covar in ls ${RESULTS}/*.covar
 do
- TRAIT=gout
  MODEL=$(basename ${covar} .covar)
- PREV=0.049
+ PREV=0.043
  mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT}
  software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}/${MODEL} --pheno data/tanya_${TRAIT}.pheno  --grm ${RESULTS}/${POP}_ldak_kinships --prevalence ${PREV} --covar ${covar}
  software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno data/tanya_${TRAIT}.pheno --prevalence ${PREV} --out ${RESULTS}/GCTA/${TRAIT}/${MODEL} --qcovar ${covar} --threads 4
 
 done > ${POP}_${TRAIT}.log
 
+TRAIT=t2d
 for covar in ls ${RESULTS}/*.covar
 do
- TRAIT=t2d
  MODEL=$(basename ${covar} .covar)
- PREV=0.078
+ PREV=0.084
  mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT}
  software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}/${MODEL} --pheno data/tanya_${TRAIT}.pheno  --grm ${RESULTS}/${POP}_ldak_kinships --prevalence ${PREV} --covar ${covar}
   software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno data/tanya_${TRAIT}.pheno --prevalence ${PREV} --out ${RESULTS}/GCTA/${TRAIT}/${MODEL} --qcovar ${covar} --threads 4
@@ -54,23 +55,24 @@ done > ${POP}_${TRAIT}.log
 
 POP=westpca
 RESULTS=${POP}_results
+TRAIT=gout
 mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT}
 for covar in ls ${RESULTS}/*.covar
 do
- TRAIT=gout
  MODEL=$(basename ${covar} .covar)
- PREV=0.049
+ PREV=0.051
  mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT}
  software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}/${MODEL} --pheno data/tanya_${TRAIT}.pheno  --grm ${RESULTS}/${POP}_ldak_kinships --prevalence ${PREV} --covar ${covar}
  software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno data/tanya_${TRAIT}.pheno --prevalence ${PREV} --out ${RESULTS}/GCTA/${TRAIT}/${MODEL} --qcovar ${covar} --threads 4
 
 done > ${POP}_${TRAIT}.log
 
+TRAIT=t2d
 for covar in ls ${RESULTS}/*.covar
 do
- TRAIT=t2d
+
  MODEL=$(basename ${covar} .covar)
- PREV=0.078
+ PREV=0.146
  mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT}
  software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}/${MODEL} --pheno data/tanya_${TRAIT}.pheno  --grm ${RESULTS}/${POP}_ldak_kinships --prevalence ${PREV} --covar ${covar}
   software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno data/tanya_${TRAIT}.pheno --prevalence ${PREV} --out ${RESULTS}/GCTA/${TRAIT}/${MODEL} --qcovar ${covar} --threads 4
@@ -82,23 +84,23 @@ done > ${POP}_${TRAIT}.log
 
 POP=europca
 RESULTS=${POP}_results
+TRAIT=gout
 mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT}
 for covar in ls ${RESULTS}/*.covar
 do
- TRAIT=gout
  MODEL=$(basename ${covar} .covar)
- PREV=0.049
+ PREV=0.024
  mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT}
  software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}/${MODEL} --pheno data/tanya_${TRAIT}.pheno  --grm ${RESULTS}/${POP}_ldak_kinships --prevalence ${PREV} --covar ${covar}
  software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno data/tanya_${TRAIT}.pheno --prevalence ${PREV} --out ${RESULTS}/GCTA/${TRAIT}/${MODEL} --qcovar ${covar} --threads 4
 
 done > ${POP}_${TRAIT}.log
 
+TRAIT=t2d
 for covar in ls ${RESULTS}/*.covar
 do
- TRAIT=t2d
  MODEL=$(basename ${covar} .covar)
- PREV=0.078
+ PREV=0.049
  mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT}
  software/LDAK/ldak5.linux --reml ${RESULTS}/LDAK/${TRAIT}/${MODEL} --pheno data/tanya_${TRAIT}.pheno  --grm ${RESULTS}/${POP}_ldak_kinships --prevalence ${PREV} --covar ${covar}
   software/gcta64 --reml --reml-pred-rand --grm ${RESULTS}/${POP}_gcta_grm --pheno data/tanya_${TRAIT}.pheno --prevalence ${PREV} --out ${RESULTS}/GCTA/${TRAIT}/${MODEL} --qcovar ${covar} --threads 4

From 2c5a01ba9bcd053b232b84a466bf19b6f990a39a Mon Sep 17 00:00:00 2001
From: Ben Rangihuna <Benrangihuna@gmail.com>
Date: Thu, 22 Jul 2021 11:32:02 +1200
Subject: [PATCH 18/23] adding subshells ({})  into "for covar in.." lines of
 code

---
 ...DAK_GCTA_heritability_with_trait_values.sh | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/scripts/LDAK_GCTA_heritability_with_trait_values.sh b/scripts/LDAK_GCTA_heritability_with_trait_values.sh
index d476f61..1614fb2 100644
--- a/scripts/LDAK_GCTA_heritability_with_trait_values.sh
+++ b/scripts/LDAK_GCTA_heritability_with_trait_values.sh
@@ -3,7 +3,7 @@ POP=nphpca
 RESULTS=${POP}_results
 TRAIT=gout
 mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT}
-for covar in ls ${RESULTS}/*.covar
+for covar in {ls ${RESULTS}/*.covar}
 do
 
  MODEL=$(basename ${covar} .covar)
@@ -15,7 +15,7 @@ do
 done > ${POP}_${TRAIT}.log
 
 TRAIT=t2d
-for covar in ls ${RESULTS}/*.covar
+for covar in {ls ${RESULTS}/*.covar}
 do
  MODEL=$(basename ${covar} .covar)
  PREV=0.078
@@ -31,7 +31,7 @@ POP=eastpca
 RESULTS=${POP}_results
 TRAIT=gout
 mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT}
-for covar in ls ${RESULTS}/*.covar
+for covar in {ls ${RESULTS}/*.covar}
 do
  MODEL=$(basename ${covar} .covar)
  PREV=0.043
@@ -42,7 +42,7 @@ do
 done > ${POP}_${TRAIT}.log
 
 TRAIT=t2d
-for covar in ls ${RESULTS}/*.covar
+for covar in {ls ${RESULTS}/*.covar}
 do
  MODEL=$(basename ${covar} .covar)
  PREV=0.084
@@ -52,12 +52,12 @@ do
 done > ${POP}_${TRAIT}.log
 
 ## West
-
+echo WEST
 POP=westpca
 RESULTS=${POP}_results
 TRAIT=gout
 mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT}
-for covar in ls ${RESULTS}/*.covar
+for covar in {ls ${RESULTS}/*.covar}
 do
  MODEL=$(basename ${covar} .covar)
  PREV=0.051
@@ -68,7 +68,7 @@ do
 done > ${POP}_${TRAIT}.log
 
 TRAIT=t2d
-for covar in ls ${RESULTS}/*.covar
+for covar in {ls ${RESULTS}/*.covar}
 do
 
  MODEL=$(basename ${covar} .covar)
@@ -81,12 +81,12 @@ done > ${POP}_${TRAIT}.log
 
 ## Euro
 
-
+echo EURO
 POP=europca
 RESULTS=${POP}_results
 TRAIT=gout
 mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT}
-for covar in ls ${RESULTS}/*.covar
+for covar in {ls ${RESULTS}/*.covar}
 do
  MODEL=$(basename ${covar} .covar)
  PREV=0.024
@@ -97,7 +97,7 @@ do
 done > ${POP}_${TRAIT}.log
 
 TRAIT=t2d
-for covar in ls ${RESULTS}/*.covar
+for covar in {ls ${RESULTS}/*.covar}
 do
  MODEL=$(basename ${covar} .covar)
  PREV=0.049

From 2735592ad71c0e0d2ad272df227021fa7ee50a69 Mon Sep 17 00:00:00 2001
From: Ben Rangihuna <Benrangihuna@gmail.com>
Date: Thu, 22 Jul 2021 16:42:29 +1200
Subject: [PATCH 19/23] remove {} from "for covar in.." lines

---
 .../LDAK_GCTA_heritability_with_trait_values.sh  | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/scripts/LDAK_GCTA_heritability_with_trait_values.sh b/scripts/LDAK_GCTA_heritability_with_trait_values.sh
index 1614fb2..daeaa9c 100644
--- a/scripts/LDAK_GCTA_heritability_with_trait_values.sh
+++ b/scripts/LDAK_GCTA_heritability_with_trait_values.sh
@@ -3,7 +3,7 @@ POP=nphpca
 RESULTS=${POP}_results
 TRAIT=gout
 mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT}
-for covar in {ls ${RESULTS}/*.covar}
+for covar in ls ${RESULTS}/*.covar
 do
 
  MODEL=$(basename ${covar} .covar)
@@ -15,7 +15,7 @@ do
 done > ${POP}_${TRAIT}.log
 
 TRAIT=t2d
-for covar in {ls ${RESULTS}/*.covar}
+for covar in ls ${RESULTS}/*.covar
 do
  MODEL=$(basename ${covar} .covar)
  PREV=0.078
@@ -31,7 +31,7 @@ POP=eastpca
 RESULTS=${POP}_results
 TRAIT=gout
 mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT}
-for covar in {ls ${RESULTS}/*.covar}
+for covar in ls ${RESULTS}/*.covar
 do
  MODEL=$(basename ${covar} .covar)
  PREV=0.043
@@ -42,7 +42,7 @@ do
 done > ${POP}_${TRAIT}.log
 
 TRAIT=t2d
-for covar in {ls ${RESULTS}/*.covar}
+for covar in ls ${RESULTS}/*.covar
 do
  MODEL=$(basename ${covar} .covar)
  PREV=0.084
@@ -57,7 +57,7 @@ POP=westpca
 RESULTS=${POP}_results
 TRAIT=gout
 mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT}
-for covar in {ls ${RESULTS}/*.covar}
+for covar in ls ${RESULTS}/*.covar
 do
  MODEL=$(basename ${covar} .covar)
  PREV=0.051
@@ -68,7 +68,7 @@ do
 done > ${POP}_${TRAIT}.log
 
 TRAIT=t2d
-for covar in {ls ${RESULTS}/*.covar}
+for covar in ls ${RESULTS}/*.covar
 do
 
  MODEL=$(basename ${covar} .covar)
@@ -86,7 +86,7 @@ POP=europca
 RESULTS=${POP}_results
 TRAIT=gout
 mkdir -p ${RESULTS}/{LDAK,GCTA}/${TRAIT}
-for covar in {ls ${RESULTS}/*.covar}
+for covar in ls ${RESULTS}/*.covar
 do
  MODEL=$(basename ${covar} .covar)
  PREV=0.024
@@ -97,7 +97,7 @@ do
 done > ${POP}_${TRAIT}.log
 
 TRAIT=t2d
-for covar in {ls ${RESULTS}/*.covar}
+for covar in ls ${RESULTS}/*.covar
 do
  MODEL=$(basename ${covar} .covar)
  PREV=0.049

From 75dfe6b3fd392e92d27b7a29322e6dd0c9cd6451 Mon Sep 17 00:00:00 2001
From: Ben Rangihuna <Benrangihuna@gmail.com>
Date: Wed, 22 Sep 2021 12:12:56 +1200
Subject: [PATCH 20/23] updates to pipeline and supplimental scripts

---
 scripts/cv_pipeline.sh                    |   2 +-
 scripts/make_pheno_and_covar_files.R      |   2 +-
 scripts/make_pop_summary_stats.R          |  49 +++++++++
 scripts/make_sex_specific_pop_files.R     |  60 ++++++++++
 scripts/model_residuals_whole_data.R      |   0
 scripts/pca_select_pops.R                 |   2 +-
 scripts/run_ldak_whole_pop.sh             |   2 +-
 scripts/whole_pop_pipeline.sh             |   6 +-
 scripts/whole_pop_pipeline_CETP_CREBRF.sh | 127 +++++++++++++++++++++
 scripts/whole_pop_pipeline_no_polysnps.sh | 128 ++++++++++++++++++++++
 10 files changed, 371 insertions(+), 7 deletions(-)
 create mode 100644 scripts/make_pop_summary_stats.R
 create mode 100644 scripts/make_sex_specific_pop_files.R
 delete mode 100644 scripts/model_residuals_whole_data.R
 create mode 100644 scripts/whole_pop_pipeline_CETP_CREBRF.sh
 create mode 100644 scripts/whole_pop_pipeline_no_polysnps.sh

diff --git a/scripts/cv_pipeline.sh b/scripts/cv_pipeline.sh
index 253936f..d7b82ef 100644
--- a/scripts/cv_pipeline.sh
+++ b/scripts/cv_pipeline.sh
@@ -19,7 +19,7 @@ parallel 'bash scripts/generate_pop_pca.sh {}' ::: nph east west euro
 # create the cv splits and residuals for all pops and trait combos
 
 # ****  traits must not contain underscores in their names ****
-CV=5 # number fo folds for cross validation
+CV=2 # number fo folds for cross validation
 
 for POP in nph east west euro 
 do
diff --git a/scripts/make_pheno_and_covar_files.R b/scripts/make_pheno_and_covar_files.R
index 303c398..8768f03 100644
--- a/scripts/make_pheno_and_covar_files.R
+++ b/scripts/make_pheno_and_covar_files.R
@@ -16,7 +16,7 @@ euro <- read_delim(here("europca_results/europca_pcafile.eigenvec"), col_names =
 
 all_IIDS <- c(nph$IID, east$IID, west$IID, euro$IID)
 
-tanya <- tanya %>% filter(SUBECT %in% all_IIDS)
+tanya <- tanya %>% filter(SUBJECT %in% all_IIDS)
 
 tanya %>% select(IID = SUBJECT, GOUT) %>%
   mutate(GOUT_recode = case_when(str_detect(GOUT,"Control") ~ 1,
diff --git a/scripts/make_pop_summary_stats.R b/scripts/make_pop_summary_stats.R
new file mode 100644
index 0000000..0f7073a
--- /dev/null
+++ b/scripts/make_pop_summary_stats.R
@@ -0,0 +1,49 @@
+## Remove Pukapuka from West Polynesian Pops
+
+
+tanya <- read_csv(file = here("data/tanya_data.csv"), col_names = TRUE) %>%
+  filter(SUBJECT != "Blank") %>%
+  mutate(GOUT_orig_code = GOUT, GOUT = case_when(str_detect(GOUT,"Control") ~ 1,
+                                                 GOUT %in% c("ACR Gout","GP Gout") ~ 2,
+                                                 TRUE ~ NA_real_
+  ),
+  T2D = case_when(TYPE2D == "No" ~ 1,
+                  TYPE2D == "Yes" ~ 2,
+                  TRUE ~ NA_real_
+  ))
+
+## west_pca_tanya.keep is the same as the current westpca.keep file (same PATIENT IDs)
+keep_files <- list.files("/Volumes/scratch/merrimanlab/ben/genomic_predictions/data/", pattern = "pca.keep", full.names = TRUE)
+
+pops <- map_dfr(keep_files, read_tsv, col_names = FALSE)
+tanya_filtered <- tanya  %>% filter(SUBJECT %in% pops$X1)
+
+tanya_no_puka <- tanya_filtered %>% mutate(Westkeep= ifelse(ETH_SPECIFIC == "Pukapukan",0,1))
+
+tanya_no_puka2 <- filter(tanya_no_puka, Westkeep == 1)
+
+height <- tanya_no_puka2 %>% drop_na(HEIGHT)
+BMI <- tanya_no_puka2 %>% drop_na(BMI)
+HDL <- tanya_no_puka2 %>% drop_na(HDL)
+T2D<- tanya_no_puka2 %>% drop_na(T2D)
+GOUT <- tanya_no_puka2 %>% drop_na(GOUT)
+
+library(dplyr)
+
+# There are 2 Mixed Polys, these are NPH participants (NPH group wasnt run)
+data.table::setDT(height)[,list(Mean=mean(HEIGHT), Max=max(HEIGHT), Min=min(HEIGHT), Mean=as.numeric(mean(HEIGHT)), Std=sd(HEIGHT)), by=ANALYSISGROUP_EASTWEST]
+data.table::setDT(BMI)[,list(Mean=mean(BMI), Max=max(BMI), Min=min(BMI), Mean=as.numeric(mean(BMI)), Std=sd(BMI)), by=ANALYSISGROUP_EASTWEST]
+data.table::setDT(HDL)[,list(Mean=mean(HDL), Max=max(HDL), Min=min(HDL), Median=as.numeric(median(HDL)), Std=sd(HDL)), by=ANALYSISGROUP_EASTWEST]
+
+table(height$ANALYSISGROUP_EASTWEST)
+table(BMI$ANALYSISGROUP_EASTWEST)
+table(HDL$ANALYSISGROUP_EASTWEST)
+table(T2D$ANALYSISGROUP_EASTWEST)
+table(height$ANALYSISGROUP_EASTWEST)
+table(height$ANALYSISGROUP_EASTWEST)
+table(tanya$ANALYSISGROUP_EASTWEST)
+
+# Gout and T2D tables
+tanya_no_puka2 %>% group_by(T2D, ANALYSISGROUP_EASTWEST) %>% tally(sort = F)
+tanya_no_puka2 %>% group_by(GOUT, ANALYSISGROUP_EASTWEST) %>% tally(sort = F)
+tanya_no_puka2 %>% group_by(SEX, ANALYSISGROUP_EASTWEST) %>% tally(sort = F)
diff --git a/scripts/make_sex_specific_pop_files.R b/scripts/make_sex_specific_pop_files.R
new file mode 100644
index 0000000..ee21a9a
--- /dev/null
+++ b/scripts/make_sex_specific_pop_files.R
@@ -0,0 +1,60 @@
+# Make sex specific Euro and West Poly keepfiles..
+
+tanya <- read_csv(file = here("data/tanya_data.csv"), col_names = TRUE) %>%
+  filter(SUBJECT != "Blank") %>%
+  mutate(GOUT_orig_code = GOUT, GOUT = case_when(str_detect(GOUT,"Control") ~ 1,
+                                                 GOUT %in% c("ACR Gout","GP Gout") ~ 2,
+                                                 TRUE ~ NA_real_
+  ),
+  T2D = case_when(TYPE2D == "No" ~ 1,
+                  TYPE2D == "Yes" ~ 2,
+                  TRUE ~ NA_real_
+  ))
+
+
+## Read in Euro and West (no Puka) keep files
+westpca <- read.delim(file = "data/westnopukapca.keep", sep = "\t", header = F)
+euro <-  read.delim(file = "data/europca.keep", sep = "\t", header = F)
+
+
+
+## List of Sex specific IDs
+tanya_men <- filter(tanya, SEX == 1) %>% select(PATIENT, ANALYSISGROUP_EASTWEST)
+tanya_women <- filter(tanya, SEX == 2) %>% select(PATIENT, ANALYSISGROUP_EASTWEST)
+
+## Renaming column so that "euro" or "westpca" have matching columns.
+names(tanya_men)[1] <- 'V1'
+names(tanya_women)[1] <- 'V1'
+
+
+
+euro_men <- filter(tanya_men, ANALYSISGROUP_EASTWEST == "European") %>% select(V1)
+euro_women <- filter(tanya_women, ANALYSISGROUP_EASTWEST == "European") %>% select(V1)
+
+west_men <- filter(tanya_men, ANALYSISGROUP_EASTWEST == "West Polynesian") %>% select(V1)
+west_women <- filter(tanya_women, ANALYSISGROUP_EASTWEST == "West Polynesian") %>% select(V1)
+
+
+## Make sex specific population files
+euromale <- merge(euro,euro_men,all = F)
+eurofemale <- merge(euro,euro_women, all = F)
+
+westmale <- merge(westpca,west_men, all = F)
+westfemale <- merge(westpca, west_women, all = F)
+sum(is.na(westfemale$V1))
+sum(is.na(westfemale$V2))
+sum(is.na(westmale$V1))
+sum(is.na(westmale$V2))
+
+ifelse(westfemale$V1==westfemale$V2,"Yes","No")
+
+## make keep files.
+write_delim(euromale, file = "/Volumes/scratch/merrimanlab/ben/genomic_predictions/data/euromalepca.keep", col_names = F, delim = "\t")
+
+write_delim(eurofemale, file = "/Volumes/scratch/merrimanlab/ben/genomic_predictions/data/eurofemalepca.keep", col_names = F, delim = "\t")
+
+write_delim(westmale, file = "/Volumes/scratch/merrimanlab/ben/genomic_predictions/data/westmalepca.keep", col_names = F, delim = "\t")
+
+write_delim(westfemale, file = "/Volumes/scratch/merrimanlab/ben/genomic_predictions/data/westfemalepca.keep", col_names = F, delim = "\t")
+
+
diff --git a/scripts/model_residuals_whole_data.R b/scripts/model_residuals_whole_data.R
deleted file mode 100644
index e69de29..0000000
diff --git a/scripts/pca_select_pops.R b/scripts/pca_select_pops.R
index 0964374..2ba5f73 100644
--- a/scripts/pca_select_pops.R
+++ b/scripts/pca_select_pops.R
@@ -32,7 +32,7 @@ p1 <- pops %>% ggplot(aes(x = PCA2, y = PCA3, colour = pop)) + geom_point() +
   geom_hline(yintercept = 0.0025, colour = "red", linetype = "dashed") +
   geom_vline(xintercept = 0, colour = 'red', linetype = "dashed")
 
-
+## 19 July 2021 Re-run NPH again
 new_nph <- pops %>% filter(pop == "nph" & PCA2 > 0 & PCA3 > 0.0025) %>%
   select(X1,X2) %>% write_delim(file = here("data/nphpca.keep"),
                                 delim = "\t",
diff --git a/scripts/run_ldak_whole_pop.sh b/scripts/run_ldak_whole_pop.sh
index 3c4ce50..96e881f 100644
--- a/scripts/run_ldak_whole_pop.sh
+++ b/scripts/run_ldak_whole_pop.sh
@@ -13,7 +13,7 @@ TRAIT=$(echo $BFILE | cut -d'_' -f2)
 
 MODEL=$2
 
-PHENOCOL=$(echo $MODEL + 4 | bc) # ignore the first 4 columns after FID and IID
+PHENOCOL=$(echo $MODEL + 7 | bc) # ignore the first 7 columns: FID IID MID PID SEX AFF TRUEPHENOVALUE
 cut -d' ' -f1,2,${PHENOCOL} <  ${RESULTS}/${BFILE}.pheno >  ${RESULTS}/${BFILE}.pheno${MODEL}_ldak
 
 
diff --git a/scripts/whole_pop_pipeline.sh b/scripts/whole_pop_pipeline.sh
index 2ae8e68..8ef0c69 100644
--- a/scripts/whole_pop_pipeline.sh
+++ b/scripts/whole_pop_pipeline.sh
@@ -3,12 +3,12 @@
 ## Run:
 
 
-## bash scripts/whole_pop_pipeline.sh data/data {POP_NAME} 
+## bash scripts/whole_pop_pipeline.sh data/data_filtered {POP_NAME} 
 
 ## 
 
-
-
+# remove duplicate markers and non-snps
+# plink --bfile data --exclude remove_dup_markers.txt --snps-only just-acgt --make-bed --out data_filtered
 
 ORIG_DATA=$1
 POP=$2
diff --git a/scripts/whole_pop_pipeline_CETP_CREBRF.sh b/scripts/whole_pop_pipeline_CETP_CREBRF.sh
new file mode 100644
index 0000000..b04ac7c
--- /dev/null
+++ b/scripts/whole_pop_pipeline_CETP_CREBRF.sh
@@ -0,0 +1,127 @@
+### whole_pop_pipeline.sh
+
+## Run:
+
+
+## bash scripts/whole_pop_pipeline.sh data/data_filtered {POP_NAME} 
+
+## 
+
+# remove duplicate markers and non-snps
+# plink --bfile data --exclude remove_dup_markers.txt --snps-only just-acgt --make-bed --out data_filtered
+
+ORIG_DATA=$1
+POP=$2
+RESULTS=${POP}_results/
+
+
+
+
+echo "START"
+
+date
+
+
+#### Initial setup #####
+mkdir -p tmp ${RESULTS}/
+
+
+software/plink1.9b6.10 --bfile ${ORIG_DATA} --geno 0.05 --make-bed --out tmp/data_sorted
+
+software/plink1.9b6.10 --bfile tmp/data_sorted --missing-genotype . --merge data/PolySNPS --make-bed --out tmp/dataPolySNPS --allow-no-sex
+
+## sort and keep our population
+software/plink1.9b6.10 --bfile tmp/dataPolySNPS --keep data/${POP}.keep --maf 0.01  --make-bed --out tmp/${POP}_sorteddata
+
+
+# create list of population specific independent SNPs from data
+software/plink1.9b6.10 --bfile tmp/${POP}_sorteddata --indep-pairwise 50 5 0.2 --maf 0.1 --out ${RESULTS}/${POP}_pca_markers
+
+#(dont' need to do) :make sure the snps of interest don't get filtered out later on
+#echo RS373863828 >>${RESULTS}/${POP}_pca_markers.prune.in 
+#echo CETP_57004947 >> ${RESULTS}/${POP}_pca_markers.prune.in
+
+######################## GCTA
+#grm
+
+# only run this line once per population to save time
+software/gcta64 --bfile tmp/${POP}_sorteddata --autosome --make-grm --out ${RESULTS}/${POP}_gcta_grm --threads 4
+
+
+
+######################## PCAs
+
+### calculate principal components for the population using the independent markers
+software/plink1.9b6.10 --bfile tmp/${POP}_sorteddata --pca 10 --out ${RESULTS}/${POP}_pcafile --extract ${RESULTS}/${POP}_pca_markers.prune.in --allow-no-sex
+
+
+
+
+
+####################### LDAK
+
+echo "Start LDAK $(date)"
+### calculate the weights and kinships for LDAK prior to GRM
+software/LDAK/ldak5.linux --cut-weights ${RESULTS}/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata
+
+
+software/LDAK/ldak5.linux --calc-weights-all ${RESULTS}/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata
+
+
+software/LDAK/ldak5.linux --calc-kins-direct ${RESULTS}/${POP}_ldak_kinships --bfile  tmp/${POP}_sorteddata --weights ${RESULTS}/${POP}_ldak_sections/weights.short --power -0.25
+
+echo "Start traits $(date)"
+# example line to use for testing one iteration
+#line="HEIGHT,linear",
+while read line
+do
+	TRAIT=$(echo $line | cut -d',' -f1)
+	REGRESSION_TYPE=$(echo $line | cut -d',' -f2)
+
+	echo "Start ${TRAIT} $(date)"
+	mkdir -p ${RESULTS}/{GCTA,LDAK}
+	##### Generate regression residuals for each model #####
+	# makes tmp/${POP}_${TRAIT}_residuals.txt
+	
+	#bash scripts/generate_models_whole_pop.sh ${POP} ${TRAIT} ${REGRESSION_TYPE} 
+	Rscript scripts/model_residuals_whole_pop.R --trait ${TRAIT} --pop ${POP} --regression ${REGRESSION_TYPE} --out_dir ${RESULTS}/
+
+	##### Create phone files with the residuals
+	### prune and sort the data
+	# prune to only those with residuals
+	# residual are population specific with PCAs included in the model outside this script.
+	# combine residuals to fam file
+	# this step need to be repeated for each of the groups at bayesR section
+	# columns 8-12 are the FIVE residuals
+	# height, egfr, serumurate, diabetes and gout in that order
+	# need to join based on number fo models 2.2 - 2.8  was for 7 models
+	software/plink1.9b6.10 --bfile tmp/dataPolySNPS --keep data/${POP}.keep --make-bed --maf 0.01 --out tmp/${POP}_${TRAIT}
+	# this should join using IID from fam and IID from the pheno (in pheno first and second cols are duplicated) includes entire fam in output so need to adjust accordingly for model numbers (-6 cols)
+	join -1 2 -2 1 -o auto -e "NA" <(sort -k2 tmp/${POP}_${TRAIT}.fam) <(sort -k1 ${RESULTS}/${POP}_${TRAIT}.residuals.txt)  > ${RESULTS}/${POP}_${TRAIT}.pheno 
+
+	# calculate the column indexes, offset by 2 (to account for FID and IID) from all of the residuals -> should be equal to $(seq 1 number_of_models)
+	pheno_cols=$( head -1 ${RESULTS}/${POP}_${TRAIT}.residuals.txt | awk '{ for(i=1;i<=NF-2;i++){print i}}' | sort -u)	
+
+
+	#### GCTA REML for each model of current trait
+	# run the following with 16 parallel jobs
+	parallel -j 16 'nice -n 10 bash scripts/run_gcta_whole_pop.sh {1} {2}' ::: tmp/${POP}_${TRAIT}.fam  ::: ${pheno_cols} 
+
+	#### LDAK for each model of current trait
+	parallel -j 16 'nice -n 10 bash scripts/run_ldak_whole_pop.sh {1} {2} ' :::  tmp/${POP}_${TRAIT}.fam  ::: ${pheno_cols} 
+done < data/pop_trait_models.csv
+
+WD=$(pwd)
+cd ${RESULTS}/LDAK
+# combine all LDAK h2 for pop into a single file
+grep 'K1' *.h2 | cat <(echo FILE $(head -1 ${TRAIT}_1.h2 )) - > ${POP}_all_traits_LDAK_K1_h2.txt
+# return to the orignal directory
+cd $WD
+
+
+
+echo "FINISH"
+
+date
+
+
diff --git a/scripts/whole_pop_pipeline_no_polysnps.sh b/scripts/whole_pop_pipeline_no_polysnps.sh
new file mode 100644
index 0000000..6cc87a7
--- /dev/null
+++ b/scripts/whole_pop_pipeline_no_polysnps.sh
@@ -0,0 +1,128 @@
+### whole_pop_pipeline.sh
+
+## Run:
+
+
+## bash scripts/whole_pop_pipeline.sh data/data_filtered {POP_NAME} 
+
+## 
+
+# remove duplicate markers and non-snps
+# plink --bfile data --exclude remove_dup_markers.txt --snps-only just-acgt --make-bed --out data_filtered
+
+ORIG_DATA=$1
+POP=$2
+RESULTS=${POP}_results/
+
+
+
+
+echo "START"
+
+date
+
+
+#### Initial setup #####
+mkdir -p tmp ${RESULTS}/
+
+
+software/plink1.9b6.10 --bfile ${ORIG_DATA} --geno 0.05 --make-bed --out tmp/data_sorted
+
+# No need to add in Poly Snps right now.
+# software/plink1.9b6.10 --bfile tmp/data_sorted --missing-genotype . --merge data/PolySNPS --make-bed --out tmp/dataPolySNPS --allow-no-sex
+
+## sort and keep our population
+software/plink1.9b6.10 --bfile tmp/data_sorted --keep data/${POP}.keep --maf 0.01  --make-bed --out tmp/${POP}_sorteddata
+
+
+# create list of population specific independent SNPs from data
+software/plink1.9b6.10 --bfile tmp/${POP}_sorteddata --indep-pairwise 50 5 0.2 --maf 0.1 --out ${RESULTS}/${POP}_pca_markers
+
+#(dont' need to do) :make sure the snps of interest don't get filtered out later on
+#echo RS373863828 >>${RESULTS}/${POP}_pca_markers.prune.in 
+#echo CETP_57004947 >> ${RESULTS}/${POP}_pca_markers.prune.in
+
+######################## GCTA
+#grm
+
+# only run this line once per population to save time
+software/gcta64 --bfile tmp/${POP}_sorteddata --autosome --make-grm --out ${RESULTS}/${POP}_gcta_grm --threads 4
+
+
+
+######################## PCAs
+
+### calculate principal components for the population using the independent markers
+software/plink1.9b6.10 --bfile tmp/${POP}_sorteddata --pca 10 --out ${RESULTS}/${POP}_pcafile --extract ${RESULTS}/${POP}_pca_markers.prune.in --allow-no-sex
+
+
+
+
+
+####################### LDAK
+
+echo "Start LDAK $(date)"
+### calculate the weights and kinships for LDAK prior to GRM
+software/LDAK/ldak5.linux --cut-weights ${RESULTS}/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata
+
+
+software/LDAK/ldak5.linux --calc-weights-all ${RESULTS}/${POP}_ldak_sections --bfile tmp/${POP}_sorteddata
+
+
+software/LDAK/ldak5.linux --calc-kins-direct ${RESULTS}/${POP}_ldak_kinships --bfile  tmp/${POP}_sorteddata --weights ${RESULTS}/${POP}_ldak_sections/weights.short --power -0.25
+
+echo "Start traits $(date)"
+# example line to use for testing one iteration
+#line="HEIGHT,linear",
+while read line
+do
+	TRAIT=$(echo $line | cut -d',' -f1)
+	REGRESSION_TYPE=$(echo $line | cut -d',' -f2)
+
+	echo "Start ${TRAIT} $(date)"
+	mkdir -p ${RESULTS}/{GCTA,LDAK}
+	##### Generate regression residuals for each model #####
+	# makes tmp/${POP}_${TRAIT}_residuals.txt
+	
+	#bash scripts/generate_models_whole_pop.sh ${POP} ${TRAIT} ${REGRESSION_TYPE} 
+	Rscript scripts/model_residuals_whole_pop.R --trait ${TRAIT} --pop ${POP} --regression ${REGRESSION_TYPE} --out_dir ${RESULTS}/
+
+	##### Create phone files with the residuals
+	### prune and sort the data
+	# prune to only those with residuals
+	# residual are population specific with PCAs included in the model outside this script.
+	# combine residuals to fam file
+	# this step need to be repeated for each of the groups at bayesR section
+	# columns 8-12 are the FIVE residuals
+	# height, egfr, serumurate, diabetes and gout in that order
+	# need to join based on number fo models 2.2 - 2.8  was for 7 models
+	software/plink1.9b6.10 --bfile tmp/data_sorted --keep data/${POP}.keep --make-bed --maf 0.01 --out tmp/${POP}_${TRAIT}
+	# this should join using IID from fam and IID from the pheno (in pheno first and second cols are duplicated) includes entire fam in output so need to adjust accordingly for model numbers (-6 cols)
+	join -1 2 -2 1 -o auto -e "NA" <(sort -k2 tmp/${POP}_${TRAIT}.fam) <(sort -k1 ${RESULTS}/${POP}_${TRAIT}.residuals.txt)  > ${RESULTS}/${POP}_${TRAIT}.pheno 
+
+	# calculate the column indexes, offset by 2 (to account for FID and IID) from all of the residuals -> should be equal to $(seq 1 number_of_models)
+	pheno_cols=$( head -1 ${RESULTS}/${POP}_${TRAIT}.residuals.txt | awk '{ for(i=1;i<=NF-2;i++){print i}}' | sort -u)	
+
+
+	#### GCTA REML for each model of current trait
+	# run the following with 16 parallel jobs
+	parallel -j 16 'nice -n 10 bash scripts/run_gcta_whole_pop.sh {1} {2}' ::: tmp/${POP}_${TRAIT}.fam  ::: ${pheno_cols} 
+
+	#### LDAK for each model of current trait
+	parallel -j 16 'nice -n 10 bash scripts/run_ldak_whole_pop.sh {1} {2} ' :::  tmp/${POP}_${TRAIT}.fam  ::: ${pheno_cols} 
+done < data/pop_trait_models.csv
+
+WD=$(pwd)
+cd ${RESULTS}/LDAK
+# combine all LDAK h2 for pop into a single file
+grep 'K1' *.h2 | cat <(echo FILE $(head -1 ${TRAIT}_1.h2 )) - > ${POP}_all_traits_LDAK_K1_h2.txt
+# return to the orignal directory
+cd $WD
+
+
+
+echo "FINISH"
+
+date
+
+

From 764c564ce445755836ec56d281c44606964e90a9 Mon Sep 17 00:00:00 2001
From: BenRangihuna <75346750+BenRangihuna@users.noreply.github.com>
Date: Mon, 11 Apr 2022 13:36:34 +1200
Subject: [PATCH 21/23] Update README.md

Clarifying the use of the 'cv_pipeline.sh' script.
---
 README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 5fd5680..c89853c 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,7 @@
 `scripts/` contains the scripts that are used for the calculation of genomic profile scores 
 
-The main workflow script is `cv_pipeline.sh` which will run each of the required steps.
+The main workflow script is `cv_pipeline.sh` which will run each of the required steps. (This pipeline is used to estimate trait heritabilities for each Polynesian Population of interest (i.e not for the CV-trait profile score stages)). 
 
 It starts with a plink formatted dataset containing all populations of interest, subsets out each population, and does cross-validation for each trait of interest in each population using both GCTA and LDAK.
 
-
 `scripts/create_model_reports.R` will create reports for each pop/trait combo as listed in `data/pop_trait_models.csv` and apply a template RMarkdown to pull in and summarise the results from LDAK and GCTA. The template document is `scripts/model_selection_doc.Rmd`.

From 56a1ab40707f071e2edf42519b99a402c9aae552 Mon Sep 17 00:00:00 2001
From: BenRangihuna <75346750+BenRangihuna@users.noreply.github.com>
Date: Mon, 31 Jul 2023 12:18:46 +1200
Subject: [PATCH 22/23] Update model_residuals_whole_pop.R

added additional package (to (hopefully) overcome issues in BASH).
---
 scripts/model_residuals_whole_pop.R | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/scripts/model_residuals_whole_pop.R b/scripts/model_residuals_whole_pop.R
index ba19326..418925a 100644
--- a/scripts/model_residuals_whole_pop.R
+++ b/scripts/model_residuals_whole_pop.R
@@ -1,3 +1,7 @@
+if(!require(textshaping)){
+  install.packages("here", repos = "https://cloud.r-project.org")
+  library("textshaping")
+}
 if(!require(here)){
   install.packages("here", repos = "https://cloud.r-project.org")
   library("here")

From cccc9a77bb5dd2761c36e115cbd8c89975639383 Mon Sep 17 00:00:00 2001
From: BenRangihuna <75346750+BenRangihuna@users.noreply.github.com>
Date: Mon, 31 Jul 2023 12:36:36 +1200
Subject: [PATCH 23/23] Update model_residuals_whole_pop.R

Initial plan to install package "text shaping" to alleviate BASH issues did not work. So I have changed the code back to its original output.
---
 scripts/model_residuals_whole_pop.R | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/scripts/model_residuals_whole_pop.R b/scripts/model_residuals_whole_pop.R
index 418925a..ba19326 100644
--- a/scripts/model_residuals_whole_pop.R
+++ b/scripts/model_residuals_whole_pop.R
@@ -1,7 +1,3 @@
-if(!require(textshaping)){
-  install.packages("here", repos = "https://cloud.r-project.org")
-  library("textshaping")
-}
 if(!require(here)){
   install.packages("here", repos = "https://cloud.r-project.org")
   library("here")