diff --git a/CONVERTF/example.perl b/CONVERTF/example.perl index 08f6bbc..096bb18 100755 --- a/CONVERTF/example.perl +++ b/CONVERTF/example.perl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # $parfile = "par.ANCESTRYMAP.EIGENSTRAT"; $parfile = "par.EIGENSTRAT.PED"; diff --git a/CONVERTF/ind2pheno.perl b/CONVERTF/ind2pheno.perl index f5bb149..bd0df1a 100755 --- a/CONVERTF/ind2pheno.perl +++ b/CONVERTF/ind2pheno.perl @@ -1,10 +1,26 @@ -#!/usr/bin/perl +#!/usr/bin/env perl + +sub usage { + my $message = "@_"; + die " +Usage: ind2pheno.perl infile outfile + +Required Arguments: + infile : input .ind file + outfile : output .pheno file + +$message + +"; +} + +unless (@ARGV == 2) {usage("OOPS unexpected number of arguments")} $in = $ARGV[0]; # .ind file $out = $ARGV[1]; # .pheno file -open(IN,$in) || die("COF"); -open(OUT,">$out") || die("COF"); +open(IN,$in) || die("Cannot open file: $in"); +open(OUT,">$out") || die("Cannot open file: $out"); while($line = ) { diff --git a/EIGENSTRAT/example.QTL.perl b/EIGENSTRAT/example.QTL.perl index ee13699..c67a165 100755 --- a/EIGENSTRAT/example.QTL.perl +++ b/EIGENSTRAT/example.QTL.perl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl $ENV{'PATH'} = "../bin:$ENV{'PATH'}"; # MUST put smartpca bin directory in path for smartpca.perl to work diff --git a/EIGENSTRAT/example.oldstyle.perl b/EIGENSTRAT/example.oldstyle.perl index 03bcdda..f3222a5 100755 --- a/EIGENSTRAT/example.oldstyle.perl +++ b/EIGENSTRAT/example.oldstyle.perl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl $ENV{'PATH'} = "../bin:$ENV{'PATH'}"; # MUST put pca bin directory in path for smartpca.perl to work diff --git a/EIGENSTRAT/example.perl b/EIGENSTRAT/example.perl index b555167..c766c61 100755 --- a/EIGENSTRAT/example.perl +++ b/EIGENSTRAT/example.perl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl $ENV{'PATH'} = "../bin:$ENV{'PATH'}"; # MUST put smartpca bin directory in path for smartpca.perl to work diff --git a/POPGEN/elldemo/doplot b/POPGEN/elldemo/doplot index b539c3b..0fed564 100755 --- a/POPGEN/elldemo/doplot +++ b/POPGEN/elldemo/doplot @@ -1,4 +1,6 @@ -#!/usr//bin/perl -w +#!/usr/bin/env perl + +use warnings; system "ploteig -i sicaa.evec -c 1:2 -g aaa -p test4 -x -k -r ctable -e ell4aa.out -t \" Sicily EBA projected conf: 0.95\"" ; diff --git a/POPGEN/elldemo/rescale_ell b/POPGEN/elldemo/rescale_ell index bf1f076..9a54bac 100755 --- a/POPGEN/elldemo/rescale_ell +++ b/POPGEN/elldemo/rescale_ell @@ -1,7 +1,8 @@ -#!/usr//bin/perl -w +#!/usr/bin/env perl ## rescale -i infile -o outfile -a inscale -b outscale +use warnings ; use Getopt::Std ; use File::Basename ; @@ -82,9 +83,6 @@ sub critchi { } -sub usage { - -print "rescale_ell -i infile -o outfile -s outscale [-a inscale]\n" ; -exit 0 ; - +sub usage { + die "Usage: rescale_ell -i infile -o outfile -s outscale [-a inscale]\n" ; } diff --git a/POPGEN/example.perl b/POPGEN/example.perl index fc16773..be757fa 100755 --- a/POPGEN/example.perl +++ b/POPGEN/example.perl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl $ENV{'PATH'} = "../bin:$ENV{'PATH'}"; diff --git a/POPGEN/twexample.perl b/POPGEN/twexample.perl index d562f98..28a9558 100755 --- a/POPGEN/twexample.perl +++ b/POPGEN/twexample.perl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl $command = "../bin/twstats"; $command .= " -t twtable "; diff --git a/bin/evec2pca-ped.perl b/bin/evec2pca-ped.perl index c0395db..626cb2b 100755 --- a/bin/evec2pca-ped.perl +++ b/bin/evec2pca-ped.perl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl ### translate .evec file to .pca file expected by eigenstrat program ### Note: .evec file does not contain entries for outliers @@ -10,6 +10,26 @@ # ----- indiv names are not found in the .evec file, try the # ----- familyname:indivname combination. +sub usage { + my $message = "@_"; + die " +Usage: evec2pca-ped.perl k example.evec example.ind example.pca + +Required Arguments: + k : the number of principal components in example.evec + file (e.g. 10) + example.evec : file of principal components produced by smartpca + example.ind : individual file + example.pca : file of principal components in file needed by + eigenstrat + +$message + +" +} + +unless (@ARGV == 4) {usage("OOPS unexpected number of arguments")} + $k = $ARGV[0]; $evec = $ARGV[1]; $ind = $ARGV[2]; diff --git a/bin/evec2pca.perl b/bin/evec2pca.perl index cf87faa..65e092f 100755 --- a/bin/evec2pca.perl +++ b/bin/evec2pca.perl @@ -1,9 +1,29 @@ -#!/usr/bin/perl +#!/usr/bin/env perl ### translate .evec file to .pca file expected by eigenstrat program ### Note: .evec file does not contain entries for outliers ### .pca file does contain entries (set to all 0.0) for outliers +sub usage { + my $message = "@_"; + die " +Usage: evec2pca.perl k example.evec example.ind example.pca + +Required Arguments: + k : the number of principal components in example.evec + file (e.g. 10) + example.evec : file of principal components produced by smartpca + example.ind : individual file + example.pca : file of principal components in file needed by + eigenstrat + +$message + +" +} + +unless (@ARGV == 4) {usage("OOPS unexpected number of arguments")} + $k = $ARGV[0]; $evec = $ARGV[1]; $ind = $ARGV[2]; diff --git a/bin/gc.perl b/bin/gc.perl index 4b90f7a..4229ab7 100755 --- a/bin/gc.perl +++ b/bin/gc.perl @@ -1,11 +1,37 @@ -#!/usr/bin/perl +#!/usr/bin/env perl + +sub usage { + my $message = "@_"; + die " +Usage: gc.perl infile outfile + +Required Arguments: + infile : input file of chisq statistics produced by eigenstrat + program. It contains both uncorrected and EIGENSTRAT + statistics for each SNP. + outfile : output file. It lists lambda inflation values (for + both uncorrected and EIGENSTRAT) chisq statistics + after scaling by lambda (uncorrected and EIGENSTRAT) + +Computation of lambda is as described in Devlin and Roeder 1999. +A lambda above 1 indicates inflation in chisq statistics. +By definition, lambda is not allowed to be less than 1. + +Running time of the gc.perl program is very fast. + +$message + +" +} + +unless(@ARGV == 2) {usage("OOPS unexpected number of arguments")} $P = $ARGV[0]; $out = $ARGV[1]; # get data $m=0; -open(P,"$P") || die("COF"); +open(P,"$P") || die("Cannot open file: $P"); while($line =

) { if($line =~ /Chisq/) { last; } } # header lines while($line =

) { @@ -66,7 +92,7 @@ if($lambda2 < 1) { $lambda2 = 1; } # not allowed to be less than 1 # output -open(OUT,">$out") || die("COF"); +open(OUT,">$out") || die("Cannot open file: $out"); print OUT ("Chisq EIGENSTRAT\n"); printf OUT ("lambda=%.03f lambda=%.03f\n",$lambda1,$lambda2); for($m=0; $m<$nSNP; $m++) diff --git a/bin/ploteig b/bin/ploteig index 0871daa..e8e40eb 100755 --- a/bin/ploteig +++ b/bin/ploteig @@ -1,6 +1,7 @@ -#!/usr/local/bin/perl -w +#!/usr/bin/env perl ### ploteig -i eigfile -p pops -c a:b [-t title] [-s stem] [-g gstem] [-o outfile] [-x] [-k] [-y] [-z sep] -r colorstring -m xmul -n ymul +use warnings ; use Getopt::Std ; use File::Basename ; @@ -41,8 +42,7 @@ if (defined $opts{"i"}) { $infile = $opts{"i"} ; } else { - usage() ; - exit 0 ; + usage("OOPS -i parameter compulsory"); } open (FF, $infile) || die "can't open $infile\n" ; @@ -63,7 +63,7 @@ if (defined $opts{"p"}) { $pops = $opts{"p"} ; } else { - die "p parameter compulsory\n" ; + usage("OOPS -p parameter compulsory"); } $popsname = setpops ($pops) ; @@ -142,7 +142,7 @@ $psfile = "$stem.ps" ; $psfile =~ s/xtxt/ps/ ; } system "gnuplot < $gnfile > $psfile" ; -system "/home/np29/bin/fixgreen $psfile" ; +#system "/home/np29/bin/fixgreen $psfile" ; system "ps2pdf $psfile " ; } unlink (@T) unless $keepflag ; @@ -172,25 +172,38 @@ sub setcolor { } sub usage { - -print "ploteig -i eigfile -p pops -c a:b [-t title] [-s stem] [-o outfile] [-x] [-k] -c colorstringh [-m xmul] [-n ymul]\n" ; -print "-i eigfile input file first col indiv-id last col population\n" ; -print "## as output by smartpca in outputvecs \n" ; -print "-c a:b a, b columns to plot. 1:2 would be common and leading 2 eigenvectors\n" ; -print "-p pops Populations to plot. : delimited. eg -p Bantu:San:French\n" ; -print "## pops can also be a filename. List populations 1 per line\n" ; -print "[-s stem] stem will start various output files\n" ; -print "[-o ofile] ofile will be gnuplot control file. Should have xtxt suffix\n"; -print "[-x] make ps and pdf files\n" ; -print "[-k] keep various intermediate files although -x set\n" ; -print "## necessary if .xtxt file is to be hand edited\n" ; -print "[-r colorstringpairs or colorstringfile]\n" ; -print "[-g gstem] make gstem.xtxt gstem.ps gstem.pdf \n" ; -print "[-y] put key at top right inside box (old mode)\n" ; -print "[-t] title (legend)\n" ; - -print "The xtxt file is a gnuplot file and can be easily hand edited. Intermediate files -needed if you want to make your own plot\n" ; + my $message = "@_"; + +# 10 20 30 40 50 60 70 80 90 +#---+----|---+----|---+----|---+----|---+----|---+----|---+----|---+----|---+----| + die " +Usage: ploteig [FLAGS] + +Required Flags: + -i eigfile : input file first col indiv-id last col population + as output by smartpca in outputvecs + -p pops : Populations to plot. : delimited. eg -p Bantu:San:French + pops can also be a filename. List populations 1 per line + +Optional Flags: + -c a:b : a, b columns to plot. 1:2 would be common and leading 2 + eigenvectors + -s stem : stem will start various output files + -o ofile : ofile will be gnuplot control file. Should have xtxt suffix + -x : make ps and pdf files + -k : keep various intermediate files, although setting -x is + necessary if .xtxt file is to be hand edited + -r cfile : colorstringpairs or colorstringfile + -g gstem : make gstem.xtxt gstem.ps gstem.pdf + -y : put key at top right inside box (old mode) + -t : title (legend) + +The xtxt file is a gnuplot file and can be easily hand edited. Intermediate +files needed if you want to make your own plot + +$message + +" ; } sub setpops { diff --git a/bin/rescale_ell b/bin/rescale_ell index bf1f076..9a54bac 100755 --- a/bin/rescale_ell +++ b/bin/rescale_ell @@ -1,7 +1,8 @@ -#!/usr//bin/perl -w +#!/usr/bin/env perl ## rescale -i infile -o outfile -a inscale -b outscale +use warnings ; use Getopt::Std ; use File::Basename ; @@ -82,9 +83,6 @@ sub critchi { } -sub usage { - -print "rescale_ell -i infile -o outfile -s outscale [-a inscale]\n" ; -exit 0 ; - +sub usage { + die "Usage: rescale_ell -i infile -o outfile -s outscale [-a inscale]\n" ; } diff --git a/bin/smarteigenstrat.perl b/bin/smarteigenstrat.perl index 9b81822..dda7d93 100755 --- a/bin/smarteigenstrat.perl +++ b/bin/smarteigenstrat.perl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # perl wrapper for smarteigenstrat program. Run smarteigenstrat.perl with no options for usage @@ -17,8 +17,7 @@ # check for mandatory options foreach $flag ("i","a","b","p","o","l") { unless ($specified{$flag}) { - usage(); - die("Error: -$flag not specified"); + usage("OOPS -$flag not specified"); } } @@ -40,7 +39,7 @@ } # write parameter file -open(PAR, ">$outfilename.par") || die("Error: unable to open $outfilename.par\n"); +open(PAR, ">$outfilename.par") || die("OOPS unable to open $outfilename.par\n"); print PAR "genotypename: $genofilename\n"; print PAR "snpname: $snpfilename\n"; print PAR "indivname: $indfilename\n"; @@ -56,17 +55,65 @@ system($cmd); sub usage { - print "smarteigenstrat.perl -i -a -b -p -o "; - print " -l -k -q "; - print "\n"; - print "-i genotype file (PED, PACKEDPED, EIGENSTRAT, ANCESTRYMAP or PACKEDANCESTRYMAP format)"; - print "-o output file (chisq)\n"; - print "-l logfile (screen output,including error messages)\n"; - print "-q YES for quantitative phenotype or NO otherwise\n"; - print "\n"; - print "For quantitative phenotype, sixth column of .ped file or third column of EIGENSTRAT .ind file\n"; - print "should be real numbers. For non-quantitative phenotype, sixth column of .ped or third column\n"; - print "should be 'Case' or 'Control'\n"; + my $message = "@_"; +# 10 20 30 40 50 60 70 80 90 +#---+----|---+----|---+----|---+----|---+----|---+----|---+----|---+----|---+----| + die " +Usage: smarteigenstrat.perl [FLAGS] + +This program is a PERL wrapper which calls the C program smarteigenstrat. + Note: the bin directory containing smarteigenstrat MUST be in your path. + See ./example.perl for a toy example. +We recommend smarteigenstrat.perl for users who prefer command-line flags. +However, users who prefer parameter files can run smarteigenstrat instead. + +Required Flags: + -i example.geno : genotype file in any format (PED, PACKEDPED, EIGENSTRAT, + ANCESTRYMAP or PACKEDANCESTRYMAP format) + -a example.snp : snp file in any format (see CONVERTF/README) + -b example.ind : individual file in any format (see CONVERTF/README). + We note that phenotype information will be contained in + example.ind, either as Case/Control labels or quantitative + phenotypes if -q set to YES. + -q YES/NO : If set to YES, use quantitative phenotypes in example.ind. + If -q is set to YES, the third column of the input individual + file in EIGENSTRAT format (or sixth column of input + individual file in PED format) should be real numbers. The + value -100.0 signifies 'missing data'. If -q is set to NO, + these values should be 'Case' or 'Control'. The default value + for the -q parameter is NO. + -p example.pca : input file of principal components (output of smartpca.perl) + -k topk : (Default is 10) number of principal components along which to + correct for stratification. Note that l must be less than or + equal to the number of principal components reported in the + file example.pca. + -o example.chisq : chisq association statistics. File contains log of flags to + eigenstrat program, followed by one line per SNP: + The first entry of each line is Armitage chisq statistic + (Armitage, 1955) defined as NSAMPLES x (correlation between + genotype and phenotype)^2. If the set of individuals with + genotype and phenotype both valid is monomorphic for either + genotype or phenotype, then NA is reported. + The second entry of each line is the EIGENSTRAT chisq statistic, + defined as (NSAMPLES-l-1) x (corr between adjusted_genotype + and adjusted_phenotype)^2. If the set of individuals with + genotype and phenotype both valid is monomorphic for either + genotype or phenotype, then NA is reported. + Note: even if l=0, there is a tiny difference between the two + statistics because Armitage uses NSAMPLES while we use + NSAMPLES-1, which we consider to be appropriate. + -l example.log : standard output logfile (screen output,including error messages) + +The running time of smarteigenstrat.perl is very fast compared to the running time + of smartpca.perl. + +For quantitative phenotype, sixth column of .ped file or third column of EIGENSTRAT +.ind file should be real numbers. For non-quantitative phenotype, sixth column of +.ped or third column should be 'Case' or 'Control' + +$message + +" } diff --git a/bin/smartpca.perl b/bin/smartpca.perl index 478be19..cbcfb51 100755 --- a/bin/smartpca.perl +++ b/bin/smartpca.perl @@ -1,8 +1,68 @@ -#!/usr/bin/perl +#!/usr/bin/env perl use Getopt::Std ; use File::Basename ; +sub usage { + my $message = "@_"; +# 10 20 30 40 50 60 70 80 90 +#---+----|---+----|---+----|---+----|---+----|---+----|---+----|---+----|---+----| + die " +Usage: smartpca.perl [FLAGS] + +This program calls the smartpca program (see POPGEN/README). +For this to work, the bin directory containing smartpca MUST be in your path. + +Required Flags: + -i example.geno : genotype file in any format (see CONVERTF/README) + -a example.snp : snp file in any format (see CONVERTF/README) + -b example.ind : indiv file in any format (see CONVERTF/README) + -k k : (Default is 10) number of principal components to output + -o example.pca : output file of principal components. Individuals removed + as outliers will have all values set to 0.0 in this file. + -p example.plot : prefix of output plot files of top 2 principal components. + (labeling individuals according to labels in indiv file) + -e example.eval : output file of all eigenvalues + -l example.log : output logfile + -m maxiter : (Default is 5) maximum number of outlier removal iterations. + To turn off outlier removal, set -m 0. + -t topk : (Default is 10) number of principal components along which + to remove outliers during each outlier removal iteration. + -s sigma : (Default is 6.0) number of standard deviations which an + individual must exceed, along one of topk top principal + components, in order to be removed as an outlier. + +Optional Flags: + -w poplist : compute eigenvectors using populations in poplist only, + where poplist is an ASCII file with one population per line + -y plotlist : output plot will include populations in plotlist only, + where plotlist is an ASCII file with one population per line + -z badsnpname : list of SNPs which should be excluded from the analysis + -q YES/NO : If set to YES, assume that there is a single population and + the population field contains real-valued phenotypes. + (Corresponds to qtmode parameter in smartpca program.) + The default value for this parameter is NO. + +Estimated running time of the smartpca program is + 2.5e-12 * nSNP * NSAMPLES^2 hours if not removing outliers. + 2.5e-12 * nSNP * NSAMPLES^2 hours * (1+m) if m outlier removal iterations. +Thus, under the default of up to 5 outlier removal iterations, running time is + up to 1.5e-11 * nSNP * NSAMPLES^2 hours. + +Recommendation: we advise after running pca to check for large correlations +between each principal component and all variables of interest. For example, +large correlations with % missing data (per sample) could imply assay issues +large correlations with plate membership could imply assay issues +large correlations with phenotype indicate highly mismatched cases vs. controls + which will lead to a loss of power upon applying eigenstrat correction. + If input indiv file contains Case and Control labels only, then + correlations between each principal component and Case/Control status will be + listed at end of output logfile (-l flag). + +$message + +" +} ### process flags # -w poplist is compute eigenvectors using populations in poplist, then project # -y poplistplot is use populations in poplistplot for plot @@ -18,7 +78,7 @@ } foreach $flag ("i","a","b","o","p","e","l") { - unless($specified{$flag}) { die("OOPS -$flag flag not specified"); } + unless($specified{$flag}) { usage("OOPS -$flag flag not specified"); } } getopts('i:a:b:k:o:p:e:l:m:t:s:w:y:z:q:',\%opts); $i = $opts{"i"}; @@ -81,7 +141,7 @@ { ### make string of populations for ploteig based on -y flag input $popstring = ""; - open(Y,$y) || die("COF"); + open(Y,$y) || die("Cannot open file: $y"); while($line = ) { chomp($line);