From 7a661e74e51526392eaf7c1b431b9d65806efccf Mon Sep 17 00:00:00 2001 From: Innerst Date: Thu, 1 Jun 2017 16:13:25 -0500 Subject: [PATCH 01/53] Implementing Hit and Run algorithm in metropolis --- R/RcppExports.R | 4 +- R/metropolis.r | 198 +++++++++++++++----------- src/RcppExports.cpp | 7 +- src/metropolis_hypergeometric_cpp.cpp | 46 +++++- 4 files changed, 161 insertions(+), 94 deletions(-) diff --git a/R/RcppExports.R b/R/RcppExports.R index 2ac5db1..5f43098 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -21,8 +21,8 @@ computeX2sCpp <- function(x, exp) { .Call('algstat_computeX2sCpp', PACKAGE = 'algstat', x, exp) } -metropolis_hypergeometric_cpp <- function(current, moves, iter, thin) { - .Call('algstat_metropolis_hypergeometric_cpp', PACKAGE = 'algstat', current, moves, iter, thin) +metropolis_hypergeometric_cpp <- function(current, moves, iter, thin, hit_and_run) { + .Call('algstat_metropolis_hypergeometric_cpp', PACKAGE = 'algstat', current, moves, iter, thin, hit_and_run) } metropolis_uniform_cpp <- function(current, moves, iter, thin) { diff --git a/R/metropolis.r b/R/metropolis.r index 678bf2b..55af35f 100644 --- a/R/metropolis.r +++ b/R/metropolis.r @@ -144,9 +144,10 @@ #' #' metropolis <- function(init, moves, iter = 1E3, burn = 0, thin = 1, - dist = c("hypergeometric","uniform"), engine = c("Cpp","R") + dist = c("hypergeometric","uniform"), engine = c("Cpp","R"), + hit_and_run = FALSE ){ - + ## preliminary checking ################################################## dist <- match.arg(dist) @@ -156,111 +157,134 @@ metropolis <- function(init, moves, iter = 1E3, burn = 0, thin = 1, thin <- 1 } - + ## in R ################################################## if(engine == "R"){ - - nMoves <- ncol(moves) - state <- matrix(nrow = nrow(moves), ncol = iter) - - ## run burn-in - - current <- unname(init) - unifs <- runif(burn) - - message("Running chain (R)... ", appendLF = FALSE) - - if(burn > 0) { - for(k in 1:burn){ - - move <- sample(c(-1,1), 1) * moves[,sample(nMoves,1)] - propState <- current + move - if(any(propState < 0)){ - prob <- 0 - } else { - if(dist == "hypergeometric"){ - prob <- exp( sum(lfactorial(current)) - sum(lfactorial(propState)) ) - } else { # dist == "uniform" - prob <- 1 + nMoves <- ncol(moves) + state <- matrix(nrow = nrow(moves), ncol = iter) + + ## run burn-in + + current <- unname(init) + unifs <- runif(burn) + + message("Running chain (R)... ", appendLF = FALSE) + + if(burn > 0) { + for(k in 1:burn){ + + if(hit_and_run) + { + move <- moves[,sample(nMoves,1)] + w_move <- move[move != 0] + w_current <- current[move != 0] + w_moves <- -1 * w_current / w_move + lower_bound <- if(any(w_moves < 0)){max(subset(w_moves,subset = w_moves < 0))}else{1} + upper_bound <- if(any(w_moves > 0)){min(subset(w_moves,subset = w_moves > 0))}else{-1} + c_s <- sample(c(lower_bound:-1,1:upper_bound),1) + propState <- current + c_s * move + }else{ + move <- sample(c(-1,1), 1) * moves[,sample(nMoves,1)] + propState <- current + move + } + if(any(propState < 0)){ + prob <- 0 + } else { + if(dist == "hypergeometric"){ + prob <- exp( sum(lfactorial(current)) - sum(lfactorial(propState)) ) + } else { # dist == "uniform" + prob <- 1 + } } + + if(unifs[k] < prob) current <- propState # else current + } + state[,1] <- current + } - if(unifs[k] < prob) current <- propState # else current + ## run main sampler - } - state[,1] <- current - } - - ## run main sampler - - totalRuns <- 0 - probTotal <- 0 - unifs <- runif(iter*thin) - - for(k in 2:iter){ - - for(j in 1:thin){ - - move <- sample(c(-1,1), 1) * moves[,sample(nMoves,1)] - propState <- current + move + totalRuns <- 0 + probTotal <- 0 + unifs <- runif(iter*thin) - if(any(propState < 0)){ - prob <- 0 - } else { - if(dist == "hypergeometric"){ - prob <- exp( sum(lfactorial(current)) - sum(lfactorial(propState)) ) - } else { # dist == "uniform" - prob <- 1 + for(k in 2:iter){ + + for(j in 1:thin){ + + if(hit_and_run) + { + move <- c(moves[,sample(nMoves,1)]) + w_move <- move[move != 0] + w_current <- current[move != 0] + w_moves <- (-1 * w_current) / w_move + lower_bound <- if(any(w_moves < 0)){max(subset(w_moves,subset = w_moves < 0))}else{1} + upper_bound <- if(any(w_moves > 0)){min(subset(w_moves,subset = w_moves > 0))}else{-1} + c_s <- sample(c(lower_bound:-1,1:upper_bound),1) + propState <- current + c_s * move + }else{ + move <- sample(c(-1,1), 1) * moves[,sample(nMoves,1)] + propState <- current + move + } + + if(any(propState < 0)){ + prob <- 0 + } else { + if(dist == "hypergeometric"){ + prob <- exp( sum(lfactorial(current)) - sum(lfactorial(propState)) ) + } else { # dist == "uniform" + prob <- 1 + } } + probTotal <- probTotal + min(1, prob) + + if(unifs[k*(thin-1)+j] < prob) current <- propState # else current + + totalRuns <- totalRuns + 1 } - probTotal <- probTotal + min(1, prob) - - if(unifs[k*(thin-1)+j] < prob) current <- propState # else current - totalRuns <- totalRuns + 1 + state[,k] <- current } - - state[,k] <- current - } - message("done.") - - ## format output - out <- list( - steps = state, - moves = moves, - acceptProb = probTotal / totalRuns - ) - - - - + message("done.") + + ## format output + out <- list( + steps = state, + moves = moves, + acceptProb = probTotal / totalRuns + ) + + + + } ## in Cpp ################################################## if(engine == "Cpp"){ - current <- unname(init) - allMoves <- cbind(moves, -moves) - sampler <- if(dist == "hypergeometric") { - metropolis_hypergeometric_cpp - } else { - metropolis_uniform_cpp - } - message("Running chain (C++)... ", appendLF = FALSE) - if (burn > 0) current <- sampler(current, allMoves, burn, 1)$steps[,burn] - out <- sampler(current, allMoves, iter, thin) - out$moves <- moves - message("done.") - + current <- unname(init) + allMoves <- cbind(moves, -moves) + sampler <- if(dist == "hypergeometric") { + metropolis_hypergeometric_cpp + } else { + metropolis_uniform_cpp + } + message("Running chain (C++)... ", appendLF = FALSE) + if (burn > 0) current <- sampler(current, allMoves, burn, 1, hit_and_run)$steps[,burn] + out <- sampler(current, allMoves, iter, thin, hit_and_run) + out$moves <- moves + message("done.") + } - - + + ## return output ################################################## - + out[c("steps", "moves", "acceptProb")] } @@ -273,7 +297,7 @@ metropolis <- function(init, moves, iter = 1E3, burn = 0, thin = 1, #' @rdname metropolis #' @export -rawMetropolis <- function(init, moves, iter = 1E3, dist = "hypergeometric"){ - metropolis(init, moves, iter, burn = 0, thin = 1, dist = dist) +rawMetropolis <- function(init, moves, iter = 1E3, dist = "hypergeometric", hit_and_run = FALSE){ + metropolis(init, moves, iter, burn = 0, thin = 1, dist = dist, hit_and_run) } diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index f52c788..8fb624b 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -66,8 +66,8 @@ BEGIN_RCPP END_RCPP } // metropolis_hypergeometric_cpp -List metropolis_hypergeometric_cpp(IntegerVector current, IntegerMatrix moves, int iter, int thin); -RcppExport SEXP algstat_metropolis_hypergeometric_cpp(SEXP currentSEXP, SEXP movesSEXP, SEXP iterSEXP, SEXP thinSEXP) { +List metropolis_hypergeometric_cpp(IntegerVector current, IntegerMatrix moves, int iter, int thin, bool hit_and_run); +RcppExport SEXP algstat_metropolis_hypergeometric_cpp(SEXP currentSEXP, SEXP movesSEXP, SEXP iterSEXP, SEXP thinSEXP, SEXP hit_and_runSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -75,7 +75,8 @@ BEGIN_RCPP Rcpp::traits::input_parameter< IntegerMatrix >::type moves(movesSEXP); Rcpp::traits::input_parameter< int >::type iter(iterSEXP); Rcpp::traits::input_parameter< int >::type thin(thinSEXP); - rcpp_result_gen = Rcpp::wrap(metropolis_hypergeometric_cpp(current, moves, iter, thin)); + Rcpp::traits::input_parameter< bool >::type hit_and_run(hit_and_runSEXP); + rcpp_result_gen = Rcpp::wrap(metropolis_hypergeometric_cpp(current, moves, iter, thin, hit_and_run)); return rcpp_result_gen; END_RCPP } diff --git a/src/metropolis_hypergeometric_cpp.cpp b/src/metropolis_hypergeometric_cpp.cpp index 1bd94a9..0bc32bf 100644 --- a/src/metropolis_hypergeometric_cpp.cpp +++ b/src/metropolis_hypergeometric_cpp.cpp @@ -1,11 +1,15 @@ #include +#include using namespace Rcpp; +// [[Rcpp::plugins(cpp11)]] + // [[Rcpp::export]] List metropolis_hypergeometric_cpp( IntegerVector current, IntegerMatrix moves, - int iter, int thin + int iter, int thin, + bool hit_and_run ){ int nTotalSamples = iter * thin; // total number of steps @@ -19,6 +23,12 @@ List metropolis_hypergeometric_cpp( bool anyIsNegative; IntegerVector move(n); double acceptProb = 0; + NumericVector stepSize(n); + NumericVector lowerBound(n); + NumericVector upperBound(n); + double lb; + double ub; + IntegerVector run(1); Function sample("sample"); whichMove = sample(nMoves, nTotalSamples, 1); @@ -33,10 +43,42 @@ List metropolis_hypergeometric_cpp( for(int k = 0; k < n; ++k){ move[k] = moves(k, whichMove[thin*i+j]-1); } + + // If hit_and_run is true, choose how far to run + if(hit_and_run){ + for(int l = 0; l < n; ++l){ + if(std::isinf(-current[l] / move[l])){ + stepSize[l] = 0; + }else{ + stepSize[l] = -current[l] / move[l]; + } + } + for(int l = 0; l < n; ++l){ + if(stepSize[l] >=0){ + lowerBound[l] = -100000; + }else{ + lowerBound[l] = stepSize[l]; + } + } + for(int l = 0; l < n; ++l){ + if(stepSize[l] <= 0){ + upperBound[l] = 100000; + }else{ + upperBound[l] = stepSize[l]; + } + } + lb = max(lowerBound); + ub = min(upperBound); + run = sample(seq(floor(lb),floor(ub)),1); + if(run[1] == 0){ + run[1] = 1; + } + + } // compute proposal for(int k = 0; k < n; ++k){ - proposal[k] = current[k] + move[k]; + proposal[k] = current[k] + run[1] * move[k]; } // compute probability of transition From 5d2824b4e0b3040084908ce7b2cfb443035e4e98 Mon Sep 17 00:00:00 2001 From: Innerst Date: Tue, 6 Jun 2017 16:17:34 -0500 Subject: [PATCH 02/53] metropolis_hypergeometric_cpp fixes --- .DS_Store | Bin 0 -> 8196 bytes R/RcppExports.R | 4 ++ R/metropolis.r | 2 + src/.DS_Store | Bin 0 -> 6148 bytes src/RcppExports.cpp | 11 ++++++ src/isinfinite.cpp | 11 ++++++ src/isinfinite.h | 20 ++++++++++ src/metropolis_hypergeometric_cpp.cpp | 54 +++++++++++--------------- 8 files changed, 71 insertions(+), 31 deletions(-) create mode 100644 .DS_Store create mode 100644 src/.DS_Store create mode 100644 src/isinfinite.cpp create mode 100644 src/isinfinite.h diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..b87d1589463e86143f695eda7906eb4df63bb6df GIT binary patch literal 8196 zcmeHMOK=oL818QqV1|s)2_b}KQ>+9FW2GP@JVjvhAOV3Q#pVgfGP^Uv=wxQynN83b zO?mTOidvSImM?tZb&VqI;+a-rp{AT9D4u!wtnFs0cPX2n<{c>_1R?|? z1R?|?1R?}(1_Wr&7Dc(vzAvp&9U%}Qa7!W}-VaH7giM8UT*~0pK~-=BASsRjL7_Gm z4U)-Frb0O`C0VFIhLV(_2#*+$;iON7eyLE7ODV${!ovr`krAFy5R6XwlYu)!O3J8? z5Qq?%j(}J`Wz1s*mdlPzKEKF+SmP}*QOWt z20hC#vthei_YalYJ+5nCZtK}T%N)t@>UP)hb;~gelS23PTaMw5w7Esc@b;x;zJAN_ z4`ulL?t(c?q20P;_F0~lwQS3OSW(3%!)r%Jn_HS1ldD%Xk2NMo*RG}Zs+KilWBiDs zENWQYx^pl;_S8G?zIXKf4?g_*8;JnIw+Ndz1(DNN;IO^nZh4&%`y`u}aQrk04(+!1 zmd>u4ty+dxiRtai`-yy%&`O-gDFn;n$e<4}*y zS&&HHao37f>o%U7%PXt2YOSV!SU_l-y(0qMLEX!lj$t_m_YmfeZ{2U11+@~5YLBB2 znKEi&e38mOSN0i{cBg9yA0x2D7seMW%3h&wFPKzIwYDU_ROPzTsiy5s+C%c!ndpa!xBQNtk%{l>OjsN8lvfWQg?U!9)*9a>=y&? z&^>=w-gIc}s!f|ytqbGvsb8)1l?R1?t6`XiqA2yLaL1@m-6=^>U*bZ2s!29g+LY>8 zGwWbG*a2E1qwHDsDtnt9XD8T6c8Z;0XW0b%oto>Js=E9Vpe=-dN=1FP=lq*pxAobNet|fLS>a@irl1wE}FEQ3UtcWGSXPpbYmn4=7 z`f){emn9k$S{^(WbhXCBazTDxCanRf4VBN3)-~EXu@HE6en;JU*Joe!Wo>yFH@MheF{^5 zy~5PtbiuJ*=OAfHyuR8w>fHbY%4E)3asJH1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0::type x(xSEXP); + rcpp_result_gen = Rcpp::wrap(isinfinite(x)); + return rcpp_result_gen; +END_RCPP +} // metropolis_hypergeometric_cpp List metropolis_hypergeometric_cpp(IntegerVector current, IntegerMatrix moves, int iter, int thin, bool hit_and_run); RcppExport SEXP algstat_metropolis_hypergeometric_cpp(SEXP currentSEXP, SEXP movesSEXP, SEXP iterSEXP, SEXP thinSEXP, SEXP hit_and_runSEXP) { diff --git a/src/isinfinite.cpp b/src/isinfinite.cpp new file mode 100644 index 0000000..45e07e5 --- /dev/null +++ b/src/isinfinite.cpp @@ -0,0 +1,11 @@ +#include +#include "isinfinite.h" +using namespace Rcpp; + + +// [[Rcpp::export]] +LogicalVector isinfinite(NumericVector x){ + return is_infinite(x); +} + + diff --git a/src/isinfinite.h b/src/isinfinite.h new file mode 100644 index 0000000..82e368d --- /dev/null +++ b/src/isinfinite.h @@ -0,0 +1,20 @@ +// +// isinfinite.h +// +// +// Created by Innerst, Grant on 6/5/17. +// +// + +#ifndef ____isinfinite__ +#define ____isinfinite__ + +#include +#include + +#endif /* defined(____isinfinite__) */ + +Rcpp::LogicalVector isinfinite(Rcpp::NumericVector x); + + + diff --git a/src/metropolis_hypergeometric_cpp.cpp b/src/metropolis_hypergeometric_cpp.cpp index 0bc32bf..7783eff 100644 --- a/src/metropolis_hypergeometric_cpp.cpp +++ b/src/metropolis_hypergeometric_cpp.cpp @@ -1,10 +1,11 @@ #include -#include +#include "isinfinite.h" using namespace Rcpp; -// [[Rcpp::plugins(cpp11)]] - // [[Rcpp::export]] + + + List metropolis_hypergeometric_cpp( IntegerVector current, IntegerMatrix moves, @@ -23,9 +24,12 @@ List metropolis_hypergeometric_cpp( bool anyIsNegative; IntegerVector move(n); double acceptProb = 0; - NumericVector stepSize(n); - NumericVector lowerBound(n); - NumericVector upperBound(n); + NumericVector stepSize_1(n); + NumericVector stepSize; + NumericVector current_num(n); + NumericVector move_num(n); + NumericVector upperBound; + NumericVector lowerBound; double lb; double ub; IntegerVector run(1); @@ -35,6 +39,9 @@ List metropolis_hypergeometric_cpp( Function runif("runif"); unifs = runif(nTotalSamples); Function print("print"); + Function subset("subset"); + + for(int i = 0; i < iter; ++i){ for(int j = 0; j < thin; ++j){ @@ -43,49 +50,34 @@ List metropolis_hypergeometric_cpp( for(int k = 0; k < n; ++k){ move[k] = moves(k, whichMove[thin*i+j]-1); } - // If hit_and_run is true, choose how far to run - if(hit_and_run){ - for(int l = 0; l < n; ++l){ - if(std::isinf(-current[l] / move[l])){ - stepSize[l] = 0; - }else{ - stepSize[l] = -current[l] / move[l]; - } - } - for(int l = 0; l < n; ++l){ - if(stepSize[l] >=0){ - lowerBound[l] = -100000; - }else{ - lowerBound[l] = stepSize[l]; - } - } + if(hit_and_run == TRUE){ + current_num = as(current); + move_num = as(move); for(int l = 0; l < n; ++l){ - if(stepSize[l] <= 0){ - upperBound[l] = 100000; - }else{ - upperBound[l] = stepSize[l]; - } + stepSize_1[l] = -1 * current_num[l] / move_num[l]; } + stepSize = subset(stepSize_1, isinfinite(stepSize_1) == FALSE); + lowerBound = subset(stepSize, stepSize < 0); + upperBound = subset(stepSize, stepSize > 0); lb = max(lowerBound); ub = min(upperBound); - run = sample(seq(floor(lb),floor(ub)),1); + run = sample(seq(lb,ub),1); if(run[1] == 0){ run[1] = 1; } - } // compute proposal for(int k = 0; k < n; ++k){ - proposal[k] = current[k] + run[1] * move[k]; + proposal[k] = current[k] + as(run) * move[k]; } // compute probability of transition anyIsNegative = false; for(int k = 0; k < n; ++k){ if(proposal[k] < 0){ - anyIsNegative = true; + anyIsNegative = true; } } From 86ee68d5076be907596a509d0a82ef3710a547da Mon Sep 17 00:00:00 2001 From: Innerst Date: Fri, 9 Jun 2017 15:48:09 -0500 Subject: [PATCH 03/53] hit_and_run in metropolis_uniform_cpp and loglinear --- R/RcppExports.R | 4 +- R/loglinear.r | 5 ++- src/RcppExports.cpp | 26 +++++++++-- src/metropolis_hypergeometric_cpp.cpp | 61 ++++++++++++++++---------- src/metropolis_uniform_cpp.cpp | 62 ++++++++++++++++++++++++--- 5 files changed, 122 insertions(+), 36 deletions(-) diff --git a/R/RcppExports.R b/R/RcppExports.R index f64eee7..1a7cfdb 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -29,8 +29,8 @@ metropolis_hypergeometric_cpp <- function(current, moves, iter, thin, hit_and_ru .Call('algstat_metropolis_hypergeometric_cpp', PACKAGE = 'algstat', current, moves, iter, thin, hit_and_run) } -metropolis_uniform_cpp <- function(current, moves, iter, thin) { - .Call('algstat_metropolis_uniform_cpp', PACKAGE = 'algstat', current, moves, iter, thin) +metropolis_uniform_cpp <- function(current, moves, iter, thin, hit_and_run) { + .Call('algstat_metropolis_uniform_cpp', PACKAGE = 'algstat', current, moves, iter, thin, hit_and_run) } rfiberOne <- function(A, b) { diff --git a/R/loglinear.r b/R/loglinear.r index 73d6c67..a7d8d56 100644 --- a/R/loglinear.r +++ b/R/loglinear.r @@ -593,7 +593,8 @@ loglinear <- function(model, data, init = tab2vec(data), iter = 1E4, burn = 1000, thin = 10, engine = c("Cpp","R"), - method = c("ipf", "mcmc"), moves, + method = c("ipf", "mcmc"), moves, + hit_and_run = FALSE, ...) { @@ -713,7 +714,7 @@ loglinear <- function(model, data, ## run metropolis-hastings ################################################## init <- unname(init) # init - out <- metropolis(init, moves, iter = iter, burn = burn, thin = thin, engine = engine) + out <- metropolis(init, moves, iter = iter, burn = burn, thin = thin, engine = engine, hit_and_run = hit_and_run) diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 6aeb2e6..8464bb2 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -92,8 +92,8 @@ BEGIN_RCPP END_RCPP } // metropolis_uniform_cpp -List metropolis_uniform_cpp(IntegerVector current, IntegerMatrix moves, int iter, int thin); -RcppExport SEXP algstat_metropolis_uniform_cpp(SEXP currentSEXP, SEXP movesSEXP, SEXP iterSEXP, SEXP thinSEXP) { +List metropolis_uniform_cpp(IntegerVector current, IntegerMatrix moves, int iter, int thin, bool hit_and_run); +RcppExport SEXP algstat_metropolis_uniform_cpp(SEXP currentSEXP, SEXP movesSEXP, SEXP iterSEXP, SEXP thinSEXP, SEXP hit_and_runSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -101,7 +101,8 @@ BEGIN_RCPP Rcpp::traits::input_parameter< IntegerMatrix >::type moves(movesSEXP); Rcpp::traits::input_parameter< int >::type iter(iterSEXP); Rcpp::traits::input_parameter< int >::type thin(thinSEXP); - rcpp_result_gen = Rcpp::wrap(metropolis_uniform_cpp(current, moves, iter, thin)); + Rcpp::traits::input_parameter< bool >::type hit_and_run(hit_and_runSEXP); + rcpp_result_gen = Rcpp::wrap(metropolis_uniform_cpp(current, moves, iter, thin, hit_and_run)); return rcpp_result_gen; END_RCPP } @@ -131,3 +132,22 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } + +static const R_CallMethodDef CallEntries[] = { + {"algstat_computeCRsCpp", (DL_FUNC) &algstat_computeCRsCpp, 3}, + {"algstat_computeG2sCpp", (DL_FUNC) &algstat_computeG2sCpp, 2}, + {"algstat_computeNMsCpp", (DL_FUNC) &algstat_computeNMsCpp, 2}, + {"algstat_computeUProbsCpp", (DL_FUNC) &algstat_computeUProbsCpp, 1}, + {"algstat_computeX2sCpp", (DL_FUNC) &algstat_computeX2sCpp, 2}, + {"algstat_isinfinite", (DL_FUNC) &algstat_isinfinite, 1}, + {"algstat_metropolis_hypergeometric_cpp", (DL_FUNC) &algstat_metropolis_hypergeometric_cpp, 5}, + {"algstat_metropolis_uniform_cpp", (DL_FUNC) &algstat_metropolis_uniform_cpp, 5}, + {"algstat_rfiberOne", (DL_FUNC) &algstat_rfiberOne, 2}, + {"algstat_walk", (DL_FUNC) &algstat_walk, 4}, + {NULL, NULL, 0} +}; + +RcppExport void R_init_algstat(DllInfo *dll) { + R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); + R_useDynamicSymbols(dll, FALSE); +} diff --git a/src/metropolis_hypergeometric_cpp.cpp b/src/metropolis_hypergeometric_cpp.cpp index 7783eff..9b33c8e 100644 --- a/src/metropolis_hypergeometric_cpp.cpp +++ b/src/metropolis_hypergeometric_cpp.cpp @@ -1,7 +1,11 @@ #include +#include +#include #include "isinfinite.h" using namespace Rcpp; +// [[Rcpp::plugins("cpp11")]] + // [[Rcpp::export]] @@ -25,54 +29,67 @@ List metropolis_hypergeometric_cpp( IntegerVector move(n); double acceptProb = 0; NumericVector stepSize_1(n); - NumericVector stepSize; NumericVector current_num(n); NumericVector move_num(n); - NumericVector upperBound; - NumericVector lowerBound; + NumericVector s(nMoves); + for(int i = 0; i < nMoves; ++i){ + int num = 0; + for(int j = 0; j < n;++j){ + if(moves(i,j) != 0){ + ++num; + } + } + s[i] = num; + } + int ss = min(s); + NumericVector stepSize(ss); + NumericVector upperBound(ss); + NumericVector lowerBound(ss); double lb; double ub; - IntegerVector run(1); + int run; Function sample("sample"); whichMove = sample(nMoves, nTotalSamples, 1); Function runif("runif"); unifs = runif(nTotalSamples); Function print("print"); - Function subset("subset"); - - for(int i = 0; i < iter; ++i){ for(int j = 0; j < thin; ++j){ - + // make move for(int k = 0; k < n; ++k){ move[k] = moves(k, whichMove[thin*i+j]-1); } - // If hit_and_run is true, choose how far to run if(hit_and_run == TRUE){ current_num = as(current); move_num = as(move); - for(int l = 0; l < n; ++l){ - stepSize_1[l] = -1 * current_num[l] / move_num[l]; + for(int i = 0; i < n; ++i){ + stepSize_1[i] = -1 * current_num[i] / move_num[i]; } - stepSize = subset(stepSize_1, isinfinite(stepSize_1) == FALSE); - lowerBound = subset(stepSize, stepSize < 0); - upperBound = subset(stepSize, stepSize > 0); + stepSize = stepSize_1[isinfinite(stepSize_1) == FALSE]; + lowerBound = stepSize[stepSize < 0]; + upperBound = stepSize[stepSize > 0]; lb = max(lowerBound); ub = min(upperBound); - run = sample(seq(lb,ub),1); - if(run[1] == 0){ - run[1] = 1; + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution dis(lb,ub); + run = dis(gen); + if(run == 0){ + run = 1; } } - - // compute proposal - for(int k = 0; k < n; ++k){ - proposal[k] = current[k] + as(run) * move[k]; + if(hit_and_run == TRUE){ + for(int k = 0; k < n; ++k){ + proposal[k] = current[k] + run * move[k]; + } + }else{ + for(int k = 0; k < n; ++k){ + proposal[k] = current[k] + move[k]; + } } - // compute probability of transition anyIsNegative = false; for(int k = 0; k < n; ++k){ diff --git a/src/metropolis_uniform_cpp.cpp b/src/metropolis_uniform_cpp.cpp index 9274428..38b8e3d 100644 --- a/src/metropolis_uniform_cpp.cpp +++ b/src/metropolis_uniform_cpp.cpp @@ -1,12 +1,17 @@ #include +#include +#include +#include "isinfinite.h" using namespace Rcpp; +// [[Rcpp::plugins("cpp11")]] + // [[Rcpp::export]] List metropolis_uniform_cpp( IntegerVector current, IntegerMatrix moves, - int iter, int - thin + int iter, int thin, + bool hit_and_run ){ int nTotalSamples = iter * thin; // total number of steps @@ -20,6 +25,26 @@ List metropolis_uniform_cpp( bool anyIsNegative; IntegerVector move(n); double acceptProb = 0; + NumericVector stepSize_1(n); + NumericVector current_num(n); + NumericVector move_num(n); + NumericVector s(nMoves); + for(int i = 0; i < nMoves; ++i){ + int num = 0; + for(int j = 0; j < n;++j){ + if(moves(i,j) != 0){ + ++num; + } + } + s[i] = num; + } + int ss = min(s); + NumericVector stepSize(ss); + NumericVector upperBound(ss); + NumericVector lowerBound(ss); + double lb; + double ub; + int run; Function sample("sample"); whichMove = sample(nMoves, nTotalSamples, 1); @@ -34,12 +59,35 @@ List metropolis_uniform_cpp( for(int k = 0; k < n; ++k){ move[k] = moves(k, whichMove[thin*i+j]-1); } - - // compute proposal - for(int k = 0; k < n; ++k){ - proposal[k] = current[k] + move[k]; + if(hit_and_run == TRUE){ + current_num = as(current); + move_num = as(move); + for(int i = 0; i < n; ++i){ + stepSize_1[i] = -1 * current_num[i] / move_num[i]; + } + stepSize = stepSize_1[isinfinite(stepSize_1) == FALSE]; + lowerBound = stepSize[stepSize < 0]; + upperBound = stepSize[stepSize > 0]; + lb = max(lowerBound); + ub = min(upperBound); + //run = sample(seq(lb,ub),1); + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution dis(lb,ub); + run = dis(gen); + if(run == 0){ + run = 1; + } + } + if(hit_and_run == TRUE){ + for(int k = 0; k < n; ++k){ + proposal[k] = current[k] + run * move[k]; + } + }else{ + for(int k = 0; k < n; ++k){ + proposal[k] = current[k] + move[k]; + } } - // compute probability of transition anyIsNegative = false; for(int k = 0; k < n; ++k){ From 52ffef78902990fa3f5918d2290189e8bf914ae0 Mon Sep 17 00:00:00 2001 From: Innerst Date: Wed, 28 Jun 2017 15:09:03 -0500 Subject: [PATCH 04/53] hit_and_run cleanup and sis in algstat --- .DS_Store | Bin 8196 -> 8196 bytes DESCRIPTION | 2 +- R/RcppExports.R | 12 +-- R/loglinear.r | 6 +- R/metropolis.r | 38 ++++++-- man/loglinear.Rd | 2 +- man/metropolis.Rd | 11 ++- src/RcppExports.cpp | 36 +++---- src/isinfinite.cpp | 11 --- src/isinfinite.h | 20 ---- src/metropolis_hypergeometric_cpp.cpp | 99 ++++++++++--------- src/metropolis_uniform_cpp.cpp | 73 ++++++-------- src/sis_tbl.cpp | 133 ++++++++++++++++++++++++++ src/sis_tbl.h | 9 ++ 14 files changed, 291 insertions(+), 161 deletions(-) delete mode 100644 src/isinfinite.cpp delete mode 100644 src/isinfinite.h create mode 100644 src/sis_tbl.cpp create mode 100644 src/sis_tbl.h diff --git a/.DS_Store b/.DS_Store index b87d1589463e86143f695eda7906eb4df63bb6df..ab93edd3dcba1891f4e8ac44b27b4c7790f4de5f 100644 GIT binary patch delta 614 zcmZp1XmQwZUVy{G$UsNI)WUS~b%996zR8Ay;;c^2E@G~e`9)?=ej)flLO_t;PEtxL zG9|Gj)iEU{HHCwNL)jtLVe$r1yUA|CN%}h&cQWo`+|77|@hsyt#`_Flz{Cil8Qh^X zlwy1^`L3`5BmZPRkrkpUntH|-woVbr`MJ5Nc_qamd5O8Hldp=X@W+8{^GPf&@h?ct zOHG;lPsE{~N0vvKlT(1+{^!p2@`>@%@t2hGNXNNz-WZbXh0 zr4~nSa`Rd)ij;EF$^i$G9e;)2<^y4Q&Mqh^udHoo>JN;CLYf{n z27X&QRU_OruIU=%d#U->UR@1o{2^7Dm2>5MLeB^#H|ZJ9Ou0}lCiIZdxLA}*fTB;GLAySKJ>!f-@J4Bjb+APT{ic~N|p`0X|BV#}3DMc~zuM{`G zuv?NIg%{yX_!Pd8!0+vuwcY4}4+9v4iXkwBpko{pFfff7%wi7nSi};Rv4T~sVI3RT U#1?iU9>)i_@% diff --git a/DESCRIPTION b/DESCRIPTION index 2beb91b..78cc1d3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -17,7 +17,7 @@ Depends: mpoly, latter, m2r -LinkingTo: Rcpp +LinkingTo: Rcpp, rcdd Imports: stringr, reshape2, diff --git a/R/RcppExports.R b/R/RcppExports.R index 1a7cfdb..5b03504 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -21,12 +21,8 @@ computeX2sCpp <- function(x, exp) { .Call('algstat_computeX2sCpp', PACKAGE = 'algstat', x, exp) } -isinfinite <- function(x) { - .Call('algstat_isinfinite', PACKAGE = 'algstat', x) -} - -metropolis_hypergeometric_cpp <- function(current, moves, iter, thin, hit_and_run) { - .Call('algstat_metropolis_hypergeometric_cpp', PACKAGE = 'algstat', current, moves, iter, thin, hit_and_run) +metropolis_hypergeometric_cpp <- function(current, moves, suff_stats, config, iter, thin, hit_and_run, SIS) { + .Call('algstat_metropolis_hypergeometric_cpp', PACKAGE = 'algstat', current, moves, suff_stats, config, iter, thin, hit_and_run, SIS) } metropolis_uniform_cpp <- function(current, moves, iter, thin, hit_and_run) { @@ -37,6 +33,10 @@ rfiberOne <- function(A, b) { .Call('algstat_rfiberOne', PACKAGE = 'algstat', A, b) } +sis_tbl <- function(A, suff_stats) { + .Call('algstat_sis_tbl', PACKAGE = 'algstat', A, suff_stats) +} + walk <- function(current, moves, iter, thin) { .Call('algstat_walk', PACKAGE = 'algstat', current, moves, iter, thin) } diff --git a/R/loglinear.r b/R/loglinear.r index a7d8d56..3068337 100644 --- a/R/loglinear.r +++ b/R/loglinear.r @@ -595,6 +595,7 @@ loglinear <- function(model, data, thin = 10, engine = c("Cpp","R"), method = c("ipf", "mcmc"), moves, hit_and_run = FALSE, + SIS = FALSE, ...) { @@ -672,6 +673,8 @@ loglinear <- function(model, data, # make configuration (model) matrix A <- hmat(dim(data), facets) } + + suff_stats <- unname(A %*% init) ## construct A matrix and compute moves ################################################## @@ -714,7 +717,8 @@ loglinear <- function(model, data, ## run metropolis-hastings ################################################## init <- unname(init) # init - out <- metropolis(init, moves, iter = iter, burn = burn, thin = thin, engine = engine, hit_and_run = hit_and_run) + out <- metropolis(init, moves, suff_stats = suff_stats, config = unname(A), iter = iter, burn = burn, thin = thin, + engine = engine, hit_and_run = hit_and_run, SIS = SIS) diff --git a/R/metropolis.r b/R/metropolis.r index c1a4c98..1ad8f84 100644 --- a/R/metropolis.r +++ b/R/metropolis.r @@ -145,9 +145,9 @@ #' } #' #' -metropolis <- function(init, moves, iter = 1E3, burn = 0, thin = 1, +metropolis <- function(init, moves, suff_stats, config, iter = 1E3, burn = 0, thin = 1, dist = c("hypergeometric","uniform"), engine = c("Cpp","R"), - hit_and_run = FALSE + hit_and_run = FALSE, SIS = FALSE ){ ## preliminary checking @@ -179,13 +179,24 @@ metropolis <- function(init, moves, iter = 1E3, burn = 0, thin = 1, if(hit_and_run) { - move <- moves[,sample(nMoves,1)] + move <- sample(c(-1,1), 1) * moves[,sample(nMoves,1)] w_move <- move[move != 0] w_current <- current[move != 0] w_moves <- -1 * w_current / w_move lower_bound <- if(any(w_moves < 0)){max(subset(w_moves,subset = w_moves < 0))}else{1} upper_bound <- if(any(w_moves > 0)){min(subset(w_moves,subset = w_moves > 0))}else{-1} - c_s <- sample(c(lower_bound:-1,1:upper_bound),1) + w_propStatelow <- current + lower_bound * move + w_propStateup <- current + upper_bound * move + if(any(w_propStatelow < 0)){ + lower_bound <- 1 + } + if(any(w_propStateup < 0)){ + upper_bound <- -1 + } + c_s <- sample(lower_bound:upper_bound,1) + if(c_s == 0){ + c_s <- 1 + } propState <- current + c_s * move }else{ move <- sample(c(-1,1), 1) * moves[,sample(nMoves,1)] @@ -219,13 +230,24 @@ metropolis <- function(init, moves, iter = 1E3, burn = 0, thin = 1, if(hit_and_run) { - move <- c(moves[,sample(nMoves,1)]) + move <- moves[,sample(nMoves,1)] w_move <- move[move != 0] w_current <- current[move != 0] w_moves <- (-1 * w_current) / w_move lower_bound <- if(any(w_moves < 0)){max(subset(w_moves,subset = w_moves < 0))}else{1} upper_bound <- if(any(w_moves > 0)){min(subset(w_moves,subset = w_moves > 0))}else{-1} - c_s <- sample(c(lower_bound:-1,1:upper_bound),1) + w_propStatelow <- current + lower_bound * move + w_propStateup <- current + upper_bound * move + if(any(w_propStatelow < 0)){ + lower_bound <- 1 + } + if(any(w_propStateup < 0)){ + upper_bound <- -1 + } + c_s <- sample(lower_bound:upper_bound,1) + if(c_s == 0){ + c_s <- 1 + } propState <- current + c_s * move }else{ move <- sample(c(-1,1), 1) * moves[,sample(nMoves,1)] @@ -276,8 +298,8 @@ metropolis <- function(init, moves, iter = 1E3, burn = 0, thin = 1, metropolis_uniform_cpp } message("Running chain (C++)... ", appendLF = FALSE) - if (burn > 0) current <- sampler(current, allMoves, burn, 1, hit_and_run)$steps[,burn] - out <- sampler(current, allMoves, iter, thin, hit_and_run) + if (burn > 0) current <- sampler(current, allMoves, suff_stats, config, burn, 1, hit_and_run, SIS)$steps[,burn] + out <- sampler(current, allMoves, suff_stats, config, iter, thin, hit_and_run, SIS) out$moves <- moves message("done.") diff --git a/man/loglinear.Rd b/man/loglinear.Rd index 321bff8..479bc42 100644 --- a/man/loglinear.Rd +++ b/man/loglinear.Rd @@ -7,7 +7,7 @@ \usage{ loglinear(model, data, init = tab2vec(data), iter = 10000, burn = 1000, thin = 10, engine = c("Cpp", "R"), method = c("ipf", "mcmc"), moves, - ...) + hit_and_run = FALSE, SIS = FALSE, ...) } \arguments{ \item{model}{hierarchical log-linear model specification} diff --git a/man/metropolis.Rd b/man/metropolis.Rd index 157158b..a51c50e 100644 --- a/man/metropolis.Rd +++ b/man/metropolis.Rd @@ -5,10 +5,12 @@ \alias{rawMetropolis} \title{The Metropolis Algorithm} \usage{ -metropolis(init, moves, iter = 1000, burn = 0, thin = 1, - dist = c("hypergeometric", "uniform"), engine = c("Cpp", "R")) +metropolis(init, moves, suff_stats, config, iter = 1000, burn = 0, + thin = 1, dist = c("hypergeometric", "uniform"), engine = c("Cpp", "R"), + hit_and_run = FALSE, SIS = FALSE) -rawMetropolis(init, moves, iter = 1000, dist = "hypergeometric") +rawMetropolis(init, moves, iter = 1000, dist = "hypergeometric", + hit_and_run = FALSE) } \arguments{ \item{init}{the initial step} @@ -26,6 +28,9 @@ they are arranged as the columns of a matrix.} or "uniform"} \item{engine}{C++ or R? (C++ yields roughly a 20-25x speedup)} + +\item{hit_and_run}{Whether or not to use the hit and run algorithm in +the metropolis algorithm} } \value{ a list diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 8464bb2..27e1a41 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -65,29 +65,21 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } -// isinfinite -LogicalVector isinfinite(NumericVector x); -RcppExport SEXP algstat_isinfinite(SEXP xSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< NumericVector >::type x(xSEXP); - rcpp_result_gen = Rcpp::wrap(isinfinite(x)); - return rcpp_result_gen; -END_RCPP -} // metropolis_hypergeometric_cpp -List metropolis_hypergeometric_cpp(IntegerVector current, IntegerMatrix moves, int iter, int thin, bool hit_and_run); -RcppExport SEXP algstat_metropolis_hypergeometric_cpp(SEXP currentSEXP, SEXP movesSEXP, SEXP iterSEXP, SEXP thinSEXP, SEXP hit_and_runSEXP) { +List metropolis_hypergeometric_cpp(IntegerVector current, IntegerMatrix moves, IntegerVector suff_stats, IntegerMatrix config, int iter, int thin, bool hit_and_run, bool SIS); +RcppExport SEXP algstat_metropolis_hypergeometric_cpp(SEXP currentSEXP, SEXP movesSEXP, SEXP suff_statsSEXP, SEXP configSEXP, SEXP iterSEXP, SEXP thinSEXP, SEXP hit_and_runSEXP, SEXP SISSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< IntegerVector >::type current(currentSEXP); Rcpp::traits::input_parameter< IntegerMatrix >::type moves(movesSEXP); + Rcpp::traits::input_parameter< IntegerVector >::type suff_stats(suff_statsSEXP); + Rcpp::traits::input_parameter< IntegerMatrix >::type config(configSEXP); Rcpp::traits::input_parameter< int >::type iter(iterSEXP); Rcpp::traits::input_parameter< int >::type thin(thinSEXP); Rcpp::traits::input_parameter< bool >::type hit_and_run(hit_and_runSEXP); - rcpp_result_gen = Rcpp::wrap(metropolis_hypergeometric_cpp(current, moves, iter, thin, hit_and_run)); + Rcpp::traits::input_parameter< bool >::type SIS(SISSEXP); + rcpp_result_gen = Rcpp::wrap(metropolis_hypergeometric_cpp(current, moves, suff_stats, config, iter, thin, hit_and_run, SIS)); return rcpp_result_gen; END_RCPP } @@ -118,6 +110,18 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// sis_tbl +IntegerVector sis_tbl(IntegerMatrix A, IntegerVector suff_stats); +RcppExport SEXP algstat_sis_tbl(SEXP ASEXP, SEXP suff_statsSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< IntegerMatrix >::type A(ASEXP); + Rcpp::traits::input_parameter< IntegerVector >::type suff_stats(suff_statsSEXP); + rcpp_result_gen = Rcpp::wrap(sis_tbl(A, suff_stats)); + return rcpp_result_gen; +END_RCPP +} // walk IntegerMatrix walk(IntegerVector current, IntegerMatrix moves, int iter, int thin); RcppExport SEXP algstat_walk(SEXP currentSEXP, SEXP movesSEXP, SEXP iterSEXP, SEXP thinSEXP) { @@ -139,10 +143,10 @@ static const R_CallMethodDef CallEntries[] = { {"algstat_computeNMsCpp", (DL_FUNC) &algstat_computeNMsCpp, 2}, {"algstat_computeUProbsCpp", (DL_FUNC) &algstat_computeUProbsCpp, 1}, {"algstat_computeX2sCpp", (DL_FUNC) &algstat_computeX2sCpp, 2}, - {"algstat_isinfinite", (DL_FUNC) &algstat_isinfinite, 1}, - {"algstat_metropolis_hypergeometric_cpp", (DL_FUNC) &algstat_metropolis_hypergeometric_cpp, 5}, + {"algstat_metropolis_hypergeometric_cpp", (DL_FUNC) &algstat_metropolis_hypergeometric_cpp, 8}, {"algstat_metropolis_uniform_cpp", (DL_FUNC) &algstat_metropolis_uniform_cpp, 5}, {"algstat_rfiberOne", (DL_FUNC) &algstat_rfiberOne, 2}, + {"algstat_sis_tbl", (DL_FUNC) &algstat_sis_tbl, 2}, {"algstat_walk", (DL_FUNC) &algstat_walk, 4}, {NULL, NULL, 0} }; diff --git a/src/isinfinite.cpp b/src/isinfinite.cpp deleted file mode 100644 index 45e07e5..0000000 --- a/src/isinfinite.cpp +++ /dev/null @@ -1,11 +0,0 @@ -#include -#include "isinfinite.h" -using namespace Rcpp; - - -// [[Rcpp::export]] -LogicalVector isinfinite(NumericVector x){ - return is_infinite(x); -} - - diff --git a/src/isinfinite.h b/src/isinfinite.h deleted file mode 100644 index 82e368d..0000000 --- a/src/isinfinite.h +++ /dev/null @@ -1,20 +0,0 @@ -// -// isinfinite.h -// -// -// Created by Innerst, Grant on 6/5/17. -// -// - -#ifndef ____isinfinite__ -#define ____isinfinite__ - -#include -#include - -#endif /* defined(____isinfinite__) */ - -Rcpp::LogicalVector isinfinite(Rcpp::NumericVector x); - - - diff --git a/src/metropolis_hypergeometric_cpp.cpp b/src/metropolis_hypergeometric_cpp.cpp index 9b33c8e..0ac5847 100644 --- a/src/metropolis_hypergeometric_cpp.cpp +++ b/src/metropolis_hypergeometric_cpp.cpp @@ -1,58 +1,45 @@ #include -#include -#include -#include "isinfinite.h" +#include "sis_tbl.h" using namespace Rcpp; -// [[Rcpp::plugins("cpp11")]] - // [[Rcpp::export]] List metropolis_hypergeometric_cpp( IntegerVector current, - IntegerMatrix moves, + IntegerMatrix moves, + IntegerVector suff_stats, + IntegerMatrix config, int iter, int thin, - bool hit_and_run + bool hit_and_run, + bool SIS ){ - int nTotalSamples = iter * thin; // total number of steps int n = current.size(); // number of cells int nMoves = moves.ncol(); // number of moves IntegerMatrix steps(n, iter); // columns are states IntegerVector whichMove(nTotalSamples); // move selection NumericVector unifs(nTotalSamples); // for transition probabilities + NumericVector unifs2(nTotalSamples); IntegerVector proposal(n); // the proposed moves double prob; // the probability of transition bool anyIsNegative; IntegerVector move(n); double acceptProb = 0; - NumericVector stepSize_1(n); - NumericVector current_num(n); - NumericVector move_num(n); - NumericVector s(nMoves); - for(int i = 0; i < nMoves; ++i){ - int num = 0; - for(int j = 0; j < n;++j){ - if(moves(i,j) != 0){ - ++num; - } - } - s[i] = num; - } - int ss = min(s); - NumericVector stepSize(ss); - NumericVector upperBound(ss); - NumericVector lowerBound(ss); - double lb; - double ub; - int run; - - Function sample("sample"); + IntegerVector current_num; + IntegerVector move_num; + IntegerVector stepSize; + IntegerVector upperBound; + IntegerVector lowerBound; + int lb; + int ub; + IntegerVector run; + whichMove = sample(nMoves, nTotalSamples, 1); Function runif("runif"); unifs = runif(nTotalSamples); + unifs2 = runif(nTotalSamples); Function print("print"); for(int i = 0; i < iter; ++i){ @@ -63,33 +50,43 @@ List metropolis_hypergeometric_cpp( move[k] = moves(k, whichMove[thin*i+j]-1); } if(hit_and_run == TRUE){ - current_num = as(current); - move_num = as(move); - for(int i = 0; i < n; ++i){ - stepSize_1[i] = -1 * current_num[i] / move_num[i]; - } - stepSize = stepSize_1[isinfinite(stepSize_1) == FALSE]; + current_num = current[move != 0]; + move_num = move[move != 0]; + stepSize = (-1 * current_num) / move_num; lowerBound = stepSize[stepSize < 0]; upperBound = stepSize[stepSize > 0]; lb = max(lowerBound); ub = min(upperBound); - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_int_distribution dis(lb,ub); - run = dis(gen); - if(run == 0){ - run = 1; + IntegerVector test1 = current + lb * move; + IntegerVector test2 = current + ub * move; + for(int i = 0; i < n; ++i){ + if(test1[i] < 0){ + lb = 1; + } + if(test2[i] < 0){ + ub = -1; + } + } + IntegerVector range = seq(lb,ub); + run = sample(range,1); + if(run[1] == 0){ + run[1] = 1; } } if(hit_and_run == TRUE){ for(int k = 0; k < n; ++k){ - proposal[k] = current[k] + run * move[k]; + proposal[k] = current[k] + as(run) * move[k]; } }else{ for(int k = 0; k < n; ++k){ proposal[k] = current[k] + move[k]; } } + if(SIS){ + if(unifs2[i] < .05){ + proposal = sis_tbl(config, suff_stats); + } + } // compute probability of transition anyIsNegative = false; for(int k = 0; k < n; ++k){ @@ -97,40 +94,40 @@ List metropolis_hypergeometric_cpp( anyIsNegative = true; } } - + if(anyIsNegative){ prob = 0; } else { prob = exp( sum(lgamma(current+1)) - sum(lgamma(proposal+1)) ); } - + if(prob > 1){ prob = 1; } - + // store acceptance probability acceptProb = acceptProb + prob / nTotalSamples; - + // make move if(unifs[thin*i+j] < prob){ for(int k = 0; k < n; ++k){ current[k] = proposal[k]; } } - + } - + // assign state move for(int k = 0; k < n; ++k){ steps(k,i) = current[k]; } } - + // create out list List out = List::create( Rcpp::Named("steps") = steps, Rcpp::Named("acceptProb") = acceptProb ); - + return out; } diff --git a/src/metropolis_uniform_cpp.cpp b/src/metropolis_uniform_cpp.cpp index 38b8e3d..d37bbf9 100644 --- a/src/metropolis_uniform_cpp.cpp +++ b/src/metropolis_uniform_cpp.cpp @@ -1,10 +1,6 @@ #include -#include -#include -#include "isinfinite.h" -using namespace Rcpp; -// [[Rcpp::plugins("cpp11")]] +using namespace Rcpp; // [[Rcpp::export]] List metropolis_uniform_cpp( @@ -25,63 +21,54 @@ List metropolis_uniform_cpp( bool anyIsNegative; IntegerVector move(n); double acceptProb = 0; - NumericVector stepSize_1(n); - NumericVector current_num(n); - NumericVector move_num(n); - NumericVector s(nMoves); - for(int i = 0; i < nMoves; ++i){ - int num = 0; - for(int j = 0; j < n;++j){ - if(moves(i,j) != 0){ - ++num; - } - } - s[i] = num; - } - int ss = min(s); - NumericVector stepSize(ss); - NumericVector upperBound(ss); - NumericVector lowerBound(ss); - double lb; - double ub; - int run; - - Function sample("sample"); + IntegerVector current_num; + IntegerVector move_num; + IntegerVector stepSize; + IntegerVector upperBound; + IntegerVector lowerBound; + int lb; + int ub; + IntegerVector run; + whichMove = sample(nMoves, nTotalSamples, 1); Function runif("runif"); unifs = runif(nTotalSamples); Function print("print"); - + for(int i = 0; i < iter; ++i){ for(int j = 0; j < thin; ++j){ - + // make move for(int k = 0; k < n; ++k){ move[k] = moves(k, whichMove[thin*i+j]-1); } if(hit_and_run == TRUE){ - current_num = as(current); - move_num = as(move); - for(int i = 0; i < n; ++i){ - stepSize_1[i] = -1 * current_num[i] / move_num[i]; - } - stepSize = stepSize_1[isinfinite(stepSize_1) == FALSE]; + current_num = current[move != 0]; + move_num = move[move != 0]; + stepSize = (-1 * current_num) / move_num; lowerBound = stepSize[stepSize < 0]; upperBound = stepSize[stepSize > 0]; lb = max(lowerBound); ub = min(upperBound); - //run = sample(seq(lb,ub),1); - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_int_distribution dis(lb,ub); - run = dis(gen); - if(run == 0){ - run = 1; + IntegerVector test1 = current + lb * move; + IntegerVector test2 = current + ub * move; + for(int i = 0; i < n; ++i){ + if(test1[i] < 0){ + lb = 1; + } + if(test2[i] < 0){ + ub = -1; + } + } + IntegerVector range = seq(lb,ub); + run = sample(range,1); + if(run[1] == 0){ + run[1] = 1; } } if(hit_and_run == TRUE){ for(int k = 0; k < n; ++k){ - proposal[k] = current[k] + run * move[k]; + proposal[k] = current[k] + as(run) * move[k]; } }else{ for(int k = 0; k < n; ++k){ diff --git a/src/sis_tbl.cpp b/src/sis_tbl.cpp new file mode 100644 index 0000000..162f084 --- /dev/null +++ b/src/sis_tbl.cpp @@ -0,0 +1,133 @@ +#include +#include "rcddAPI.h" +using namespace Rcpp; + +// [[Rcpp::export]] + +IntegerVector sis_tbl(IntegerMatrix A, IntegerVector suff_stats) { + int w = 0; + int n = A.nrow(); + int n1 = A.ncol(); + IntegerVector tbl(n1); + NumericMatrix constr(n+n1, n1+2); + NumericVector objfun(n1+1); + IntegerVector maxs(n1); + while(w < 1){ + IntegerMatrix work = A; + IntegerVector work2 = suff_stats; + int min; + int max; + LogicalVector first(1); + first[0] = true; + LogicalVector second(1); + second[0] = false; + CharacterVector solver(1); + solver = "DualSimplex"; + bool isAnyNegative = false; + bool lpsolved = true; + for(int i = 0; i= n && l == 0){ + constr(k,l) = 0; + }if(k >= n && l == 1){ + constr(k,l) = 0; + } + for(int m = 2; m < n1 + 2; ++m){ + if(k >= n && l == m){ + if(z == m){ + constr(k,l) = -1; + }else{ + constr(k,l) = 0; + } + } + } + } + if(k >= n){ + ++z; + } + } + + SEXP out1 = lpcdd_f(constr, objfun, first, solver); + String solution = VECTOR_ELT(out1, 0); + if(solution == "Optimal"){ + IntegerVector val = VECTOR_ELT(out1,3); + min = Rcpp::as(val); + }else{ + lpsolved = false; + break; + } + SEXP out2 = lpcdd_f(constr, objfun, second, solver); + String solution2 = VECTOR_ELT(out2, 0); + if(solution2 == "Optimal"){ + IntegerVector val2 = VECTOR_ELT(out2,3); + max = Rcpp::as(val2); + }else{ + lpsolved = false; + break; + } + if(min == max){ + tbl[i] = min; + } else { + IntegerVector range = seq(min,max); + IntegerVector value = sample(range,1); + tbl[i] = Rcpp::as(value); + } + // Update constraints(work and work2) + IntegerVector index; + int y = 0; + + for(int o = 0; o < n; ++o){ + if(work(o,i) == 1){ + work(o,i) = 0; + index[y] = o; + ++y; + } + } + int x = 0; + for(int p = 0; p < n; ++p){ + if(p == index[x]){ + work2[p] = work2[p] - tbl[i]; + ++x; + } + } + } + //Check if elements are non-zero + for(int q = 0;q < n; ++q){ + if(tbl[q] < 0){ + isAnyNegative = true; + } + } + if(isAnyNegative == false && lpsolved == true){ + ++w; + } + } + return tbl; +} + + + +/*** R +tbl <- c(50, 54, 56, 43, 59, 32, 54, 45, 82) +#Configuration matrix of a 3 by 3 matrix + A <- hmat(c(3,3),1:2) + suff_stats <- A %*% t(t(tbl)) + sis_tbl(A, suff_stats) +*/ diff --git a/src/sis_tbl.h b/src/sis_tbl.h new file mode 100644 index 0000000..1315b33 --- /dev/null +++ b/src/sis_tbl.h @@ -0,0 +1,9 @@ +// sis_tbl.h + +#ifndef __SIS_TBL_H__ +#define __SIS_TBL_H__ + +#include +#endif + +Rcpp::IntegerVector sis_tbl(Rcpp::IntegerMatrix A, Rcpp::IntegerVector suff_stats); \ No newline at end of file From 9a313f99f8a4e907cf770f9c0f7da5bda79f2edb Mon Sep 17 00:00:00 2001 From: Innerst Date: Mon, 10 Jul 2017 10:53:22 -0500 Subject: [PATCH 05/53] non-uniform move sampling; sis fix --- R/RcppExports.R | 4 +- R/metropolis.r | 82 ++++++++++++++++---- R/sis_tbl.R | 32 ++++++++ src/RcppExports.cpp | 9 ++- src/metropolis_hypergeometric_cpp.cpp | 61 ++++++++++++--- src/sis_tbl.cpp | 105 +++++++++++++------------- 6 files changed, 212 insertions(+), 81 deletions(-) create mode 100644 R/sis_tbl.R diff --git a/R/RcppExports.R b/R/RcppExports.R index 5b03504..f7abfb3 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -21,8 +21,8 @@ computeX2sCpp <- function(x, exp) { .Call('algstat_computeX2sCpp', PACKAGE = 'algstat', x, exp) } -metropolis_hypergeometric_cpp <- function(current, moves, suff_stats, config, iter, thin, hit_and_run, SIS) { - .Call('algstat_metropolis_hypergeometric_cpp', PACKAGE = 'algstat', current, moves, suff_stats, config, iter, thin, hit_and_run, SIS) +metropolis_hypergeometric_cpp <- function(current, moves, suff_stats, config, iter, thin, hit_and_run, SIS, non_uniform) { + .Call('algstat_metropolis_hypergeometric_cpp', PACKAGE = 'algstat', current, moves, suff_stats, config, iter, thin, hit_and_run, SIS, non_uniform) } metropolis_uniform_cpp <- function(current, moves, iter, thin, hit_and_run) { diff --git a/R/metropolis.r b/R/metropolis.r index 1ad8f84..f580283 100644 --- a/R/metropolis.r +++ b/R/metropolis.r @@ -147,7 +147,7 @@ #' metropolis <- function(init, moves, suff_stats, config, iter = 1E3, burn = 0, thin = 1, dist = c("hypergeometric","uniform"), engine = c("Cpp","R"), - hit_and_run = FALSE, SIS = FALSE + hit_and_run = FALSE, SIS = FALSE, non_uniform = FALSE ){ ## preliminary checking @@ -166,17 +166,26 @@ metropolis <- function(init, moves, suff_stats, config, iter = 1E3, burn = 0, th nMoves <- ncol(moves) state <- matrix(nrow = nrow(moves), ncol = iter) - ## run burn-in current <- unname(init) - unifs <- runif(burn) + train_current <- unname(init) message("Running chain (R)... ", appendLF = FALSE) + + #Setting up non-uniform move sampling framework + if(non_uniform == TRUE){ + move_dist <- rep(1,nMoves) + counter <- nMoves + } + unifs <- runif(burn) + if(non_uniform == TRUE){ + move_prob <- runif(burn) + } if(burn > 0) { for(k in 1:burn){ - + #Hit and Run option if(hit_and_run) { move <- sample(c(-1,1), 1) * moves[,sample(nMoves,1)] @@ -198,11 +207,29 @@ metropolis <- function(init, moves, suff_stats, config, iter = 1E3, burn = 0, th c_s <- 1 } propState <- current + c_s * move - }else{ + } + + #Non-uniform move sampling option + if(non_uniform == TRUE) + { + + for(l in 1:nMoves){ + if(move_prob[k] <= sum(move_dist[1:l])/counter){ + move <- moves[,l] + which_move <- l + } + } + move <- sample(c(-1,1), 1) * move + propState <- current + move + } + + else{ move <- sample(c(-1,1), 1) * moves[,sample(nMoves,1)] propState <- current + move } - if(any(propState < 0)){ + + + if(any(propState < 0)){ prob <- 0 } else { if(dist == "hypergeometric"){ @@ -212,7 +239,15 @@ metropolis <- function(init, moves, suff_stats, config, iter = 1E3, burn = 0, th } } - if(unifs[k] < prob) current <- propState # else current + if(non_uniform == TRUE){ + if(unifs[k] < prob){ + current <- propState + move_dist[which_move] <- move_dist[which_move] + 1 + counter <- counter + 1 + } + }else{ + if(unifs[k] < prob) current <- propState # else current + } } state[,1] <- current @@ -222,7 +257,7 @@ metropolis <- function(init, moves, suff_stats, config, iter = 1E3, burn = 0, th totalRuns <- 0 probTotal <- 0 - unifs <- runif(iter*thin) + unifs <- runif(iter*thin) for(k in 2:iter){ @@ -249,7 +284,21 @@ metropolis <- function(init, moves, suff_stats, config, iter = 1E3, burn = 0, th c_s <- 1 } propState <- current + c_s * move - }else{ + } + if(non_uniform == TRUE) + { + move_prob <- runif(1) + for(l in 1:nMoves){ + if(move_prob <= sum(move_dist[1:l])/counter){ + move <- moves[,l] + which_move <- l + break() + } + } + move <- sample(c(-1,1), 1) * move + propState <- current + move + } + else{ move <- sample(c(-1,1), 1) * moves[,sample(nMoves,1)] propState <- current + move } @@ -265,7 +314,15 @@ metropolis <- function(init, moves, suff_stats, config, iter = 1E3, burn = 0, th } probTotal <- probTotal + min(1, prob) - if(unifs[k*(thin-1)+j] < prob) current <- propState # else current + if(non_uniform == TRUE){ + if(unifs[k*(thin-1)+j] < prob){ + current <- propState + move_dist[which_move] <- move_dist[which_move] + 1 + counter <- counter + 1 + } + }else{ + if(unifs[k*(thin-1)+j] < prob) current <- propState # else current + } totalRuns <- totalRuns + 1 } @@ -273,7 +330,6 @@ metropolis <- function(init, moves, suff_stats, config, iter = 1E3, burn = 0, th state[,k] <- current } message("done.") - ## format output out <- list( steps = state, @@ -298,8 +354,8 @@ metropolis <- function(init, moves, suff_stats, config, iter = 1E3, burn = 0, th metropolis_uniform_cpp } message("Running chain (C++)... ", appendLF = FALSE) - if (burn > 0) current <- sampler(current, allMoves, suff_stats, config, burn, 1, hit_and_run, SIS)$steps[,burn] - out <- sampler(current, allMoves, suff_stats, config, iter, thin, hit_and_run, SIS) + if (burn > 0) current <- sampler(current, allMoves, suff_stats, config, burn, 1, hit_and_run, SIS, non_uniform)$steps[,burn] + out <- sampler(current, allMoves, suff_stats, config, iter, thin, hit_and_run, SIS, non_uniform) out$moves <- moves message("done.") diff --git a/R/sis_tbl.R b/R/sis_tbl.R new file mode 100644 index 0000000..4f58123 --- /dev/null +++ b/R/sis_tbl.R @@ -0,0 +1,32 @@ +sis_table <- function(config_mat, suff_statistics){ + #Need to check if config_mat is a mat, suff_statistics is vector/mat, + #length of suff_statistics is same as number of rows of config_mat + + #Matricies and vectors to work with! + work_A <- config_mat + work_suff <- suff_statistics + tbl_elts <- ncol(config_mat) + num_const <- nrow(config_mat) + tbl <- vector(mode = "numeric", length = tbl_elts) + + for(i in 1:tbl_elts){ + constr <- unname(rbind(cbind(rep(1,num_const), work_suff, work_A), + cbind(rep(0,tbl_elts), rep(0,tbl_elts), diag(-1,tbl_elts)))) + objfun <- vector(mode = "numeric", length = tbl_elts) + objfun[i] <- -1 + min_lp <- lpcdd(constr, objfun) + max_lp <- lpcdd(constr, objfun, minimize = FALSE) + + if(min_lp[1] == "Optimal" && max_lp[1] == "Optimal"){ + print(minimum <- as.numeric(unname(min_lp[4]))) + print(maximum <- as.numeric(unname(max_lp[4]))) + tbl[i] <- if(isTRUE(all.equal(minimum, maximum))){minimum} + else{sample(minimum:maximum, 1)} + } else { tbl[i] <- 0 } + #Update constraints and sufficient statistics + index <- which(work_A[,i] == 1) + work_A[index,i] <- 0 + work_suff[index] <- work_suff[index] - tbl[i] + } + return(tbl) +} \ No newline at end of file diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 27e1a41..deb1338 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -66,8 +66,8 @@ BEGIN_RCPP END_RCPP } // metropolis_hypergeometric_cpp -List metropolis_hypergeometric_cpp(IntegerVector current, IntegerMatrix moves, IntegerVector suff_stats, IntegerMatrix config, int iter, int thin, bool hit_and_run, bool SIS); -RcppExport SEXP algstat_metropolis_hypergeometric_cpp(SEXP currentSEXP, SEXP movesSEXP, SEXP suff_statsSEXP, SEXP configSEXP, SEXP iterSEXP, SEXP thinSEXP, SEXP hit_and_runSEXP, SEXP SISSEXP) { +List metropolis_hypergeometric_cpp(IntegerVector current, IntegerMatrix moves, IntegerVector suff_stats, IntegerMatrix config, int iter, int thin, bool hit_and_run, bool SIS, bool non_uniform); +RcppExport SEXP algstat_metropolis_hypergeometric_cpp(SEXP currentSEXP, SEXP movesSEXP, SEXP suff_statsSEXP, SEXP configSEXP, SEXP iterSEXP, SEXP thinSEXP, SEXP hit_and_runSEXP, SEXP SISSEXP, SEXP non_uniformSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -79,7 +79,8 @@ BEGIN_RCPP Rcpp::traits::input_parameter< int >::type thin(thinSEXP); Rcpp::traits::input_parameter< bool >::type hit_and_run(hit_and_runSEXP); Rcpp::traits::input_parameter< bool >::type SIS(SISSEXP); - rcpp_result_gen = Rcpp::wrap(metropolis_hypergeometric_cpp(current, moves, suff_stats, config, iter, thin, hit_and_run, SIS)); + Rcpp::traits::input_parameter< bool >::type non_uniform(non_uniformSEXP); + rcpp_result_gen = Rcpp::wrap(metropolis_hypergeometric_cpp(current, moves, suff_stats, config, iter, thin, hit_and_run, SIS, non_uniform)); return rcpp_result_gen; END_RCPP } @@ -143,7 +144,7 @@ static const R_CallMethodDef CallEntries[] = { {"algstat_computeNMsCpp", (DL_FUNC) &algstat_computeNMsCpp, 2}, {"algstat_computeUProbsCpp", (DL_FUNC) &algstat_computeUProbsCpp, 1}, {"algstat_computeX2sCpp", (DL_FUNC) &algstat_computeX2sCpp, 2}, - {"algstat_metropolis_hypergeometric_cpp", (DL_FUNC) &algstat_metropolis_hypergeometric_cpp, 8}, + {"algstat_metropolis_hypergeometric_cpp", (DL_FUNC) &algstat_metropolis_hypergeometric_cpp, 9}, {"algstat_metropolis_uniform_cpp", (DL_FUNC) &algstat_metropolis_uniform_cpp, 5}, {"algstat_rfiberOne", (DL_FUNC) &algstat_rfiberOne, 2}, {"algstat_sis_tbl", (DL_FUNC) &algstat_sis_tbl, 2}, diff --git a/src/metropolis_hypergeometric_cpp.cpp b/src/metropolis_hypergeometric_cpp.cpp index 0ac5847..65dcd18 100644 --- a/src/metropolis_hypergeometric_cpp.cpp +++ b/src/metropolis_hypergeometric_cpp.cpp @@ -5,7 +5,6 @@ using namespace Rcpp; // [[Rcpp::export]] - List metropolis_hypergeometric_cpp( IntegerVector current, IntegerMatrix moves, @@ -13,7 +12,7 @@ List metropolis_hypergeometric_cpp( IntegerMatrix config, int iter, int thin, bool hit_and_run, - bool SIS + bool SIS, bool non_uniform ){ int nTotalSamples = iter * thin; // total number of steps int n = current.size(); // number of cells @@ -22,6 +21,7 @@ List metropolis_hypergeometric_cpp( IntegerVector whichMove(nTotalSamples); // move selection NumericVector unifs(nTotalSamples); // for transition probabilities NumericVector unifs2(nTotalSamples); + NumericVector unifs3(nTotalSamples); IntegerVector proposal(n); // the proposed moves double prob; // the probability of transition bool anyIsNegative; @@ -40,16 +40,44 @@ List metropolis_hypergeometric_cpp( Function runif("runif"); unifs = runif(nTotalSamples); unifs2 = runif(nTotalSamples); + unifs3 = runif(nTotalSamples); + unifs3[0] = .999; Function print("print"); + + NumericVector move_dist = rep(1.0, nMoves); + double counter = moves.ncol(); + int which_move; + for(int i = 0; i < iter; ++i){ for(int j = 0; j < thin; ++j){ + if(non_uniform == true){ + for(int l = 0; l < nMoves; ++l){ + double sums = 0; + for(int m = 0; m < l+1; ++m){ + sums = sums + move_dist[m]; + } + + if(unifs3[thin*i+j] <= sums / counter){ + + for(int k = 0; k < n; ++k){ + move[k] = moves(k, l); + } + which_move = l; + break; + } + } + for(int k = 0; k < n; ++k){ + proposal[k] = current[k] + move[k]; + } + }else{ + // make move for(int k = 0; k < n; ++k){ move[k] = moves(k, whichMove[thin*i+j]-1); } - if(hit_and_run == TRUE){ + if(hit_and_run == true){ current_num = current[move != 0]; move_num = move[move != 0]; stepSize = (-1 * current_num) / move_num; @@ -82,6 +110,7 @@ List metropolis_hypergeometric_cpp( proposal[k] = current[k] + move[k]; } } + } if(SIS){ if(unifs2[i] < .05){ proposal = sis_tbl(config, suff_stats); @@ -104,17 +133,31 @@ List metropolis_hypergeometric_cpp( if(prob > 1){ prob = 1; } - + // store acceptance probability acceptProb = acceptProb + prob / nTotalSamples; - // make move - if(unifs[thin*i+j] < prob){ - for(int k = 0; k < n; ++k){ - current[k] = proposal[k]; + + if(non_uniform == true){ + + if(unifs[thin*i+j] < prob){ + for(int k = 0; k < n; ++k){ + current[k] = proposal[k]; + } + + move_dist[which_move] = move_dist[which_move] + 1; + ++counter; + } + }else{ + // make move + if(unifs[thin*i+j] < prob){ + + for(int k = 0; k < n; ++k){ + current[k] = proposal[k]; + } + } } - } // assign state move diff --git a/src/sis_tbl.cpp b/src/sis_tbl.cpp index 162f084..a20c36c 100644 --- a/src/sis_tbl.cpp +++ b/src/sis_tbl.cpp @@ -11,51 +11,55 @@ IntegerVector sis_tbl(IntegerMatrix A, IntegerVector suff_stats) { IntegerVector tbl(n1); NumericMatrix constr(n+n1, n1+2); NumericVector objfun(n1+1); - IntegerVector maxs(n1); + int p = 0; + Function print("print"); + IntegerMatrix work; + IntegerVector work2; + int min, max; + while(w < 1){ - IntegerMatrix work = A; - IntegerVector work2 = suff_stats; - int min; - int max; - LogicalVector first(1); - first[0] = true; - LogicalVector second(1); - second[0] = false; - CharacterVector solver(1); - solver = "DualSimplex"; - bool isAnyNegative = false; - bool lpsolved = true; - for(int i = 0; i= n && l == 0){ - constr(k,l) = 0; - }if(k >= n && l == 1){ - constr(k,l) = 0; - } - for(int m = 2; m < n1 + 2; ++m){ - if(k >= n && l == m){ - if(z == m){ - constr(k,l) = -1; - }else{ - constr(k,l) = 0; + int z = 2; + for(int k = 0; k < n+n1 ; ++k){ + for(int l = 0; l < n1 + 2; ++l){ + if(k < n && l == 0) constr(k,l) = 1; + + if(k < n && l == 1) constr(k,l) = work2[k]; + + for(int m = 2; m < n1 +2; ++m){ + + if(k < n && l == m) constr(k,l) = work(k,m-2); + + } + if(k >= n && l == 0){ + constr(k,l) = 0; + } + if(k >= n && l == 1){ + constr(k,l) = 0; + } + for(int m = 2; m < n1 + 2; ++m){ + if(k >= n && l == m){ + if(z == m){ + constr(k,l) = -1; + }else{ + constr(k,l) = 0; } } } @@ -83,13 +87,9 @@ IntegerVector sis_tbl(IntegerMatrix A, IntegerVector suff_stats) { lpsolved = false; break; } - if(min == max){ - tbl[i] = min; - } else { IntegerVector range = seq(min,max); IntegerVector value = sample(range,1); tbl[i] = Rcpp::as(value); - } // Update constraints(work and work2) IntegerVector index; int y = 0; @@ -118,16 +118,15 @@ IntegerVector sis_tbl(IntegerMatrix A, IntegerVector suff_stats) { if(isAnyNegative == false && lpsolved == true){ ++w; } + print(tbl); + ++p; + if(p > 2){ + break; + } } return tbl; } -/*** R -tbl <- c(50, 54, 56, 43, 59, 32, 54, 45, 82) -#Configuration matrix of a 3 by 3 matrix - A <- hmat(c(3,3),1:2) - suff_stats <- A %*% t(t(tbl)) - sis_tbl(A, suff_stats) -*/ + From 2e46158b1e15039bcc06fd797b915289f357ce4e Mon Sep 17 00:00:00 2001 From: Innerst Date: Mon, 10 Jul 2017 11:22:35 -0500 Subject: [PATCH 06/53] random fix --- src/metropolis_hypergeometric_cpp.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/metropolis_hypergeometric_cpp.cpp b/src/metropolis_hypergeometric_cpp.cpp index 65dcd18..025c3b2 100644 --- a/src/metropolis_hypergeometric_cpp.cpp +++ b/src/metropolis_hypergeometric_cpp.cpp @@ -41,7 +41,6 @@ List metropolis_hypergeometric_cpp( unifs = runif(nTotalSamples); unifs2 = runif(nTotalSamples); unifs3 = runif(nTotalSamples); - unifs3[0] = .999; Function print("print"); From 746eae55646a40ca2299ff63b1fa7eaab2b2a328 Mon Sep 17 00:00:00 2001 From: Innerst Date: Mon, 10 Jul 2017 14:50:55 -0500 Subject: [PATCH 07/53] another sis fix, non_uniform to metropolis_uniform_cpp --- NAMESPACE | 3 +- R/metropolis.r | 6 ++ R/sis_tbl.R | 4 +- src/metropolis_uniform_cpp.cpp | 122 +++++++++++++++++++++++---------- src/sis_tbl.cpp | 1 - 5 files changed, 97 insertions(+), 39 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index b971e29..ab367c7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -70,4 +70,5 @@ importFrom(stats,runif) importFrom(stats,sd) importFrom(utils,combn) importFrom(utils,download.file) -useDynLib(algstat) +importFrom(rcdd, lpcdd) +useDynLib(algstat) \ No newline at end of file diff --git a/R/metropolis.r b/R/metropolis.r index f580283..c0f2ec9 100644 --- a/R/metropolis.r +++ b/R/metropolis.r @@ -303,6 +303,12 @@ metropolis <- function(init, moves, suff_stats, config, iter = 1E3, burn = 0, th propState <- current + move } + if(SIS == TRUE){ + if(runif(1) <= .05){ + propState <- sis_table(config, suff_stats) + } + } + if(any(propState < 0)){ prob <- 0 } else { diff --git a/R/sis_tbl.R b/R/sis_tbl.R index 4f58123..414a597 100644 --- a/R/sis_tbl.R +++ b/R/sis_tbl.R @@ -18,8 +18,8 @@ sis_table <- function(config_mat, suff_statistics){ max_lp <- lpcdd(constr, objfun, minimize = FALSE) if(min_lp[1] == "Optimal" && max_lp[1] == "Optimal"){ - print(minimum <- as.numeric(unname(min_lp[4]))) - print(maximum <- as.numeric(unname(max_lp[4]))) + minimum <- as.numeric(unname(min_lp[4])) + maximum <- as.numeric(unname(max_lp[4])) tbl[i] <- if(isTRUE(all.equal(minimum, maximum))){minimum} else{sample(minimum:maximum, 1)} } else { tbl[i] <- 0 } diff --git a/src/metropolis_uniform_cpp.cpp b/src/metropolis_uniform_cpp.cpp index d37bbf9..6dfea3e 100644 --- a/src/metropolis_uniform_cpp.cpp +++ b/src/metropolis_uniform_cpp.cpp @@ -5,9 +5,12 @@ using namespace Rcpp; // [[Rcpp::export]] List metropolis_uniform_cpp( IntegerVector current, - IntegerMatrix moves, - int iter, int thin, - bool hit_and_run + IntegerMatrix moves, + IntegerVector suff_stats, + IntegerMatrix config, + int iter, int thin, + bool hit_and_run, + bool SIS, bool non_uniform ){ int nTotalSamples = iter * thin; // total number of steps @@ -16,6 +19,8 @@ List metropolis_uniform_cpp( IntegerMatrix steps(n, iter); // columns are states IntegerVector whichMove(nTotalSamples); // move selection NumericVector unifs(nTotalSamples); // for transition probabilities + NumericVector unifs2(nTotalSamples); + NumericVector unifs3(nTotalSamples); IntegerVector proposal(n); // the proposed moves double prob; // the probability of transition bool anyIsNegative; @@ -33,46 +38,79 @@ List metropolis_uniform_cpp( whichMove = sample(nMoves, nTotalSamples, 1); Function runif("runif"); unifs = runif(nTotalSamples); + unifs2 = runif(nTotalSamples); + unifs3 = runif(nTotalSamples); Function print("print"); + NumericVector move_dist = rep(1.0, nMoves); + double counter = moves.ncol(); + int which_move; + for(int i = 0; i < iter; ++i){ for(int j = 0; j < thin; ++j){ - // make move - for(int k = 0; k < n; ++k){ - move[k] = moves(k, whichMove[thin*i+j]-1); - } - if(hit_and_run == TRUE){ - current_num = current[move != 0]; - move_num = move[move != 0]; - stepSize = (-1 * current_num) / move_num; - lowerBound = stepSize[stepSize < 0]; - upperBound = stepSize[stepSize > 0]; - lb = max(lowerBound); - ub = min(upperBound); - IntegerVector test1 = current + lb * move; - IntegerVector test2 = current + ub * move; - for(int i = 0; i < n; ++i){ - if(test1[i] < 0){ - lb = 1; + if(non_uniform == true){ + for(int l = 0; l < nMoves; ++l){ + double sums = 0; + for(int m = 0; m < l+1; ++m){ + sums = sums + move_dist[m]; } - if(test2[i] < 0){ - ub = -1; + + if(unifs3[thin*i+j] <= sums / counter){ + + for(int k = 0; k < n; ++k){ + move[k] = moves(k, l); + } + which_move = l; + break; } } - IntegerVector range = seq(lb,ub); - run = sample(range,1); - if(run[1] == 0){ - run[1] = 1; - } - } - if(hit_and_run == TRUE){ for(int k = 0; k < n; ++k){ - proposal[k] = current[k] + as(run) * move[k]; + proposal[k] = current[k] + move[k]; } }else{ + + // make move for(int k = 0; k < n; ++k){ - proposal[k] = current[k] + move[k]; + move[k] = moves(k, whichMove[thin*i+j]-1); + } + if(hit_and_run == true){ + current_num = current[move != 0]; + move_num = move[move != 0]; + stepSize = (-1 * current_num) / move_num; + lowerBound = stepSize[stepSize < 0]; + upperBound = stepSize[stepSize > 0]; + lb = max(lowerBound); + ub = min(upperBound); + IntegerVector test1 = current + lb * move; + IntegerVector test2 = current + ub * move; + for(int i = 0; i < n; ++i){ + if(test1[i] < 0){ + lb = 1; + } + if(test2[i] < 0){ + ub = -1; + } + } + IntegerVector range = seq(lb,ub); + run = sample(range,1); + if(run[1] == 0){ + run[1] = 1; + } + } + if(hit_and_run == TRUE){ + for(int k = 0; k < n; ++k){ + proposal[k] = current[k] + as(run) * move[k]; + } + }else{ + for(int k = 0; k < n; ++k){ + proposal[k] = current[k] + move[k]; + } + } + } + if(SIS){ + if(unifs2[i] < .05){ + proposal = sis_tbl(config, suff_stats); } } // compute probability of transition @@ -96,10 +134,24 @@ List metropolis_uniform_cpp( // store acceptance probability acceptProb = acceptProb + prob / nTotalSamples; - // make move - if(unifs[thin*i+j] < prob){ - for(int k = 0; k < n; ++k){ - current[k] = proposal[k]; + if(non_uniform == true){ + + if(unifs[thin*i+j] < prob){ + for(int k = 0; k < n; ++k){ + current[k] = proposal[k]; + } + + move_dist[which_move] = move_dist[which_move] + 1; + ++counter; + } + }else{ + // make move + if(unifs[thin*i+j] < prob){ + + for(int k = 0; k < n; ++k){ + current[k] = proposal[k]; + } + } } diff --git a/src/sis_tbl.cpp b/src/sis_tbl.cpp index a20c36c..acdb238 100644 --- a/src/sis_tbl.cpp +++ b/src/sis_tbl.cpp @@ -118,7 +118,6 @@ IntegerVector sis_tbl(IntegerMatrix A, IntegerVector suff_stats) { if(isAnyNegative == false && lpsolved == true){ ++w; } - print(tbl); ++p; if(p > 2){ break; From 6bfc88578090e1294eded0ab7d5a06c576e77394 Mon Sep 17 00:00:00 2001 From: Innerst Date: Tue, 11 Jul 2017 11:07:28 -0500 Subject: [PATCH 08/53] adding include file --- src/metropolis_uniform_cpp.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/metropolis_uniform_cpp.cpp b/src/metropolis_uniform_cpp.cpp index 6dfea3e..348fe8c 100644 --- a/src/metropolis_uniform_cpp.cpp +++ b/src/metropolis_uniform_cpp.cpp @@ -1,5 +1,5 @@ #include - +#include "sis_tbl.h" using namespace Rcpp; // [[Rcpp::export]] From 768ba096795d78077639caef0543e0825876e87b Mon Sep 17 00:00:00 2001 From: Innerst Date: Thu, 13 Jul 2017 15:12:42 -0500 Subject: [PATCH 09/53] updating sis and loglinear --- R/loglinear.r | 6 ++- src/sis_tbl.cpp | 139 ++++++++++++++++++++++++++---------------------- 2 files changed, 80 insertions(+), 65 deletions(-) diff --git a/R/loglinear.r b/R/loglinear.r index 3068337..b2157ce 100644 --- a/R/loglinear.r +++ b/R/loglinear.r @@ -593,9 +593,10 @@ loglinear <- function(model, data, init = tab2vec(data), iter = 1E4, burn = 1000, thin = 10, engine = c("Cpp","R"), - method = c("ipf", "mcmc"), moves, + method = c("ipf", "mcmc"),moves, hit_and_run = FALSE, SIS = FALSE, + non_uniform = FALSE, ...) { @@ -718,7 +719,8 @@ loglinear <- function(model, data, ################################################## init <- unname(init) # init out <- metropolis(init, moves, suff_stats = suff_stats, config = unname(A), iter = iter, burn = burn, thin = thin, - engine = engine, hit_and_run = hit_and_run, SIS = SIS) + engine = engine, hit_and_run = hit_and_run, SIS = SIS, + non_uniform = non_uniform) diff --git a/src/sis_tbl.cpp b/src/sis_tbl.cpp index acdb238..5e1adb9 100644 --- a/src/sis_tbl.cpp +++ b/src/sis_tbl.cpp @@ -5,72 +5,79 @@ using namespace Rcpp; // [[Rcpp::export]] IntegerVector sis_tbl(IntegerMatrix A, IntegerVector suff_stats) { - int w = 0; - int n = A.nrow(); - int n1 = A.ncol(); - IntegerVector tbl(n1); - NumericMatrix constr(n+n1, n1+2); - NumericVector objfun(n1+1); - int p = 0; + int w = 0; // counter for while loop + int d = A.nrow(); // number of sufficient statisitcs + int r = A.ncol(); // number of cells in the table + IntegerVector tbl(r); //table to return + NumericMatrix constr(d+r, r+2); // constraint matrix + NumericVector objfun(r+1); // objective function + int p = 0; // counter for error purposes + IntegerMatrix work_A(A.nrow(), A.ncol()); // work matrix to edit configuration matrix + IntegerVector work_suff_stats(suff_stats.size()); // work vector to edit sufficient statistics + int min, max; // integers to calculate our range + Function print("print"); - IntegerMatrix work; - IntegerVector work2; - int min, max; + // Elements needed to run the linear program solver + LogicalVector first(1); + first[0] = true; + LogicalVector second(1); + second[0] = false; + CharacterVector solver(1); + solver = "DualSimplex"; + // Theoretically, the loop will run until "correct" table is produced while(w < 1){ - work = clone(A); - work2 = clone(suff_stats); - LogicalVector first(1); - first[0] = true; - LogicalVector second(1); - second[0] = false; - CharacterVector solver(1); - solver = "DualSimplex"; - bool isAnyNegative = false; + + for(int i = 0; i < A.nrow(); ++i){ + for(int j = 0; j < A.ncol(); ++j){ + work_A(i,j) = A(i,j); + } + } + for(int i = 0; i < suff_stats.size(); ++i) work_suff_stats[i] = suff_stats[i]; + + //Logical for checking purposes bool lpsolved = true; - for(int i = 0; i= n && l == 0){ - constr(k,l) = 0; + for(int k = 0; k < d+r ; ++k){ + for(int l = 0; l < r + 2; ++l){ + //First column for equalities + if(k < d && l == 0) constr(k,l) = 1; + //Second column for sufficient statistics (rhs) + if(k < d && l == 1) constr(k,l) = work_suff_stats[k]; + //Rest of the columns for A matrix + for(int m = 2; m < r +2; ++m){ + if(k < d && l == m) constr(k,l) = work_A(k,m-2); } - if(k >= n && l == 1){ - constr(k,l) = 0; + //First column for inequalities + if(k >= d && l == 0){ + constr(k,l) = 0; } - for(int m = 2; m < n1 + 2; ++m){ - if(k >= n && l == m){ - if(z == m){ - constr(k,l) = -1; - }else{ - constr(k,l) = 0; + //Second column for right hand side + if(k >= d && l == 1){ + constr(k,l) = 0; } + //Constructing coefficients for each cell to be positive + for(int m = 2; m < r + 2; ++m){ + if(k >= d && l == m){ + constr(k,l) = (z == m) ? -1:0; } } } - if(k >= n){ + if(k >= d){ ++z; } } - + //Running linear program solver to find range of possible values(min, max) SEXP out1 = lpcdd_f(constr, objfun, first, solver); String solution = VECTOR_ELT(out1, 0); + //If solution Optimal continue, else break and start over if(solution == "Optimal"){ IntegerVector val = VECTOR_ELT(out1,3); min = Rcpp::as(val); @@ -80,6 +87,7 @@ IntegerVector sis_tbl(IntegerMatrix A, IntegerVector suff_stats) { } SEXP out2 = lpcdd_f(constr, objfun, second, solver); String solution2 = VECTOR_ELT(out2, 0); + //If solution Optimal continue, else break and start over if(solution2 == "Optimal"){ IntegerVector val2 = VECTOR_ELT(out2,3); max = Rcpp::as(val2); @@ -87,37 +95,36 @@ IntegerVector sis_tbl(IntegerMatrix A, IntegerVector suff_stats) { lpsolved = false; break; } + //Calculate the range and sample from that range to populate the table IntegerVector range = seq(min,max); IntegerVector value = sample(range,1); tbl[i] = Rcpp::as(value); - // Update constraints(work and work2) + + // Update constraints(work_A and work_suff_stats) IntegerVector index; int y = 0; - for(int o = 0; o < n; ++o){ - if(work(o,i) == 1){ - work(o,i) = 0; + + for(int o = 0; o < d; ++o){ + if(work_A(o,i) != 0){ + work_A(o,i) = 0; index[y] = o; ++y; } } int x = 0; - for(int p = 0; p < n; ++p){ + for(int p = 0; p < d; ++p){ if(p == index[x]){ - work2[p] = work2[p] - tbl[i]; + work_suff_stats[p] = work_suff_stats[p] - tbl[i] * A(p,i); ++x; } } } - //Check if elements are non-zero - for(int q = 0;q < n; ++q){ - if(tbl[q] < 0){ - isAnyNegative = true; - } - } - if(isAnyNegative == false && lpsolved == true){ + // If all linear programs are solved, index w and end the loop + if(lpsolved == true){ ++w; } + // If error continues, only let it continue two times ++p; if(p > 2){ break; @@ -127,5 +134,11 @@ IntegerVector sis_tbl(IntegerMatrix A, IntegerVector suff_stats) { } - +/***R +library(algstat) +tbl <- rep(21,8) +A <- matrix(c(rep(1,8), 1:8), nrow = 2, byrow = TRUE) +suff_stats <- A %*% t(t(tbl)) +sis_tbl(A, suff_stats) +*/ From 70bfc215f7e34d6f8b00b978100c134eac5ca198 Mon Sep 17 00:00:00 2001 From: Innerst Date: Thu, 13 Jul 2017 15:35:50 -0500 Subject: [PATCH 10/53] updating sis again --- src/sis_tbl.cpp | 49 ++++++++++++++++++++++--------------------------- 1 file changed, 22 insertions(+), 27 deletions(-) diff --git a/src/sis_tbl.cpp b/src/sis_tbl.cpp index 5e1adb9..ba8a111 100644 --- a/src/sis_tbl.cpp +++ b/src/sis_tbl.cpp @@ -25,6 +25,8 @@ IntegerVector sis_tbl(IntegerMatrix A, IntegerVector suff_stats) { second[0] = false; CharacterVector solver(1); solver = "DualSimplex"; + + // Theoretically, the loop will run until "correct" table is produced while(w < 1){ @@ -35,8 +37,8 @@ IntegerVector sis_tbl(IntegerMatrix A, IntegerVector suff_stats) { } for(int i = 0; i < suff_stats.size(); ++i) work_suff_stats[i] = suff_stats[i]; - //Logical for checking purposes - bool lpsolved = true; + + bool lpsolved = true; //Logical for checking purposes for(int i = 0; i= d && l == 0){ - constr(k,l) = 0; - } - //Second column for right hand side - if(k >= d && l == 1){ - constr(k,l) = 0; - } + + if(k >= d && l == 0) constr(k,l) = 0;//First column for inequalities + + if(k >= d && l == 1)constr(k,l) = 0;//Second column for right hand side + //Constructing coefficients for each cell to be positive for(int m = 2; m < r + 2; ++m){ - if(k >= d && l == m){ - constr(k,l) = (z == m) ? -1:0; - } + if(k >= d && l == m) constr(k,l) = (z == m) ? -1:0; } } - if(k >= d){ - ++z; - } + if(k >= d) ++z; } //Running linear program solver to find range of possible values(min, max) SEXP out1 = lpcdd_f(constr, objfun, first, solver); @@ -104,7 +100,8 @@ IntegerVector sis_tbl(IntegerMatrix A, IntegerVector suff_stats) { IntegerVector index; int y = 0; - + //Updating work_A by changing non-zero A elements to zero in the column + //Keep track of where non-zero elements were for(int o = 0; o < d; ++o){ if(work_A(o,i) != 0){ work_A(o,i) = 0; @@ -112,6 +109,7 @@ IntegerVector sis_tbl(IntegerMatrix A, IntegerVector suff_stats) { ++y; } } + //Updating work_suff_stats where elements of work_A were changed int x = 0; for(int p = 0; p < d; ++p){ if(p == index[x]){ @@ -121,14 +119,11 @@ IntegerVector sis_tbl(IntegerMatrix A, IntegerVector suff_stats) { } } // If all linear programs are solved, index w and end the loop - if(lpsolved == true){ - ++w; - } + if(lpsolved == true) ++w; + // If error continues, only let it continue two times ++p; - if(p > 2){ - break; - } + if(p > 2) break; } return tbl; } From 66671d7632a3691dd0845a9cd1d3b6642b79a942 Mon Sep 17 00:00:00 2001 From: Innerst Date: Thu, 13 Jul 2017 15:37:35 -0500 Subject: [PATCH 11/53] sis --- src/sis_tbl.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sis_tbl.cpp b/src/sis_tbl.cpp index ba8a111..085ae00 100644 --- a/src/sis_tbl.cpp +++ b/src/sis_tbl.cpp @@ -131,8 +131,8 @@ IntegerVector sis_tbl(IntegerMatrix A, IntegerVector suff_stats) { /***R library(algstat) -tbl <- rep(21,8) -A <- matrix(c(rep(1,8), 1:8), nrow = 2, byrow = TRUE) +tbl <- rep(50,8) +A <- hmat(c(4,2), 1:2) suff_stats <- A %*% t(t(tbl)) sis_tbl(A, suff_stats) */ From e3140b310f2a0fecdc46e027313292af102fead6 Mon Sep 17 00:00:00 2001 From: Innerst Date: Thu, 27 Jul 2017 13:53:09 -0500 Subject: [PATCH 12/53] poisson and logistic regression in R --- R/logistic.r | 335 +++++++++++++++++++++++++++++++++++++++++++ R/pmat.R | 43 ++++++ R/poisson.R | 390 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 768 insertions(+) create mode 100644 R/logistic.r create mode 100644 R/pmat.R create mode 100644 R/poisson.R diff --git a/R/logistic.r b/R/logistic.r new file mode 100644 index 0000000..77d34e5 --- /dev/null +++ b/R/logistic.r @@ -0,0 +1,335 @@ +log_reg <- function(model, data, + iter = 1E4, burn = 1000, + thin = 10, engine = c("Cpp","R"), + method = c("ipf", "mcmc"), moves, + hit_and_run = FALSE, + SIS = FALSE, + non_uniform = FALSE, + ...) +{ + + ## set/check args + ################################################## + + engine <- match.arg(engine) + method <- match.arg(method) + argList <- as.list(match.call(expand.dots = TRUE))[-1] + + if("formula" %in% names(argList)){ + .Deprecated(msg = + 'the formula argument is deprecated, please use "model" instead.' + ) + } + + + ## reshape data + ################################################## + + # data <- suppressMessages(teshape(data, "tab")) + # p <- length(dim(data)) + # nCells <- length(data) + + ## if a pure array is given, give names for later + # if(is.array(data) && is.null(dimnames(data))) data <- array2tab(data) + + ## other basic objects + # varsNlevels <- dimnames(data) + vars <- names(data) + + + + + ## check for sampling zeros + ################################################## + if(any(data == 0L)) message( + "Care ought be taken with tables with sampling zeros to ensure the MLE exists." + ) + + + ## parse model specification (formula for vector of r_k's) + ################################################## + + modelGivenByMatrix <- ifelse(is.matrix(model), TRUE, FALSE) + + if(modelGivenByMatrix){ + A <- model + data <- suppressMessages(teshape(data, "tab")) + init <- tab2vec(data) + } else { + # if it's a formula, convert to list + if(is.formula(model)){ + + ## parse formula + fString <- as.character(model) + response <- fString[2] + predString <- fString[3] + + + + ## make list of facets + model <- strsplit(predString, " \\+ ")[[1]] + model <- strsplit(model, " \\* ") + + + if(length(model) == 1){ + init <- unlist(data[vars == response]) + p <- 1 + data <- data[vars == model] + + }else{ + #If model specifiaction, then make table + data <- supressMessages(teshape(cbind(data[vars == response],data[vars %in% model]), "tab", freqVar = response)) + p <- length(dim(data)) + init <- tab2vec(data) + } + } + + + + # make facets (list of index vecs); if model specified with variable + # names, convert them to indices + if(all(unlist(model) %in% vars)){ # variable names + varname2index <- 1:p + names(varname2index) <- vars[vars != response] + facets <- lapply(model, function(varsInFacet) varname2index[varsInFacet]) + } else if(all(unlist(model) %in% 1:length(vars))){ # by indices + facets <- lapply(model, as.integer) # to fix the ~ 1 + 2 case, parsed as chars + } else { + stop("Invalid model specification, see ?poisson") + } + + # make configuration (model) matrix + pois_A <- pmat(dim(data), facets) + A <- lawrence(pois_A) + } + suff_stats <- unname(A %*% init) + + ## construct A matrix and compute moves + ################################################## + + if(missing(moves) && !is.null(getOption("4ti2_path"))){ + + message("Computing Markov moves (4ti2)... ", appendLF = FALSE) + moves <- markov(A) + message("done.", appendLF = TRUE) + + } else if(missing(moves) && is.null(getOption("4ti2_path"))){ + + warning( + "No moves were provided and 4ti2 is not found.\n", + " The resulting chain is likely not connected and strongly autocorrelated.\n", + " See ?loglinear. Consider using rmove to generate SIS moves in advance.", + immediate. = TRUE + ) + message("Computing 1000 SIS moves... ", appendLF = FALSE) + moves <- rmove(n = 1000, A = A, b = A %*% tab2vec(data), ...) + message("done.", appendLF = TRUE) + + } else if(is.character(moves)){ + + movesMat <- NULL + stopifnot(all(moves %in% c("lattice", "markov", "groebner", "grobner", "graver", "sis"))) + if("lattice" %in% moves) movesMat <- cbind(movesMat, zbasis(A)) + if("markov" %in% moves) movesMat <- cbind(movesMat, markov(A)) + if("groebner" %in% moves) movesMat <- cbind(movesMat, groebner(A)) + if("grobner" %in% moves) movesMat <- cbind(movesMat, groebner(A)) + if("graver" %in% moves) stop("graver not yet implemented.") + moves <- movesMat + + } + + stopifnot(is.array(moves)) + + + + ## run metropolis-hastings + ################################################## + init <- unname(init) # init + out <- metropolis(init, moves, suff_stats = suff_stats, config = unname(A), iter = iter, burn = burn, thin = thin, + engine = engine, hit_and_run = hit_and_run, SIS = SIS, non_uniform = non_uniform) + + + + ## compute data chi square + ################################################## + if(modelGivenByMatrix && method == "ipf"){ + message( + "Iterative proportional fitting is not yet implemented\n", + " for models specified by configuration matrices.\n", + " Changing to method = \"mcmc\"..." + ) + method <- "mcmc" + } + if(method == "ipf"){ + exp <- loglin(data, facets, fit = TRUE, print = FALSE)$fit + } else if(method == "mcmc"){ + exp <- vec2tab(rowMeans(out$steps), dim(data)) + dimnames(exp) <- dimnames(data) + } + e <- unname(tab2vec(exp)) + u <- t(t(unname(tab2vec(data)))) + PR <- computeUProbsCpp(matrix(u)) # unnormd prob; numers LAS 1.1.10 + X2 <- computeX2sCpp(u, e) + G2 <- computeG2sCpp(u, e) + FT <- computeCRsCpp(u, e, -.5) + CR <- computeCRsCpp(u, e, 2/3) + NM <- computeNMsCpp(u, e) + + + ## compute MCMC chi squares + ################################################## + PRs <- computeUProbsCpp(out$steps) # unnormd probs; numers LAS 1.1.10 + X2s <- computeX2sCpp(out$steps, e) + G2s <- computeG2sCpp(out$steps, e) + FTs <- computeCRsCpp(out$steps, e, -.5) + CRs <- computeCRsCpp(out$steps, e, 2/3) + NMs <- computeNMsCpp(out$steps, e) + + + ## compute parameters + ################################################## + if(!modelGivenByMatrix){ + # in principle, there should be one parameter for every cell. + # there are prod(dim(data)) cells. + # a good reference is BFH, p. 35 (and to a lesser extent 43) + # the prod(dim(data)[terms[[j]]] - 1) line below is like + # (I - 1) (J - 1) (K - 1) + # CDA p.79 also helpful + dimSatModel <- nCells - 1 + degFreedom <- rep.int(0, 2^p) # there are 2^p possible subsets of vars, and + # therefore there are 2^p possible terms + + # possibleTerms are more "types of terms" as opposed to individual terms + # for example, an entry c(1,3) would refer to all combinations of levels + # of variables 1 and 3; ie (# var 1 levels - 1) * (# var 3 levels - 1) + # individual terms (parameters) + possibleTerms <- subsets(p, include_null = TRUE) + names(possibleTerms) <- sapply(possibleTerms, paste, collapse = " ") + names(possibleTerms)[which(names(possibleTerms) == "")] <- "(Intercept)" + nVarLvls <- dim(data) + # paramsPerTerm <- lapply(possibleTerms, function(x){ + # if(length(x) == 0) return(1L) + # prod(nVarLvls[x] - 1) + # }) + + + # similarly, there are the terms in the model + termsInModel <- unique(unlist(lapply( + lapply(facets, as.character), # to avoid subsets(2) + subsets, include_null = TRUE), + recursive = FALSE + )) + termsInModel <- lapply(termsInModel, as.integer) + names(termsInModel) <- sapply(termsInModel, paste, collapse = " ") + names(termsInModel)[which(names(termsInModel) == "")] <- "(Intercept)" + paramsPerTermInModel <- lapply(termsInModel, function(x){ + if(length(x) == 0) return(1L) + prod(nVarLvls[x] - 1) + }) + names(paramsPerTermInModel) <- unname(sapply(termsInModel, function(x){ + if(length(x) == 0) return("(Intercept)") + paste(names(dimnames(data))[x], collapse = ".") + })) + nParamsInModel <- sum(unlist(paramsPerTermInModel)) + dimModel <- nParamsInModel - 1 # the - 1 accounts for the overall mean + overallAsymptoticDegFreedom <- (dimSatModel - dimModel) + + + # compute the parameters + log_fit <- exp + log_fit[exp > 0] <- log(exp[exp > 0]) + param <- as.list(rep(NA, length(termsInModel))) + names(param) <- names(paramsPerTermInModel) + for(k in seq_along(param)){ + if(length(termsInModel[[k]]) == 0){ + param[[k]] <- mean(log_fit) + log_fit <- log_fit - param[[k]] + } else { + param[[k]] <- apply(log_fit, termsInModel[[k]], mean) + log_fit <- sweep(log_fit, termsInModel[[k]], param[[k]]) + } + } + # for every step, fit mle + # then decompose mle + # problem : they all have the same marginals, so the same + # mles! + # idea 1 : sample from the multinomial with the same sample + # size (so different marginals), estimate, then decompose + # idea 2 : bootstrap sample from the table, estimate, decompose + # i think i like idea 2 better. + + + # reorder the param estimates in the order of subsets + # so you have the intercept, then all first order terms, and so on + goodOrder <- sapply( + c("(Intercept)", subsets(names(dimnames(data)))), + paste, collapse = "." + ) + param <- param[goodOrder[goodOrder %in% names(param)]] + out$param <- param + + } + + ## compute residuals and model selection, agresti p.81, 216, 324 + ################################################## + out$residuals <- exp + out$residuals[exp > 0] <- + (data[exp > 0] - exp[exp > 0]) / sqrt(exp[exp > 0]) + + if(!modelGivenByMatrix){ + k <- nParamsInModel # = number of params + n <- sum(data) # = sample size + L <- dmultinom(u, sum(u), e, TRUE) # maximized log-likelihood + BIC <- log(n)*k - 2*L + AIC <- 2*k - 2*L + AICc <- AIC + 2*k*(k+1)/(n-k-1) + out$df <- paramsPerTermInModel + out$quality <- c(AIC = AIC, AICc = AICc, BIC = BIC) + } + + ## add A matrix, p.value and return + ################################################## + out$call <- match.call() + out$obs <- data + out$exp <- exp + out$A <- A + + out$p.value <- c( + PR = mean(PRs <= PR), + X2 = mean(X2s >= X2), + G2 = mean(G2s >= G2), + FT = mean(FTs >= FT), + CR = mean(CRs >= CR), + NM = mean(NMs >= NM) + ) + + out$p.value.std.err <- c( + PR = sqrt(mean(PRs <= PR)*(1-mean(PRs <= PR))/iter), + X2 = sqrt(mean(X2s >= X2)*(1-mean(X2s >= X2))/iter), + G2 = sqrt(mean(G2s >= G2)*(1-mean(G2s >= G2))/iter), + FT = sqrt(mean(FTs >= FT)*(1-mean(FTs >= FT))/iter), + CR = sqrt(mean(CRs >= CR)*(1-mean(CRs >= CR))/iter), + NM = sqrt(mean(NMs >= NM)*(1-mean(NMs >= NM))/iter) + ) + + out$mid.p.value <- c( + PR = mean(PRs < PR) + mean(PRs == PR)/2, + X2 = mean(X2s > X2) + mean(X2s == X2)/2, + G2 = mean(G2s > G2) + mean(G2s == G2)/2, + FT = mean(FTs > FT) + mean(FTs == FT)/2, + CR = mean(CRs > CR) + mean(CRs == CR)/2, + NM = mean(NMs > NM) + mean(NMs == NM)/2 + ) + + out$iter <- iter + out$burn <- burn + out$thin <- thin + out$statistic <- c(PR = PR, X2 = X2, G2 = G2, FT = FT, CR = CR, NM = NM) + out$sampsStats <- list(PRs = PRs, X2s = X2s, G2s = G2s, FTs = FTs, CRs = CRs, NMs = NMs) + out$cells <- nCells + out$method <- method + + class(out) <- "logistic" + out +} + \ No newline at end of file diff --git a/R/pmat.R b/R/pmat.R new file mode 100644 index 0000000..f1a19e5 --- /dev/null +++ b/R/pmat.R @@ -0,0 +1,43 @@ +#' Construct a Model Matrix for Poisson Regression +#' +#' Determine the A matrix associated with a hierarchical model on a +#' contingency table for Poisson Regression. +#' +#' @param levels a vector containing the number of levels of each +#' variable +#' @param facets the facets generating the hierarchical model, a +#' list of vectors of variable indices + +#' @return a matrix +#' @export pmat + +pmat <- function(levels, facets){ + + #Basic Variables + num_covariates <- length(levels) + cov_index <- 1:num_covariates + num_elements <- prod(levels) + num_rows <- length(facets) + + #Base level combos + varsNlvls <- lapply(as.list(levels), function(x) number2Glyph(1:x)) + baseLvls <- expand.grid(rev(varsNlvls))[,num_covariates:1] + + #Make initial matrix + A <- matrix(0L, nrow = num_rows, ncol = num_elements) + + for(i in 1:num_rows){ + if(length(facets[[i]]) == 1){ + + A[i, ] <- baseLvls[,facets[[i]]] + }else{ + inter_terms <- cov_index %in% facets[[i]] + sub_mat <- as.matrix(baseLvls[ ,inter_terms]) + class(sub_mat) <- "numeric" + A[i,] <- apply(sub_mat, 1, prod) + } + } + return(rbind(rep(1, num_elements), A)) +} + +number2Glyph <- function(n) c(0:9, letters, LETTERS)[n+1] diff --git a/R/poisson.R b/R/poisson.R new file mode 100644 index 0000000..4b0ecbb --- /dev/null +++ b/R/poisson.R @@ -0,0 +1,390 @@ +#' Fit a Poisson Regression model with algebraic methods +#' +#' +#' +#' @param model hierarchical poisson model specification +#' @param data data, as a data frame with frequencies. see \code{\link{teshape}} if your data is not +#' in that format +#' @param init the initialization of the chain. by default, this is +#' the observed table +#' @param iter number of chain iterations +#' @param burn burn-in +#' @param thin thinning +#' @param engine C++ or R? (C++ yields roughly a 20-25x speedup) +#' @param method should the expected value (exp) be fit using +#' iterative proportional fitting (via loglin) or the MCMC as the +#' average of the steps? +#' @param moves the markov moves for the mcmc (as columns of a +#' matrix). +#' @param ... ... +#' @return a list containing named elements \itemize{ \item +#' \code{steps}: an integer matrix whose columns represent +#' individual samples from the mcmc. \item \code{moves}: the moves +#' used for the proposal distribution in the mcmc, computed with +#' 4ti2 (note that only the positive moves are given). \item +#' \code{acceptProb}: the average acceptance probability of the +#' moves, including the thinned moves. \item \code{param}: the +#' fitted parameters of the log linear model. \item \code{df}: +#' parameters per term in the model \item \code{quality}: model +#' selection statistics AIC, AICc, and BIC. \item +#' \code{residuals}: the (unstandardized) pearson residuals (O - +#' E) / sqrt(E) \item \code{call}: the call. \item \code{obs}: the +#' contingency table given. \item \code{exp}: the fit contingency +#' table as an integer array. \item \code{A}: the sufficient +#' statistics computing matrix (from Tmaker). \item +#' \code{p.value}: the exact p-values of individual tests, +#' accurate to Monte-Carlo error. these are computed as the +#' proportion of samples with statistics equal to or larger than +#' the oberved statistic. \item \code{mid.p.value}: the mid +#' p.values, see Agresti pp.20--21. \item \code{statistic}: the +#' pearson's chi-squared (X2), likelihood ratio (G2), +#' Freeman-Tukey (FT), Cressie-Read (CR), and Neyman modified +#' chi-squared (NM) statistics computed for the table given. \item +#' \code{sampsStats}: the statistics computed for each mcmc +#' sample. \item \code{cells}: the number of cells in the table. +#' \item \code{method}: the method used to estimate the table. } +#' @export pois_reg + + + + + + +pois_reg <- function(model, data, + iter = 1E4, burn = 1000, + thin = 10, engine = c("Cpp","R"), + method = c("ipf", "mcmc"), moves, + hit_and_run = FALSE, + SIS = FALSE, + non_uniform = FALSE, + ...) +{ + + ## set/check args + ################################################## + + engine <- match.arg(engine) + method <- match.arg(method) + argList <- as.list(match.call(expand.dots = TRUE))[-1] + + if("formula" %in% names(argList)){ + .Deprecated(msg = + 'the formula argument is deprecated, please use "model" instead.' + ) + } + + + ## reshape data + ################################################## + + # data <- suppressMessages(teshape(data, "tab")) + # p <- length(dim(data)) + # nCells <- length(data) + + ## if a pure array is given, give names for later + # if(is.array(data) && is.null(dimnames(data))) data <- array2tab(data) + + ## other basic objects + #varsNlevels <- dimnames(data) + vars <- names(data) + + + + + ## check for sampling zeros + ################################################## + if(any(data == 0L)) message( + "Care ought be taken with tables with sampling zeros to ensure the MLE exists." + ) + + + ## parse model specification (formula for vector of r_k's) + ################################################## + + modelGivenByMatrix <- ifelse(is.matrix(model), TRUE, FALSE) + + if(modelGivenByMatrix){ + A <- model + data <- suppressMessages(teshape(data, "tab")) + init <- tab2vec(data) + nCells <- length(init) + } else { + # if it's a formula, convert to list + if(is.formula(model)){ + + ## parse formula + fString <- as.character(model) + response <- fString[2] + predString <- fString[3] + + + + ## make list of facets + model <- strsplit(predString, " \\+ ")[[1]] + model <- strsplit(model, " \\* ") + + + if(length(model) == 1){ + init <- unlist(data[vars == response]) + nCells <- length(init) + p <- 1 + data <- data[vars == model] + + }else{ + #If model specifiaction, then make table + data <- suppressMessages(teshape(cbind(data[vars == response],data[vars %in% model]), "tab", freqVar = response)) + p <- length(dim(data)) + init <- tab2vec(data) + nCells <- length(init) + } + } + + + + # make facets (list of index vecs); if model specified with variable + # names, convert them to indices + if(all(unlist(model) %in% vars)){ # variable names + varname2index <- 1:p + names(varname2index) <- vars[vars != response] + facets <- lapply(model, function(varsInFacet) varname2index[varsInFacet]) + } else if(all(unlist(model) %in% 1:length(vars))){ # by indices + facets <- lapply(model, as.integer) # to fix the ~ 1 + 2 case, parsed as chars + } else { + stop("Invalid model specification, see ?pois_reg") + } + + # make configuration (model) matrix + A <- pmat(dim(data), facets) + } + #len <- length(init) + #A <- matrix(c(rep(1, len), 1:len), nrow = 2, byrow = TRUE) + suff_stats <- unname(A %*% init) + + ## construct A matrix and compute moves + ################################################## + + if(missing(moves) && !is.null(getOption("4ti2_path"))){ + + message("Computing Markov moves (4ti2)... ", appendLF = FALSE) + moves <- markov(A) + message("done.", appendLF = TRUE) + + } else if(missing(moves) && is.null(getOption("4ti2_path"))){ + + warning( + "No moves were provided and 4ti2 is not found.\n", + " The resulting chain is likely not connected and strongly autocorrelated.\n", + " See ?loglinear. Consider using rmove to generate SIS moves in advance.", + immediate. = TRUE + ) + message("Computing 1000 SIS moves... ", appendLF = FALSE) + moves <- rmove(n = 1000, A = A, b = A %*% tab2vec(data), ...) + message("done.", appendLF = TRUE) + + } else if(is.character(moves)){ + + movesMat <- NULL + stopifnot(all(moves %in% c("lattice", "markov", "groebner", "grobner", "graver", "sis"))) + if("lattice" %in% moves) movesMat <- cbind(movesMat, zbasis(A)) + if("markov" %in% moves) movesMat <- cbind(movesMat, markov(A)) + if("groebner" %in% moves) movesMat <- cbind(movesMat, groebner(A)) + if("grobner" %in% moves) movesMat <- cbind(movesMat, groebner(A)) + if("graver" %in% moves) stop("graver not yet implemented.") + moves <- movesMat + + } + + stopifnot(is.array(moves)) + + + + ## run metropolis-hastings + ################################################## + init <- unname(init) # init + out <- metropolis(init, moves, suff_stats = suff_stats, config = unname(A), iter = iter, burn = burn, thin = thin, + engine = engine, hit_and_run = hit_and_run, SIS = SIS, non_uniform = non_uniform) + + + + ## compute data chi square + ################################################## + if(modelGivenByMatrix && method == "ipf"){ + message( + "Iterative proportional fitting is not yet implemented\n", + " for models specified by configuration matrices.\n", + " Changing to method = \"mcmc\"..." + ) + method <- "mcmc" + } + if(method == "ipf"){ + exp <- loglin(data, facets, fit = TRUE, print = FALSE)$fit + } else if(method == "mcmc"){ + exp <- vec2tab(rowMeans(out$steps), dim(data)) + dimnames(exp) <- dimnames(data) + } + e <- unname(tab2vec(exp)) + u <- t(t(unname(tab2vec(data)))) + PR <- computeUProbsCpp(matrix(u)) # unnormd prob; numers LAS 1.1.10 + X2 <- computeX2sCpp(u, e) + G2 <- computeG2sCpp(u, e) + FT <- computeCRsCpp(u, e, -.5) + CR <- computeCRsCpp(u, e, 2/3) + NM <- computeNMsCpp(u, e) + + + ## compute MCMC chi squares + ################################################## + PRs <- computeUProbsCpp(out$steps) # unnormd probs; numers LAS 1.1.10 + X2s <- computeX2sCpp(out$steps, e) + G2s <- computeG2sCpp(out$steps, e) + FTs <- computeCRsCpp(out$steps, e, -.5) + CRs <- computeCRsCpp(out$steps, e, 2/3) + NMs <- computeNMsCpp(out$steps, e) + + + ## compute parameters + ################################################## + if(!modelGivenByMatrix){ + # in principle, there should be one parameter for every cell. + # there are prod(dim(data)) cells. + # a good reference is BFH, p. 35 (and to a lesser extent 43) + # the prod(dim(data)[terms[[j]]] - 1) line below is like + # (I - 1) (J - 1) (K - 1) + # CDA p.79 also helpful + dimSatModel <- nCells - 1 + degFreedom <- rep.int(0, 2^p) # there are 2^p possible subsets of vars, and + # therefore there are 2^p possible terms + + # possibleTerms are more "types of terms" as opposed to individual terms + # for example, an entry c(1,3) would refer to all combinations of levels + # of variables 1 and 3; ie (# var 1 levels - 1) * (# var 3 levels - 1) + # individual terms (parameters) + possibleTerms <- subsets(p, include_null = TRUE) + names(possibleTerms) <- sapply(possibleTerms, paste, collapse = " ") + names(possibleTerms)[which(names(possibleTerms) == "")] <- "(Intercept)" + nVarLvls <- dim(data) + # paramsPerTerm <- lapply(possibleTerms, function(x){ + # if(length(x) == 0) return(1L) + # prod(nVarLvls[x] - 1) + # }) + + + # similarly, there are the terms in the model + termsInModel <- unique(unlist(lapply( + lapply(facets, as.character), # to avoid subsets(2) + subsets, include_null = TRUE), + recursive = FALSE + )) + termsInModel <- lapply(termsInModel, as.integer) + names(termsInModel) <- sapply(termsInModel, paste, collapse = " ") + names(termsInModel)[which(names(termsInModel) == "")] <- "(Intercept)" + paramsPerTermInModel <- lapply(termsInModel, function(x){ + if(length(x) == 0) return(1L) + prod(nVarLvls[x] - 1) + }) + names(paramsPerTermInModel) <- unname(sapply(termsInModel, function(x){ + if(length(x) == 0) return("(Intercept)") + paste(names(dimnames(data))[x], collapse = ".") + })) + nParamsInModel <- sum(unlist(paramsPerTermInModel)) + dimModel <- nParamsInModel - 1 # the - 1 accounts for the overall mean + overallAsymptoticDegFreedom <- (dimSatModel - dimModel) + + + # compute the parameters + log_fit <- exp + log_fit[exp > 0] <- log(exp[exp > 0]) + param <- as.list(rep(NA, length(termsInModel))) + names(param) <- names(paramsPerTermInModel) + for(k in seq_along(param)){ + if(length(termsInModel[[k]]) == 0){ + param[[k]] <- mean(log_fit) + log_fit <- log_fit - param[[k]] + } else { + param[[k]] <- apply(log_fit, termsInModel[[k]], mean) + log_fit <- sweep(log_fit, termsInModel[[k]], param[[k]]) + } + } + # for every step, fit mle + # then decompose mle + # problem : they all have the same marginals, so the same + # mles! + # idea 1 : sample from the multinomial with the same sample + # size (so different marginals), estimate, then decompose + # idea 2 : bootstrap sample from the table, estimate, decompose + # i think i like idea 2 better. + + + # reorder the param estimates in the order of subsets + # so you have the intercept, then all first order terms, and so on + goodOrder <- sapply( + c("(Intercept)", subsets(names(dimnames(data)))), + paste, collapse = "." + ) + param <- param[goodOrder[goodOrder %in% names(param)]] + out$param <- param + + } + + ## compute residuals and model selection, agresti p.81, 216, 324 + ################################################## + out$residuals <- exp + out$residuals[exp > 0] <- + (data[exp > 0] - exp[exp > 0]) / sqrt(exp[exp > 0]) + + if(!modelGivenByMatrix){ + k <- nParamsInModel # = number of params + n <- sum(data) # = sample size + L <- dmultinom(u, sum(u), e, TRUE) # maximized log-likelihood + BIC <- log(n)*k - 2*L + AIC <- 2*k - 2*L + AICc <- AIC + 2*k*(k+1)/(n-k-1) + out$df <- paramsPerTermInModel + out$quality <- c(AIC = AIC, AICc = AICc, BIC = BIC) + } + + ## add A matrix, p.value and return + ################################################## + out$call <- match.call() + out$obs <- data + out$exp <- exp + out$A <- A + + out$p.value <- c( + PR = mean(PRs <= PR), + X2 = mean(X2s >= X2), + G2 = mean(G2s >= G2), + FT = mean(FTs >= FT), + CR = mean(CRs >= CR), + NM = mean(NMs >= NM) + ) + + out$p.value.std.err <- c( + PR = sqrt(mean(PRs <= PR)*(1-mean(PRs <= PR))/iter), + X2 = sqrt(mean(X2s >= X2)*(1-mean(X2s >= X2))/iter), + G2 = sqrt(mean(G2s >= G2)*(1-mean(G2s >= G2))/iter), + FT = sqrt(mean(FTs >= FT)*(1-mean(FTs >= FT))/iter), + CR = sqrt(mean(CRs >= CR)*(1-mean(CRs >= CR))/iter), + NM = sqrt(mean(NMs >= NM)*(1-mean(NMs >= NM))/iter) + ) + + out$mid.p.value <- c( + PR = mean(PRs < PR) + mean(PRs == PR)/2, + X2 = mean(X2s > X2) + mean(X2s == X2)/2, + G2 = mean(G2s > G2) + mean(G2s == G2)/2, + FT = mean(FTs > FT) + mean(FTs == FT)/2, + CR = mean(CRs > CR) + mean(CRs == CR)/2, + NM = mean(NMs > NM) + mean(NMs == NM)/2 + ) + + out$iter <- iter + out$burn <- burn + out$thin <- thin + out$statistic <- c(PR = PR, X2 = X2, G2 = G2, FT = FT, CR = CR, NM = NM) + out$sampsStats <- list(PRs = PRs, X2s = X2s, G2s = G2s, FTs = FTs, CRs = CRs, NMs = NMs) + out$cells <- nCells + out$method <- method + + class(out) <- "poisson" + out +} From 6b23d54fb2cd2a580d7d22b2f5537df7c4d801cd Mon Sep 17 00:00:00 2001 From: Innerst Date: Thu, 17 Aug 2017 13:22:59 -0500 Subject: [PATCH 13/53] various fixes to new schemes, poisson and logistic regression adds and fixes --- .DS_Store | Bin 8196 -> 8196 bytes NAMESPACE | 7 +-- R/RcppExports.R | 22 ++++----- R/metropolis.r | 21 +++++--- R/pmat.R | 31 +++--------- R/poisson.R | 2 - man/loglinear.Rd | 2 +- man/metropolis.Rd | 2 +- man/pmat.Rd | 22 +++++++++ man/pois_reg.Rd | 67 ++++++++++++++++++++++++++ src/RcppExports.cpp | 48 +++++++++--------- src/metropolis_hypergeometric_cpp.cpp | 65 +++++++++++++------------ src/metropolis_uniform_cpp.cpp | 36 +++++++++----- src/sis_tbl.cpp | 11 +---- 14 files changed, 216 insertions(+), 120 deletions(-) create mode 100644 man/pmat.Rd create mode 100644 man/pois_reg.Rd diff --git a/.DS_Store b/.DS_Store index ab93edd3dcba1891f4e8ac44b27b4c7790f4de5f..3505ba1a3e7e261c28e1a4de630a883985245764 100644 GIT binary patch delta 121 zcmZp1XmOa}FUrQiz`)4BAi$8H1Z4e4D#*z!F4 0)){min(subset(w_moves,subset = w_moves > 0))}else{-1} + + if(any(w_moves == 0)){ w_propStatelow <- current + lower_bound * move w_propStateup <- current + upper_bound * move if(any(w_propStatelow < 0)){ @@ -202,6 +204,8 @@ metropolis <- function(init, moves, suff_stats, config, iter = 1E3, burn = 0, th if(any(w_propStateup < 0)){ upper_bound <- -1 } + } + c_s <- sample(lower_bound:upper_bound,1) if(c_s == 0){ c_s <- 1 @@ -271,14 +275,17 @@ metropolis <- function(init, moves, suff_stats, config, iter = 1E3, burn = 0, th w_moves <- (-1 * w_current) / w_move lower_bound <- if(any(w_moves < 0)){max(subset(w_moves,subset = w_moves < 0))}else{1} upper_bound <- if(any(w_moves > 0)){min(subset(w_moves,subset = w_moves > 0))}else{-1} - w_propStatelow <- current + lower_bound * move - w_propStateup <- current + upper_bound * move - if(any(w_propStatelow < 0)){ - lower_bound <- 1 - } - if(any(w_propStateup < 0)){ - upper_bound <- -1 + if(any(w_moves == 0)){ + w_propStatelow <- current + lower_bound * move + w_propStateup <- current + upper_bound * move + if(any(w_propStatelow < 0)){ + lower_bound <- 1 + } + if(any(w_propStateup < 0)){ + upper_bound <- -1 + } } + c_s <- sample(lower_bound:upper_bound,1) if(c_s == 0){ c_s <- 1 diff --git a/R/pmat.R b/R/pmat.R index f1a19e5..09088e0 100644 --- a/R/pmat.R +++ b/R/pmat.R @@ -12,32 +12,17 @@ #' @export pmat pmat <- function(levels, facets){ - - #Basic Variables + + levels <- levels[levels != 1] num_covariates <- length(levels) - cov_index <- 1:num_covariates - num_elements <- prod(levels) - num_rows <- length(facets) - - #Base level combos - varsNlvls <- lapply(as.list(levels), function(x) number2Glyph(1:x)) - baseLvls <- expand.grid(rev(varsNlvls))[,num_covariates:1] + mat_list <- list() - #Make initial matrix - A <- matrix(0L, nrow = num_rows, ncol = num_elements) + for(i in 1:num_covariates){ + mat_list[[i]] <- matrix(c(rep(1,levels[i]), 1:levels[i]), nrow = 2, byrow = T) + } - for(i in 1:num_rows){ - if(length(facets[[i]]) == 1){ + return(do.call(kprod, mat_list)) - A[i, ] <- baseLvls[,facets[[i]]] - }else{ - inter_terms <- cov_index %in% facets[[i]] - sub_mat <- as.matrix(baseLvls[ ,inter_terms]) - class(sub_mat) <- "numeric" - A[i,] <- apply(sub_mat, 1, prod) - } - } - return(rbind(rep(1, num_elements), A)) } -number2Glyph <- function(n) c(0:9, letters, LETTERS)[n+1] + diff --git a/R/poisson.R b/R/poisson.R index 4b0ecbb..291967c 100644 --- a/R/poisson.R +++ b/R/poisson.R @@ -156,8 +156,6 @@ pois_reg <- function(model, data, # make configuration (model) matrix A <- pmat(dim(data), facets) } - #len <- length(init) - #A <- matrix(c(rep(1, len), 1:len), nrow = 2, byrow = TRUE) suff_stats <- unname(A %*% init) ## construct A matrix and compute moves diff --git a/man/loglinear.Rd b/man/loglinear.Rd index 479bc42..53defb4 100644 --- a/man/loglinear.Rd +++ b/man/loglinear.Rd @@ -7,7 +7,7 @@ \usage{ loglinear(model, data, init = tab2vec(data), iter = 10000, burn = 1000, thin = 10, engine = c("Cpp", "R"), method = c("ipf", "mcmc"), moves, - hit_and_run = FALSE, SIS = FALSE, ...) + hit_and_run = FALSE, SIS = FALSE, non_uniform = FALSE, ...) } \arguments{ \item{model}{hierarchical log-linear model specification} diff --git a/man/metropolis.Rd b/man/metropolis.Rd index a51c50e..b869af6 100644 --- a/man/metropolis.Rd +++ b/man/metropolis.Rd @@ -7,7 +7,7 @@ \usage{ metropolis(init, moves, suff_stats, config, iter = 1000, burn = 0, thin = 1, dist = c("hypergeometric", "uniform"), engine = c("Cpp", "R"), - hit_and_run = FALSE, SIS = FALSE) + hit_and_run = FALSE, SIS = FALSE, non_uniform = FALSE) rawMetropolis(init, moves, iter = 1000, dist = "hypergeometric", hit_and_run = FALSE) diff --git a/man/pmat.Rd b/man/pmat.Rd new file mode 100644 index 0000000..2831452 --- /dev/null +++ b/man/pmat.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pmat.R +\name{pmat} +\alias{pmat} +\title{Construct a Model Matrix for Poisson Regression} +\usage{ +pmat(levels, facets) +} +\arguments{ +\item{levels}{a vector containing the number of levels of each +variable} + +\item{facets}{the facets generating the hierarchical model, a +list of vectors of variable indices} +} +\value{ +a matrix +} +\description{ +Determine the A matrix associated with a hierarchical model on a +contingency table for Poisson Regression. +} diff --git a/man/pois_reg.Rd b/man/pois_reg.Rd new file mode 100644 index 0000000..9b4b457 --- /dev/null +++ b/man/pois_reg.Rd @@ -0,0 +1,67 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/poisson.R +\name{pois_reg} +\alias{pois_reg} +\title{Fit a Poisson Regression model with algebraic methods} +\usage{ +pois_reg(model, data, iter = 10000, burn = 1000, thin = 10, + engine = c("Cpp", "R"), method = c("ipf", "mcmc"), moves, + hit_and_run = FALSE, SIS = FALSE, non_uniform = FALSE, ...) +} +\arguments{ +\item{model}{hierarchical log-linear model specification} + +\item{data}{data, as a data frame with frequencies. see \code{\link{teshape}} if your data is not +in that format} + +\item{iter}{number of chain iterations} + +\item{burn}{burn-in} + +\item{thin}{thinning} + +\item{engine}{C++ or R? (C++ yields roughly a 20-25x speedup)} + +\item{method}{should the expected value (exp) be fit using +iterative proportional fitting (via loglin) or the MCMC as the +average of the steps?} + +\item{moves}{the markov moves for the mcmc (as columns of a +matrix).} + +\item{...}{...} + +\item{init}{the initialization of the chain. by default, this is +the observed table} +} +\value{ +a list containing named elements \itemize{ \item + \code{steps}: an integer matrix whose columns represent + individual samples from the mcmc. \item \code{moves}: the moves + used for the proposal distribution in the mcmc, computed with + 4ti2 (note that only the positive moves are given). \item + \code{acceptProb}: the average acceptance probability of the + moves, including the thinned moves. \item \code{param}: the + fitted parameters of the log linear model. \item \code{df}: + parameters per term in the model \item \code{quality}: model + selection statistics AIC, AICc, and BIC. \item + \code{residuals}: the (unstandardized) pearson residuals (O - + E) / sqrt(E) \item \code{call}: the call. \item \code{obs}: the + contingency table given. \item \code{exp}: the fit contingency + table as an integer array. \item \code{A}: the sufficient + statistics computing matrix (from Tmaker). \item + \code{p.value}: the exact p-values of individual tests, + accurate to Monte-Carlo error. these are computed as the + proportion of samples with statistics equal to or larger than + the oberved statistic. \item \code{mid.p.value}: the mid + p.values, see Agresti pp.20--21. \item \code{statistic}: the + pearson's chi-squared (X2), likelihood ratio (G2), + Freeman-Tukey (FT), Cressie-Read (CR), and Neyman modified + chi-squared (NM) statistics computed for the table given. \item + \code{sampsStats}: the statistics computed for each mcmc + sample. \item \code{cells}: the number of cells in the table. + \item \code{method}: the method used to estimate the table. } +} +\description{ +Fit a Poisson Regression model with algebraic methods +} diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index deb1338..04e82f3 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -7,7 +7,7 @@ using namespace Rcpp; // computeCRsCpp NumericVector computeCRsCpp(NumericMatrix x, NumericVector exp, double lambda); -RcppExport SEXP algstat_computeCRsCpp(SEXP xSEXP, SEXP expSEXP, SEXP lambdaSEXP) { +RcppExport SEXP _algstat_computeCRsCpp(SEXP xSEXP, SEXP expSEXP, SEXP lambdaSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -20,7 +20,7 @@ END_RCPP } // computeG2sCpp NumericVector computeG2sCpp(NumericMatrix x, NumericVector exp); -RcppExport SEXP algstat_computeG2sCpp(SEXP xSEXP, SEXP expSEXP) { +RcppExport SEXP _algstat_computeG2sCpp(SEXP xSEXP, SEXP expSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -32,7 +32,7 @@ END_RCPP } // computeNMsCpp NumericVector computeNMsCpp(NumericMatrix x, NumericVector exp); -RcppExport SEXP algstat_computeNMsCpp(SEXP xSEXP, SEXP expSEXP) { +RcppExport SEXP _algstat_computeNMsCpp(SEXP xSEXP, SEXP expSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -44,7 +44,7 @@ END_RCPP } // computeUProbsCpp NumericVector computeUProbsCpp(NumericMatrix x); -RcppExport SEXP algstat_computeUProbsCpp(SEXP xSEXP) { +RcppExport SEXP _algstat_computeUProbsCpp(SEXP xSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -55,7 +55,7 @@ END_RCPP } // computeX2sCpp NumericVector computeX2sCpp(NumericMatrix x, NumericVector exp); -RcppExport SEXP algstat_computeX2sCpp(SEXP xSEXP, SEXP expSEXP) { +RcppExport SEXP _algstat_computeX2sCpp(SEXP xSEXP, SEXP expSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -67,7 +67,7 @@ END_RCPP } // metropolis_hypergeometric_cpp List metropolis_hypergeometric_cpp(IntegerVector current, IntegerMatrix moves, IntegerVector suff_stats, IntegerMatrix config, int iter, int thin, bool hit_and_run, bool SIS, bool non_uniform); -RcppExport SEXP algstat_metropolis_hypergeometric_cpp(SEXP currentSEXP, SEXP movesSEXP, SEXP suff_statsSEXP, SEXP configSEXP, SEXP iterSEXP, SEXP thinSEXP, SEXP hit_and_runSEXP, SEXP SISSEXP, SEXP non_uniformSEXP) { +RcppExport SEXP _algstat_metropolis_hypergeometric_cpp(SEXP currentSEXP, SEXP movesSEXP, SEXP suff_statsSEXP, SEXP configSEXP, SEXP iterSEXP, SEXP thinSEXP, SEXP hit_and_runSEXP, SEXP SISSEXP, SEXP non_uniformSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -85,23 +85,27 @@ BEGIN_RCPP END_RCPP } // metropolis_uniform_cpp -List metropolis_uniform_cpp(IntegerVector current, IntegerMatrix moves, int iter, int thin, bool hit_and_run); -RcppExport SEXP algstat_metropolis_uniform_cpp(SEXP currentSEXP, SEXP movesSEXP, SEXP iterSEXP, SEXP thinSEXP, SEXP hit_and_runSEXP) { +List metropolis_uniform_cpp(IntegerVector current, IntegerMatrix moves, IntegerVector suff_stats, IntegerMatrix config, int iter, int thin, bool hit_and_run, bool SIS, bool non_uniform); +RcppExport SEXP _algstat_metropolis_uniform_cpp(SEXP currentSEXP, SEXP movesSEXP, SEXP suff_statsSEXP, SEXP configSEXP, SEXP iterSEXP, SEXP thinSEXP, SEXP hit_and_runSEXP, SEXP SISSEXP, SEXP non_uniformSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< IntegerVector >::type current(currentSEXP); Rcpp::traits::input_parameter< IntegerMatrix >::type moves(movesSEXP); + Rcpp::traits::input_parameter< IntegerVector >::type suff_stats(suff_statsSEXP); + Rcpp::traits::input_parameter< IntegerMatrix >::type config(configSEXP); Rcpp::traits::input_parameter< int >::type iter(iterSEXP); Rcpp::traits::input_parameter< int >::type thin(thinSEXP); Rcpp::traits::input_parameter< bool >::type hit_and_run(hit_and_runSEXP); - rcpp_result_gen = Rcpp::wrap(metropolis_uniform_cpp(current, moves, iter, thin, hit_and_run)); + Rcpp::traits::input_parameter< bool >::type SIS(SISSEXP); + Rcpp::traits::input_parameter< bool >::type non_uniform(non_uniformSEXP); + rcpp_result_gen = Rcpp::wrap(metropolis_uniform_cpp(current, moves, suff_stats, config, iter, thin, hit_and_run, SIS, non_uniform)); return rcpp_result_gen; END_RCPP } // rfiberOne List rfiberOne(IntegerMatrix A, IntegerVector b); -RcppExport SEXP algstat_rfiberOne(SEXP ASEXP, SEXP bSEXP) { +RcppExport SEXP _algstat_rfiberOne(SEXP ASEXP, SEXP bSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -113,7 +117,7 @@ END_RCPP } // sis_tbl IntegerVector sis_tbl(IntegerMatrix A, IntegerVector suff_stats); -RcppExport SEXP algstat_sis_tbl(SEXP ASEXP, SEXP suff_statsSEXP) { +RcppExport SEXP _algstat_sis_tbl(SEXP ASEXP, SEXP suff_statsSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -125,7 +129,7 @@ END_RCPP } // walk IntegerMatrix walk(IntegerVector current, IntegerMatrix moves, int iter, int thin); -RcppExport SEXP algstat_walk(SEXP currentSEXP, SEXP movesSEXP, SEXP iterSEXP, SEXP thinSEXP) { +RcppExport SEXP _algstat_walk(SEXP currentSEXP, SEXP movesSEXP, SEXP iterSEXP, SEXP thinSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -139,16 +143,16 @@ END_RCPP } static const R_CallMethodDef CallEntries[] = { - {"algstat_computeCRsCpp", (DL_FUNC) &algstat_computeCRsCpp, 3}, - {"algstat_computeG2sCpp", (DL_FUNC) &algstat_computeG2sCpp, 2}, - {"algstat_computeNMsCpp", (DL_FUNC) &algstat_computeNMsCpp, 2}, - {"algstat_computeUProbsCpp", (DL_FUNC) &algstat_computeUProbsCpp, 1}, - {"algstat_computeX2sCpp", (DL_FUNC) &algstat_computeX2sCpp, 2}, - {"algstat_metropolis_hypergeometric_cpp", (DL_FUNC) &algstat_metropolis_hypergeometric_cpp, 9}, - {"algstat_metropolis_uniform_cpp", (DL_FUNC) &algstat_metropolis_uniform_cpp, 5}, - {"algstat_rfiberOne", (DL_FUNC) &algstat_rfiberOne, 2}, - {"algstat_sis_tbl", (DL_FUNC) &algstat_sis_tbl, 2}, - {"algstat_walk", (DL_FUNC) &algstat_walk, 4}, + {"_algstat_computeCRsCpp", (DL_FUNC) &_algstat_computeCRsCpp, 3}, + {"_algstat_computeG2sCpp", (DL_FUNC) &_algstat_computeG2sCpp, 2}, + {"_algstat_computeNMsCpp", (DL_FUNC) &_algstat_computeNMsCpp, 2}, + {"_algstat_computeUProbsCpp", (DL_FUNC) &_algstat_computeUProbsCpp, 1}, + {"_algstat_computeX2sCpp", (DL_FUNC) &_algstat_computeX2sCpp, 2}, + {"_algstat_metropolis_hypergeometric_cpp", (DL_FUNC) &_algstat_metropolis_hypergeometric_cpp, 9}, + {"_algstat_metropolis_uniform_cpp", (DL_FUNC) &_algstat_metropolis_uniform_cpp, 9}, + {"_algstat_rfiberOne", (DL_FUNC) &_algstat_rfiberOne, 2}, + {"_algstat_sis_tbl", (DL_FUNC) &_algstat_sis_tbl, 2}, + {"_algstat_walk", (DL_FUNC) &_algstat_walk, 4}, {NULL, NULL, 0} }; diff --git a/src/metropolis_hypergeometric_cpp.cpp b/src/metropolis_hypergeometric_cpp.cpp index 025c3b2..6f94104 100644 --- a/src/metropolis_hypergeometric_cpp.cpp +++ b/src/metropolis_hypergeometric_cpp.cpp @@ -36,6 +36,7 @@ List metropolis_hypergeometric_cpp( int ub; IntegerVector run; + Function sample("sample"); whichMove = sample(nMoves, nTotalSamples, 1); Function runif("runif"); unifs = runif(nTotalSamples); @@ -44,14 +45,14 @@ List metropolis_hypergeometric_cpp( Function print("print"); - NumericVector move_dist = rep(1.0, nMoves); - double counter = moves.ncol(); + NumericVector move_dist = rep(10.0, nMoves); + double counter = sum(move_dist); int which_move; for(int i = 0; i < iter; ++i){ for(int j = 0; j < thin; ++j){ - if(non_uniform == true){ + if(non_uniform){ for(int l = 0; l < nMoves; ++l){ double sums = 0; for(int m = 0; m < l+1; ++m){ @@ -67,40 +68,45 @@ List metropolis_hypergeometric_cpp( break; } } - for(int k = 0; k < n; ++k){ - proposal[k] = current[k] + move[k]; - } + for(int k = 0; k < n; ++k){ + proposal[k] = current[k] + move[k]; + } + }else{ // make move for(int k = 0; k < n; ++k){ move[k] = moves(k, whichMove[thin*i+j]-1); } - if(hit_and_run == true){ - current_num = current[move != 0]; - move_num = move[move != 0]; - stepSize = (-1 * current_num) / move_num; - lowerBound = stepSize[stepSize < 0]; - upperBound = stepSize[stepSize > 0]; - lb = max(lowerBound); - ub = min(upperBound); - IntegerVector test1 = current + lb * move; - IntegerVector test2 = current + ub * move; - for(int i = 0; i < n; ++i){ - if(test1[i] < 0){ - lb = 1; - } - if(test2[i] < 0){ - ub = -1; + if(hit_and_run){ + current_num = current[move != 0]; + move_num = move[move != 0]; + stepSize = (-1 * current_num) / move_num; + lowerBound = stepSize[stepSize < 0]; + upperBound = stepSize[stepSize > 0]; + lb = max(lowerBound); + ub = min(upperBound); + + if(is_true(any(stepSize == 0))){ + IntegerVector test1 = current + lb * move; + IntegerVector test2 = current + ub * move; + for(int i = 0; i < n; ++i){ + if(test1[i] < 0) lb = 1; + if(test2[i] < 0) ub = -1; } } - IntegerVector range = seq(lb,ub); - run = sample(range,1); - if(run[1] == 0){ - run[1] = 1; + if(lb > ub){ + run[0] = 1; + }else{ + IntegerVector range = seq(lb,ub); + + run = Rcpp::sample(range,1); + } + if(run[0] == 0){ + run[0] = 1; } } - if(hit_and_run == TRUE){ + if(hit_and_run){ for(int k = 0; k < n; ++k){ proposal[k] = current[k] + as(run) * move[k]; } @@ -111,9 +117,7 @@ List metropolis_hypergeometric_cpp( } } if(SIS){ - if(unifs2[i] < .05){ - proposal = sis_tbl(config, suff_stats); - } + if(unifs2[i] < .01) proposal = sis_tbl(config, suff_stats); } // compute probability of transition anyIsNegative = false; @@ -173,3 +177,4 @@ List metropolis_hypergeometric_cpp( return out; } + diff --git a/src/metropolis_uniform_cpp.cpp b/src/metropolis_uniform_cpp.cpp index 348fe8c..f317a5d 100644 --- a/src/metropolis_uniform_cpp.cpp +++ b/src/metropolis_uniform_cpp.cpp @@ -35,6 +35,7 @@ List metropolis_uniform_cpp( int ub; IntegerVector run; + Function sample("sample"); whichMove = sample(nMoves, nTotalSamples, 1); Function runif("runif"); unifs = runif(nTotalSamples); @@ -76,26 +77,38 @@ List metropolis_uniform_cpp( } if(hit_and_run == true){ current_num = current[move != 0]; + move_num = move[move != 0]; + stepSize = (-1 * current_num) / move_num; + lowerBound = stepSize[stepSize < 0]; + upperBound = stepSize[stepSize > 0]; + lb = max(lowerBound); + ub = min(upperBound); - IntegerVector test1 = current + lb * move; - IntegerVector test2 = current + ub * move; - for(int i = 0; i < n; ++i){ - if(test1[i] < 0){ - lb = 1; - } - if(test2[i] < 0){ - ub = -1; + + if(is_true(any(stepSize == 0))){ + IntegerVector test1 = current + lb * move; + IntegerVector test2 = current + ub * move; + for(int i = 0; i < n; ++i){ + if(test1[i] < 0) lb = 1; + if(test2[i] < 0) ub = -1; } } + if(lb > ub){ + run[0] = 1; + }else{ + IntegerVector range = seq(lb,ub); - run = sample(range,1); - if(run[1] == 0){ - run[1] = 1; + + run = Rcpp::sample(range,1); + + } + if(run[0] == 0){ + run[0] = 1; } } if(hit_and_run == TRUE){ @@ -171,3 +184,4 @@ List metropolis_uniform_cpp( return out; } + diff --git a/src/sis_tbl.cpp b/src/sis_tbl.cpp index 085ae00..bf8d9fa 100644 --- a/src/sis_tbl.cpp +++ b/src/sis_tbl.cpp @@ -92,6 +92,8 @@ IntegerVector sis_tbl(IntegerMatrix A, IntegerVector suff_stats) { break; } //Calculate the range and sample from that range to populate the table + if(min == max - 1) min = max; + IntegerVector range = seq(min,max); IntegerVector value = sample(range,1); tbl[i] = Rcpp::as(value); @@ -128,12 +130,3 @@ IntegerVector sis_tbl(IntegerMatrix A, IntegerVector suff_stats) { return tbl; } - -/***R -library(algstat) -tbl <- rep(50,8) -A <- hmat(c(4,2), 1:2) -suff_stats <- A %*% t(t(tbl)) -sis_tbl(A, suff_stats) -*/ - From 04c5b18601eabb2f7c382c799b1f00b5ed94fd9d Mon Sep 17 00:00:00 2001 From: Innerst Date: Fri, 25 Aug 2017 15:03:54 -0500 Subject: [PATCH 14/53] hit_and_run inner MCMC addition --- R/metropolis.r | 43 ++++++++++- src/metropolis_hypergeometric_cpp.cpp | 105 ++++++++++++++++++++------ 2 files changed, 125 insertions(+), 23 deletions(-) diff --git a/R/metropolis.r b/R/metropolis.r index b25a0ee..0769e67 100644 --- a/R/metropolis.r +++ b/R/metropolis.r @@ -169,7 +169,6 @@ metropolis <- function(init, moves, suff_stats, config, iter = 1E3, burn = 0, th ## run burn-in current <- unname(init) - train_current <- unname(init) message("Running chain (R)... ", appendLF = FALSE) @@ -275,6 +274,46 @@ metropolis <- function(init, moves, suff_stats, config, iter = 1E3, burn = 0, th w_moves <- (-1 * w_current) / w_move lower_bound <- if(any(w_moves < 0)){max(subset(w_moves,subset = w_moves < 0))}else{1} upper_bound <- if(any(w_moves > 0)){min(subset(w_moves,subset = w_moves > 0))}else{-1} + + #New part + # #Option 1 Enumerate tables and + # line <- lower_bound:upper_bound + # #Enumerate tables on the line + # tables <- matrix(0L, nrow =length(init) , ncol = length(line)) + # for(i in 1:length(line)){ + # tables[,i] <- current + line[i]*move + # } + # probs <- apply(tables, 2, function(x) 1/(sum(lfactorial(x)))) + # prob_dist <- probs / sum(probs) + # unif <- runif(1) + # dummy <- prob_dist[1] + # for(i in 1:(length(prob_dist) -1)){ + # if(unif < dummy){ + # propState <- tables[,i] + # break() + # } + # dummy <- dummy + prob_dist[i+1] + # } + #Option 2 + # line <- lower_bound:upper_bound + # w_current <- current + # unifs2 <- runif(2*length(line)) + # for(i in 1:(2*length(line))){ + # w_propState <- w_current + sample(c(-1,1), 1)*move + # if(any(w_propState < 0)){ + # prob <- 0 + # } else { + # if(dist == "hypergeometric"){ + # prob <- exp( sum(lfactorial(w_current)) - sum(lfactorial(w_propState)) ) + # } else { # dist == "uniform" + # prob <- 1 + # } + # } + # if(unifs2[i] < prob) w_current <- w_propState # else w_current + # } + # propState <- w_current + + if(any(w_moves == 0)){ w_propStatelow <- current + lower_bound * move w_propStateup <- current + upper_bound * move @@ -390,7 +429,7 @@ metropolis <- function(init, moves, suff_stats, config, iter = 1E3, burn = 0, th #' @rdname metropolis #' @export -rawMetropolis <- function(init, moves, iter = 1E3, dist = "hypergeometric", hit_and_run = FALSE){ +rawMetropolis <- function(init, moves, iter = 1E3, dist = "hypergeometric", hit_and_run = FALSE, SIS = FALSE, non_uniform = FALSE){ metropolis(init, moves, iter, burn = 0, thin = 1, dist = dist, hit_and_run) } diff --git a/src/metropolis_hypergeometric_cpp.cpp b/src/metropolis_hypergeometric_cpp.cpp index 6f94104..389e3c6 100644 --- a/src/metropolis_hypergeometric_cpp.cpp +++ b/src/metropolis_hypergeometric_cpp.cpp @@ -24,6 +24,7 @@ List metropolis_hypergeometric_cpp( NumericVector unifs3(nTotalSamples); IntegerVector proposal(n); // the proposed moves double prob; // the probability of transition + double prob2; bool anyIsNegative; IntegerVector move(n); double acceptProb = 0; @@ -35,6 +36,7 @@ List metropolis_hypergeometric_cpp( int lb; int ub; IntegerVector run; + IntegerVector constant = IntegerVector::create(-1,1); Function sample("sample"); whichMove = sample(nMoves, nTotalSamples, 1); @@ -86,31 +88,92 @@ List metropolis_hypergeometric_cpp( upperBound = stepSize[stepSize > 0]; lb = max(lowerBound); ub = min(upperBound); - - if(is_true(any(stepSize == 0))){ - IntegerVector test1 = current + lb * move; - IntegerVector test2 = current + ub * move; - for(int i = 0; i < n; ++i){ - if(test1[i] < 0) lb = 1; - if(test2[i] < 0) ub = -1; + + // Enumerating all tables + // IntegerVector line = seq(lb, ub); + // int line_length = line.size(); + // IntegerMatrix tables(n, line_length); + // for(int i = 0; i < line_length;++i){ + // for(int j = 0; j < n;++j){ + // tables(j,i) = current[j] + line[i] * move[j]; + // } + // } + + // MCMC inside MCMC + IntegerVector line = seq(lb, ub); + int line_length = line.size(); + IntegerVector w_current(n); + IntegerVector w_proposal(n); + for(int m = 0; m < n;++m){ + w_current[m] = current[m]; + } + + for(int l = 0; l < line_length;++l){ + int constant2 = as(Rcpp::sample(constant, 1)); + for(int k = 0; k < n;++k){ + w_proposal[k] = w_current[k] + constant2 * move[k]; + } + bool anyIsNegative2; + anyIsNegative2 = false; + for(int k = 0; k < n; ++k){ + if(w_proposal[k] < 0){ + anyIsNegative2 = true; } } - if(lb > ub){ - run[0] = 1; - }else{ - IntegerVector range = seq(lb,ub); - - run = Rcpp::sample(range,1); - } - if(run[0] == 0){ - run[0] = 1; + + if(anyIsNegative2){ + prob2 = 0; + } else { + prob2 = exp( sum(lgamma(w_current+1)) - sum(lgamma(w_proposal+1)) ); } - } - if(hit_and_run){ - for(int k = 0; k < n; ++k){ - proposal[k] = current[k] + as(run) * move[k]; + + if(prob2 > 1){ + prob2 = 1; } - }else{ + + // make move + if(unifs[l] < prob2){ + for(int k = 0; k < n; ++k){ + w_current[k] = w_proposal[k]; + } + } + } + for(int k = 0; k < n; ++k){ + proposal[k] = w_current[k]; + } + + + //Attempt at recursively calling MCMC routine + // List MCMC_out = metropolis_hypergeometric_cpp(current, as(move), suff_stats, config, 50, 1, false, false, false); + // IntegerMatrix mini_steps = MCMC_out[0]; + // int step_length = mini_steps.ncol(); + // proposal = mini_steps(_, step_length); + + // Base Hit and Run + // if(is_true(any(stepSize == 0))){ + // IntegerVector test1 = current + lb * move; + // IntegerVector test2 = current + ub * move; + // for(int i = 0; i < n; ++i){ + // if(test1[i] < 0) lb = 1; + // if(test2[i] < 0) ub = -1; + // } + // } + // if(lb > ub){ + // run[0] = 1; + // }else{ + // IntegerVector range = seq(lb,ub); + // + // run = Rcpp::sample(range,1); + // } + // if(run[0] == 0){ + // run[0] = 1; + // } + // if(hit_and_run){ + // for(int k = 0; k < n; ++k){ + // proposal[k] = current[k] + as(run) * move[k]; + // } + // } + }else{ for(int k = 0; k < n; ++k){ proposal[k] = current[k] + move[k]; } From 58acad58892493077645cdcf24e273fab74d9e11 Mon Sep 17 00:00:00 2001 From: Innerst Date: Fri, 25 Aug 2017 15:04:45 -0500 Subject: [PATCH 15/53] pois_reg name fix --- R/poisson.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/poisson.R b/R/poisson.R index 291967c..62b9975 100644 --- a/R/poisson.R +++ b/R/poisson.R @@ -86,7 +86,7 @@ pois_reg <- function(model, data, ## other basic objects #varsNlevels <- dimnames(data) - vars <- names(data) + vars <- names(data) @@ -172,7 +172,7 @@ pois_reg <- function(model, data, warning( "No moves were provided and 4ti2 is not found.\n", " The resulting chain is likely not connected and strongly autocorrelated.\n", - " See ?loglinear. Consider using rmove to generate SIS moves in advance.", + " See ?pois_reg. Consider using rmove to generate SIS moves in advance.", immediate. = TRUE ) message("Computing 1000 SIS moves... ", appendLF = FALSE) From e7d8a63db570a857cd695fa620b655006f82f12c Mon Sep 17 00:00:00 2001 From: Innerst Date: Thu, 21 Sep 2017 22:35:41 -0500 Subject: [PATCH 16/53] poisson, metropolis fixes --- R/RcppExports.R | 4 +- R/metropolis.r | 11 +++-- R/pmat.R | 34 +++++++++++-- R/poisson.R | 10 ++-- src/RcppExports.cpp | 9 ++-- src/metropolis_hypergeometric_cpp.cpp | 70 ++++++++++++++------------- src/metropolis_uniform_cpp.cpp | 62 +++++++++++++++++++++--- 7 files changed, 140 insertions(+), 60 deletions(-) diff --git a/R/RcppExports.R b/R/RcppExports.R index 13aa3b8..4f4cd75 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -21,8 +21,8 @@ computeX2sCpp <- function(x, exp) { .Call('_algstat_computeX2sCpp', PACKAGE = 'algstat', x, exp) } -metropolis_hypergeometric_cpp <- function(current, moves, suff_stats, config, iter, thin, hit_and_run, SIS, non_uniform) { - .Call('_algstat_metropolis_hypergeometric_cpp', PACKAGE = 'algstat', current, moves, suff_stats, config, iter, thin, hit_and_run, SIS, non_uniform) +metropolis_hypergeometric_cpp <- function(current, moves, suff_stats, config, iter, thin, hit_and_run, SIS, non_uniform, adaptive) { + .Call('_algstat_metropolis_hypergeometric_cpp', PACKAGE = 'algstat', current, moves, suff_stats, config, iter, thin, hit_and_run, SIS, non_uniform, adaptive) } metropolis_uniform_cpp <- function(current, moves, suff_stats, config, iter, thin, hit_and_run, SIS, non_uniform) { diff --git a/R/metropolis.r b/R/metropolis.r index 0769e67..8575706 100644 --- a/R/metropolis.r +++ b/R/metropolis.r @@ -15,8 +15,11 @@ #' @param dist steady-state distribution; "hypergeometric" (default) #' or "uniform" #' @param engine C++ or R? (C++ yields roughly a 20-25x speedup) -#' @param hit_and_run Whether or not to use the hit and run algorithm in +#' @param hit_and_run Whether or not to use the discrete hit and run algorithm in #' the metropolis algorithm +#' @param adaptive Option inside hit_and_run option. If TRUE, hit and run will choose a proposal distribution adaptively. Defaulted to FALSE. +#' @param SIS If TRUE, with a small probability the move will be chosen randomly from the uniform distribution +#' on the fiber using Sequential Importance "Like" Sampling methods. Defaulted to FALSE #' @name metropolis #' @return a list #' @export metropolis @@ -147,7 +150,7 @@ #' metropolis <- function(init, moves, suff_stats, config, iter = 1E3, burn = 0, thin = 1, dist = c("hypergeometric","uniform"), engine = c("Cpp","R"), - hit_and_run = FALSE, SIS = FALSE, non_uniform = FALSE + hit_and_run = FALSE, SIS = FALSE, non_uniform = FALSE, adaptive = FALSE ){ ## preliminary checking @@ -406,8 +409,8 @@ metropolis <- function(init, moves, suff_stats, config, iter = 1E3, burn = 0, th metropolis_uniform_cpp } message("Running chain (C++)... ", appendLF = FALSE) - if (burn > 0) current <- sampler(current, allMoves, suff_stats, config, burn, 1, hit_and_run, SIS, non_uniform)$steps[,burn] - out <- sampler(current, allMoves, suff_stats, config, iter, thin, hit_and_run, SIS, non_uniform) + if (burn > 0) current <- sampler(current, allMoves, suff_stats, config, burn, 1, hit_and_run, SIS, non_uniform, adaptive)$steps[,burn] + out <- sampler(current, allMoves, suff_stats, config, iter, thin, hit_and_run, SIS, non_uniform, adaptive) out$moves <- moves message("done.") diff --git a/R/pmat.R b/R/pmat.R index 09088e0..160c44a 100644 --- a/R/pmat.R +++ b/R/pmat.R @@ -13,16 +13,42 @@ pmat <- function(levels, facets){ + #Setup the levels levels <- levels[levels != 1] num_covariates <- length(levels) mat_list <- list() + exp_cov <-1:num_covariates - for(i in 1:num_covariates){ - mat_list[[i]] <- matrix(c(rep(1,levels[i]), 1:levels[i]), nrow = 2, byrow = T) + #Checking heirarchical sturcture of facets + if(any(sapply(facets, length) > 1)){ + long_list_elts <- facets[which(sapply(facets, length)>1)] + + unique_vals <- unique(unlist(long_list_elts)) + + heirarc <- as.list(c(unique_vals, long_list_elts)) + + facets <- union(heirarc, facets) } - return(do.call(kprod, mat_list)) - + #List of config matrices, one for each covariate + for(i in exp_cov){ + mat_list[[i]] <- matrix(c(rep(1,levels[i]), 1:levels[i]), nrow = 2, byrow = TRUE) + } + + #Full heirarchicial config matrix with all interactions included + full_mat <- do.call(kprod, mat_list) + + #All possible combinations of covariates (powerset like) to be compared to facets + if(length(exp_cov) == 1) facet_list <- list(exp_cov) + else{ + facet_list <- list(integer(0)) + for(i in seq_along(exp_cov)){ + facet_list <- c(facet_list, lapply(facet_list, function(x) c(x,exp_cov[i]))) + } + facet_list <- facet_list[-1] + } + #return the configuration matrix which includes only the elements need for the heirarchical model + return(full_mat[c(TRUE, facet_list %in% facets),]) } diff --git a/R/poisson.R b/R/poisson.R index 62b9975..938e94a 100644 --- a/R/poisson.R +++ b/R/poisson.R @@ -77,7 +77,7 @@ pois_reg <- function(model, data, ## reshape data ################################################## - # data <- suppressMessages(teshape(data, "tab")) + data <- suppressMessages(teshape(data, "freq")) # p <- length(dim(data)) # nCells <- length(data) @@ -117,8 +117,11 @@ pois_reg <- function(model, data, response <- fString[2] predString <- fString[3] - - + #Rename the response variable to fit formula syntax id needed + if(any(response != vars)){ + colnames(data)[colnames(data) == "freq"] <- response + vars <- names(data) + } ## make list of facets model <- strsplit(predString, " \\+ ")[[1]] model <- strsplit(model, " \\* ") @@ -129,7 +132,6 @@ pois_reg <- function(model, data, nCells <- length(init) p <- 1 data <- data[vars == model] - }else{ #If model specifiaction, then make table data <- suppressMessages(teshape(cbind(data[vars == response],data[vars %in% model]), "tab", freqVar = response)) diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 04e82f3..0354641 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -66,8 +66,8 @@ BEGIN_RCPP END_RCPP } // metropolis_hypergeometric_cpp -List metropolis_hypergeometric_cpp(IntegerVector current, IntegerMatrix moves, IntegerVector suff_stats, IntegerMatrix config, int iter, int thin, bool hit_and_run, bool SIS, bool non_uniform); -RcppExport SEXP _algstat_metropolis_hypergeometric_cpp(SEXP currentSEXP, SEXP movesSEXP, SEXP suff_statsSEXP, SEXP configSEXP, SEXP iterSEXP, SEXP thinSEXP, SEXP hit_and_runSEXP, SEXP SISSEXP, SEXP non_uniformSEXP) { +List metropolis_hypergeometric_cpp(IntegerVector current, IntegerMatrix moves, IntegerVector suff_stats, IntegerMatrix config, int iter, int thin, bool hit_and_run, bool SIS, bool non_uniform, bool adaptive); +RcppExport SEXP _algstat_metropolis_hypergeometric_cpp(SEXP currentSEXP, SEXP movesSEXP, SEXP suff_statsSEXP, SEXP configSEXP, SEXP iterSEXP, SEXP thinSEXP, SEXP hit_and_runSEXP, SEXP SISSEXP, SEXP non_uniformSEXP, SEXP adaptiveSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -80,7 +80,8 @@ BEGIN_RCPP Rcpp::traits::input_parameter< bool >::type hit_and_run(hit_and_runSEXP); Rcpp::traits::input_parameter< bool >::type SIS(SISSEXP); Rcpp::traits::input_parameter< bool >::type non_uniform(non_uniformSEXP); - rcpp_result_gen = Rcpp::wrap(metropolis_hypergeometric_cpp(current, moves, suff_stats, config, iter, thin, hit_and_run, SIS, non_uniform)); + Rcpp::traits::input_parameter< bool >::type adaptive(adaptiveSEXP); + rcpp_result_gen = Rcpp::wrap(metropolis_hypergeometric_cpp(current, moves, suff_stats, config, iter, thin, hit_and_run, SIS, non_uniform, adaptive)); return rcpp_result_gen; END_RCPP } @@ -148,7 +149,7 @@ static const R_CallMethodDef CallEntries[] = { {"_algstat_computeNMsCpp", (DL_FUNC) &_algstat_computeNMsCpp, 2}, {"_algstat_computeUProbsCpp", (DL_FUNC) &_algstat_computeUProbsCpp, 1}, {"_algstat_computeX2sCpp", (DL_FUNC) &_algstat_computeX2sCpp, 2}, - {"_algstat_metropolis_hypergeometric_cpp", (DL_FUNC) &_algstat_metropolis_hypergeometric_cpp, 9}, + {"_algstat_metropolis_hypergeometric_cpp", (DL_FUNC) &_algstat_metropolis_hypergeometric_cpp, 10}, {"_algstat_metropolis_uniform_cpp", (DL_FUNC) &_algstat_metropolis_uniform_cpp, 9}, {"_algstat_rfiberOne", (DL_FUNC) &_algstat_rfiberOne, 2}, {"_algstat_sis_tbl", (DL_FUNC) &_algstat_sis_tbl, 2}, diff --git a/src/metropolis_hypergeometric_cpp.cpp b/src/metropolis_hypergeometric_cpp.cpp index 389e3c6..d2e4472 100644 --- a/src/metropolis_hypergeometric_cpp.cpp +++ b/src/metropolis_hypergeometric_cpp.cpp @@ -12,7 +12,8 @@ List metropolis_hypergeometric_cpp( IntegerMatrix config, int iter, int thin, bool hit_and_run, - bool SIS, bool non_uniform + bool SIS, bool non_uniform, + bool adaptive ){ int nTotalSamples = iter * thin; // total number of steps int n = current.size(); // number of cells @@ -37,6 +38,8 @@ List metropolis_hypergeometric_cpp( int ub; IntegerVector run; IntegerVector constant = IntegerVector::create(-1,1); + IntegerVector w_current(n); + IntegerVector w_proposal(n); Function sample("sample"); whichMove = sample(nMoves, nTotalSamples, 1); @@ -100,10 +103,9 @@ List metropolis_hypergeometric_cpp( // } // MCMC inside MCMC - IntegerVector line = seq(lb, ub); - int line_length = line.size(); - IntegerVector w_current(n); - IntegerVector w_proposal(n); + //IntegerVector line = seq(lb, ub); + if(adaptive){ + int line_length = ub-lb; for(int m = 0; m < n;++m){ w_current[m] = current[m]; } @@ -141,38 +143,38 @@ List metropolis_hypergeometric_cpp( for(int k = 0; k < n; ++k){ proposal[k] = w_current[k]; } - - //Attempt at recursively calling MCMC routine - // List MCMC_out = metropolis_hypergeometric_cpp(current, as(move), suff_stats, config, 50, 1, false, false, false); - // IntegerMatrix mini_steps = MCMC_out[0]; - // int step_length = mini_steps.ncol(); - // proposal = mini_steps(_, step_length); + // List MCMC_out = metropolis_hypergeometric_cpp(current, as(move), suff_stats, config, 50, 1, false, false, false); + // IntegerMatrix mini_steps = MCMC_out[0]; + // int step_length = mini_steps.ncol(); + // proposal = mini_steps(_, step_length); + } else { // Base Hit and Run - // if(is_true(any(stepSize == 0))){ - // IntegerVector test1 = current + lb * move; - // IntegerVector test2 = current + ub * move; - // for(int i = 0; i < n; ++i){ - // if(test1[i] < 0) lb = 1; - // if(test2[i] < 0) ub = -1; - // } - // } - // if(lb > ub){ - // run[0] = 1; - // }else{ - // IntegerVector range = seq(lb,ub); - // - // run = Rcpp::sample(range,1); - // } - // if(run[0] == 0){ - // run[0] = 1; - // } - // if(hit_and_run){ - // for(int k = 0; k < n; ++k){ - // proposal[k] = current[k] + as(run) * move[k]; - // } - // } + if(is_true(any(stepSize == 0))){ + IntegerVector test1 = current + lb * move; + IntegerVector test2 = current + ub * move; + for(int i = 0; i < n; ++i){ + if(test1[i] < 0) lb = 1; + if(test2[i] < 0) ub = -1; + } + } + if(lb > ub){ + run[0] = 1; + }else{ + IntegerVector range = seq(lb,ub); + + run = Rcpp::sample(range,1); + } + if(run[0] == 0){ + run[0] = 1; + } + if(hit_and_run){ + for(int k = 0; k < n; ++k){ + proposal[k] = current[k] + as(run) * move[k]; + } + } + } }else{ for(int k = 0; k < n; ++k){ proposal[k] = current[k] + move[k]; diff --git a/src/metropolis_uniform_cpp.cpp b/src/metropolis_uniform_cpp.cpp index f317a5d..c06510c 100644 --- a/src/metropolis_uniform_cpp.cpp +++ b/src/metropolis_uniform_cpp.cpp @@ -10,7 +10,8 @@ List metropolis_uniform_cpp( IntegerMatrix config, int iter, int thin, bool hit_and_run, - bool SIS, bool non_uniform + bool SIS, bool non_uniform, + bool adaptive ){ int nTotalSamples = iter * thin; // total number of steps @@ -23,6 +24,7 @@ List metropolis_uniform_cpp( NumericVector unifs3(nTotalSamples); IntegerVector proposal(n); // the proposed moves double prob; // the probability of transition + double prob2; bool anyIsNegative; IntegerVector move(n); double acceptProb = 0; @@ -33,6 +35,9 @@ List metropolis_uniform_cpp( IntegerVector lowerBound; int lb; int ub; + IntegerVector constant = IntegerVector::create(-1,1); + IntegerVector w_current(n); + IntegerVector w_proposal(n); IntegerVector run; Function sample("sample"); @@ -77,19 +82,59 @@ List metropolis_uniform_cpp( } if(hit_and_run == true){ current_num = current[move != 0]; - move_num = move[move != 0]; - stepSize = (-1 * current_num) / move_num; - lowerBound = stepSize[stepSize < 0]; - upperBound = stepSize[stepSize > 0]; - lb = max(lowerBound); - ub = min(upperBound); + if(adaptive){ + int line_length = ub-lb; + for(int m = 0; m < n;++m){ + w_current[m] = current[m]; + } + + for(int l = 0; l < line_length;++l){ + int constant2 = as(Rcpp::sample(constant, 1)); + for(int k = 0; k < n;++k){ + w_proposal[k] = w_current[k] + constant2 * move[k]; + } + bool anyIsNegative2; + anyIsNegative2 = false; + for(int k = 0; k < n; ++k){ + if(w_proposal[k] < 0){ + anyIsNegative2 = true; + } + } + + if(anyIsNegative2){ + prob2 = 0; + } else { + prob2 = exp( sum(lgamma(w_current+1)) - sum(lgamma(w_proposal+1)) ); + } + + if(prob2 > 1){ + prob2 = 1; + } + + // make move + if(unifs[l] < prob2){ + for(int k = 0; k < n; ++k){ + w_current[k] = w_proposal[k]; + } + } + } + for(int k = 0; k < n; ++k){ + proposal[k] = w_current[k]; + } + //Attempt at recursively calling MCMC routine + // List MCMC_out = metropolis_hypergeometric_cpp(current, as(move), suff_stats, config, 50, 1, false, false, false); + // IntegerMatrix mini_steps = MCMC_out[0]; + // int step_length = mini_steps.ncol(); + // proposal = mini_steps(_, step_length); + } else { + if(is_true(any(stepSize == 0))){ IntegerVector test1 = current + lb * move; IntegerVector test2 = current + ub * move; @@ -115,7 +160,8 @@ List metropolis_uniform_cpp( for(int k = 0; k < n; ++k){ proposal[k] = current[k] + as(run) * move[k]; } - }else{ + } + } else { for(int k = 0; k < n; ++k){ proposal[k] = current[k] + move[k]; } From e2d607e8d429a366714a0ccd8c7459c3910ee2f7 Mon Sep 17 00:00:00 2001 From: Innerst Date: Fri, 22 Sep 2017 09:51:01 -0500 Subject: [PATCH 17/53] pois_reg fixes --- R/RcppExports.R | 4 ++-- R/poisson.R | 8 +++++--- src/RcppExports.cpp | 9 +++++---- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/R/RcppExports.R b/R/RcppExports.R index 4f4cd75..b1b3a31 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -25,8 +25,8 @@ metropolis_hypergeometric_cpp <- function(current, moves, suff_stats, config, it .Call('_algstat_metropolis_hypergeometric_cpp', PACKAGE = 'algstat', current, moves, suff_stats, config, iter, thin, hit_and_run, SIS, non_uniform, adaptive) } -metropolis_uniform_cpp <- function(current, moves, suff_stats, config, iter, thin, hit_and_run, SIS, non_uniform) { - .Call('_algstat_metropolis_uniform_cpp', PACKAGE = 'algstat', current, moves, suff_stats, config, iter, thin, hit_and_run, SIS, non_uniform) +metropolis_uniform_cpp <- function(current, moves, suff_stats, config, iter, thin, hit_and_run, SIS, non_uniform, adaptive) { + .Call('_algstat_metropolis_uniform_cpp', PACKAGE = 'algstat', current, moves, suff_stats, config, iter, thin, hit_and_run, SIS, non_uniform, adaptive) } rfiberOne <- function(A, b) { diff --git a/R/poisson.R b/R/poisson.R index 938e94a..abcffea 100644 --- a/R/poisson.R +++ b/R/poisson.R @@ -117,8 +117,8 @@ pois_reg <- function(model, data, response <- fString[2] predString <- fString[3] - #Rename the response variable to fit formula syntax id needed - if(any(response != vars)){ + #Rename the response variable to fit formula syntax if needed + if(!(response %in% vars)){ colnames(data)[colnames(data) == "freq"] <- response vars <- names(data) } @@ -134,7 +134,9 @@ pois_reg <- function(model, data, data <- data[vars == model] }else{ #If model specifiaction, then make table - data <- suppressMessages(teshape(cbind(data[vars == response],data[vars %in% model]), "tab", freqVar = response)) + augdata <- cbind(data[vars == response],data[vars %in% model]) + vars <- names(augdata) + data <- suppressMessages(teshape(augdata, "tab", freqVar = response)) p <- length(dim(data)) init <- tab2vec(data) nCells <- length(init) diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 0354641..26da1a9 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -86,8 +86,8 @@ BEGIN_RCPP END_RCPP } // metropolis_uniform_cpp -List metropolis_uniform_cpp(IntegerVector current, IntegerMatrix moves, IntegerVector suff_stats, IntegerMatrix config, int iter, int thin, bool hit_and_run, bool SIS, bool non_uniform); -RcppExport SEXP _algstat_metropolis_uniform_cpp(SEXP currentSEXP, SEXP movesSEXP, SEXP suff_statsSEXP, SEXP configSEXP, SEXP iterSEXP, SEXP thinSEXP, SEXP hit_and_runSEXP, SEXP SISSEXP, SEXP non_uniformSEXP) { +List metropolis_uniform_cpp(IntegerVector current, IntegerMatrix moves, IntegerVector suff_stats, IntegerMatrix config, int iter, int thin, bool hit_and_run, bool SIS, bool non_uniform, bool adaptive); +RcppExport SEXP _algstat_metropolis_uniform_cpp(SEXP currentSEXP, SEXP movesSEXP, SEXP suff_statsSEXP, SEXP configSEXP, SEXP iterSEXP, SEXP thinSEXP, SEXP hit_and_runSEXP, SEXP SISSEXP, SEXP non_uniformSEXP, SEXP adaptiveSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -100,7 +100,8 @@ BEGIN_RCPP Rcpp::traits::input_parameter< bool >::type hit_and_run(hit_and_runSEXP); Rcpp::traits::input_parameter< bool >::type SIS(SISSEXP); Rcpp::traits::input_parameter< bool >::type non_uniform(non_uniformSEXP); - rcpp_result_gen = Rcpp::wrap(metropolis_uniform_cpp(current, moves, suff_stats, config, iter, thin, hit_and_run, SIS, non_uniform)); + Rcpp::traits::input_parameter< bool >::type adaptive(adaptiveSEXP); + rcpp_result_gen = Rcpp::wrap(metropolis_uniform_cpp(current, moves, suff_stats, config, iter, thin, hit_and_run, SIS, non_uniform, adaptive)); return rcpp_result_gen; END_RCPP } @@ -150,7 +151,7 @@ static const R_CallMethodDef CallEntries[] = { {"_algstat_computeUProbsCpp", (DL_FUNC) &_algstat_computeUProbsCpp, 1}, {"_algstat_computeX2sCpp", (DL_FUNC) &_algstat_computeX2sCpp, 2}, {"_algstat_metropolis_hypergeometric_cpp", (DL_FUNC) &_algstat_metropolis_hypergeometric_cpp, 10}, - {"_algstat_metropolis_uniform_cpp", (DL_FUNC) &_algstat_metropolis_uniform_cpp, 9}, + {"_algstat_metropolis_uniform_cpp", (DL_FUNC) &_algstat_metropolis_uniform_cpp, 10}, {"_algstat_rfiberOne", (DL_FUNC) &_algstat_rfiberOne, 2}, {"_algstat_sis_tbl", (DL_FUNC) &_algstat_sis_tbl, 2}, {"_algstat_walk", (DL_FUNC) &_algstat_walk, 4}, From 3a17045253f0be726c4ca0a0ba1c3db74243d714 Mon Sep 17 00:00:00 2001 From: Innerst Date: Thu, 5 Oct 2017 12:24:36 -0500 Subject: [PATCH 18/53] Mass changes to pmat and poisson --- R/pmat.R | 42 +++++-- R/poisson.R | 356 ++++++++++++++++++++++++++++------------------------ 2 files changed, 223 insertions(+), 175 deletions(-) diff --git a/R/pmat.R b/R/pmat.R index 160c44a..4e46da4 100644 --- a/R/pmat.R +++ b/R/pmat.R @@ -13,11 +13,30 @@ pmat <- function(levels, facets){ + #########Old Setup######### #Setup the levels - levels <- levels[levels != 1] - num_covariates <- length(levels) - mat_list <- list() - exp_cov <-1:num_covariates + #levels <- levels[levels != 1] + #num_covariates <- length(levels) + #mat_list <- list() + #exp_cov <-1:num_covariates + ############################# + #Small function to make single covariate configuration matrix + func <- function(x){ + rbind(rep(1, length(x)), x) + } + + ######### New Setup ######## + if(is.vector(levels)){ + num_covariates <- 1 + full_mat <- func(levels) + }else{ + num_covariates <- ncol(levels) + #Make single covariate configuration matrix for each covariate + mat_list <- alply(levels, 2, func) + #Full heirarchicial config matrix with all interactions included + full_mat <- do.call(kprod, mat_list) + } + exp_cov <- 1:num_covariates #Checking heirarchical sturcture of facets if(any(sapply(facets, length) > 1)){ @@ -30,17 +49,18 @@ pmat <- function(levels, facets){ facets <- union(heirarc, facets) } + ########Old####### #List of config matrices, one for each covariate - for(i in exp_cov){ - mat_list[[i]] <- matrix(c(rep(1,levels[i]), 1:levels[i]), nrow = 2, byrow = TRUE) - } + #for(i in exp_cov){ + # mat_list[[i]] <- matrix(c(rep(1,levels[i]), 1:levels[i]), nrow = 2, byrow = TRUE) + #} + ################ - #Full heirarchicial config matrix with all interactions included - full_mat <- do.call(kprod, mat_list) #All possible combinations of covariates (powerset like) to be compared to facets - if(length(exp_cov) == 1) facet_list <- list(exp_cov) - else{ + if(length(exp_cov) == 1) { + facet_list <- list(exp_cov) + }else{ facet_list <- list(integer(0)) for(i in seq_along(exp_cov)){ facet_list <- c(facet_list, lapply(facet_list, function(x) c(x,exp_cov[i]))) diff --git a/R/poisson.R b/R/poisson.R index abcffea..7119ccc 100644 --- a/R/poisson.R +++ b/R/poisson.R @@ -77,7 +77,7 @@ pois_reg <- function(model, data, ## reshape data ################################################## - data <- suppressMessages(teshape(data, "freq")) + # data <- suppressMessages(teshape(data, "freq")) # p <- length(dim(data)) # nCells <- length(data) @@ -93,9 +93,9 @@ pois_reg <- function(model, data, ## check for sampling zeros ################################################## - if(any(data == 0L)) message( - "Care ought be taken with tables with sampling zeros to ensure the MLE exists." - ) + #if(any(data == 0L)) message( + # "Care ought be taken with tables with sampling zeros to ensure the MLE exists." + #) ## parse model specification (formula for vector of r_k's) @@ -111,6 +111,8 @@ pois_reg <- function(model, data, } else { # if it's a formula, convert to list if(is.formula(model)){ + ## Reshape data + data <- model.frame(model, data) ## parse formula fString <- as.character(model) @@ -118,27 +120,28 @@ pois_reg <- function(model, data, predString <- fString[3] #Rename the response variable to fit formula syntax if needed - if(!(response %in% vars)){ - colnames(data)[colnames(data) == "freq"] <- response - vars <- names(data) - } + #if(!(response %in% vars)){ + # colnames(data)[colnames(data) == "freq"] <- response + # vars <- names(data) + #} ## make list of facets model <- strsplit(predString, " \\+ ")[[1]] model <- strsplit(model, " \\* ") + ##Format the data + data <- ddply(data, unique(unlist(model)), "sum") + if(length(model) == 1){ - init <- unlist(data[vars == response]) + init <- data$sum nCells <- length(init) p <- 1 - data <- data[vars == model] }else{ #If model specifiaction, then make table - augdata <- cbind(data[vars == response],data[vars %in% model]) - vars <- names(augdata) - data <- suppressMessages(teshape(augdata, "tab", freqVar = response)) - p <- length(dim(data)) - init <- tab2vec(data) + #data <- suppressMessages(teshape(data, "tab", freqVar = response)) + #p <- length(dim(data)) + p <- ncol(data) - 1 + init <- data$sum nCells <- length(init) } } @@ -156,9 +159,15 @@ pois_reg <- function(model, data, } else { stop("Invalid model specification, see ?pois_reg") } + ##Levels (assuming all levels are numeric i.e. (1,2,3,... not Green, Blue, Red, etc.) + if(ncol(data) <= 2){ + levls <- unique(data[,-ncol(data)]) + }else{ + levls <- apply(data[,-ncol(data)], 2, unique) + } # make configuration (model) matrix - A <- pmat(dim(data), facets) + A <- pmat(levls, facets) } suff_stats <- unname(A %*% init) @@ -210,180 +219,199 @@ pois_reg <- function(model, data, ## compute data chi square ################################################## - if(modelGivenByMatrix && method == "ipf"){ - message( - "Iterative proportional fitting is not yet implemented\n", - " for models specified by configuration matrices.\n", - " Changing to method = \"mcmc\"..." - ) - method <- "mcmc" - } - if(method == "ipf"){ - exp <- loglin(data, facets, fit = TRUE, print = FALSE)$fit - } else if(method == "mcmc"){ - exp <- vec2tab(rowMeans(out$steps), dim(data)) - dimnames(exp) <- dimnames(data) - } - e <- unname(tab2vec(exp)) - u <- t(t(unname(tab2vec(data)))) + # if(modelGivenByMatrix && method == "ipf"){ + # message( + # "Iterative proportional fitting is not yet implemented\n", + # " for models specified by configuration matrices.\n", + # " Changing to method = \"mcmc\"..." + # ) + # method <- "mcmc" + # } + # if(method == "ipf"){ + # exp <- loglin(data, facets, fit = TRUE, print = FALSE)$fit + # } else if(method == "mcmc"){ + # exp <- vec2tab(rowMeans(out$steps), dim(data)) + # dimnames(exp) <- dimnames(data) + # } + # e <- unname(tab2vec(exp)) + +#####Issue Here!!!!!!!############# + ###Old + #u <- t(t(unname(tab2vec(data)))) + ###New + u <- t(t(data$sum)) PR <- computeUProbsCpp(matrix(u)) # unnormd prob; numers LAS 1.1.10 - X2 <- computeX2sCpp(u, e) - G2 <- computeG2sCpp(u, e) - FT <- computeCRsCpp(u, e, -.5) - CR <- computeCRsCpp(u, e, 2/3) - NM <- computeNMsCpp(u, e) + # X2 <- computeX2sCpp(u, e) + # G2 <- computeG2sCpp(u, e) + # FT <- computeCRsCpp(u, e, -.5) + # CR <- computeCRsCpp(u, e, 2/3) + # NM <- computeNMsCpp(u, e) ## compute MCMC chi squares ################################################## PRs <- computeUProbsCpp(out$steps) # unnormd probs; numers LAS 1.1.10 - X2s <- computeX2sCpp(out$steps, e) - G2s <- computeG2sCpp(out$steps, e) - FTs <- computeCRsCpp(out$steps, e, -.5) - CRs <- computeCRsCpp(out$steps, e, 2/3) - NMs <- computeNMsCpp(out$steps, e) - - - ## compute parameters - ################################################## - if(!modelGivenByMatrix){ - # in principle, there should be one parameter for every cell. - # there are prod(dim(data)) cells. - # a good reference is BFH, p. 35 (and to a lesser extent 43) - # the prod(dim(data)[terms[[j]]] - 1) line below is like - # (I - 1) (J - 1) (K - 1) - # CDA p.79 also helpful - dimSatModel <- nCells - 1 - degFreedom <- rep.int(0, 2^p) # there are 2^p possible subsets of vars, and - # therefore there are 2^p possible terms - - # possibleTerms are more "types of terms" as opposed to individual terms - # for example, an entry c(1,3) would refer to all combinations of levels - # of variables 1 and 3; ie (# var 1 levels - 1) * (# var 3 levels - 1) - # individual terms (parameters) - possibleTerms <- subsets(p, include_null = TRUE) - names(possibleTerms) <- sapply(possibleTerms, paste, collapse = " ") - names(possibleTerms)[which(names(possibleTerms) == "")] <- "(Intercept)" - nVarLvls <- dim(data) - # paramsPerTerm <- lapply(possibleTerms, function(x){ - # if(length(x) == 0) return(1L) - # prod(nVarLvls[x] - 1) - # }) - - - # similarly, there are the terms in the model - termsInModel <- unique(unlist(lapply( - lapply(facets, as.character), # to avoid subsets(2) - subsets, include_null = TRUE), - recursive = FALSE - )) - termsInModel <- lapply(termsInModel, as.integer) - names(termsInModel) <- sapply(termsInModel, paste, collapse = " ") - names(termsInModel)[which(names(termsInModel) == "")] <- "(Intercept)" - paramsPerTermInModel <- lapply(termsInModel, function(x){ - if(length(x) == 0) return(1L) - prod(nVarLvls[x] - 1) - }) - names(paramsPerTermInModel) <- unname(sapply(termsInModel, function(x){ - if(length(x) == 0) return("(Intercept)") - paste(names(dimnames(data))[x], collapse = ".") - })) - nParamsInModel <- sum(unlist(paramsPerTermInModel)) - dimModel <- nParamsInModel - 1 # the - 1 accounts for the overall mean - overallAsymptoticDegFreedom <- (dimSatModel - dimModel) - - - # compute the parameters - log_fit <- exp - log_fit[exp > 0] <- log(exp[exp > 0]) - param <- as.list(rep(NA, length(termsInModel))) - names(param) <- names(paramsPerTermInModel) - for(k in seq_along(param)){ - if(length(termsInModel[[k]]) == 0){ - param[[k]] <- mean(log_fit) - log_fit <- log_fit - param[[k]] - } else { - param[[k]] <- apply(log_fit, termsInModel[[k]], mean) - log_fit <- sweep(log_fit, termsInModel[[k]], param[[k]]) - } - } - # for every step, fit mle - # then decompose mle - # problem : they all have the same marginals, so the same - # mles! - # idea 1 : sample from the multinomial with the same sample - # size (so different marginals), estimate, then decompose - # idea 2 : bootstrap sample from the table, estimate, decompose - # i think i like idea 2 better. - - - # reorder the param estimates in the order of subsets - # so you have the intercept, then all first order terms, and so on - goodOrder <- sapply( - c("(Intercept)", subsets(names(dimnames(data)))), - paste, collapse = "." - ) - param <- param[goodOrder[goodOrder %in% names(param)]] - out$param <- param - - } - + # X2s <- computeX2sCpp(out$steps, e) + # G2s <- computeG2sCpp(out$steps, e) + # FTs <- computeCRsCpp(out$steps, e, -.5) + # CRs <- computeCRsCpp(out$steps, e, 2/3) + # NMs <- computeNMsCpp(out$steps, e) + + # + # ## compute parameters + # ################################################## + # if(!modelGivenByMatrix){ + # # in principle, there should be one parameter for every cell. + # # there are prod(dim(data)) cells. + # # a good reference is BFH, p. 35 (and to a lesser extent 43) + # # the prod(dim(data)[terms[[j]]] - 1) line below is like + # # (I - 1) (J - 1) (K - 1) + # # CDA p.79 also helpful + # dimSatModel <- nCells - 1 + # degFreedom <- rep.int(0, 2^p) # there are 2^p possible subsets of vars, and + # # therefore there are 2^p possible terms + # + # # possibleTerms are more "types of terms" as opposed to individual terms + # # for example, an entry c(1,3) would refer to all combinations of levels + # # of variables 1 and 3; ie (# var 1 levels - 1) * (# var 3 levels - 1) + # # individual terms (parameters) + # possibleTerms <- subsets(p, include_null = TRUE) + # names(possibleTerms) <- sapply(possibleTerms, paste, collapse = " ") + # names(possibleTerms)[which(names(possibleTerms) == "")] <- "(Intercept)" + # nVarLvls <- dim(data) + # # paramsPerTerm <- lapply(possibleTerms, function(x){ + # # if(length(x) == 0) return(1L) + # # prod(nVarLvls[x] - 1) + # # }) + # + # + # # similarly, there are the terms in the model + # termsInModel <- unique(unlist(lapply( + # lapply(facets, as.character), # to avoid subsets(2) + # subsets, include_null = TRUE), + # recursive = FALSE + # )) + # termsInModel <- lapply(termsInModel, as.integer) + # names(termsInModel) <- sapply(termsInModel, paste, collapse = " ") + # names(termsInModel)[which(names(termsInModel) == "")] <- "(Intercept)" + # paramsPerTermInModel <- lapply(termsInModel, function(x){ + # if(length(x) == 0) return(1L) + # prod(nVarLvls[x] - 1) + # }) + # names(paramsPerTermInModel) <- unname(sapply(termsInModel, function(x){ + # if(length(x) == 0) return("(Intercept)") + # paste(names(dimnames(data))[x], collapse = ".") + # })) + # nParamsInModel <- sum(unlist(paramsPerTermInModel)) + # dimModel <- nParamsInModel - 1 # the - 1 accounts for the overall mean + # overallAsymptoticDegFreedom <- (dimSatModel - dimModel) + # + # + # # compute the parameters + # log_fit <- exp + # log_fit[exp > 0] <- log(exp[exp > 0]) + # param <- as.list(rep(NA, length(termsInModel))) + # names(param) <- names(paramsPerTermInModel) + # for(k in seq_along(param)){ + # if(length(termsInModel[[k]]) == 0){ + # param[[k]] <- mean(log_fit) + # log_fit <- log_fit - param[[k]] + # } else { + # param[[k]] <- apply(log_fit, termsInModel[[k]], mean) + # log_fit <- sweep(log_fit, termsInModel[[k]], param[[k]]) + # } + # } + # # for every step, fit mle + # # then decompose mle + # # problem : they all have the same marginals, so the same + # # mles! + # # idea 1 : sample from the multinomial with the same sample + # # size (so different marginals), estimate, then decompose + # # idea 2 : bootstrap sample from the table, estimate, decompose + # # i think i like idea 2 better. + # + # + # # reorder the param estimates in the order of subsets + # # so you have the intercept, then all first order terms, and so on + # goodOrder <- sapply( + # c("(Intercept)", subsets(names(dimnames(data)))), + # paste, collapse = "." + # ) + # param <- param[goodOrder[goodOrder %in% names(param)]] + # out$param <- param + # + # } + # ## compute residuals and model selection, agresti p.81, 216, 324 ################################################## - out$residuals <- exp - out$residuals[exp > 0] <- - (data[exp > 0] - exp[exp > 0]) / sqrt(exp[exp > 0]) - - if(!modelGivenByMatrix){ - k <- nParamsInModel # = number of params - n <- sum(data) # = sample size - L <- dmultinom(u, sum(u), e, TRUE) # maximized log-likelihood - BIC <- log(n)*k - 2*L - AIC <- 2*k - 2*L - AICc <- AIC + 2*k*(k+1)/(n-k-1) - out$df <- paramsPerTermInModel - out$quality <- c(AIC = AIC, AICc = AICc, BIC = BIC) - } + # out$residuals <- exp + # out$residuals[exp > 0] <- + # (data[exp > 0] - exp[exp > 0]) / sqrt(exp[exp > 0]) + # + # if(!modelGivenByMatrix){ + # k <- nParamsInModel # = number of params + # n <- sum(data) # = sample size + # L <- dmultinom(u, sum(u), e, TRUE) # maximized log-likelihood + # BIC <- log(n)*k - 2*L + # AIC <- 2*k - 2*L + # AICc <- AIC + 2*k*(k+1)/(n-k-1) + # out$df <- paramsPerTermInModel + # out$quality <- c(AIC = AIC, AICc = AICc, BIC = BIC) + # } ## add A matrix, p.value and return ################################################## out$call <- match.call() out$obs <- data - out$exp <- exp - out$A <- A + # out$exp <- exp + # out$A <- A out$p.value <- c( - PR = mean(PRs <= PR), - X2 = mean(X2s >= X2), - G2 = mean(G2s >= G2), - FT = mean(FTs >= FT), - CR = mean(CRs >= CR), - NM = mean(NMs >= NM) + PR = mean(PRs <= PR) ) + # out$p.value <- c( + # PR = mean(PRs <= PR), + # X2 = mean(X2s >= X2), + # G2 = mean(G2s >= G2), + # FT = mean(FTs >= FT), + # CR = mean(CRs >= CR), + # NM = mean(NMs >= NM) + # ) + out$p.value.std.err <- c( - PR = sqrt(mean(PRs <= PR)*(1-mean(PRs <= PR))/iter), - X2 = sqrt(mean(X2s >= X2)*(1-mean(X2s >= X2))/iter), - G2 = sqrt(mean(G2s >= G2)*(1-mean(G2s >= G2))/iter), - FT = sqrt(mean(FTs >= FT)*(1-mean(FTs >= FT))/iter), - CR = sqrt(mean(CRs >= CR)*(1-mean(CRs >= CR))/iter), - NM = sqrt(mean(NMs >= NM)*(1-mean(NMs >= NM))/iter) + PR = sqrt(mean(PRs <= PR)*(1-mean(PRs <= PR))/iter) ) + # out$p.value.std.err <- c( + # PR = sqrt(mean(PRs <= PR)*(1-mean(PRs <= PR))/iter), + # X2 = sqrt(mean(X2s >= X2)*(1-mean(X2s >= X2))/iter), + # G2 = sqrt(mean(G2s >= G2)*(1-mean(G2s >= G2))/iter), + # FT = sqrt(mean(FTs >= FT)*(1-mean(FTs >= FT))/iter), + # CR = sqrt(mean(CRs >= CR)*(1-mean(CRs >= CR))/iter), + # NM = sqrt(mean(NMs >= NM)*(1-mean(NMs >= NM))/iter) + # ) + out$mid.p.value <- c( - PR = mean(PRs < PR) + mean(PRs == PR)/2, - X2 = mean(X2s > X2) + mean(X2s == X2)/2, - G2 = mean(G2s > G2) + mean(G2s == G2)/2, - FT = mean(FTs > FT) + mean(FTs == FT)/2, - CR = mean(CRs > CR) + mean(CRs == CR)/2, - NM = mean(NMs > NM) + mean(NMs == NM)/2 + PR = mean(PRs < PR) + mean(PRs == PR)/2 ) + # out$mid.p.value <- c( + # PR = mean(PRs < PR) + mean(PRs == PR)/2, + # X2 = mean(X2s > X2) + mean(X2s == X2)/2, + # G2 = mean(G2s > G2) + mean(G2s == G2)/2, + # FT = mean(FTs > FT) + mean(FTs == FT)/2, + # CR = mean(CRs > CR) + mean(CRs == CR)/2, + # NM = mean(NMs > NM) + mean(NMs == NM)/2 + # ) + out$iter <- iter out$burn <- burn out$thin <- thin - out$statistic <- c(PR = PR, X2 = X2, G2 = G2, FT = FT, CR = CR, NM = NM) - out$sampsStats <- list(PRs = PRs, X2s = X2s, G2s = G2s, FTs = FTs, CRs = CRs, NMs = NMs) + out$statistic <- c(PR = PR) + # out$statistic <- c(PR = PR, X2 = X2, G2 = G2, FT = FT, CR = CR, NM = NM) + out$sampsStats <- list(PRs = PRs) + # out$sampsStats <- list(PRs = PRs, X2s = X2s, G2s = G2s, FTs = FTs, CRs = CRs, NMs = NMs) out$cells <- nCells out$method <- method From f000b88b6d4ac1ae939e9d4b8652061d29ac091f Mon Sep 17 00:00:00 2001 From: Innerst Date: Wed, 11 Oct 2017 08:51:19 -0500 Subject: [PATCH 19/53] pois_reg can now have multiple covariates --- NAMESPACE | 2 +- R/pmat.R | 9 ++++----- R/poisson.R | 25 ++++++++++++++++++------- man/metropolis.Rd | 12 +++++++++--- man/pois_reg.Rd | 6 +++--- 5 files changed, 35 insertions(+), 19 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index d45f657..14849ea 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,6 +5,7 @@ S3method(print,loglinear) S3method(print,m2) S3method(print,polyOptim) S3method(print,spectral) +S3method(summary,bertini) export(Amaker) export(Emaker) export(Mmaker) @@ -45,7 +46,6 @@ export(setLattEPath) export(setLattePath) export(spectral) export(subsets) -export(summary.bertini) export(tab2array) export(tab2vec) export(tabFill) diff --git a/R/pmat.R b/R/pmat.R index 4e46da4..b75220e 100644 --- a/R/pmat.R +++ b/R/pmat.R @@ -12,7 +12,6 @@ #' @export pmat pmat <- function(levels, facets){ - #########Old Setup######### #Setup the levels #levels <- levels[levels != 1] @@ -26,18 +25,18 @@ pmat <- function(levels, facets){ } ######### New Setup ######## - if(is.vector(levels)){ + if(!is.list(levels)){ num_covariates <- 1 full_mat <- func(levels) }else{ - num_covariates <- ncol(levels) + num_covariates <- length(levels) #Make single covariate configuration matrix for each covariate - mat_list <- alply(levels, 2, func) + mat_list <- lapply(levels, func) #Full heirarchicial config matrix with all interactions included full_mat <- do.call(kprod, mat_list) } exp_cov <- 1:num_covariates - + #Checking heirarchical sturcture of facets if(any(sapply(facets, length) > 1)){ long_list_elts <- facets[which(sapply(facets, length)>1)] diff --git a/R/poisson.R b/R/poisson.R index 7119ccc..9c1cc4e 100644 --- a/R/poisson.R +++ b/R/poisson.R @@ -3,8 +3,7 @@ #' #' #' @param model hierarchical poisson model specification -#' @param data data, as a data frame with frequencies. see \code{\link{teshape}} if your data is not -#' in that format +#' @param data data, as a data frame with raw data with discrete covariates #' @param init the initialization of the chain. by default, this is #' the observed table #' @param iter number of chain iterations @@ -43,6 +42,7 @@ #' \code{sampsStats}: the statistics computed for each mcmc #' sample. \item \code{cells}: the number of cells in the table. #' \item \code{method}: the method used to estimate the table. } +#' @importFrom stats model.frame #' @export pois_reg @@ -86,7 +86,7 @@ pois_reg <- function(model, data, ## other basic objects #varsNlevels <- dimnames(data) - vars <- names(data) + @@ -114,6 +114,9 @@ pois_reg <- function(model, data, ## Reshape data data <- model.frame(model, data) + # name data + vars <- names(data) + ## parse formula fString <- as.character(model) response <- fString[2] @@ -131,7 +134,6 @@ pois_reg <- function(model, data, ##Format the data data <- ddply(data, unique(unlist(model)), "sum") - if(length(model) == 1){ init <- data$sum nCells <- length(init) @@ -162,13 +164,22 @@ pois_reg <- function(model, data, ##Levels (assuming all levels are numeric i.e. (1,2,3,... not Green, Blue, Red, etc.) if(ncol(data) <= 2){ - levls <- unique(data[,-ncol(data)]) + lvls <- unique(data[,-ncol(data)]) }else{ - levls <- apply(data[,-ncol(data)], 2, unique) + lvls <- apply(data[,-ncol(data)], 2, unique) } # make configuration (model) matrix - A <- pmat(levls, facets) + A <- pmat(lvls, facets) } + + + # check to see if all level configurations are there (need work here) + lvlsInData <- as.list(as.data.frame(t(expand.grid(lvls)))) %in% as.list(as.data.frame(t(data[,-ncol(data)]))) + + # subset A by levels that are present + A <- A[,lvlsInData] + + # find the sufficient statistics suff_stats <- unname(A %*% init) ## construct A matrix and compute moves diff --git a/man/metropolis.Rd b/man/metropolis.Rd index b869af6..260d7fc 100644 --- a/man/metropolis.Rd +++ b/man/metropolis.Rd @@ -7,10 +7,11 @@ \usage{ metropolis(init, moves, suff_stats, config, iter = 1000, burn = 0, thin = 1, dist = c("hypergeometric", "uniform"), engine = c("Cpp", "R"), - hit_and_run = FALSE, SIS = FALSE, non_uniform = FALSE) + hit_and_run = FALSE, SIS = FALSE, non_uniform = FALSE, + adaptive = FALSE) rawMetropolis(init, moves, iter = 1000, dist = "hypergeometric", - hit_and_run = FALSE) + hit_and_run = FALSE, SIS = FALSE, non_uniform = FALSE) } \arguments{ \item{init}{the initial step} @@ -29,8 +30,13 @@ or "uniform"} \item{engine}{C++ or R? (C++ yields roughly a 20-25x speedup)} -\item{hit_and_run}{Whether or not to use the hit and run algorithm in +\item{hit_and_run}{Whether or not to use the discrete hit and run algorithm in the metropolis algorithm} + +\item{SIS}{If TRUE, with a small probability the move will be chosen randomly from the uniform distribution +on the fiber using Sequential Importance "Like" Sampling methods. Defaulted to FALSE} + +\item{adaptive}{Option inside hit_and_run option. If TRUE, hit and run will choose a proposal distribution adaptively. Defaulted to FALSE.} } \value{ a list diff --git a/man/pois_reg.Rd b/man/pois_reg.Rd index 9b4b457..c9f6ee5 100644 --- a/man/pois_reg.Rd +++ b/man/pois_reg.Rd @@ -9,10 +9,9 @@ pois_reg(model, data, iter = 10000, burn = 1000, thin = 10, hit_and_run = FALSE, SIS = FALSE, non_uniform = FALSE, ...) } \arguments{ -\item{model}{hierarchical log-linear model specification} +\item{model}{hierarchical poisson model specification} -\item{data}{data, as a data frame with frequencies. see \code{\link{teshape}} if your data is not -in that format} +\item{data}{data, as a data frame with raw data with discrete covariates} \item{iter}{number of chain iterations} @@ -61,6 +60,7 @@ a list containing named elements \itemize{ \item \code{sampsStats}: the statistics computed for each mcmc sample. \item \code{cells}: the number of cells in the table. \item \code{method}: the method used to estimate the table. } + @importFrom stats model.frame } \description{ Fit a Poisson Regression model with algebraic methods From 150647867071f0c9f603a06f13e2c610d440fd1f Mon Sep 17 00:00:00 2001 From: Innerst Date: Fri, 10 Nov 2017 10:00:28 -0600 Subject: [PATCH 20/53] changes to logistic and poisson regression --- R/logistic.r | 445 +++++++++++++++++++++++++++++------------------- R/poisson.R | 39 ++--- man/log_reg.Rd | 68 ++++++++ man/pois_reg.Rd | 3 +- 4 files changed, 353 insertions(+), 202 deletions(-) create mode 100644 man/log_reg.Rd diff --git a/R/logistic.r b/R/logistic.r index 77d34e5..e428c31 100644 --- a/R/logistic.r +++ b/R/logistic.r @@ -1,11 +1,59 @@ +#' Fit a Logistic Regression model with algebraic methods +#' +#' +#' +#' @param model hierarchical poisson model specification +#' @param data data, as a data frame with raw data with discrete covariates +#' @param init the initialization of the chain. by default, this is +#' the observed table +#' @param iter number of chain iterations +#' @param burn burn-in +#' @param thin thinning +#' @param engine C++ or R? (C++ yields roughly a 20-25x speedup) +#' @param method should the expected value (exp) be fit using +#' iterative proportional fitting (via loglin) or the MCMC as the +#' average of the steps? +#' @param moves the markov moves for the mcmc (as columns of a +#' matrix). +#' @param ... ... +#' @return a list containing named elements \itemize{ \item +#' \code{steps}: an integer matrix whose columns represent +#' individual samples from the mcmc. \item \code{moves}: the moves +#' used for the proposal distribution in the mcmc, computed with +#' 4ti2 (note that only the positive moves are given). \item +#' \code{acceptProb}: the average acceptance probability of the +#' moves, including the thinned moves. \item \code{param}: the +#' fitted parameters of the log linear model. \item \code{df}: +#' parameters per term in the model \item \code{quality}: model +#' selection statistics AIC, AICc, and BIC. \item +#' \code{residuals}: the (unstandardized) pearson residuals (O - +#' E) / sqrt(E) \item \code{call}: the call. \item \code{obs}: the +#' contingency table given. \item \code{exp}: the fit contingency +#' table as an integer array. \item \code{A}: the sufficient +#' statistics computing matrix (from Tmaker). \item +#' \code{p.value}: the exact p-values of individual tests, +#' accurate to Monte-Carlo error. these are computed as the +#' proportion of samples with statistics equal to or larger than +#' the oberved statistic. \item \code{mid.p.value}: the mid +#' p.values, see Agresti pp.20--21. \item \code{statistic}: the +#' pearson's chi-squared (X2), likelihood ratio (G2), +#' Freeman-Tukey (FT), Cressie-Read (CR), and Neyman modified +#' chi-squared (NM) statistics computed for the table given. \item +#' \code{sampsStats}: the statistics computed for each mcmc +#' sample. \item \code{cells}: the number of cells in the table. +#' \item \code{method}: the method used to estimate the table. } +#' @importFrom stats model.frame +#' @export log_reg + log_reg <- function(model, data, - iter = 1E4, burn = 1000, - thin = 10, engine = c("Cpp","R"), - method = c("ipf", "mcmc"), moves, - hit_and_run = FALSE, - SIS = FALSE, - non_uniform = FALSE, - ...) + iter = 1E4, burn = 1000, + thin = 10, engine = c("Cpp","R"), + method = c("ipf", "mcmc"), moves, + hit_and_run = FALSE, + SIS = FALSE, + non_uniform = FALSE, + adaptive = FALSE, + ...) { ## set/check args @@ -25,7 +73,7 @@ log_reg <- function(model, data, ## reshape data ################################################## - # data <- suppressMessages(teshape(data, "tab")) + # data <- suppressMessages(teshape(data, "freq")) # p <- length(dim(data)) # nCells <- length(data) @@ -33,31 +81,37 @@ log_reg <- function(model, data, # if(is.array(data) && is.null(dimnames(data))) data <- array2tab(data) ## other basic objects - # varsNlevels <- dimnames(data) - vars <- names(data) + #varsNlevels <- dimnames(data) + ## check for sampling zeros ################################################## - if(any(data == 0L)) message( - "Care ought be taken with tables with sampling zeros to ensure the MLE exists." - ) + #if(any(data == 0L)) message( + # "Care ought be taken with tables with sampling zeros to ensure the MLE exists." + #) ## parse model specification (formula for vector of r_k's) ################################################## - + modelGivenByMatrix <- ifelse(is.matrix(model), TRUE, FALSE) if(modelGivenByMatrix){ A <- model data <- suppressMessages(teshape(data, "tab")) init <- tab2vec(data) + nCells <- length(init) } else { # if it's a formula, convert to list if(is.formula(model)){ + ## reshape data + data <- model.frame(model, data) + + # name data + vars <- names(data) ## parse formula fString <- as.character(model) @@ -65,22 +119,27 @@ log_reg <- function(model, data, predString <- fString[3] - ## make list of facets model <- strsplit(predString, " \\+ ")[[1]] model <- strsplit(model, " \\* ") - + + ## format the data + names(data)[names(data) == response] <- "response" + data <- rbind(ddply(data, unique(unlist(model)), summarise, sum = sum(response)), + ddply(data, unique(unlist(model)), summarise, sum = length(response) - sum(response)) + ) if(length(model) == 1){ - init <- unlist(data[vars == response]) + + init <- data$sum + nCells <- length(init) p <- 1 - data <- data[vars == model] - }else{ - #If model specifiaction, then make table - data <- supressMessages(teshape(cbind(data[vars == response],data[vars %in% model]), "tab", freqVar = response)) - p <- length(dim(data)) - init <- tab2vec(data) + } else { + # if model specifiaction, then make table + p <- ncol(data) - 1 + init <- data$sum + nCells <- length(init) } } @@ -95,13 +154,29 @@ log_reg <- function(model, data, } else if(all(unlist(model) %in% 1:length(vars))){ # by indices facets <- lapply(model, as.integer) # to fix the ~ 1 + 2 case, parsed as chars } else { - stop("Invalid model specification, see ?poisson") + stop("Invalid model specification, see ?log_reg") } + ## levels (assuming all levels are numeric i.e. (1,2,3,... not Green, Blue, Red, etc.) + if(ncol(data) <= 2){ + lvls <- unique(data[,-ncol(data)]) + } else { + lvls <- lapply(data[,-ncol(data)], unique) + } # make configuration (model) matrix - pois_A <- pmat(dim(data), facets) - A <- lawrence(pois_A) + A <- pmat(lvls, facets) } + + # check to see if all level configurations are there (need work here) + lvlsInData <- as.list(as.data.frame(t(expand.grid(lvls)))) %in% as.list(as.data.frame(t(data[,-ncol(data)]))) + + # subset A by levels that are present + A <- A[,lvlsInData] + + # compute the Lawrence lifting of A + A <- lawrence(A) + + # find the sufficient statistics suff_stats <- unname(A %*% init) ## construct A matrix and compute moves @@ -118,7 +193,7 @@ log_reg <- function(model, data, warning( "No moves were provided and 4ti2 is not found.\n", " The resulting chain is likely not connected and strongly autocorrelated.\n", - " See ?loglinear. Consider using rmove to generate SIS moves in advance.", + " See ?pois_reg. Consider using rmove to generate SIS moves in advance.", immediate. = TRUE ) message("Computing 1000 SIS moves... ", appendLF = FALSE) @@ -146,190 +221,204 @@ log_reg <- function(model, data, ################################################## init <- unname(init) # init out <- metropolis(init, moves, suff_stats = suff_stats, config = unname(A), iter = iter, burn = burn, thin = thin, - engine = engine, hit_and_run = hit_and_run, SIS = SIS, non_uniform = non_uniform) + engine = engine, hit_and_run = hit_and_run, SIS = SIS, non_uniform = non_uniform, adaptive = adaptive) ## compute data chi square ################################################## - if(modelGivenByMatrix && method == "ipf"){ - message( - "Iterative proportional fitting is not yet implemented\n", - " for models specified by configuration matrices.\n", - " Changing to method = \"mcmc\"..." - ) - method <- "mcmc" - } - if(method == "ipf"){ - exp <- loglin(data, facets, fit = TRUE, print = FALSE)$fit - } else if(method == "mcmc"){ - exp <- vec2tab(rowMeans(out$steps), dim(data)) - dimnames(exp) <- dimnames(data) - } - e <- unname(tab2vec(exp)) - u <- t(t(unname(tab2vec(data)))) + # if(modelGivenByMatrix && method == "ipf"){ + # message( + # "Iterative proportional fitting is not yet implemented\n", + # " for models specified by configuration matrices.\n", + # " Changing to method = \"mcmc\"..." + # ) + # method <- "mcmc" + # } + # if(method == "ipf"){ + # exp <- loglin(data, facets, fit = TRUE, print = FALSE)$fit + # } else if(method == "mcmc"){ + # exp <- vec2tab(rowMeans(out$steps), dim(data)) + # dimnames(exp) <- dimnames(data) + # } + # e <- unname(tab2vec(exp)) + + u <- t(t(data$sum)) PR <- computeUProbsCpp(matrix(u)) # unnormd prob; numers LAS 1.1.10 - X2 <- computeX2sCpp(u, e) - G2 <- computeG2sCpp(u, e) - FT <- computeCRsCpp(u, e, -.5) - CR <- computeCRsCpp(u, e, 2/3) - NM <- computeNMsCpp(u, e) + # X2 <- computeX2sCpp(u, e) + # G2 <- computeG2sCpp(u, e) + # FT <- computeCRsCpp(u, e, -.5) + # CR <- computeCRsCpp(u, e, 2/3) + # NM <- computeNMsCpp(u, e) ## compute MCMC chi squares ################################################## PRs <- computeUProbsCpp(out$steps) # unnormd probs; numers LAS 1.1.10 - X2s <- computeX2sCpp(out$steps, e) - G2s <- computeG2sCpp(out$steps, e) - FTs <- computeCRsCpp(out$steps, e, -.5) - CRs <- computeCRsCpp(out$steps, e, 2/3) - NMs <- computeNMsCpp(out$steps, e) - - - ## compute parameters - ################################################## - if(!modelGivenByMatrix){ - # in principle, there should be one parameter for every cell. - # there are prod(dim(data)) cells. - # a good reference is BFH, p. 35 (and to a lesser extent 43) - # the prod(dim(data)[terms[[j]]] - 1) line below is like - # (I - 1) (J - 1) (K - 1) - # CDA p.79 also helpful - dimSatModel <- nCells - 1 - degFreedom <- rep.int(0, 2^p) # there are 2^p possible subsets of vars, and - # therefore there are 2^p possible terms - - # possibleTerms are more "types of terms" as opposed to individual terms - # for example, an entry c(1,3) would refer to all combinations of levels - # of variables 1 and 3; ie (# var 1 levels - 1) * (# var 3 levels - 1) - # individual terms (parameters) - possibleTerms <- subsets(p, include_null = TRUE) - names(possibleTerms) <- sapply(possibleTerms, paste, collapse = " ") - names(possibleTerms)[which(names(possibleTerms) == "")] <- "(Intercept)" - nVarLvls <- dim(data) - # paramsPerTerm <- lapply(possibleTerms, function(x){ - # if(length(x) == 0) return(1L) - # prod(nVarLvls[x] - 1) - # }) - - - # similarly, there are the terms in the model - termsInModel <- unique(unlist(lapply( - lapply(facets, as.character), # to avoid subsets(2) - subsets, include_null = TRUE), - recursive = FALSE - )) - termsInModel <- lapply(termsInModel, as.integer) - names(termsInModel) <- sapply(termsInModel, paste, collapse = " ") - names(termsInModel)[which(names(termsInModel) == "")] <- "(Intercept)" - paramsPerTermInModel <- lapply(termsInModel, function(x){ - if(length(x) == 0) return(1L) - prod(nVarLvls[x] - 1) - }) - names(paramsPerTermInModel) <- unname(sapply(termsInModel, function(x){ - if(length(x) == 0) return("(Intercept)") - paste(names(dimnames(data))[x], collapse = ".") - })) - nParamsInModel <- sum(unlist(paramsPerTermInModel)) - dimModel <- nParamsInModel - 1 # the - 1 accounts for the overall mean - overallAsymptoticDegFreedom <- (dimSatModel - dimModel) - - - # compute the parameters - log_fit <- exp - log_fit[exp > 0] <- log(exp[exp > 0]) - param <- as.list(rep(NA, length(termsInModel))) - names(param) <- names(paramsPerTermInModel) - for(k in seq_along(param)){ - if(length(termsInModel[[k]]) == 0){ - param[[k]] <- mean(log_fit) - log_fit <- log_fit - param[[k]] - } else { - param[[k]] <- apply(log_fit, termsInModel[[k]], mean) - log_fit <- sweep(log_fit, termsInModel[[k]], param[[k]]) - } - } - # for every step, fit mle - # then decompose mle - # problem : they all have the same marginals, so the same - # mles! - # idea 1 : sample from the multinomial with the same sample - # size (so different marginals), estimate, then decompose - # idea 2 : bootstrap sample from the table, estimate, decompose - # i think i like idea 2 better. - - - # reorder the param estimates in the order of subsets - # so you have the intercept, then all first order terms, and so on - goodOrder <- sapply( - c("(Intercept)", subsets(names(dimnames(data)))), - paste, collapse = "." - ) - param <- param[goodOrder[goodOrder %in% names(param)]] - out$param <- param - - } - + # X2s <- computeX2sCpp(out$steps, e) + # G2s <- computeG2sCpp(out$steps, e) + # FTs <- computeCRsCpp(out$steps, e, -.5) + # CRs <- computeCRsCpp(out$steps, e, 2/3) + # NMs <- computeNMsCpp(out$steps, e) + + # + # ## compute parameters + # ################################################## + # if(!modelGivenByMatrix){ + # # in principle, there should be one parameter for every cell. + # # there are prod(dim(data)) cells. + # # a good reference is BFH, p. 35 (and to a lesser extent 43) + # # the prod(dim(data)[terms[[j]]] - 1) line below is like + # # (I - 1) (J - 1) (K - 1) + # # CDA p.79 also helpful + # dimSatModel <- nCells - 1 + # degFreedom <- rep.int(0, 2^p) # there are 2^p possible subsets of vars, and + # # therefore there are 2^p possible terms + # + # # possibleTerms are more "types of terms" as opposed to individual terms + # # for example, an entry c(1,3) would refer to all combinations of levels + # # of variables 1 and 3; ie (# var 1 levels - 1) * (# var 3 levels - 1) + # # individual terms (parameters) + # possibleTerms <- subsets(p, include_null = TRUE) + # names(possibleTerms) <- sapply(possibleTerms, paste, collapse = " ") + # names(possibleTerms)[which(names(possibleTerms) == "")] <- "(Intercept)" + # nVarLvls <- dim(data) + # # paramsPerTerm <- lapply(possibleTerms, function(x){ + # # if(length(x) == 0) return(1L) + # # prod(nVarLvls[x] - 1) + # # }) + # + # + # # similarly, there are the terms in the model + # termsInModel <- unique(unlist(lapply( + # lapply(facets, as.character), # to avoid subsets(2) + # subsets, include_null = TRUE), + # recursive = FALSE + # )) + # termsInModel <- lapply(termsInModel, as.integer) + # names(termsInModel) <- sapply(termsInModel, paste, collapse = " ") + # names(termsInModel)[which(names(termsInModel) == "")] <- "(Intercept)" + # paramsPerTermInModel <- lapply(termsInModel, function(x){ + # if(length(x) == 0) return(1L) + # prod(nVarLvls[x] - 1) + # }) + # names(paramsPerTermInModel) <- unname(sapply(termsInModel, function(x){ + # if(length(x) == 0) return("(Intercept)") + # paste(names(dimnames(data))[x], collapse = ".") + # })) + # nParamsInModel <- sum(unlist(paramsPerTermInModel)) + # dimModel <- nParamsInModel - 1 # the - 1 accounts for the overall mean + # overallAsymptoticDegFreedom <- (dimSatModel - dimModel) + # + # + # # compute the parameters + # log_fit <- exp + # log_fit[exp > 0] <- log(exp[exp > 0]) + # param <- as.list(rep(NA, length(termsInModel))) + # names(param) <- names(paramsPerTermInModel) + # for(k in seq_along(param)){ + # if(length(termsInModel[[k]]) == 0){ + # param[[k]] <- mean(log_fit) + # log_fit <- log_fit - param[[k]] + # } else { + # param[[k]] <- apply(log_fit, termsInModel[[k]], mean) + # log_fit <- sweep(log_fit, termsInModel[[k]], param[[k]]) + # } + # } + # # for every step, fit mle + # # then decompose mle + # # problem : they all have the same marginals, so the same + # # mles! + # # idea 1 : sample from the multinomial with the same sample + # # size (so different marginals), estimate, then decompose + # # idea 2 : bootstrap sample from the table, estimate, decompose + # # i think i like idea 2 better. + # + # + # # reorder the param estimates in the order of subsets + # # so you have the intercept, then all first order terms, and so on + # goodOrder <- sapply( + # c("(Intercept)", subsets(names(dimnames(data)))), + # paste, collapse = "." + # ) + # param <- param[goodOrder[goodOrder %in% names(param)]] + # out$param <- param + # + # } + # ## compute residuals and model selection, agresti p.81, 216, 324 ################################################## - out$residuals <- exp - out$residuals[exp > 0] <- - (data[exp > 0] - exp[exp > 0]) / sqrt(exp[exp > 0]) - - if(!modelGivenByMatrix){ - k <- nParamsInModel # = number of params - n <- sum(data) # = sample size - L <- dmultinom(u, sum(u), e, TRUE) # maximized log-likelihood - BIC <- log(n)*k - 2*L - AIC <- 2*k - 2*L - AICc <- AIC + 2*k*(k+1)/(n-k-1) - out$df <- paramsPerTermInModel - out$quality <- c(AIC = AIC, AICc = AICc, BIC = BIC) - } + # out$residuals <- exp + # out$residuals[exp > 0] <- + # (data[exp > 0] - exp[exp > 0]) / sqrt(exp[exp > 0]) + # + # if(!modelGivenByMatrix){ + # k <- nParamsInModel # = number of params + # n <- sum(data) # = sample size + # L <- dmultinom(u, sum(u), e, TRUE) # maximized log-likelihood + # BIC <- log(n)*k - 2*L + # AIC <- 2*k - 2*L + # AICc <- AIC + 2*k*(k+1)/(n-k-1) + # out$df <- paramsPerTermInModel + # out$quality <- c(AIC = AIC, AICc = AICc, BIC = BIC) + # } ## add A matrix, p.value and return ################################################## out$call <- match.call() out$obs <- data - out$exp <- exp - out$A <- A + # out$exp <- exp + # out$A <- A out$p.value <- c( - PR = mean(PRs <= PR), - X2 = mean(X2s >= X2), - G2 = mean(G2s >= G2), - FT = mean(FTs >= FT), - CR = mean(CRs >= CR), - NM = mean(NMs >= NM) + PR = mean(PRs <= PR) ) + # out$p.value <- c( + # PR = mean(PRs <= PR), + # X2 = mean(X2s >= X2), + # G2 = mean(G2s >= G2), + # FT = mean(FTs >= FT), + # CR = mean(CRs >= CR), + # NM = mean(NMs >= NM) + # ) + out$p.value.std.err <- c( - PR = sqrt(mean(PRs <= PR)*(1-mean(PRs <= PR))/iter), - X2 = sqrt(mean(X2s >= X2)*(1-mean(X2s >= X2))/iter), - G2 = sqrt(mean(G2s >= G2)*(1-mean(G2s >= G2))/iter), - FT = sqrt(mean(FTs >= FT)*(1-mean(FTs >= FT))/iter), - CR = sqrt(mean(CRs >= CR)*(1-mean(CRs >= CR))/iter), - NM = sqrt(mean(NMs >= NM)*(1-mean(NMs >= NM))/iter) + PR = sqrt(mean(PRs <= PR)*(1-mean(PRs <= PR))/iter) ) + # out$p.value.std.err <- c( + # PR = sqrt(mean(PRs <= PR)*(1-mean(PRs <= PR))/iter), + # X2 = sqrt(mean(X2s >= X2)*(1-mean(X2s >= X2))/iter), + # G2 = sqrt(mean(G2s >= G2)*(1-mean(G2s >= G2))/iter), + # FT = sqrt(mean(FTs >= FT)*(1-mean(FTs >= FT))/iter), + # CR = sqrt(mean(CRs >= CR)*(1-mean(CRs >= CR))/iter), + # NM = sqrt(mean(NMs >= NM)*(1-mean(NMs >= NM))/iter) + # ) + out$mid.p.value <- c( - PR = mean(PRs < PR) + mean(PRs == PR)/2, - X2 = mean(X2s > X2) + mean(X2s == X2)/2, - G2 = mean(G2s > G2) + mean(G2s == G2)/2, - FT = mean(FTs > FT) + mean(FTs == FT)/2, - CR = mean(CRs > CR) + mean(CRs == CR)/2, - NM = mean(NMs > NM) + mean(NMs == NM)/2 + PR = mean(PRs < PR) + mean(PRs == PR)/2 ) + # out$mid.p.value <- c( + # PR = mean(PRs < PR) + mean(PRs == PR)/2, + # X2 = mean(X2s > X2) + mean(X2s == X2)/2, + # G2 = mean(G2s > G2) + mean(G2s == G2)/2, + # FT = mean(FTs > FT) + mean(FTs == FT)/2, + # CR = mean(CRs > CR) + mean(CRs == CR)/2, + # NM = mean(NMs > NM) + mean(NMs == NM)/2 + # ) + out$iter <- iter out$burn <- burn out$thin <- thin - out$statistic <- c(PR = PR, X2 = X2, G2 = G2, FT = FT, CR = CR, NM = NM) - out$sampsStats <- list(PRs = PRs, X2s = X2s, G2s = G2s, FTs = FTs, CRs = CRs, NMs = NMs) + out$statistic <- c(PR = PR) + # out$statistic <- c(PR = PR, X2 = X2, G2 = G2, FT = FT, CR = CR, NM = NM) + out$sampsStats <- list(PRs = PRs) + # out$sampsStats <- list(PRs = PRs, X2s = X2s, G2s = G2s, FTs = FTs, CRs = CRs, NMs = NMs) out$cells <- nCells out$method <- method class(out) <- "logistic" out -} - \ No newline at end of file +} \ No newline at end of file diff --git a/R/poisson.R b/R/poisson.R index 9c1cc4e..743d9b9 100644 --- a/R/poisson.R +++ b/R/poisson.R @@ -57,6 +57,7 @@ pois_reg <- function(model, data, hit_and_run = FALSE, SIS = FALSE, non_uniform = FALSE, + adaptive = FALSE, ...) { @@ -111,7 +112,7 @@ pois_reg <- function(model, data, } else { # if it's a formula, convert to list if(is.formula(model)){ - ## Reshape data + ## reshape data data <- model.frame(model, data) # name data @@ -122,26 +123,23 @@ pois_reg <- function(model, data, response <- fString[2] predString <- fString[3] - #Rename the response variable to fit formula syntax if needed - #if(!(response %in% vars)){ - # colnames(data)[colnames(data) == "freq"] <- response - # vars <- names(data) - #} + ## make list of facets model <- strsplit(predString, " \\+ ")[[1]] model <- strsplit(model, " \\* ") - - ##Format the data - data <- ddply(data, unique(unlist(model)), "sum") + + ## format the data + names(data)[names(data) == response] <- "response" + data <- ddply(data, unique(unlist(model)), summarise, sum = sum(response)) if(length(model) == 1){ + init <- data$sum nCells <- length(init) p <- 1 - }else{ - #If model specifiaction, then make table - #data <- suppressMessages(teshape(data, "tab", freqVar = response)) - #p <- length(dim(data)) + + } else { + # if model specifiaction, then make table p <- ncol(data) - 1 init <- data$sum nCells <- length(init) @@ -161,18 +159,17 @@ pois_reg <- function(model, data, } else { stop("Invalid model specification, see ?pois_reg") } - ##Levels (assuming all levels are numeric i.e. (1,2,3,... not Green, Blue, Red, etc.) - + ## levels (assuming all levels are numeric i.e. (1,2,3,... not Green, Blue, Red, etc.) + if(ncol(data) <= 2){ lvls <- unique(data[,-ncol(data)]) - }else{ - lvls <- apply(data[,-ncol(data)], 2, unique) + } else { + lvls <- lapply(data[,-ncol(data)], unique) } # make configuration (model) matrix A <- pmat(lvls, facets) } - # check to see if all level configurations are there (need work here) lvlsInData <- as.list(as.data.frame(t(expand.grid(lvls)))) %in% as.list(as.data.frame(t(data[,-ncol(data)]))) @@ -224,7 +221,7 @@ pois_reg <- function(model, data, ################################################## init <- unname(init) # init out <- metropolis(init, moves, suff_stats = suff_stats, config = unname(A), iter = iter, burn = burn, thin = thin, - engine = engine, hit_and_run = hit_and_run, SIS = SIS, non_uniform = non_uniform) + engine = engine, hit_and_run = hit_and_run, SIS = SIS, non_uniform = non_uniform, adaptive = adaptive) @@ -246,10 +243,6 @@ pois_reg <- function(model, data, # } # e <- unname(tab2vec(exp)) -#####Issue Here!!!!!!!############# - ###Old - #u <- t(t(unname(tab2vec(data)))) - ###New u <- t(t(data$sum)) PR <- computeUProbsCpp(matrix(u)) # unnormd prob; numers LAS 1.1.10 # X2 <- computeX2sCpp(u, e) diff --git a/man/log_reg.Rd b/man/log_reg.Rd new file mode 100644 index 0000000..1a44e9b --- /dev/null +++ b/man/log_reg.Rd @@ -0,0 +1,68 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/logistic.r +\name{log_reg} +\alias{log_reg} +\title{Fit a Logistic Regression model with algebraic methods} +\usage{ +log_reg(model, data, iter = 10000, burn = 1000, thin = 10, + engine = c("Cpp", "R"), method = c("ipf", "mcmc"), moves, + hit_and_run = FALSE, SIS = FALSE, non_uniform = FALSE, ...) +} +\arguments{ +\item{model}{hierarchical poisson model specification} + +\item{data}{data, as a data frame with raw data with discrete covariates} + +\item{iter}{number of chain iterations} + +\item{burn}{burn-in} + +\item{thin}{thinning} + +\item{engine}{C++ or R? (C++ yields roughly a 20-25x speedup)} + +\item{method}{should the expected value (exp) be fit using +iterative proportional fitting (via loglin) or the MCMC as the +average of the steps?} + +\item{moves}{the markov moves for the mcmc (as columns of a +matrix).} + +\item{...}{...} + +\item{init}{the initialization of the chain. by default, this is +the observed table} +} +\value{ +a list containing named elements \itemize{ \item + \code{steps}: an integer matrix whose columns represent + individual samples from the mcmc. \item \code{moves}: the moves + used for the proposal distribution in the mcmc, computed with + 4ti2 (note that only the positive moves are given). \item + \code{acceptProb}: the average acceptance probability of the + moves, including the thinned moves. \item \code{param}: the + fitted parameters of the log linear model. \item \code{df}: + parameters per term in the model \item \code{quality}: model + selection statistics AIC, AICc, and BIC. \item + \code{residuals}: the (unstandardized) pearson residuals (O - + E) / sqrt(E) \item \code{call}: the call. \item \code{obs}: the + contingency table given. \item \code{exp}: the fit contingency + table as an integer array. \item \code{A}: the sufficient + statistics computing matrix (from Tmaker). \item + \code{p.value}: the exact p-values of individual tests, + accurate to Monte-Carlo error. these are computed as the + proportion of samples with statistics equal to or larger than + the oberved statistic. \item \code{mid.p.value}: the mid + p.values, see Agresti pp.20--21. \item \code{statistic}: the + pearson's chi-squared (X2), likelihood ratio (G2), + Freeman-Tukey (FT), Cressie-Read (CR), and Neyman modified + chi-squared (NM) statistics computed for the table given. \item + \code{sampsStats}: the statistics computed for each mcmc + sample. \item \code{cells}: the number of cells in the table. + \item \code{method}: the method used to estimate the table. } + @importFrom stats model.frame + @export log_reg +} +\description{ +Fit a Logistic Regression model with algebraic methods +} diff --git a/man/pois_reg.Rd b/man/pois_reg.Rd index c9f6ee5..12cde28 100644 --- a/man/pois_reg.Rd +++ b/man/pois_reg.Rd @@ -6,7 +6,8 @@ \usage{ pois_reg(model, data, iter = 10000, burn = 1000, thin = 10, engine = c("Cpp", "R"), method = c("ipf", "mcmc"), moves, - hit_and_run = FALSE, SIS = FALSE, non_uniform = FALSE, ...) + hit_and_run = FALSE, SIS = FALSE, non_uniform = FALSE, + adaptive = FALSE, ...) } \arguments{ \item{model}{hierarchical poisson model specification} From 7345e161866c441ca07c576842735249a4fcf1cc Mon Sep 17 00:00:00 2001 From: Innerst Date: Mon, 27 Nov 2017 15:07:13 -0600 Subject: [PATCH 21/53] Aglm function added, pois_reg and log_reg deleted --- .DS_Store | Bin 8196 -> 10244 bytes NAMESPACE | 5 +- R/.DS_Store | Bin 0 -> 6148 bytes R/Aglm.r | 337 ++++++++++++++++++++++++++++++++++++++ R/algstat.r | 3 +- R/apoisson.R | 395 -------------------------------------------- R/logistic.r | 424 ------------------------------------------------ R/pmat.R | 17 +- R/poisson.R | 424 ------------------------------------------------ man/Aglm.Rd | 132 +++++++++++++++ man/apoisson.Rd | 149 ----------------- man/log_reg.Rd | 68 -------- man/pois_reg.Rd | 68 -------- 13 files changed, 475 insertions(+), 1547 deletions(-) create mode 100644 R/.DS_Store create mode 100644 R/Aglm.r delete mode 100644 R/apoisson.R delete mode 100644 R/logistic.r delete mode 100644 R/poisson.R create mode 100644 man/Aglm.Rd delete mode 100644 man/apoisson.Rd delete mode 100644 man/log_reg.Rd delete mode 100644 man/pois_reg.Rd diff --git a/.DS_Store b/.DS_Store index 3505ba1a3e7e261c28e1a4de630a883985245764..81babd2bd558b802a105f7adfde182cd6b6e6e27 100644 GIT binary patch delta 776 zcmZp1XbF&DU|?W$DortDU{C-uIe-{M3-C-V6q~50$PW@=1o9d3lYp$B8{^jUvqQv! zCL0OvVX?#}n!>@sq3n<~d99fJ zWG|5<>n)7?7>_YtV7$n9iSaVy9mc1O?-;)WT?7P7K=*-x7nFulj6Z-3ppzy)6;Wdp zoy;S;LS9u%-^9|+DIz&PH#aq}q&N;_oljzMiGM+AUTVtZlcFlDA$f_psgwVRI@HT3 z$*6D$oZt}PxFR7bB`u>aqsbw#l|z6D%;nLM(c=_I#uUa}#yrLX#zMwo#u~<2#(KsE#wNyY#vaB# z#(u_$jI$VLGtOn4$GCuTtLp#& delta 152 zcmZn(XmOBWU|?W$DortDU;r^WfEYvza8E20o2aMA&jykQ@)`1zfUF-IH1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 0] <- log(exp[exp > 0]) -# param <- as.list(rep(NA, length(termsInModel))) -# names(param) <- names(paramsPerTermInModel) -# for(k in seq_along(param)){ -# if(length(termsInModel[[k]]) == 0){ -# param[[k]] <- mean(log_fit) -# log_fit <- log_fit - param[[k]] -# } else { -# param[[k]] <- apply(log_fit, termsInModel[[k]], mean) -# log_fit <- sweep(log_fit, termsInModel[[k]], param[[k]]) -# } -# } -# # for every step, fit mle -# # then decompose mle -# # problem : they all have the same marginals, so the same -# # mles! -# # idea 1 : sample from the multinomial with the same sample -# # size (so different marginals), estimate, then decompose -# # idea 2 : bootstrap sample from the table, estimate, decompose -# # i think i like idea 2 better. -# -# -# # reorder the param estimates in the order of subsets -# # so you have the intercept, then all first order terms, and so on -# goodOrder <- sapply( -# c("(Intercept)", subsets(names(dimnames(data)))), -# paste, collapse = "." -# ) -# param <- param[goodOrder[goodOrder %in% names(param)]] -# out$param <- param -# -# } -# - ## compute residuals and model selection, agresti p.81, 216, 324 - ################################################## - # out$residuals <- exp - # out$residuals[exp > 0] <- - # (data[exp > 0] - exp[exp > 0]) / sqrt(exp[exp > 0]) - # - # if(!modelGivenByMatrix){ - # k <- nParamsInModel # = number of params - # n <- sum(data) # = sample size - # L <- dmultinom(u, sum(u), e, TRUE) # maximized log-likelihood - # BIC <- log(n)*k - 2*L - # AIC <- 2*k - 2*L - # AICc <- AIC + 2*k*(k+1)/(n-k-1) - # out$df <- paramsPerTermInModel - # out$quality <- c(AIC = AIC, AICc = AICc, BIC = BIC) - # } - - ## add A matrix, p.value and return - ################################################## - out$call <- match.call() - out$obs <- data - out$exp <- exp - # out$A <- A - - out$p.value <- c( - PR = mean(PRs <= PR) - ) - - # out$p.value <- c( - # PR = mean(PRs <= PR), - # X2 = mean(X2s >= X2), - # G2 = mean(G2s >= G2), - # FT = mean(FTs >= FT), - # CR = mean(CRs >= CR), - # NM = mean(NMs >= NM) - # ) - - out$p.value.std.err <- c( - PR = sqrt(mean(PRs <= PR)*(1-mean(PRs <= PR))/iter) - ) - - # out$p.value.std.err <- c( - # PR = sqrt(mean(PRs <= PR)*(1-mean(PRs <= PR))/iter), - # X2 = sqrt(mean(X2s >= X2)*(1-mean(X2s >= X2))/iter), - # G2 = sqrt(mean(G2s >= G2)*(1-mean(G2s >= G2))/iter), - # FT = sqrt(mean(FTs >= FT)*(1-mean(FTs >= FT))/iter), - # CR = sqrt(mean(CRs >= CR)*(1-mean(CRs >= CR))/iter), - # NM = sqrt(mean(NMs >= NM)*(1-mean(NMs >= NM))/iter) - # ) - - out$mid.p.value <- c( - PR = mean(PRs < PR) + mean(PRs == PR)/2 - ) - - # out$mid.p.value <- c( - # PR = mean(PRs < PR) + mean(PRs == PR)/2, - # X2 = mean(X2s > X2) + mean(X2s == X2)/2, - # G2 = mean(G2s > G2) + mean(G2s == G2)/2, - # FT = mean(FTs > FT) + mean(FTs == FT)/2, - # CR = mean(CRs > CR) + mean(CRs == CR)/2, - # NM = mean(NMs > NM) + mean(NMs == NM)/2 - # ) - - out$iter <- iter - out$burn <- burn - out$thin <- thin - out$statistic <- c(PR = PR) - # out$statistic <- c(PR = PR, X2 = X2, G2 = G2, FT = FT, CR = CR, NM = NM) - out$sampsStats <- list(PRs = PRs) - # out$sampsStats <- list(PRs = PRs, X2s = X2s, G2s = G2s, FTs = FTs, CRs = CRs, NMs = NMs) - out$cells <- nCells - out$method <- method - - class(out) <- "poisson" - out -} - - - - diff --git a/R/logistic.r b/R/logistic.r deleted file mode 100644 index e428c31..0000000 --- a/R/logistic.r +++ /dev/null @@ -1,424 +0,0 @@ -#' Fit a Logistic Regression model with algebraic methods -#' -#' -#' -#' @param model hierarchical poisson model specification -#' @param data data, as a data frame with raw data with discrete covariates -#' @param init the initialization of the chain. by default, this is -#' the observed table -#' @param iter number of chain iterations -#' @param burn burn-in -#' @param thin thinning -#' @param engine C++ or R? (C++ yields roughly a 20-25x speedup) -#' @param method should the expected value (exp) be fit using -#' iterative proportional fitting (via loglin) or the MCMC as the -#' average of the steps? -#' @param moves the markov moves for the mcmc (as columns of a -#' matrix). -#' @param ... ... -#' @return a list containing named elements \itemize{ \item -#' \code{steps}: an integer matrix whose columns represent -#' individual samples from the mcmc. \item \code{moves}: the moves -#' used for the proposal distribution in the mcmc, computed with -#' 4ti2 (note that only the positive moves are given). \item -#' \code{acceptProb}: the average acceptance probability of the -#' moves, including the thinned moves. \item \code{param}: the -#' fitted parameters of the log linear model. \item \code{df}: -#' parameters per term in the model \item \code{quality}: model -#' selection statistics AIC, AICc, and BIC. \item -#' \code{residuals}: the (unstandardized) pearson residuals (O - -#' E) / sqrt(E) \item \code{call}: the call. \item \code{obs}: the -#' contingency table given. \item \code{exp}: the fit contingency -#' table as an integer array. \item \code{A}: the sufficient -#' statistics computing matrix (from Tmaker). \item -#' \code{p.value}: the exact p-values of individual tests, -#' accurate to Monte-Carlo error. these are computed as the -#' proportion of samples with statistics equal to or larger than -#' the oberved statistic. \item \code{mid.p.value}: the mid -#' p.values, see Agresti pp.20--21. \item \code{statistic}: the -#' pearson's chi-squared (X2), likelihood ratio (G2), -#' Freeman-Tukey (FT), Cressie-Read (CR), and Neyman modified -#' chi-squared (NM) statistics computed for the table given. \item -#' \code{sampsStats}: the statistics computed for each mcmc -#' sample. \item \code{cells}: the number of cells in the table. -#' \item \code{method}: the method used to estimate the table. } -#' @importFrom stats model.frame -#' @export log_reg - -log_reg <- function(model, data, - iter = 1E4, burn = 1000, - thin = 10, engine = c("Cpp","R"), - method = c("ipf", "mcmc"), moves, - hit_and_run = FALSE, - SIS = FALSE, - non_uniform = FALSE, - adaptive = FALSE, - ...) -{ - - ## set/check args - ################################################## - - engine <- match.arg(engine) - method <- match.arg(method) - argList <- as.list(match.call(expand.dots = TRUE))[-1] - - if("formula" %in% names(argList)){ - .Deprecated(msg = - 'the formula argument is deprecated, please use "model" instead.' - ) - } - - - ## reshape data - ################################################## - - # data <- suppressMessages(teshape(data, "freq")) - # p <- length(dim(data)) - # nCells <- length(data) - - ## if a pure array is given, give names for later - # if(is.array(data) && is.null(dimnames(data))) data <- array2tab(data) - - ## other basic objects - #varsNlevels <- dimnames(data) - - - - - - ## check for sampling zeros - ################################################## - #if(any(data == 0L)) message( - # "Care ought be taken with tables with sampling zeros to ensure the MLE exists." - #) - - - ## parse model specification (formula for vector of r_k's) - ################################################## - - modelGivenByMatrix <- ifelse(is.matrix(model), TRUE, FALSE) - - if(modelGivenByMatrix){ - A <- model - data <- suppressMessages(teshape(data, "tab")) - init <- tab2vec(data) - nCells <- length(init) - } else { - # if it's a formula, convert to list - if(is.formula(model)){ - ## reshape data - data <- model.frame(model, data) - - # name data - vars <- names(data) - - ## parse formula - fString <- as.character(model) - response <- fString[2] - predString <- fString[3] - - - ## make list of facets - model <- strsplit(predString, " \\+ ")[[1]] - model <- strsplit(model, " \\* ") - - ## format the data - names(data)[names(data) == response] <- "response" - data <- rbind(ddply(data, unique(unlist(model)), summarise, sum = sum(response)), - ddply(data, unique(unlist(model)), summarise, sum = length(response) - sum(response)) - ) - - if(length(model) == 1){ - - init <- data$sum - nCells <- length(init) - p <- 1 - - } else { - # if model specifiaction, then make table - p <- ncol(data) - 1 - init <- data$sum - nCells <- length(init) - } - } - - - - # make facets (list of index vecs); if model specified with variable - # names, convert them to indices - if(all(unlist(model) %in% vars)){ # variable names - varname2index <- 1:p - names(varname2index) <- vars[vars != response] - facets <- lapply(model, function(varsInFacet) varname2index[varsInFacet]) - } else if(all(unlist(model) %in% 1:length(vars))){ # by indices - facets <- lapply(model, as.integer) # to fix the ~ 1 + 2 case, parsed as chars - } else { - stop("Invalid model specification, see ?log_reg") - } - ## levels (assuming all levels are numeric i.e. (1,2,3,... not Green, Blue, Red, etc.) - - if(ncol(data) <= 2){ - lvls <- unique(data[,-ncol(data)]) - } else { - lvls <- lapply(data[,-ncol(data)], unique) - } - # make configuration (model) matrix - A <- pmat(lvls, facets) - } - - # check to see if all level configurations are there (need work here) - lvlsInData <- as.list(as.data.frame(t(expand.grid(lvls)))) %in% as.list(as.data.frame(t(data[,-ncol(data)]))) - - # subset A by levels that are present - A <- A[,lvlsInData] - - # compute the Lawrence lifting of A - A <- lawrence(A) - - # find the sufficient statistics - suff_stats <- unname(A %*% init) - - ## construct A matrix and compute moves - ################################################## - - if(missing(moves) && !is.null(getOption("4ti2_path"))){ - - message("Computing Markov moves (4ti2)... ", appendLF = FALSE) - moves <- markov(A) - message("done.", appendLF = TRUE) - - } else if(missing(moves) && is.null(getOption("4ti2_path"))){ - - warning( - "No moves were provided and 4ti2 is not found.\n", - " The resulting chain is likely not connected and strongly autocorrelated.\n", - " See ?pois_reg. Consider using rmove to generate SIS moves in advance.", - immediate. = TRUE - ) - message("Computing 1000 SIS moves... ", appendLF = FALSE) - moves <- rmove(n = 1000, A = A, b = A %*% tab2vec(data), ...) - message("done.", appendLF = TRUE) - - } else if(is.character(moves)){ - - movesMat <- NULL - stopifnot(all(moves %in% c("lattice", "markov", "groebner", "grobner", "graver", "sis"))) - if("lattice" %in% moves) movesMat <- cbind(movesMat, zbasis(A)) - if("markov" %in% moves) movesMat <- cbind(movesMat, markov(A)) - if("groebner" %in% moves) movesMat <- cbind(movesMat, groebner(A)) - if("grobner" %in% moves) movesMat <- cbind(movesMat, groebner(A)) - if("graver" %in% moves) stop("graver not yet implemented.") - moves <- movesMat - - } - - stopifnot(is.array(moves)) - - - - ## run metropolis-hastings - ################################################## - init <- unname(init) # init - out <- metropolis(init, moves, suff_stats = suff_stats, config = unname(A), iter = iter, burn = burn, thin = thin, - engine = engine, hit_and_run = hit_and_run, SIS = SIS, non_uniform = non_uniform, adaptive = adaptive) - - - - ## compute data chi square - ################################################## - # if(modelGivenByMatrix && method == "ipf"){ - # message( - # "Iterative proportional fitting is not yet implemented\n", - # " for models specified by configuration matrices.\n", - # " Changing to method = \"mcmc\"..." - # ) - # method <- "mcmc" - # } - # if(method == "ipf"){ - # exp <- loglin(data, facets, fit = TRUE, print = FALSE)$fit - # } else if(method == "mcmc"){ - # exp <- vec2tab(rowMeans(out$steps), dim(data)) - # dimnames(exp) <- dimnames(data) - # } - # e <- unname(tab2vec(exp)) - - u <- t(t(data$sum)) - PR <- computeUProbsCpp(matrix(u)) # unnormd prob; numers LAS 1.1.10 - # X2 <- computeX2sCpp(u, e) - # G2 <- computeG2sCpp(u, e) - # FT <- computeCRsCpp(u, e, -.5) - # CR <- computeCRsCpp(u, e, 2/3) - # NM <- computeNMsCpp(u, e) - - - ## compute MCMC chi squares - ################################################## - PRs <- computeUProbsCpp(out$steps) # unnormd probs; numers LAS 1.1.10 - # X2s <- computeX2sCpp(out$steps, e) - # G2s <- computeG2sCpp(out$steps, e) - # FTs <- computeCRsCpp(out$steps, e, -.5) - # CRs <- computeCRsCpp(out$steps, e, 2/3) - # NMs <- computeNMsCpp(out$steps, e) - - # - # ## compute parameters - # ################################################## - # if(!modelGivenByMatrix){ - # # in principle, there should be one parameter for every cell. - # # there are prod(dim(data)) cells. - # # a good reference is BFH, p. 35 (and to a lesser extent 43) - # # the prod(dim(data)[terms[[j]]] - 1) line below is like - # # (I - 1) (J - 1) (K - 1) - # # CDA p.79 also helpful - # dimSatModel <- nCells - 1 - # degFreedom <- rep.int(0, 2^p) # there are 2^p possible subsets of vars, and - # # therefore there are 2^p possible terms - # - # # possibleTerms are more "types of terms" as opposed to individual terms - # # for example, an entry c(1,3) would refer to all combinations of levels - # # of variables 1 and 3; ie (# var 1 levels - 1) * (# var 3 levels - 1) - # # individual terms (parameters) - # possibleTerms <- subsets(p, include_null = TRUE) - # names(possibleTerms) <- sapply(possibleTerms, paste, collapse = " ") - # names(possibleTerms)[which(names(possibleTerms) == "")] <- "(Intercept)" - # nVarLvls <- dim(data) - # # paramsPerTerm <- lapply(possibleTerms, function(x){ - # # if(length(x) == 0) return(1L) - # # prod(nVarLvls[x] - 1) - # # }) - # - # - # # similarly, there are the terms in the model - # termsInModel <- unique(unlist(lapply( - # lapply(facets, as.character), # to avoid subsets(2) - # subsets, include_null = TRUE), - # recursive = FALSE - # )) - # termsInModel <- lapply(termsInModel, as.integer) - # names(termsInModel) <- sapply(termsInModel, paste, collapse = " ") - # names(termsInModel)[which(names(termsInModel) == "")] <- "(Intercept)" - # paramsPerTermInModel <- lapply(termsInModel, function(x){ - # if(length(x) == 0) return(1L) - # prod(nVarLvls[x] - 1) - # }) - # names(paramsPerTermInModel) <- unname(sapply(termsInModel, function(x){ - # if(length(x) == 0) return("(Intercept)") - # paste(names(dimnames(data))[x], collapse = ".") - # })) - # nParamsInModel <- sum(unlist(paramsPerTermInModel)) - # dimModel <- nParamsInModel - 1 # the - 1 accounts for the overall mean - # overallAsymptoticDegFreedom <- (dimSatModel - dimModel) - # - # - # # compute the parameters - # log_fit <- exp - # log_fit[exp > 0] <- log(exp[exp > 0]) - # param <- as.list(rep(NA, length(termsInModel))) - # names(param) <- names(paramsPerTermInModel) - # for(k in seq_along(param)){ - # if(length(termsInModel[[k]]) == 0){ - # param[[k]] <- mean(log_fit) - # log_fit <- log_fit - param[[k]] - # } else { - # param[[k]] <- apply(log_fit, termsInModel[[k]], mean) - # log_fit <- sweep(log_fit, termsInModel[[k]], param[[k]]) - # } - # } - # # for every step, fit mle - # # then decompose mle - # # problem : they all have the same marginals, so the same - # # mles! - # # idea 1 : sample from the multinomial with the same sample - # # size (so different marginals), estimate, then decompose - # # idea 2 : bootstrap sample from the table, estimate, decompose - # # i think i like idea 2 better. - # - # - # # reorder the param estimates in the order of subsets - # # so you have the intercept, then all first order terms, and so on - # goodOrder <- sapply( - # c("(Intercept)", subsets(names(dimnames(data)))), - # paste, collapse = "." - # ) - # param <- param[goodOrder[goodOrder %in% names(param)]] - # out$param <- param - # - # } - # - ## compute residuals and model selection, agresti p.81, 216, 324 - ################################################## - # out$residuals <- exp - # out$residuals[exp > 0] <- - # (data[exp > 0] - exp[exp > 0]) / sqrt(exp[exp > 0]) - # - # if(!modelGivenByMatrix){ - # k <- nParamsInModel # = number of params - # n <- sum(data) # = sample size - # L <- dmultinom(u, sum(u), e, TRUE) # maximized log-likelihood - # BIC <- log(n)*k - 2*L - # AIC <- 2*k - 2*L - # AICc <- AIC + 2*k*(k+1)/(n-k-1) - # out$df <- paramsPerTermInModel - # out$quality <- c(AIC = AIC, AICc = AICc, BIC = BIC) - # } - - ## add A matrix, p.value and return - ################################################## - out$call <- match.call() - out$obs <- data - # out$exp <- exp - # out$A <- A - - out$p.value <- c( - PR = mean(PRs <= PR) - ) - - # out$p.value <- c( - # PR = mean(PRs <= PR), - # X2 = mean(X2s >= X2), - # G2 = mean(G2s >= G2), - # FT = mean(FTs >= FT), - # CR = mean(CRs >= CR), - # NM = mean(NMs >= NM) - # ) - - out$p.value.std.err <- c( - PR = sqrt(mean(PRs <= PR)*(1-mean(PRs <= PR))/iter) - ) - - # out$p.value.std.err <- c( - # PR = sqrt(mean(PRs <= PR)*(1-mean(PRs <= PR))/iter), - # X2 = sqrt(mean(X2s >= X2)*(1-mean(X2s >= X2))/iter), - # G2 = sqrt(mean(G2s >= G2)*(1-mean(G2s >= G2))/iter), - # FT = sqrt(mean(FTs >= FT)*(1-mean(FTs >= FT))/iter), - # CR = sqrt(mean(CRs >= CR)*(1-mean(CRs >= CR))/iter), - # NM = sqrt(mean(NMs >= NM)*(1-mean(NMs >= NM))/iter) - # ) - - out$mid.p.value <- c( - PR = mean(PRs < PR) + mean(PRs == PR)/2 - ) - - # out$mid.p.value <- c( - # PR = mean(PRs < PR) + mean(PRs == PR)/2, - # X2 = mean(X2s > X2) + mean(X2s == X2)/2, - # G2 = mean(G2s > G2) + mean(G2s == G2)/2, - # FT = mean(FTs > FT) + mean(FTs == FT)/2, - # CR = mean(CRs > CR) + mean(CRs == CR)/2, - # NM = mean(NMs > NM) + mean(NMs == NM)/2 - # ) - - out$iter <- iter - out$burn <- burn - out$thin <- thin - out$statistic <- c(PR = PR) - # out$statistic <- c(PR = PR, X2 = X2, G2 = G2, FT = FT, CR = CR, NM = NM) - out$sampsStats <- list(PRs = PRs) - # out$sampsStats <- list(PRs = PRs, X2s = X2s, G2s = G2s, FTs = FTs, CRs = CRs, NMs = NMs) - out$cells <- nCells - out$method <- method - - class(out) <- "logistic" - out -} \ No newline at end of file diff --git a/R/pmat.R b/R/pmat.R index b75220e..e8e6769 100644 --- a/R/pmat.R +++ b/R/pmat.R @@ -12,19 +12,12 @@ #' @export pmat pmat <- function(levels, facets){ - #########Old Setup######### - #Setup the levels - #levels <- levels[levels != 1] - #num_covariates <- length(levels) - #mat_list <- list() - #exp_cov <-1:num_covariates - ############################# + #Small function to make single covariate configuration matrix func <- function(x){ rbind(rep(1, length(x)), x) } - ######### New Setup ######## if(!is.list(levels)){ num_covariates <- 1 full_mat <- func(levels) @@ -48,14 +41,6 @@ pmat <- function(levels, facets){ facets <- union(heirarc, facets) } - ########Old####### - #List of config matrices, one for each covariate - #for(i in exp_cov){ - # mat_list[[i]] <- matrix(c(rep(1,levels[i]), 1:levels[i]), nrow = 2, byrow = TRUE) - #} - ################ - - #All possible combinations of covariates (powerset like) to be compared to facets if(length(exp_cov) == 1) { facet_list <- list(exp_cov) diff --git a/R/poisson.R b/R/poisson.R deleted file mode 100644 index 743d9b9..0000000 --- a/R/poisson.R +++ /dev/null @@ -1,424 +0,0 @@ -#' Fit a Poisson Regression model with algebraic methods -#' -#' -#' -#' @param model hierarchical poisson model specification -#' @param data data, as a data frame with raw data with discrete covariates -#' @param init the initialization of the chain. by default, this is -#' the observed table -#' @param iter number of chain iterations -#' @param burn burn-in -#' @param thin thinning -#' @param engine C++ or R? (C++ yields roughly a 20-25x speedup) -#' @param method should the expected value (exp) be fit using -#' iterative proportional fitting (via loglin) or the MCMC as the -#' average of the steps? -#' @param moves the markov moves for the mcmc (as columns of a -#' matrix). -#' @param ... ... -#' @return a list containing named elements \itemize{ \item -#' \code{steps}: an integer matrix whose columns represent -#' individual samples from the mcmc. \item \code{moves}: the moves -#' used for the proposal distribution in the mcmc, computed with -#' 4ti2 (note that only the positive moves are given). \item -#' \code{acceptProb}: the average acceptance probability of the -#' moves, including the thinned moves. \item \code{param}: the -#' fitted parameters of the log linear model. \item \code{df}: -#' parameters per term in the model \item \code{quality}: model -#' selection statistics AIC, AICc, and BIC. \item -#' \code{residuals}: the (unstandardized) pearson residuals (O - -#' E) / sqrt(E) \item \code{call}: the call. \item \code{obs}: the -#' contingency table given. \item \code{exp}: the fit contingency -#' table as an integer array. \item \code{A}: the sufficient -#' statistics computing matrix (from Tmaker). \item -#' \code{p.value}: the exact p-values of individual tests, -#' accurate to Monte-Carlo error. these are computed as the -#' proportion of samples with statistics equal to or larger than -#' the oberved statistic. \item \code{mid.p.value}: the mid -#' p.values, see Agresti pp.20--21. \item \code{statistic}: the -#' pearson's chi-squared (X2), likelihood ratio (G2), -#' Freeman-Tukey (FT), Cressie-Read (CR), and Neyman modified -#' chi-squared (NM) statistics computed for the table given. \item -#' \code{sampsStats}: the statistics computed for each mcmc -#' sample. \item \code{cells}: the number of cells in the table. -#' \item \code{method}: the method used to estimate the table. } -#' @importFrom stats model.frame -#' @export pois_reg - - - - - - -pois_reg <- function(model, data, - iter = 1E4, burn = 1000, - thin = 10, engine = c("Cpp","R"), - method = c("ipf", "mcmc"), moves, - hit_and_run = FALSE, - SIS = FALSE, - non_uniform = FALSE, - adaptive = FALSE, - ...) -{ - - ## set/check args - ################################################## - - engine <- match.arg(engine) - method <- match.arg(method) - argList <- as.list(match.call(expand.dots = TRUE))[-1] - - if("formula" %in% names(argList)){ - .Deprecated(msg = - 'the formula argument is deprecated, please use "model" instead.' - ) - } - - - ## reshape data - ################################################## - - # data <- suppressMessages(teshape(data, "freq")) - # p <- length(dim(data)) - # nCells <- length(data) - - ## if a pure array is given, give names for later - # if(is.array(data) && is.null(dimnames(data))) data <- array2tab(data) - - ## other basic objects - #varsNlevels <- dimnames(data) - - - - - - ## check for sampling zeros - ################################################## - #if(any(data == 0L)) message( - # "Care ought be taken with tables with sampling zeros to ensure the MLE exists." - #) - - - ## parse model specification (formula for vector of r_k's) - ################################################## - - modelGivenByMatrix <- ifelse(is.matrix(model), TRUE, FALSE) - - if(modelGivenByMatrix){ - A <- model - data <- suppressMessages(teshape(data, "tab")) - init <- tab2vec(data) - nCells <- length(init) - } else { - # if it's a formula, convert to list - if(is.formula(model)){ - ## reshape data - data <- model.frame(model, data) - - # name data - vars <- names(data) - - ## parse formula - fString <- as.character(model) - response <- fString[2] - predString <- fString[3] - - - ## make list of facets - model <- strsplit(predString, " \\+ ")[[1]] - model <- strsplit(model, " \\* ") - - ## format the data - names(data)[names(data) == response] <- "response" - data <- ddply(data, unique(unlist(model)), summarise, sum = sum(response)) - - if(length(model) == 1){ - - init <- data$sum - nCells <- length(init) - p <- 1 - - } else { - # if model specifiaction, then make table - p <- ncol(data) - 1 - init <- data$sum - nCells <- length(init) - } - } - - - - # make facets (list of index vecs); if model specified with variable - # names, convert them to indices - if(all(unlist(model) %in% vars)){ # variable names - varname2index <- 1:p - names(varname2index) <- vars[vars != response] - facets <- lapply(model, function(varsInFacet) varname2index[varsInFacet]) - } else if(all(unlist(model) %in% 1:length(vars))){ # by indices - facets <- lapply(model, as.integer) # to fix the ~ 1 + 2 case, parsed as chars - } else { - stop("Invalid model specification, see ?pois_reg") - } - ## levels (assuming all levels are numeric i.e. (1,2,3,... not Green, Blue, Red, etc.) - - if(ncol(data) <= 2){ - lvls <- unique(data[,-ncol(data)]) - } else { - lvls <- lapply(data[,-ncol(data)], unique) - } - # make configuration (model) matrix - A <- pmat(lvls, facets) - } - - # check to see if all level configurations are there (need work here) - lvlsInData <- as.list(as.data.frame(t(expand.grid(lvls)))) %in% as.list(as.data.frame(t(data[,-ncol(data)]))) - - # subset A by levels that are present - A <- A[,lvlsInData] - - # find the sufficient statistics - suff_stats <- unname(A %*% init) - - ## construct A matrix and compute moves - ################################################## - - if(missing(moves) && !is.null(getOption("4ti2_path"))){ - - message("Computing Markov moves (4ti2)... ", appendLF = FALSE) - moves <- markov(A) - message("done.", appendLF = TRUE) - - } else if(missing(moves) && is.null(getOption("4ti2_path"))){ - - warning( - "No moves were provided and 4ti2 is not found.\n", - " The resulting chain is likely not connected and strongly autocorrelated.\n", - " See ?pois_reg. Consider using rmove to generate SIS moves in advance.", - immediate. = TRUE - ) - message("Computing 1000 SIS moves... ", appendLF = FALSE) - moves <- rmove(n = 1000, A = A, b = A %*% tab2vec(data), ...) - message("done.", appendLF = TRUE) - - } else if(is.character(moves)){ - - movesMat <- NULL - stopifnot(all(moves %in% c("lattice", "markov", "groebner", "grobner", "graver", "sis"))) - if("lattice" %in% moves) movesMat <- cbind(movesMat, zbasis(A)) - if("markov" %in% moves) movesMat <- cbind(movesMat, markov(A)) - if("groebner" %in% moves) movesMat <- cbind(movesMat, groebner(A)) - if("grobner" %in% moves) movesMat <- cbind(movesMat, groebner(A)) - if("graver" %in% moves) stop("graver not yet implemented.") - moves <- movesMat - - } - - stopifnot(is.array(moves)) - - - - ## run metropolis-hastings - ################################################## - init <- unname(init) # init - out <- metropolis(init, moves, suff_stats = suff_stats, config = unname(A), iter = iter, burn = burn, thin = thin, - engine = engine, hit_and_run = hit_and_run, SIS = SIS, non_uniform = non_uniform, adaptive = adaptive) - - - - ## compute data chi square - ################################################## - # if(modelGivenByMatrix && method == "ipf"){ - # message( - # "Iterative proportional fitting is not yet implemented\n", - # " for models specified by configuration matrices.\n", - # " Changing to method = \"mcmc\"..." - # ) - # method <- "mcmc" - # } - # if(method == "ipf"){ - # exp <- loglin(data, facets, fit = TRUE, print = FALSE)$fit - # } else if(method == "mcmc"){ - # exp <- vec2tab(rowMeans(out$steps), dim(data)) - # dimnames(exp) <- dimnames(data) - # } - # e <- unname(tab2vec(exp)) - - u <- t(t(data$sum)) - PR <- computeUProbsCpp(matrix(u)) # unnormd prob; numers LAS 1.1.10 - # X2 <- computeX2sCpp(u, e) - # G2 <- computeG2sCpp(u, e) - # FT <- computeCRsCpp(u, e, -.5) - # CR <- computeCRsCpp(u, e, 2/3) - # NM <- computeNMsCpp(u, e) - - - ## compute MCMC chi squares - ################################################## - PRs <- computeUProbsCpp(out$steps) # unnormd probs; numers LAS 1.1.10 - # X2s <- computeX2sCpp(out$steps, e) - # G2s <- computeG2sCpp(out$steps, e) - # FTs <- computeCRsCpp(out$steps, e, -.5) - # CRs <- computeCRsCpp(out$steps, e, 2/3) - # NMs <- computeNMsCpp(out$steps, e) - - # - # ## compute parameters - # ################################################## - # if(!modelGivenByMatrix){ - # # in principle, there should be one parameter for every cell. - # # there are prod(dim(data)) cells. - # # a good reference is BFH, p. 35 (and to a lesser extent 43) - # # the prod(dim(data)[terms[[j]]] - 1) line below is like - # # (I - 1) (J - 1) (K - 1) - # # CDA p.79 also helpful - # dimSatModel <- nCells - 1 - # degFreedom <- rep.int(0, 2^p) # there are 2^p possible subsets of vars, and - # # therefore there are 2^p possible terms - # - # # possibleTerms are more "types of terms" as opposed to individual terms - # # for example, an entry c(1,3) would refer to all combinations of levels - # # of variables 1 and 3; ie (# var 1 levels - 1) * (# var 3 levels - 1) - # # individual terms (parameters) - # possibleTerms <- subsets(p, include_null = TRUE) - # names(possibleTerms) <- sapply(possibleTerms, paste, collapse = " ") - # names(possibleTerms)[which(names(possibleTerms) == "")] <- "(Intercept)" - # nVarLvls <- dim(data) - # # paramsPerTerm <- lapply(possibleTerms, function(x){ - # # if(length(x) == 0) return(1L) - # # prod(nVarLvls[x] - 1) - # # }) - # - # - # # similarly, there are the terms in the model - # termsInModel <- unique(unlist(lapply( - # lapply(facets, as.character), # to avoid subsets(2) - # subsets, include_null = TRUE), - # recursive = FALSE - # )) - # termsInModel <- lapply(termsInModel, as.integer) - # names(termsInModel) <- sapply(termsInModel, paste, collapse = " ") - # names(termsInModel)[which(names(termsInModel) == "")] <- "(Intercept)" - # paramsPerTermInModel <- lapply(termsInModel, function(x){ - # if(length(x) == 0) return(1L) - # prod(nVarLvls[x] - 1) - # }) - # names(paramsPerTermInModel) <- unname(sapply(termsInModel, function(x){ - # if(length(x) == 0) return("(Intercept)") - # paste(names(dimnames(data))[x], collapse = ".") - # })) - # nParamsInModel <- sum(unlist(paramsPerTermInModel)) - # dimModel <- nParamsInModel - 1 # the - 1 accounts for the overall mean - # overallAsymptoticDegFreedom <- (dimSatModel - dimModel) - # - # - # # compute the parameters - # log_fit <- exp - # log_fit[exp > 0] <- log(exp[exp > 0]) - # param <- as.list(rep(NA, length(termsInModel))) - # names(param) <- names(paramsPerTermInModel) - # for(k in seq_along(param)){ - # if(length(termsInModel[[k]]) == 0){ - # param[[k]] <- mean(log_fit) - # log_fit <- log_fit - param[[k]] - # } else { - # param[[k]] <- apply(log_fit, termsInModel[[k]], mean) - # log_fit <- sweep(log_fit, termsInModel[[k]], param[[k]]) - # } - # } - # # for every step, fit mle - # # then decompose mle - # # problem : they all have the same marginals, so the same - # # mles! - # # idea 1 : sample from the multinomial with the same sample - # # size (so different marginals), estimate, then decompose - # # idea 2 : bootstrap sample from the table, estimate, decompose - # # i think i like idea 2 better. - # - # - # # reorder the param estimates in the order of subsets - # # so you have the intercept, then all first order terms, and so on - # goodOrder <- sapply( - # c("(Intercept)", subsets(names(dimnames(data)))), - # paste, collapse = "." - # ) - # param <- param[goodOrder[goodOrder %in% names(param)]] - # out$param <- param - # - # } - # - ## compute residuals and model selection, agresti p.81, 216, 324 - ################################################## - # out$residuals <- exp - # out$residuals[exp > 0] <- - # (data[exp > 0] - exp[exp > 0]) / sqrt(exp[exp > 0]) - # - # if(!modelGivenByMatrix){ - # k <- nParamsInModel # = number of params - # n <- sum(data) # = sample size - # L <- dmultinom(u, sum(u), e, TRUE) # maximized log-likelihood - # BIC <- log(n)*k - 2*L - # AIC <- 2*k - 2*L - # AICc <- AIC + 2*k*(k+1)/(n-k-1) - # out$df <- paramsPerTermInModel - # out$quality <- c(AIC = AIC, AICc = AICc, BIC = BIC) - # } - - ## add A matrix, p.value and return - ################################################## - out$call <- match.call() - out$obs <- data - # out$exp <- exp - # out$A <- A - - out$p.value <- c( - PR = mean(PRs <= PR) - ) - - # out$p.value <- c( - # PR = mean(PRs <= PR), - # X2 = mean(X2s >= X2), - # G2 = mean(G2s >= G2), - # FT = mean(FTs >= FT), - # CR = mean(CRs >= CR), - # NM = mean(NMs >= NM) - # ) - - out$p.value.std.err <- c( - PR = sqrt(mean(PRs <= PR)*(1-mean(PRs <= PR))/iter) - ) - - # out$p.value.std.err <- c( - # PR = sqrt(mean(PRs <= PR)*(1-mean(PRs <= PR))/iter), - # X2 = sqrt(mean(X2s >= X2)*(1-mean(X2s >= X2))/iter), - # G2 = sqrt(mean(G2s >= G2)*(1-mean(G2s >= G2))/iter), - # FT = sqrt(mean(FTs >= FT)*(1-mean(FTs >= FT))/iter), - # CR = sqrt(mean(CRs >= CR)*(1-mean(CRs >= CR))/iter), - # NM = sqrt(mean(NMs >= NM)*(1-mean(NMs >= NM))/iter) - # ) - - out$mid.p.value <- c( - PR = mean(PRs < PR) + mean(PRs == PR)/2 - ) - - # out$mid.p.value <- c( - # PR = mean(PRs < PR) + mean(PRs == PR)/2, - # X2 = mean(X2s > X2) + mean(X2s == X2)/2, - # G2 = mean(G2s > G2) + mean(G2s == G2)/2, - # FT = mean(FTs > FT) + mean(FTs == FT)/2, - # CR = mean(CRs > CR) + mean(CRs == CR)/2, - # NM = mean(NMs > NM) + mean(NMs == NM)/2 - # ) - - out$iter <- iter - out$burn <- burn - out$thin <- thin - out$statistic <- c(PR = PR) - # out$statistic <- c(PR = PR, X2 = X2, G2 = G2, FT = FT, CR = CR, NM = NM) - out$sampsStats <- list(PRs = PRs) - # out$sampsStats <- list(PRs = PRs, X2s = X2s, G2s = G2s, FTs = FTs, CRs = CRs, NMs = NMs) - out$cells <- nCells - out$method <- method - - class(out) <- "poisson" - out -} diff --git a/man/Aglm.Rd b/man/Aglm.Rd new file mode 100644 index 0000000..b27cf59 --- /dev/null +++ b/man/Aglm.Rd @@ -0,0 +1,132 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Aglm.r +\name{Aglm} +\alias{Aglm} +\title{Fitting generalized linear models with algebraic methods} +\usage{ +Aglm(model, data, family = poisson(), iter = 10000, burn = 10000, + thin = 100, engine = c("Cpp", "R"), moves, hit_and_run = FALSE, + SIS = FALSE, non_uniform = FALSE, adaptive = FALSE, ...) +} +\arguments{ +\item{model}{model specification, either in terms of a configuration matrix or a symbolic +description of the model to be fitted} + +\item{data}{data, as a data frame with raw data with discrete covariates} + +\item{family}{a description of the error distirbution and link function used in the model} + +\item{iter}{number of chain iterations} + +\item{burn}{burn-in} + +\item{thin}{thinning} + +\item{engine}{C++ or R? (C++ yields roughly a 20-25x speedup)} + +\item{moves}{the markov moves for the mcmc (as columns of a +matrix).} + +\item{...}{...} +} +\value{ +a list containing named elements \itemize{ \item + \code{steps}: an integer matrix whose columns represent + individual samples from the mcmc. \item \code{moves}: the moves + used for the proposal distribution in the mcmc, computed with + 4ti2 (note that only the positive moves are given). \item + \code{acceptProb}: the average acceptance probability of the + moves, including the thinned moves. \item \code{call}: the call. + \item \code{obs}: the summarized data. + \item \code{A}: the sufficient + statistics computing matrix. + \item \code{sufficientStatistics}: The sufficient statistics of the model. + \item \code{p.value}: the exact p-values of individual tests, + accurate to Monte-Carlo error. these are computed as the + proportion of samples with statistics equal to or larger than + the oberved statistic. \item \code{mid.p.value}: the mid + p.values, see Agresti pp.20--21. \item \code{sampsStats}: + the statistics computed for each mcmc + sample. \item \code{cells}: the number of cells in the table. } +} +\description{ +Fitting generalized linear models with algebraic methods +} +\examples{ + + library(ggplot2);theme_set(theme_bw()) + + # generating data and running a poisson regression model + # pick beta 0 and beta 1 + b0 <- 1; b1 <- 0.3 + + # generate data + n <- 100 + x <- sample(1:5, n, replace = T) + y <- rpois(n, lambda = exp(b0 + b1*x)) + df <- data.frame( + x = x, + y = y + ) + + # function output + out <- Aglm(y ~ x, data = df, family = poisson()) + + # check convergence through trace plot + qplot(1:10000, out$sampStats$PRs, geom = "line") + + # compare Aglm and glm predictions with the truth + + # model fitting with glm + mod <- glm(y ~ x, data = df, family = poisson()) + + # truth + exp(b0 + b1*(1:5)) + + # glm predictions + predict(mod, data.frame(x = 1:5), type = "response") + + # Aglm predictions + rowMeans(out$steps) / ddply(df, "x", nrow)$V1 + + + + # generating data and running a logistic regression model + + # helper functions + link <- function(p) log(p/(1-p)) + invlink <- function(x) 1 / (1+exp(-x)) + + + # create a fake data set + + # one covariate + b0 <- 0.5; b1 <- 0.2 + + n <- 100 + x <- sample(1:5, n, replace = T) + y <- rbinom(n = n, size = 1, prob = invlink(b0 + b1*x)) + df <- data.frame( + x = x, + y = y + ) + + # Aglm + out <- Aglm(y ~ x, data = df, family = binomial()) + + # check convergence through trace plot + qplot(1:10000, out$sampStats$PRs, geom = "line") + + # using glm + mod <- glm(y ~ x, data = df, family = binomial()) + + # truth + invlink(b0 + b1*out$obs[,1]) + + # glm predictions + predict(mod, data.frame(x = c(1:5)), type = "response") + + # Aglm predictions + rowMeans(out$steps) / ddply(df, "x", nrow)$V1 + +} diff --git a/man/apoisson.Rd b/man/apoisson.Rd deleted file mode 100644 index f8a97a3..0000000 --- a/man/apoisson.Rd +++ /dev/null @@ -1,149 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/apoisson.R -\name{apoisson} -\alias{apoisson} -\title{Fit a hierarchical Poisson regression models with algebraic -methods} -\usage{ -apoisson(model, data, init = tab2vec(data), iter = 10000, burn = 1000, - thin = 10, engine = c("Cpp", "R"), method = c("ipf", "mcmc"), moves, - ...) -} -\arguments{ -\item{model}{hierarchical log-linear model specification} - -\item{data}{data, typically as a table but can be in different -formats. see \code{\link{teshape}}} - -\item{init}{the initialization of the chain. by default, this is -the observed table} - -\item{iter}{number of chain iterations} - -\item{burn}{burn-in} - -\item{thin}{thinning} - -\item{engine}{C++ or R? (C++ yields roughly a 20-25x speedup)} - -\item{method}{should the expected value (exp) be fit using -iterative proportional fitting (via loglin) or the MCMC as the -average of the steps?} - -\item{moves}{the markov moves for the mcmc (as columns of a -matrix).} - -\item{...}{...} -} -\value{ -a list containing named elements \itemize{ \item - \code{steps}: an integer matrix whose columns represent - individual samples from the mcmc. \item \code{moves}: the moves - used for the proposal distribution in the mcmc, computed with - 4ti2 (note that only the positive moves are given). \item - \code{acceptProb}: the average acceptance probability of the - moves, including the thinned moves. \item \code{param}: the - fitted parameters of the log linear model. \item \code{df}: - parameters per term in the model \item \code{quality}: model - selection statistics AIC, AICc, and BIC. \item - \code{residuals}: the (unstandardized) pearson residuals (O - - E) / sqrt(E) \item \code{call}: the call. \item \code{obs}: the - contingency table given. \item \code{exp}: the fit contingency - table as an integer array. \item \code{A}: the sufficient - statistics computing matrix (from Tmaker). \item - \code{p.value}: the exact p-values of individual tests, - accurate to Monte-Carlo error. these are computed as the - proportion of samples with statistics equal to or larger than - the oberved statistic. \item \code{mid.p.value}: the mid - p.values, see Agresti pp.20--21. \item \code{statistic}: the - pearson's chi-squared (X2), likelihood ratio (G2), - Freeman-Tukey (FT), Cressie-Read (CR), and Neyman modified - chi-squared (NM) statistics computed for the table given. \item - \code{sampsStats}: the statistics computed for each mcmc - sample. \item \code{cells}: the number of cells in the table. - \item \code{method}: the method used to estimate the table. } -} -\description{ -Fit a hierarchical Poisson regression models with algebraic -methods -} -\examples{ - -\dontrun{ - - -## handedness introductory example -############################################################ - -data(handy) - -(out <- loglinear(~ Gender + Handedness, data = handy)) - -# you can also specify the same model using variable indices... -(out <- loglinear(~ 1 + 2, data = handy)) - -# ... or as a list of facets given by indices -(out <- loglinear(list(1, 2), data = handy)) - -# ... or as a list of facets given by name -(out <- loglinear(list("Gender", "Handedness"), data = handy)) - -# ... and even via a pre-computed configuration matrix -# this method does come with somewhat reduced output -A <- hmat(c(2, 2), 1:2) -(out <- loglinear(A, data = handy)) - - - -# loglinear performs the same tasks as loglin and loglm, -# but loglinear gives the exact test p values and more goodness-of-fit statistics -stats::loglin(handy, list(1, 2)) -MASS::loglm(~ Gender + Handedness, data = handy) -# loglm is just a wrapper of loglin - -# we can check loglinear's output with -fisher.test(handy)$p.value -out$p.value - - - - - - - - - - -} - - -} -\references{ -Diaconis, P. and B. Sturmfels (1998). Algebraic - Algorithms for Sampling from Conditional Distributions. - \emph{The Annals of Statistics} 26(1), pp.363-397. - -Drton, M., B. Sturmfels, and S. Sullivant (2009). - \emph{Lectures on Algebraic Statistics}, Basel: Birkhauser - Verlag AG. - -Aoki, S., H. Hara, and A. Takemura (2012). - \emph{Markov Bases in Algebraic Statistics}, Springer. - -Agresti, A. (2002). \emph{Categorical Data Analysis}, - Basel: John Wiley & Sons, 2ed. - -Agresti, A. (1992). A Survey of Exact Inference for - Contingency Tables \emph{Statistical Science} 7(1), pp.131-153. - -Read, T. and Cressie, N. (1998). - \emph{Goodness-of-Fit Statistics for Discrete Multivariate - Data}, Springer-Verlag. -} -\seealso{ -\code{\link{loglin}}, \code{\link{loglm}}, - \code{\link{metropolis}} -} -\author{ -David Kahle -} diff --git a/man/log_reg.Rd b/man/log_reg.Rd deleted file mode 100644 index 1a44e9b..0000000 --- a/man/log_reg.Rd +++ /dev/null @@ -1,68 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/logistic.r -\name{log_reg} -\alias{log_reg} -\title{Fit a Logistic Regression model with algebraic methods} -\usage{ -log_reg(model, data, iter = 10000, burn = 1000, thin = 10, - engine = c("Cpp", "R"), method = c("ipf", "mcmc"), moves, - hit_and_run = FALSE, SIS = FALSE, non_uniform = FALSE, ...) -} -\arguments{ -\item{model}{hierarchical poisson model specification} - -\item{data}{data, as a data frame with raw data with discrete covariates} - -\item{iter}{number of chain iterations} - -\item{burn}{burn-in} - -\item{thin}{thinning} - -\item{engine}{C++ or R? (C++ yields roughly a 20-25x speedup)} - -\item{method}{should the expected value (exp) be fit using -iterative proportional fitting (via loglin) or the MCMC as the -average of the steps?} - -\item{moves}{the markov moves for the mcmc (as columns of a -matrix).} - -\item{...}{...} - -\item{init}{the initialization of the chain. by default, this is -the observed table} -} -\value{ -a list containing named elements \itemize{ \item - \code{steps}: an integer matrix whose columns represent - individual samples from the mcmc. \item \code{moves}: the moves - used for the proposal distribution in the mcmc, computed with - 4ti2 (note that only the positive moves are given). \item - \code{acceptProb}: the average acceptance probability of the - moves, including the thinned moves. \item \code{param}: the - fitted parameters of the log linear model. \item \code{df}: - parameters per term in the model \item \code{quality}: model - selection statistics AIC, AICc, and BIC. \item - \code{residuals}: the (unstandardized) pearson residuals (O - - E) / sqrt(E) \item \code{call}: the call. \item \code{obs}: the - contingency table given. \item \code{exp}: the fit contingency - table as an integer array. \item \code{A}: the sufficient - statistics computing matrix (from Tmaker). \item - \code{p.value}: the exact p-values of individual tests, - accurate to Monte-Carlo error. these are computed as the - proportion of samples with statistics equal to or larger than - the oberved statistic. \item \code{mid.p.value}: the mid - p.values, see Agresti pp.20--21. \item \code{statistic}: the - pearson's chi-squared (X2), likelihood ratio (G2), - Freeman-Tukey (FT), Cressie-Read (CR), and Neyman modified - chi-squared (NM) statistics computed for the table given. \item - \code{sampsStats}: the statistics computed for each mcmc - sample. \item \code{cells}: the number of cells in the table. - \item \code{method}: the method used to estimate the table. } - @importFrom stats model.frame - @export log_reg -} -\description{ -Fit a Logistic Regression model with algebraic methods -} diff --git a/man/pois_reg.Rd b/man/pois_reg.Rd deleted file mode 100644 index 12cde28..0000000 --- a/man/pois_reg.Rd +++ /dev/null @@ -1,68 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/poisson.R -\name{pois_reg} -\alias{pois_reg} -\title{Fit a Poisson Regression model with algebraic methods} -\usage{ -pois_reg(model, data, iter = 10000, burn = 1000, thin = 10, - engine = c("Cpp", "R"), method = c("ipf", "mcmc"), moves, - hit_and_run = FALSE, SIS = FALSE, non_uniform = FALSE, - adaptive = FALSE, ...) -} -\arguments{ -\item{model}{hierarchical poisson model specification} - -\item{data}{data, as a data frame with raw data with discrete covariates} - -\item{iter}{number of chain iterations} - -\item{burn}{burn-in} - -\item{thin}{thinning} - -\item{engine}{C++ or R? (C++ yields roughly a 20-25x speedup)} - -\item{method}{should the expected value (exp) be fit using -iterative proportional fitting (via loglin) or the MCMC as the -average of the steps?} - -\item{moves}{the markov moves for the mcmc (as columns of a -matrix).} - -\item{...}{...} - -\item{init}{the initialization of the chain. by default, this is -the observed table} -} -\value{ -a list containing named elements \itemize{ \item - \code{steps}: an integer matrix whose columns represent - individual samples from the mcmc. \item \code{moves}: the moves - used for the proposal distribution in the mcmc, computed with - 4ti2 (note that only the positive moves are given). \item - \code{acceptProb}: the average acceptance probability of the - moves, including the thinned moves. \item \code{param}: the - fitted parameters of the log linear model. \item \code{df}: - parameters per term in the model \item \code{quality}: model - selection statistics AIC, AICc, and BIC. \item - \code{residuals}: the (unstandardized) pearson residuals (O - - E) / sqrt(E) \item \code{call}: the call. \item \code{obs}: the - contingency table given. \item \code{exp}: the fit contingency - table as an integer array. \item \code{A}: the sufficient - statistics computing matrix (from Tmaker). \item - \code{p.value}: the exact p-values of individual tests, - accurate to Monte-Carlo error. these are computed as the - proportion of samples with statistics equal to or larger than - the oberved statistic. \item \code{mid.p.value}: the mid - p.values, see Agresti pp.20--21. \item \code{statistic}: the - pearson's chi-squared (X2), likelihood ratio (G2), - Freeman-Tukey (FT), Cressie-Read (CR), and Neyman modified - chi-squared (NM) statistics computed for the table given. \item - \code{sampsStats}: the statistics computed for each mcmc - sample. \item \code{cells}: the number of cells in the table. - \item \code{method}: the method used to estimate the table. } - @importFrom stats model.frame -} -\description{ -Fit a Poisson Regression model with algebraic methods -} From 515511f49d5e9102ae0e7fe888eec5558a74d7a7 Mon Sep 17 00:00:00 2001 From: David Kahle Date: Tue, 28 Nov 2017 09:07:35 -0600 Subject: [PATCH 22/53] rm .DS_Store forever (hopefully) --- .DS_Store | Bin 10244 -> 0 bytes .gitignore | 2 ++ R/.DS_Store | Bin 6148 -> 0 bytes 3 files changed, 2 insertions(+) delete mode 100644 .DS_Store delete mode 100644 R/.DS_Store diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 81babd2bd558b802a105f7adfde182cd6b6e6e27..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 10244 zcmeHMTWl0n7(QQHV1|x#T1tUs8&--ywL)p>1rXTYC!Ye`k%nDQLLb%7!?bwLcUV(>z zhk%EGhk%EGhrnn+fHa#2DP34E_7Lz8@DLbAfWIFCba5MW?U1m}tAiqE3qUrFik(1d zl_$ssTpM)lkg!xk0m7B6aD}_YfN-bsfZHzU+96?uJA=FVfO}`SI}|u~r~U&@bB3U> zUhE;@AutjFv3IWz_uD;TBXR#;Sa=30scGZJPmtt^>HMc=PTbV6)#`7Eg<`#SWhlDQ z$>|X-?568Nv90lRqh%Sx=}@>?*ZMou>?+HQg>+MkCb+h@=w`&~ue5s2h~3^G`fJf6 zv8^5Il)9+aN3L}tQ)||3J**pg>~2ZszYcZUz(84PNkL#yQR!eoV4$q5kkZAagM;cm zNt%&gQn9YBXK?omFTQl(<%5Ub|4_^x?i++nPD0}Z<-zI=yRS|_k@A{2$|x-SDgg@R z24Bs}+MHENhnmHQw!SB(Tc+KtMQySjYKBc1t?sf+cjzr$meJd7+8t_USIFqJaq^^-$`UHfcFk2~;1Of7d zVJt6LD3UUXQ7$WD)od;6WKlN2_OSizd3J;yWyjcYc9NZ9XV{PIEc=!H#?Gqfz6rmU;D8*7#qXxBDg>`5~3)-*&ozSrj2D;IUUAPxRxF3&TA0ETwIDj{B z1n=PlKE-GF9H;O#e#Tk+A%>O`XYcLtQ45RnWCeTOg`D!}<=N|BS-Wb=>QT(zGm|)H zCudK|nd*>u@$$-H9wa^bL<%~r6SQ*Z7N?U#=u0^~jhOAvRc6Ksl1!W?Vym`8;>$su zaN21-%Rie_PnAS=u0LO*rJ<%cS*5@~pA%nA6}1kbBeFV9)Rrpcd=aQ=qIRjWjMH5m zFKTm@3RzZ}$jtSp%h!UV9}`1=Aby@>7ua7Ik917Kjl{|YSd0}2qKa6#66;ZidNiO3 zo6&&~!iW$jP4wU{*tiFS4ma<^19%X7@em%ylXwcx;8{F}S8y1w;&mLwF}#JhaU3Uc z8s8+bbWIXVzZER)YlxbLWo~hDjHNQ0R$13|D?;T(Bs?YMCD%xVTsRVhXz%5hvahnL zI;Un7FXgc&L~3TnauT8>!PyjaR>3G19KL|x!MpeXpWq98=M0w(OHHlF$76g?sf!%F z0=db!UgTzW{`}(dWf$ef(T*z-nIv7hTDehjmAqk5@XztjOB9=1{I@2GO`%f6Kd9nj zQ=*h5ip>&*KADEa#=pW}ksxEr99M3xmZ4m3zGOeKU+9DAclPJM$ju!jH|^L&a??q2 zvz sb4BNTh3ec>6~`|~<=lTrSjV06jyu`^>CXUf|L^Vpui+u|^|=548)V);@c;k- diff --git a/.gitignore b/.gitignore index 0a74739..a020294 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,5 @@ src/*.o src/*.so src/*.dll +R/.DS_Store +.DS_Store diff --git a/R/.DS_Store b/R/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 Date: Tue, 28 Nov 2017 11:39:36 -0600 Subject: [PATCH 23/53] cleanup and aglm namechange --- NAMESPACE | 2 +- R/Aglm.r | 29 +++++++++++++---------------- man/Aglm.Rd | 29 ++++++++++++++--------------- 3 files changed, 28 insertions(+), 32 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 28a9824..d16a005 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,13 +6,13 @@ S3method(print,m2) S3method(print,polyOptim) S3method(print,spectral) S3method(summary,bertini) -export(Aglm) export(Amaker) export(Emaker) export(Mmaker) export(Pmaker) export(Smaker) export(Tmaker) +export(aglm) export(array2tab) export(bertini) export(bump) diff --git a/R/Aglm.r b/R/Aglm.r index 1672c7b..30fc4b3 100644 --- a/R/Aglm.r +++ b/R/Aglm.r @@ -39,7 +39,7 @@ #' b0 <- 1; b1 <- 0.3 #' #' # generate data -#' n <- 100 +#' n <- 5000 #' x <- sample(1:5, n, replace = T) #' y <- rpois(n, lambda = exp(b0 + b1*x)) #' df <- data.frame( @@ -48,12 +48,12 @@ #' ) #' #' # function output -#' out <- Aglm(y ~ x, data = df, family = poisson()) +#' out <- aglm(y ~ x, data = df, family = poisson(), thin = 2000) #' #' # check convergence through trace plot -#' qplot(1:10000, out$sampStats$PRs, geom = "line") +#' qplot(1:10000, out$sampsStats$PRs, geom = "line") #' -#' # compare Aglm and glm predictions with the truth +#' # compare aglm and glm predictions with the truth #' #' # model fitting with glm #' mod <- glm(y ~ x, data = df, family = poisson()) @@ -64,7 +64,7 @@ #' # glm predictions #' predict(mod, data.frame(x = 1:5), type = "response") #' -#' # Aglm predictions +#' # aglm predictions #' rowMeans(out$steps) / ddply(df, "x", nrow)$V1 #' #' @@ -89,11 +89,11 @@ #' y = y #' ) #' -#' # Aglm -#' out <- Aglm(y ~ x, data = df, family = binomial()) +#' # aglm +#' out <- aglm(y ~ x, data = df, family = binomial()) #' #' # check convergence through trace plot -#' qplot(1:10000, out$sampStats$PRs, geom = "line") +#' qplot(1:10000, out$sampsStats$PRs, geom = "line") #' #' # using glm #' mod <- glm(y ~ x, data = df, family = binomial()) @@ -104,19 +104,16 @@ #' # glm predictions #' predict(mod, data.frame(x = c(1:5)), type = "response") #' -#' # Aglm predictions +#' # aglm predictions #' rowMeans(out$steps) / ddply(df, "x", nrow)$V1 #' #' @export -Aglm <- function(model, data, family = poisson(), +aglm <- function(model, data, family = poisson(), iter = 1E4, burn = 10000, thin = 100, engine = c("Cpp","R"), - moves, hit_and_run = FALSE, - SIS = FALSE, - non_uniform = FALSE, - adaptive = FALSE, + moves, ...) { ## set/check args @@ -286,7 +283,7 @@ Aglm <- function(model, data, family = poisson(), ################################################## init <- unname(init) # init out <- metropolis(init, moves, suff_stats = suff_stats, config = unname(A), iter = iter, burn = burn, thin = thin, - engine = engine, hit_and_run = hit_and_run, SIS = SIS, non_uniform = non_uniform, adaptive = adaptive) + engine = engine) u <- t(t(data$sum)) @@ -325,7 +322,7 @@ Aglm <- function(model, data, family = poisson(), out$cells <- nCells out$method <- method - class(out) <- "Aglm" + class(out) <- "aglm" out diff --git a/man/Aglm.Rd b/man/Aglm.Rd index b27cf59..31a0e3b 100644 --- a/man/Aglm.Rd +++ b/man/Aglm.Rd @@ -1,12 +1,11 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/Aglm.r -\name{Aglm} -\alias{Aglm} +% Please edit documentation in R/aglm.r +\name{aglm} +\alias{aglm} \title{Fitting generalized linear models with algebraic methods} \usage{ -Aglm(model, data, family = poisson(), iter = 10000, burn = 10000, - thin = 100, engine = c("Cpp", "R"), moves, hit_and_run = FALSE, - SIS = FALSE, non_uniform = FALSE, adaptive = FALSE, ...) +aglm(model, data, family = poisson(), iter = 10000, burn = 10000, + thin = 100, engine = c("Cpp", "R"), moves, ...) } \arguments{ \item{model}{model specification, either in terms of a configuration matrix or a symbolic @@ -61,7 +60,7 @@ Fitting generalized linear models with algebraic methods b0 <- 1; b1 <- 0.3 # generate data - n <- 100 + n <- 5000 x <- sample(1:5, n, replace = T) y <- rpois(n, lambda = exp(b0 + b1*x)) df <- data.frame( @@ -70,12 +69,12 @@ Fitting generalized linear models with algebraic methods ) # function output - out <- Aglm(y ~ x, data = df, family = poisson()) + out <- aglm(y ~ x, data = df, family = poisson(), thin = 2000) # check convergence through trace plot - qplot(1:10000, out$sampStats$PRs, geom = "line") + qplot(1:10000, out$sampsStats$PRs, geom = "line") - # compare Aglm and glm predictions with the truth + # compare aglm and glm predictions with the truth # model fitting with glm mod <- glm(y ~ x, data = df, family = poisson()) @@ -86,7 +85,7 @@ Fitting generalized linear models with algebraic methods # glm predictions predict(mod, data.frame(x = 1:5), type = "response") - # Aglm predictions + # aglm predictions rowMeans(out$steps) / ddply(df, "x", nrow)$V1 @@ -111,11 +110,11 @@ Fitting generalized linear models with algebraic methods y = y ) - # Aglm - out <- Aglm(y ~ x, data = df, family = binomial()) + # aglm + out <- aglm(y ~ x, data = df, family = binomial()) # check convergence through trace plot - qplot(1:10000, out$sampStats$PRs, geom = "line") + qplot(1:10000, out$sampsStats$PRs, geom = "line") # using glm mod <- glm(y ~ x, data = df, family = binomial()) @@ -126,7 +125,7 @@ Fitting generalized linear models with algebraic methods # glm predictions predict(mod, data.frame(x = c(1:5)), type = "response") - # Aglm predictions + # aglm predictions rowMeans(out$steps) / ddply(df, "x", nrow)$V1 } From 8c05b005d1b9c7edc77ace00a09d0d142eb8254b Mon Sep 17 00:00:00 2001 From: Innerst Date: Mon, 4 Dec 2017 13:55:16 -0600 Subject: [PATCH 24/53] fixes to the syntax etc. --- R/Aglm.r | 15 +++++++------- R/loglinear.r | 6 +----- R/metropolis.r | 2 +- R/pmat.R | 53 +++++++++++++++++++++++++++-------------------- man/Aglm.Rd | 2 +- man/loglinear.Rd | 2 +- man/metropolis.Rd | 3 ++- man/pmat.Rd | 15 ++++++++++++-- 8 files changed, 57 insertions(+), 41 deletions(-) diff --git a/R/Aglm.r b/R/Aglm.r index 30fc4b3..c3569fb 100644 --- a/R/Aglm.r +++ b/R/Aglm.r @@ -3,7 +3,7 @@ #' #' @param model model specification, either in terms of a configuration matrix or a symbolic #' description of the model to be fitted -#' @param data data, as a data frame with raw data with discrete covariates +#' @param data data, as a data frame of raw data with ordinal discrete covariates #' @param family a description of the error distirbution and link function used in the model #' @param iter number of chain iterations #' @param burn burn-in @@ -215,21 +215,20 @@ aglm <- function(model, data, family = poisson(), } else if(all(unlist(model) %in% 1:length(vars))){ # by indices facets <- lapply(model, as.integer) # to fix the ~ 1 + 2 case, parsed as chars } else { - stop("Invalid model specification, see ?Aglm") + stop("Invalid model specification, see ?aglm") } ## levels (assuming all levels are numeric i.e. (1,2,3,... not Green, Blue, Red, etc.) - if(ncol(data) <= 2){ - lvls <- unique(data[,-ncol(data)]) + levls <- unique(data[,-ncol(data)]) } else { - lvls <- lapply(data[,-ncol(data)], unique) + levls <- lapply(data[,-ncol(data)], unique) } # make configuration (model) matrix - A <- pmat(lvls, facets) + A <- pmat(levls, facets) } # check to see if all level configurations are there (need work here) - lvlsInData <- as.list(as.data.frame(t(expand.grid(lvls)))) %in% as.list(as.data.frame(t(data[,-ncol(data)]))) + lvlsInData <- as.list(as.data.frame(t(expand.grid(levls)))) %in% as.list(as.data.frame(t(data[,-ncol(data)]))) # subset A by levels that are present A <- A[,lvlsInData] @@ -283,7 +282,7 @@ aglm <- function(model, data, family = poisson(), ################################################## init <- unname(init) # init out <- metropolis(init, moves, suff_stats = suff_stats, config = unname(A), iter = iter, burn = burn, thin = thin, - engine = engine) + engine = engine, ...) u <- t(t(data$sum)) diff --git a/R/loglinear.r b/R/loglinear.r index b2157ce..84a2927 100644 --- a/R/loglinear.r +++ b/R/loglinear.r @@ -594,9 +594,6 @@ loglinear <- function(model, data, iter = 1E4, burn = 1000, thin = 10, engine = c("Cpp","R"), method = c("ipf", "mcmc"),moves, - hit_and_run = FALSE, - SIS = FALSE, - non_uniform = FALSE, ...) { @@ -719,8 +716,7 @@ loglinear <- function(model, data, ################################################## init <- unname(init) # init out <- metropolis(init, moves, suff_stats = suff_stats, config = unname(A), iter = iter, burn = burn, thin = thin, - engine = engine, hit_and_run = hit_and_run, SIS = SIS, - non_uniform = non_uniform) + engine = engine, ...) diff --git a/R/metropolis.r b/R/metropolis.r index 8575706..268884f 100644 --- a/R/metropolis.r +++ b/R/metropolis.r @@ -432,7 +432,7 @@ metropolis <- function(init, moves, suff_stats, config, iter = 1E3, burn = 0, th #' @rdname metropolis #' @export -rawMetropolis <- function(init, moves, iter = 1E3, dist = "hypergeometric", hit_and_run = FALSE, SIS = FALSE, non_uniform = FALSE){ +rawMetropolis <- function(init, moves, iter = 1E3, dist = "hypergeometric", hit_and_run = FALSE, SIS = FALSE, non_uniform = FALSE, adaptive = FALSE){ metropolis(init, moves, iter, burn = 0, thin = 1, dist = dist, hit_and_run) } diff --git a/R/pmat.R b/R/pmat.R index e8e6769..4083b24 100644 --- a/R/pmat.R +++ b/R/pmat.R @@ -3,56 +3,65 @@ #' Determine the A matrix associated with a hierarchical model on a #' contingency table for Poisson Regression. #' -#' @param levels a vector containing the number of levels of each +#' @param levels a list containing the number of levels of each #' variable #' @param facets the facets generating the hierarchical model, a #' list of vectors of variable indices -#' @return a matrix +#' @return The configuration matrix for the given levels and facets +#' +#' @examples +#' # Single Covariate +#' levels <- 1:5 +#' facets <- list(1) +#' pmat(levels, facets) +#' +#' # multiple covariates, each has levels 1, ..., 5 +#' levels <- list(1:5, 1:5) +#' facets <- list(1, 2, c(1,2)) +#' pmat(levels, facets) #' @export pmat pmat <- function(levels, facets){ - + #Small function to make single covariate configuration matrix - func <- function(x){ - rbind(rep(1, length(x)), x) - } + func <- function(x) rbind(rep(1, length(x)), x) if(!is.list(levels)){ - num_covariates <- 1 - full_mat <- func(levels) + numCovariates <- 1 + fullMat <- func(levels) }else{ - num_covariates <- length(levels) + numCovariates <- length(levels) #Make single covariate configuration matrix for each covariate - mat_list <- lapply(levels, func) + matList <- lapply(levels, func) #Full heirarchicial config matrix with all interactions included - full_mat <- do.call(kprod, mat_list) + fullMat <- do.call(kprod, matList) } - exp_cov <- 1:num_covariates + expCov <- 1:numCovariates #Checking heirarchical sturcture of facets if(any(sapply(facets, length) > 1)){ - long_list_elts <- facets[which(sapply(facets, length)>1)] + longListElts <- facets[which(sapply(facets, length) > 1)] - unique_vals <- unique(unlist(long_list_elts)) + uniqueVals <- unique(unlist(longListElts)) - heirarc <- as.list(c(unique_vals, long_list_elts)) + heirarc <- as.list(c(uniqueVals, longListElts)) facets <- union(heirarc, facets) } #All possible combinations of covariates (powerset like) to be compared to facets - if(length(exp_cov) == 1) { - facet_list <- list(exp_cov) + if(length(expCov) == 1) { + facetList <- list(expCov) }else{ - facet_list <- list(integer(0)) - for(i in seq_along(exp_cov)){ - facet_list <- c(facet_list, lapply(facet_list, function(x) c(x,exp_cov[i]))) + facetList <- list(integer(0)) + for(i in seq_along(expCov)){ + facetList <- c(facetList, lapply(facetList, function(x) c(x,expCov[i]))) } - facet_list <- facet_list[-1] + facetList <- facetList[-1] } #return the configuration matrix which includes only the elements need for the heirarchical model - return(full_mat[c(TRUE, facet_list %in% facets),]) + return(fullMat[c(TRUE, facetList %in% facets),]) } diff --git a/man/Aglm.Rd b/man/Aglm.Rd index 31a0e3b..cabb3b6 100644 --- a/man/Aglm.Rd +++ b/man/Aglm.Rd @@ -11,7 +11,7 @@ aglm(model, data, family = poisson(), iter = 10000, burn = 10000, \item{model}{model specification, either in terms of a configuration matrix or a symbolic description of the model to be fitted} -\item{data}{data, as a data frame with raw data with discrete covariates} +\item{data}{data, as a data frame of raw data with ordinal discrete covariates} \item{family}{a description of the error distirbution and link function used in the model} diff --git a/man/loglinear.Rd b/man/loglinear.Rd index 53defb4..321bff8 100644 --- a/man/loglinear.Rd +++ b/man/loglinear.Rd @@ -7,7 +7,7 @@ \usage{ loglinear(model, data, init = tab2vec(data), iter = 10000, burn = 1000, thin = 10, engine = c("Cpp", "R"), method = c("ipf", "mcmc"), moves, - hit_and_run = FALSE, SIS = FALSE, non_uniform = FALSE, ...) + ...) } \arguments{ \item{model}{hierarchical log-linear model specification} diff --git a/man/metropolis.Rd b/man/metropolis.Rd index 260d7fc..eae92d3 100644 --- a/man/metropolis.Rd +++ b/man/metropolis.Rd @@ -11,7 +11,8 @@ metropolis(init, moves, suff_stats, config, iter = 1000, burn = 0, adaptive = FALSE) rawMetropolis(init, moves, iter = 1000, dist = "hypergeometric", - hit_and_run = FALSE, SIS = FALSE, non_uniform = FALSE) + hit_and_run = FALSE, SIS = FALSE, non_uniform = FALSE, + adaptive = FALSE) } \arguments{ \item{init}{the initial step} diff --git a/man/pmat.Rd b/man/pmat.Rd index 2831452..4d23cf2 100644 --- a/man/pmat.Rd +++ b/man/pmat.Rd @@ -7,16 +7,27 @@ pmat(levels, facets) } \arguments{ -\item{levels}{a vector containing the number of levels of each +\item{levels}{a list containing the number of levels of each variable} \item{facets}{the facets generating the hierarchical model, a list of vectors of variable indices} } \value{ -a matrix +The configuration matrix for the given levels and facets } \description{ Determine the A matrix associated with a hierarchical model on a contingency table for Poisson Regression. } +\examples{ +# Single Covariate +levels <- 1:5 +facets <- list(1) +pmat(levels, facets) + +# multiple covariates, each has levels 1, ..., 5 +levels <- list(1:5, 1:5) +facets <- list(1, 2, c(1,2)) +pmat(levels, facets) +} From 8c3166f04a1dbb868d56920e8f2bb5589612f5a0 Mon Sep 17 00:00:00 2001 From: Grant Innerst Date: Sun, 17 Dec 2017 12:23:22 -0600 Subject: [PATCH 25/53] naming convention changes --- R/Aglm.r | 6 +-- R/algstat.r | 2 +- R/loglinear.r | 4 +- R/metropolis.r | 142 ++++++++++++++++++++++++------------------------- R/sis_tbl.R | 44 +++++++-------- 5 files changed, 99 insertions(+), 99 deletions(-) diff --git a/R/Aglm.r b/R/Aglm.r index c3569fb..3cb28f5 100644 --- a/R/Aglm.r +++ b/R/Aglm.r @@ -238,7 +238,7 @@ aglm <- function(model, data, family = poisson(), A <- lawrence(A) } # find the sufficient statistics - suff_stats <- unname(A %*% init) + suffStats <- unname(A %*% init) ## construct A matrix and compute moves ################################################## @@ -281,7 +281,7 @@ aglm <- function(model, data, family = poisson(), ## run metropolis-hastings ################################################## init <- unname(init) # init - out <- metropolis(init, moves, suff_stats = suff_stats, config = unname(A), iter = iter, burn = burn, thin = thin, + out <- metropolis(init, moves, suffStats = suffStats, config = unname(A), iter = iter, burn = burn, thin = thin, engine = engine, ...) @@ -298,7 +298,7 @@ aglm <- function(model, data, family = poisson(), out$call <- match.call() out$obs <- data out$A <- A - out$sufficientStatistics <- suff_stats + out$sufficientStatistics <- suffStats out$p.value <- c( PR = mean(PRs <= PR) diff --git a/R/algstat.r b/R/algstat.r index 8160b18..a9cb72d 100644 --- a/R/algstat.r +++ b/R/algstat.r @@ -9,7 +9,7 @@ #' 4ti2 (through latter). #' #' @import stringr mpoly reshape2 Rcpp lpSolve parallel memoise -#' ggplot2 latter m2r +#' ggplot2 latter m2r #' @importFrom stats deriv dmultinom loglin runif sd model.frame #' @importFrom utils combn download.file #' @importFrom plyr ddply diff --git a/R/loglinear.r b/R/loglinear.r index 84a2927..deedf87 100644 --- a/R/loglinear.r +++ b/R/loglinear.r @@ -672,7 +672,7 @@ loglinear <- function(model, data, A <- hmat(dim(data), facets) } - suff_stats <- unname(A %*% init) + suffStats <- unname(A %*% init) ## construct A matrix and compute moves ################################################## @@ -715,7 +715,7 @@ loglinear <- function(model, data, ## run metropolis-hastings ################################################## init <- unname(init) # init - out <- metropolis(init, moves, suff_stats = suff_stats, config = unname(A), iter = iter, burn = burn, thin = thin, + out <- metropolis(init, moves, suffStats = suffStats, config = unname(A), iter = iter, burn = burn, thin = thin, engine = engine, ...) diff --git a/R/metropolis.r b/R/metropolis.r index 268884f..999e535 100644 --- a/R/metropolis.r +++ b/R/metropolis.r @@ -15,9 +15,9 @@ #' @param dist steady-state distribution; "hypergeometric" (default) #' or "uniform" #' @param engine C++ or R? (C++ yields roughly a 20-25x speedup) -#' @param hit_and_run Whether or not to use the discrete hit and run algorithm in +#' @param hitAndRun Whether or not to use the discrete hit and run algorithm in #' the metropolis algorithm -#' @param adaptive Option inside hit_and_run option. If TRUE, hit and run will choose a proposal distribution adaptively. Defaulted to FALSE. +#' @param adaptive Option inside hitAndRun option. If TRUE, hit and run will choose a proposal distribution adaptively. Defaulted to FALSE. #' @param SIS If TRUE, with a small probability the move will be chosen randomly from the uniform distribution #' on the fiber using Sequential Importance "Like" Sampling methods. Defaulted to FALSE #' @name metropolis @@ -148,9 +148,9 @@ #' } #' #' -metropolis <- function(init, moves, suff_stats, config, iter = 1E3, burn = 0, thin = 1, +metropolis <- function(init, moves, suffStats, config, iter = 1E3, burn = 0, thin = 1, dist = c("hypergeometric","uniform"), engine = c("Cpp","R"), - hit_and_run = FALSE, SIS = FALSE, non_uniform = FALSE, adaptive = FALSE + hitAndRun = FALSE, SIS = FALSE, nonUniform = FALSE, adaptive = FALSE ){ ## preliminary checking @@ -176,53 +176,53 @@ metropolis <- function(init, moves, suff_stats, config, iter = 1E3, burn = 0, th message("Running chain (R)... ", appendLF = FALSE) #Setting up non-uniform move sampling framework - if(non_uniform == TRUE){ - move_dist <- rep(1,nMoves) + if(nonUniform == TRUE){ + moveDist <- rep(1,nMoves) counter <- nMoves } unifs <- runif(burn) - if(non_uniform == TRUE){ - move_prob <- runif(burn) + if(nonUniform == TRUE){ + moveProb <- runif(burn) } if(burn > 0) { for(k in 1:burn){ #Hit and Run option - if(hit_and_run) + if(hitAndRun) { move <- sample(c(-1,1), 1) * moves[,sample(nMoves,1)] - w_move <- move[move != 0] - w_current <- current[move != 0] - w_moves <- -1 * w_current / w_move - lower_bound <- if(any(w_moves < 0)){max(subset(w_moves,subset = w_moves < 0))}else{1} - upper_bound <- if(any(w_moves > 0)){min(subset(w_moves,subset = w_moves > 0))}else{-1} + workMove <- move[move != 0] + workCurrent <- current[move != 0] + workMoves <- -1 * workCurrent / workMove + lowerBound <- if(any(workMoves < 0)){max(subset(workMoves,subset = workMoves < 0))}else{1} + upperBound <- if(any(workMoves > 0)){min(subset(workMoves,subset = workMoves > 0))}else{-1} - if(any(w_moves == 0)){ - w_propStatelow <- current + lower_bound * move - w_propStateup <- current + upper_bound * move - if(any(w_propStatelow < 0)){ - lower_bound <- 1 + if(any(workMoves == 0)){ + workPropStatelow <- current + lowerBound * move + workPropStateup <- current + upperBound * move + if(any(workPropStatelow < 0)){ + lowerBound <- 1 } - if(any(w_propStateup < 0)){ - upper_bound <- -1 + if(any(workPropStateup < 0)){ + upperBound <- -1 } } - c_s <- sample(lower_bound:upper_bound,1) - if(c_s == 0){ - c_s <- 1 + multiple <- sample(lowerBound:upperBound,1) + if(multiple == 0){ + multiple <- 1 } - propState <- current + c_s * move + propState <- current + multiple * move } #Non-uniform move sampling option - if(non_uniform == TRUE) + if(nonUniform == TRUE) { for(l in 1:nMoves){ - if(move_prob[k] <= sum(move_dist[1:l])/counter){ + if(moveProb[k] <= sum(moveDist[1:l])/counter){ move <- moves[,l] - which_move <- l + whichMove <- l } } move <- sample(c(-1,1), 1) * move @@ -245,10 +245,10 @@ metropolis <- function(init, moves, suff_stats, config, iter = 1E3, burn = 0, th } } - if(non_uniform == TRUE){ + if(nonUniform == TRUE){ if(unifs[k] < prob){ current <- propState - move_dist[which_move] <- move_dist[which_move] + 1 + moveDist[whichMove] <- moveDist[whichMove] + 1 counter <- counter + 1 } }else{ @@ -269,78 +269,78 @@ metropolis <- function(init, moves, suff_stats, config, iter = 1E3, burn = 0, th for(j in 1:thin){ - if(hit_and_run) + if(hitAndRun) { move <- moves[,sample(nMoves,1)] - w_move <- move[move != 0] - w_current <- current[move != 0] - w_moves <- (-1 * w_current) / w_move - lower_bound <- if(any(w_moves < 0)){max(subset(w_moves,subset = w_moves < 0))}else{1} - upper_bound <- if(any(w_moves > 0)){min(subset(w_moves,subset = w_moves > 0))}else{-1} + workMove <- move[move != 0] + workCurrent <- current[move != 0] + workMoves <- (-1 * workCurrent) / workMove + lowerBound <- if(any(workMoves < 0)){max(subset(workMoves,subset = workMoves < 0))}else{1} + upperBound <- if(any(workMoves > 0)){min(subset(workMoves,subset = workMoves > 0))}else{-1} #New part # #Option 1 Enumerate tables and - # line <- lower_bound:upper_bound + # line <- lowerBound:upperBound # #Enumerate tables on the line # tables <- matrix(0L, nrow =length(init) , ncol = length(line)) # for(i in 1:length(line)){ # tables[,i] <- current + line[i]*move # } # probs <- apply(tables, 2, function(x) 1/(sum(lfactorial(x)))) - # prob_dist <- probs / sum(probs) + # probDist <- probs / sum(probs) # unif <- runif(1) - # dummy <- prob_dist[1] - # for(i in 1:(length(prob_dist) -1)){ + # dummy <- probDist[1] + # for(i in 1:(length(probDist) -1)){ # if(unif < dummy){ # propState <- tables[,i] # break() # } - # dummy <- dummy + prob_dist[i+1] + # dummy <- dummy + probDist[i+1] # } #Option 2 - # line <- lower_bound:upper_bound - # w_current <- current + # line <- lowerBound:upperBound + # workCurrent <- current # unifs2 <- runif(2*length(line)) # for(i in 1:(2*length(line))){ - # w_propState <- w_current + sample(c(-1,1), 1)*move - # if(any(w_propState < 0)){ + # workPropState <- workCurrent + sample(c(-1,1), 1)*move + # if(any(workPropState < 0)){ # prob <- 0 # } else { # if(dist == "hypergeometric"){ - # prob <- exp( sum(lfactorial(w_current)) - sum(lfactorial(w_propState)) ) + # prob <- exp( sum(lfactorial(workCurrent)) - sum(lfactorial(workPropState)) ) # } else { # dist == "uniform" # prob <- 1 # } # } - # if(unifs2[i] < prob) w_current <- w_propState # else w_current + # if(unifs2[i] < prob) workCurrent <- workPropState # else workCurrent # } - # propState <- w_current + # propState <- workCurrent - if(any(w_moves == 0)){ - w_propStatelow <- current + lower_bound * move - w_propStateup <- current + upper_bound * move - if(any(w_propStatelow < 0)){ - lower_bound <- 1 + if(any(workMoves == 0)){ + workPropStatelow <- current + lowerBound * move + workPropStateup <- current + upperBound * move + if(any(workPropStatelow < 0)){ + lowerBound <- 1 } - if(any(w_propStateup < 0)){ - upper_bound <- -1 + if(any(workPropStateup < 0)){ + upperBound <- -1 } } - c_s <- sample(lower_bound:upper_bound,1) - if(c_s == 0){ - c_s <- 1 + multiple <- sample(lowerBound:upperBound,1) + if(multiple == 0){ + multiple <- 1 } - propState <- current + c_s * move + propState <- current + multiple * move } - if(non_uniform == TRUE) + if(nonUniform) { - move_prob <- runif(1) + moveProb <- runif(1) for(l in 1:nMoves){ - if(move_prob <= sum(move_dist[1:l])/counter){ + if(moveProb <= sum(moveDist[1:l])/counter){ move <- moves[,l] - which_move <- l + whichMove <- l break() } } @@ -352,9 +352,9 @@ metropolis <- function(init, moves, suff_stats, config, iter = 1E3, burn = 0, th propState <- current + move } - if(SIS == TRUE){ + if(SIS){ if(runif(1) <= .05){ - propState <- sis_table(config, suff_stats) + propState <- sis_table(config, suffStats) } } @@ -369,10 +369,10 @@ metropolis <- function(init, moves, suff_stats, config, iter = 1E3, burn = 0, th } probTotal <- probTotal + min(1, prob) - if(non_uniform == TRUE){ + if(nonUniform){ if(unifs[k*(thin-1)+j] < prob){ current <- propState - move_dist[which_move] <- move_dist[which_move] + 1 + moveDist[whichMove] <- moveDist[whichMove] + 1 counter <- counter + 1 } }else{ @@ -409,8 +409,8 @@ metropolis <- function(init, moves, suff_stats, config, iter = 1E3, burn = 0, th metropolis_uniform_cpp } message("Running chain (C++)... ", appendLF = FALSE) - if (burn > 0) current <- sampler(current, allMoves, suff_stats, config, burn, 1, hit_and_run, SIS, non_uniform, adaptive)$steps[,burn] - out <- sampler(current, allMoves, suff_stats, config, iter, thin, hit_and_run, SIS, non_uniform, adaptive) + if (burn > 0) current <- sampler(current, allMoves, suffStats, config, burn, 1, hitAndRun, SIS, nonUniform, adaptive)$steps[,burn] + out <- sampler(current, allMoves, suffStats, config, iter, thin, hitAndRun, SIS, nonUniform, adaptive) out$moves <- moves message("done.") @@ -432,7 +432,7 @@ metropolis <- function(init, moves, suff_stats, config, iter = 1E3, burn = 0, th #' @rdname metropolis #' @export -rawMetropolis <- function(init, moves, iter = 1E3, dist = "hypergeometric", hit_and_run = FALSE, SIS = FALSE, non_uniform = FALSE, adaptive = FALSE){ - metropolis(init, moves, iter, burn = 0, thin = 1, dist = dist, hit_and_run) +rawMetropolis <- function(init, moves, iter = 1E3, dist = "hypergeometric", hitAndRun = FALSE, SIS = FALSE, nonUniform = FALSE, adaptive = FALSE){ + metropolis(init, moves, iter, burn = 0, thin = 1, dist = dist, hitAndRun) } diff --git a/R/sis_tbl.R b/R/sis_tbl.R index 414a597..7a524a2 100644 --- a/R/sis_tbl.R +++ b/R/sis_tbl.R @@ -1,32 +1,32 @@ -sis_table <- function(config_mat, suff_statistics){ - #Need to check if config_mat is a mat, suff_statistics is vector/mat, - #length of suff_statistics is same as number of rows of config_mat +sis_table <- function(configMat, suffStatistics){ + # need to check if configMat is a mat, suffStatistics is vector/mat, + # length of suffStatistics is same as number of rows of configMat - #Matricies and vectors to work with! - work_A <- config_mat - work_suff <- suff_statistics - tbl_elts <- ncol(config_mat) - num_const <- nrow(config_mat) - tbl <- vector(mode = "numeric", length = tbl_elts) + # matricies and vectors to work with! + workA <- configMat + workSuff <- suffStatistics + tblElts <- ncol(configMat) + numConstraints <- nrow(configMat) + tbl <- vector(mode = "numeric", length = tblElts) - for(i in 1:tbl_elts){ - constr <- unname(rbind(cbind(rep(1,num_const), work_suff, work_A), - cbind(rep(0,tbl_elts), rep(0,tbl_elts), diag(-1,tbl_elts)))) - objfun <- vector(mode = "numeric", length = tbl_elts) + for(i in 1:tblElts){ + constr <- unname(rbind(cbind(rep(1,numConstraints), workSuff, workA), + cbind(rep(0,tblElts), rep(0,tblElts), diag(-1,tblElts)))) + objfun <- vector(mode = "numeric", length = tblElts) objfun[i] <- -1 - min_lp <- lpcdd(constr, objfun) - max_lp <- lpcdd(constr, objfun, minimize = FALSE) + minLp <- lpcdd(constr, objfun) + maxLp <- lpcdd(constr, objfun, minimize = FALSE) - if(min_lp[1] == "Optimal" && max_lp[1] == "Optimal"){ - minimum <- as.numeric(unname(min_lp[4])) - maximum <- as.numeric(unname(max_lp[4])) + if(minLp[1] == "Optimal" && maxLp[1] == "Optimal"){ + minimum <- as.numeric(unname(minLp[4])) + maximum <- as.numeric(unname(maxLp[4])) tbl[i] <- if(isTRUE(all.equal(minimum, maximum))){minimum} else{sample(minimum:maximum, 1)} } else { tbl[i] <- 0 } - #Update constraints and sufficient statistics - index <- which(work_A[,i] == 1) - work_A[index,i] <- 0 - work_suff[index] <- work_suff[index] - tbl[i] + # update constraints and sufficient statistics + index <- which(workA[,i] == 1) + workA[index,i] <- 0 + workSuff[index] <- workSuff[index] - tbl[i] } return(tbl) } \ No newline at end of file From 7bb6d5d0a6b8ef68aa8b180b179686b537dc5ac3 Mon Sep 17 00:00:00 2001 From: Grant Innerst Date: Sat, 30 Dec 2017 14:13:30 -0600 Subject: [PATCH 26/53] metropolis example added, commented code removed --- R/metropolis.r | 68 ++++++++++----------------- src/metropolis_hypergeometric_cpp.cpp | 16 +------ 2 files changed, 26 insertions(+), 58 deletions(-) diff --git a/R/metropolis.r b/R/metropolis.r index 999e535..6b54aec 100644 --- a/R/metropolis.r +++ b/R/metropolis.r @@ -16,10 +16,14 @@ #' or "uniform" #' @param engine C++ or R? (C++ yields roughly a 20-25x speedup) #' @param hitAndRun Whether or not to use the discrete hit and run algorithm in -#' the metropolis algorithm -#' @param adaptive Option inside hitAndRun option. If TRUE, hit and run will choose a proposal distribution adaptively. Defaulted to FALSE. +#' the metropolis algorithm. Defaulted to FALSE #' @param SIS If TRUE, with a small probability the move will be chosen randomly from the uniform distribution #' on the fiber using Sequential Importance "Like" Sampling methods. Defaulted to FALSE +#' @param nonUniform If TRUE, moves will be chosen adaptively using a move weighting system that uses information +#' from previous steps. Defaulted to FALSE +#' @param adaptive Option when hitAndRun = TRUE. If adaptive = TRUE, hit and run will choose a proposal distribution adaptively. +#' Defaulted to FALSE +#' #' @name metropolis #' @return a list #' @export metropolis @@ -43,7 +47,7 @@ #' # view convergence through trace plot #' qplot(1:1000, out$steps[1,]) #' -#' # sampling from the hypergeometric distribution +#' # sampling from the uniform distribution #' out <- metropolis(init, moves, dist = "uniform") #' qplot(out$steps[1,]) #' @@ -112,13 +116,30 @@ #' #' #' +#' # examples using the extra options inside metropolis #' #' +#' data("HairEyeColor") +#' tbl <- tab2vec(apply(HairEyeColor, c(1, 2), sum)) +#' A <- hmat(c(4,4),1:2) +#' moves <- markov(A) +#' suffStats <- A %*% tbl #' +#' # base metropolis algorithm +#' base <- metropolis(tbl, moves, suffStats, A) #' +#' # hit and run option +#' har <- metropolis(tbl, moves, suffStats, A, hitAndRun = TRUE) #' +#' # check convergence through trace plots +#' baseStats <- copmuteUProbsCpp(base$steps) +#' harStats <- computeUProbsCpp(har$steps) #' +#' data <- data.frame(baseStats = baseStats, harStats = harStats, steps = 1:1000) #' +#' ggplot(data = data) + geom_line(aes(steps, baseStats)) + +#' geom_line(aes(steps, harStats), color = "red") + +#' labs(x = "Steps", y = "UNLL value", title = "Base Algorithm vs. Algorithm with Hit and Run option in red") #' #' #' showSteps <- function(steps){ @@ -277,45 +298,6 @@ metropolis <- function(init, moves, suffStats, config, iter = 1E3, burn = 0, thi workMoves <- (-1 * workCurrent) / workMove lowerBound <- if(any(workMoves < 0)){max(subset(workMoves,subset = workMoves < 0))}else{1} upperBound <- if(any(workMoves > 0)){min(subset(workMoves,subset = workMoves > 0))}else{-1} - - #New part - # #Option 1 Enumerate tables and - # line <- lowerBound:upperBound - # #Enumerate tables on the line - # tables <- matrix(0L, nrow =length(init) , ncol = length(line)) - # for(i in 1:length(line)){ - # tables[,i] <- current + line[i]*move - # } - # probs <- apply(tables, 2, function(x) 1/(sum(lfactorial(x)))) - # probDist <- probs / sum(probs) - # unif <- runif(1) - # dummy <- probDist[1] - # for(i in 1:(length(probDist) -1)){ - # if(unif < dummy){ - # propState <- tables[,i] - # break() - # } - # dummy <- dummy + probDist[i+1] - # } - #Option 2 - # line <- lowerBound:upperBound - # workCurrent <- current - # unifs2 <- runif(2*length(line)) - # for(i in 1:(2*length(line))){ - # workPropState <- workCurrent + sample(c(-1,1), 1)*move - # if(any(workPropState < 0)){ - # prob <- 0 - # } else { - # if(dist == "hypergeometric"){ - # prob <- exp( sum(lfactorial(workCurrent)) - sum(lfactorial(workPropState)) ) - # } else { # dist == "uniform" - # prob <- 1 - # } - # } - # if(unifs2[i] < prob) workCurrent <- workPropState # else workCurrent - # } - # propState <- workCurrent - if(any(workMoves == 0)){ workPropStatelow <- current + lowerBound * move @@ -433,6 +415,6 @@ metropolis <- function(init, moves, suffStats, config, iter = 1E3, burn = 0, thi #' @rdname metropolis #' @export rawMetropolis <- function(init, moves, iter = 1E3, dist = "hypergeometric", hitAndRun = FALSE, SIS = FALSE, nonUniform = FALSE, adaptive = FALSE){ - metropolis(init, moves, iter, burn = 0, thin = 1, dist = dist, hitAndRun) + metropolis(init, moves, iter, burn = 0, thin = 1, dist = dist, hitAndRun, SIS, nonUniform, adaptive) } diff --git a/src/metropolis_hypergeometric_cpp.cpp b/src/metropolis_hypergeometric_cpp.cpp index d2e4472..75b3298 100644 --- a/src/metropolis_hypergeometric_cpp.cpp +++ b/src/metropolis_hypergeometric_cpp.cpp @@ -92,16 +92,6 @@ List metropolis_hypergeometric_cpp( lb = max(lowerBound); ub = min(upperBound); - // Enumerating all tables - // IntegerVector line = seq(lb, ub); - // int line_length = line.size(); - // IntegerMatrix tables(n, line_length); - // for(int i = 0; i < line_length;++i){ - // for(int j = 0; j < n;++j){ - // tables(j,i) = current[j] + line[i] * move[j]; - // } - // } - // MCMC inside MCMC //IntegerVector line = seq(lb, ub); if(adaptive){ @@ -143,11 +133,7 @@ List metropolis_hypergeometric_cpp( for(int k = 0; k < n; ++k){ proposal[k] = w_current[k]; } - //Attempt at recursively calling MCMC routine - // List MCMC_out = metropolis_hypergeometric_cpp(current, as(move), suff_stats, config, 50, 1, false, false, false); - // IntegerMatrix mini_steps = MCMC_out[0]; - // int step_length = mini_steps.ncol(); - // proposal = mini_steps(_, step_length); + } else { // Base Hit and Run From 4dedfee6146c7e821f4f692331842432332a430b Mon Sep 17 00:00:00 2001 From: Innerst Date: Mon, 8 Jan 2018 16:26:22 -0600 Subject: [PATCH 27/53] fix typos, change aglm example --- R/Aglm.r | 2 +- R/metropolis.r | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/R/Aglm.r b/R/Aglm.r index 3cb28f5..3849d3f 100644 --- a/R/Aglm.r +++ b/R/Aglm.r @@ -90,7 +90,7 @@ #' ) #' #' # aglm -#' out <- aglm(y ~ x, data = df, family = binomial()) +#' out <- aglm(y ~ x, data = df, family = binomial(), thin = 500) #' #' # check convergence through trace plot #' qplot(1:10000, out$sampsStats$PRs, geom = "line") diff --git a/R/metropolis.r b/R/metropolis.r index 6b54aec..812e149 100644 --- a/R/metropolis.r +++ b/R/metropolis.r @@ -132,8 +132,8 @@ #' har <- metropolis(tbl, moves, suffStats, A, hitAndRun = TRUE) #' #' # check convergence through trace plots -#' baseStats <- copmuteUProbsCpp(base$steps) -#' harStats <- computeUProbsCpp(har$steps) +#' baseStats <- algstat:::computeUProbsCpp(base$steps) +#' harStats <- algstat:::computeUProbsCpp(har$steps) #' #' data <- data.frame(baseStats = baseStats, harStats = harStats, steps = 1:1000) #' From 0b17277b132f23d3ea5523f56f8c3720fdf3da25 Mon Sep 17 00:00:00 2001 From: Innerst Date: Thu, 11 Jan 2018 09:02:22 -0600 Subject: [PATCH 28/53] small fixes after merge --- NAMESPACE | 2 +- R/algstat.r | 2 +- man/Aglm.Rd | 4 ++-- man/metropolis.Rd | 37 ++++++++++++++++++++++++++++--------- src/RcppExports.cpp | 10 ---------- 5 files changed, 32 insertions(+), 23 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 4a57f12..e657dd6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -74,7 +74,7 @@ importFrom(parallel,makeCluster) importFrom(parallel,mclapply) importFrom(parallel,parLapply) importFrom(parallel,stopCluster) -importFrom(plyr, ddply) +importFrom(plyr,ddply) importFrom(reshape2,melt) importFrom(stats,deriv) importFrom(stats,dmultinom) diff --git a/R/algstat.r b/R/algstat.r index 2af4059..7c03144 100644 --- a/R/algstat.r +++ b/R/algstat.r @@ -16,7 +16,7 @@ #' element_blank #' @importFrom parallel mclapply makeCluster stopCluster parLapply detectCores #' @importFrom reshape2 melt -#' @importFrom stats deriv dmultinom loglin runif sd +#' @importFrom stats deriv dmultinom loglin runif sd model.frame #' @importFrom utils combn download.file #' @importFrom plyr ddply #' @useDynLib algstat diff --git a/man/Aglm.Rd b/man/Aglm.Rd index cabb3b6..da38b0c 100644 --- a/man/Aglm.Rd +++ b/man/Aglm.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/aglm.r +% Please edit documentation in R/Aglm.r \name{aglm} \alias{aglm} \title{Fitting generalized linear models with algebraic methods} @@ -111,7 +111,7 @@ Fitting generalized linear models with algebraic methods ) # aglm - out <- aglm(y ~ x, data = df, family = binomial()) + out <- aglm(y ~ x, data = df, family = binomial(), thin = 500) # check convergence through trace plot qplot(1:10000, out$sampsStats$PRs, geom = "line") diff --git a/man/metropolis.Rd b/man/metropolis.Rd index eae92d3..7a5a46a 100644 --- a/man/metropolis.Rd +++ b/man/metropolis.Rd @@ -5,14 +5,12 @@ \alias{rawMetropolis} \title{The Metropolis Algorithm} \usage{ -metropolis(init, moves, suff_stats, config, iter = 1000, burn = 0, +metropolis(init, moves, suffStats, config, iter = 1000, burn = 0, thin = 1, dist = c("hypergeometric", "uniform"), engine = c("Cpp", "R"), - hit_and_run = FALSE, SIS = FALSE, non_uniform = FALSE, - adaptive = FALSE) + hitAndRun = FALSE, SIS = FALSE, nonUniform = FALSE, adaptive = FALSE) rawMetropolis(init, moves, iter = 1000, dist = "hypergeometric", - hit_and_run = FALSE, SIS = FALSE, non_uniform = FALSE, - adaptive = FALSE) + hitAndRun = FALSE, SIS = FALSE, nonUniform = FALSE, adaptive = FALSE) } \arguments{ \item{init}{the initial step} @@ -31,13 +29,17 @@ or "uniform"} \item{engine}{C++ or R? (C++ yields roughly a 20-25x speedup)} -\item{hit_and_run}{Whether or not to use the discrete hit and run algorithm in -the metropolis algorithm} +\item{hitAndRun}{Whether or not to use the discrete hit and run algorithm in +the metropolis algorithm. Defaulted to FALSE} \item{SIS}{If TRUE, with a small probability the move will be chosen randomly from the uniform distribution on the fiber using Sequential Importance "Like" Sampling methods. Defaulted to FALSE} -\item{adaptive}{Option inside hit_and_run option. If TRUE, hit and run will choose a proposal distribution adaptively. Defaulted to FALSE.} +\item{nonUniform}{If TRUE, moves will be chosen adaptively using a move weighting system that uses information +from previous steps. Defaulted to FALSE} + +\item{adaptive}{Option when hitAndRun = TRUE. If adaptive = TRUE, hit and run will choose a proposal distribution adaptively. +Defaulted to FALSE} } \value{ a list @@ -66,7 +68,7 @@ qplot(out$steps[1,]) # view convergence through trace plot qplot(1:1000, out$steps[1,]) -# sampling from the hypergeometric distribution +# sampling from the uniform distribution out <- metropolis(init, moves, dist = "uniform") qplot(out$steps[1,]) @@ -135,13 +137,30 @@ microbenchmark( +# examples using the extra options inside metropolis +data("HairEyeColor") +tbl <- tab2vec(apply(HairEyeColor, c(1, 2), sum)) +A <- hmat(c(4,4),1:2) +moves <- markov(A) +suffStats <- A \%*\% tbl +# base metropolis algorithm +base <- metropolis(tbl, moves, suffStats, A) +# hit and run option +har <- metropolis(tbl, moves, suffStats, A, hitAndRun = TRUE) +# check convergence through trace plots +baseStats <- algstat:::computeUProbsCpp(base$steps) +harStats <- algstat:::computeUProbsCpp(har$steps) +data <- data.frame(baseStats = baseStats, harStats = harStats, steps = 1:1000) +ggplot(data = data) + geom_line(aes(steps, baseStats)) + +geom_line(aes(steps, harStats), color = "red") + +labs(x = "Steps", y = "UNLL value", title = "Base Algorithm vs. Algorithm with Hit and Run option in red") showSteps <- function(steps){ diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 6d5335f..26da1a9 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -66,13 +66,8 @@ BEGIN_RCPP END_RCPP } // metropolis_hypergeometric_cpp -<<<<<<< HEAD List metropolis_hypergeometric_cpp(IntegerVector current, IntegerMatrix moves, IntegerVector suff_stats, IntegerMatrix config, int iter, int thin, bool hit_and_run, bool SIS, bool non_uniform, bool adaptive); RcppExport SEXP _algstat_metropolis_hypergeometric_cpp(SEXP currentSEXP, SEXP movesSEXP, SEXP suff_statsSEXP, SEXP configSEXP, SEXP iterSEXP, SEXP thinSEXP, SEXP hit_and_runSEXP, SEXP SISSEXP, SEXP non_uniformSEXP, SEXP adaptiveSEXP) { -======= -List metropolis_hypergeometric_cpp(IntegerVector current, IntegerMatrix moves, int iter, int thin); -RcppExport SEXP _algstat_metropolis_hypergeometric_cpp(SEXP currentSEXP, SEXP movesSEXP, SEXP iterSEXP, SEXP thinSEXP) { ->>>>>>> b84c9d57a6404c52843645b7db04e0d26e0e01af BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -91,13 +86,8 @@ BEGIN_RCPP END_RCPP } // metropolis_uniform_cpp -<<<<<<< HEAD List metropolis_uniform_cpp(IntegerVector current, IntegerMatrix moves, IntegerVector suff_stats, IntegerMatrix config, int iter, int thin, bool hit_and_run, bool SIS, bool non_uniform, bool adaptive); RcppExport SEXP _algstat_metropolis_uniform_cpp(SEXP currentSEXP, SEXP movesSEXP, SEXP suff_statsSEXP, SEXP configSEXP, SEXP iterSEXP, SEXP thinSEXP, SEXP hit_and_runSEXP, SEXP SISSEXP, SEXP non_uniformSEXP, SEXP adaptiveSEXP) { -======= -List metropolis_uniform_cpp(IntegerVector current, IntegerMatrix moves, int iter, int thin); -RcppExport SEXP _algstat_metropolis_uniform_cpp(SEXP currentSEXP, SEXP movesSEXP, SEXP iterSEXP, SEXP thinSEXP) { ->>>>>>> b84c9d57a6404c52843645b7db04e0d26e0e01af BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; From 6be6b68eedffa824fbc7658888792b9dc41b8137 Mon Sep 17 00:00:00 2001 From: Innerst Date: Thu, 11 Jan 2018 12:39:56 -0600 Subject: [PATCH 29/53] example fixes in metropolis.r --- R/metropolis.r | 14 ++++++++------ man/metropolis.Rd | 19 ++++++++++++------- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/R/metropolis.r b/R/metropolis.r index 70f4d4c..ef88ffa 100644 --- a/R/metropolis.r +++ b/R/metropolis.r @@ -9,6 +9,8 @@ #' @param init the initial step #' @param moves the moves to be used (the negatives will be added); #' they are arranged as the columns of a matrix. +#' @param suffStats the sufficient statistics of the model. Only used when SIS = TRUE. Defaulted to 0. +#' @param config the configuration matrix that encodes the model. Only used when SIS = TRUE. Defaulted to matrix(0). #' @param iter number of chain iterations #' @param burn burn-in #' @param thin thinning @@ -97,9 +99,9 @@ #' #' A <- hmat(c(2,2), as.list(1:2)) #' moves <- markov(A) -#' outC <- metropolis(tab2vec(handy), moves, 1e4, engine = "Cpp") +#' outC <- metropolis(tab2vec(handy), moves, suffStats = tab2vec(handy) %*% A, config = A, 1e4, engine = "Cpp") #' str(outC) -#' outR <- metropolis(tab2vec(handy), moves, 1e4, engine = "R", thin = 20) +#' outR <- metropolis(tab2vec(handy), moves, suffStats = tab2vec(handy) %*% A, config = A, 1e4, engine = "R", thin = 20) #' str(outR) #' #' # showSteps(out$steps) @@ -107,8 +109,8 @@ #' #' library(microbenchmark) #' microbenchmark( -#' metropolis(tab2vec(handy), moves, engine = "Cpp"), -#' metropolis(tab2vec(handy), moves, engine = "R") +#' metropolis(tab2vec(handy), moves, suffStats = tab2vec(handy) %*% A, config = A,engine = "Cpp"), +#' metropolis(tab2vec(handy), moves, suffStats = tab2vec(handy) %*% A, config = A,engine = "R") #' ) #' #' # cpp ~ 20-25x faster @@ -169,7 +171,7 @@ #' } #' #' -metropolis <- function(init, moves, suffStats, config, iter = 1E3, burn = 0, thin = 1, +metropolis <- function(init, moves, suffStats = 0, config = matrix(0), iter = 1E3, burn = 0, thin = 1, dist = c("hypergeometric","uniform"), engine = c("Cpp","R"), hitAndRun = FALSE, SIS = FALSE, nonUniform = FALSE, adaptive = FALSE ){ @@ -371,7 +373,7 @@ metropolis <- function(init, moves, suffStats, config, iter = 1E3, burn = 0, thi out <- list( steps = state, moves = moves, - acceptProb = probTotal / totalRuns + accept_prob = probTotal / totalRuns ) diff --git a/man/metropolis.Rd b/man/metropolis.Rd index 7a5a46a..f75c66b 100644 --- a/man/metropolis.Rd +++ b/man/metropolis.Rd @@ -5,9 +5,10 @@ \alias{rawMetropolis} \title{The Metropolis Algorithm} \usage{ -metropolis(init, moves, suffStats, config, iter = 1000, burn = 0, - thin = 1, dist = c("hypergeometric", "uniform"), engine = c("Cpp", "R"), - hitAndRun = FALSE, SIS = FALSE, nonUniform = FALSE, adaptive = FALSE) +metropolis(init, moves, suffStats = 0, config = matrix(0), iter = 1000, + burn = 0, thin = 1, dist = c("hypergeometric", "uniform"), + engine = c("Cpp", "R"), hitAndRun = FALSE, SIS = FALSE, + nonUniform = FALSE, adaptive = FALSE) rawMetropolis(init, moves, iter = 1000, dist = "hypergeometric", hitAndRun = FALSE, SIS = FALSE, nonUniform = FALSE, adaptive = FALSE) @@ -18,6 +19,10 @@ rawMetropolis(init, moves, iter = 1000, dist = "hypergeometric", \item{moves}{the moves to be used (the negatives will be added); they are arranged as the columns of a matrix.} +\item{suffStats}{the sufficient statistics of the model. Only used when SIS = TRUE. Defaulted to 0.} + +\item{config}{the configuration matrix that encodes the model. Only used when SIS = TRUE. Defaulted to matrix(0).} + \item{iter}{number of chain iterations} \item{burn}{burn-in} @@ -118,9 +123,9 @@ fisher.test(handy)$p.value A <- hmat(c(2,2), as.list(1:2)) moves <- markov(A) -outC <- metropolis(tab2vec(handy), moves, 1e4, engine = "Cpp") +outC <- metropolis(tab2vec(handy), moves, suffStats = tab2vec(handy) \%*\% A, config = A, 1e4, engine = "Cpp") str(outC) -outR <- metropolis(tab2vec(handy), moves, 1e4, engine = "R", thin = 20) +outR <- metropolis(tab2vec(handy), moves, suffStats = tab2vec(handy) \%*\% A, config = A, 1e4, engine = "R", thin = 20) str(outR) # showSteps(out$steps) @@ -128,8 +133,8 @@ str(outR) library(microbenchmark) microbenchmark( - metropolis(tab2vec(handy), moves, engine = "Cpp"), - metropolis(tab2vec(handy), moves, engine = "R") + metropolis(tab2vec(handy), moves, suffStats = tab2vec(handy) \%*\% A, config = A,engine = "Cpp"), + metropolis(tab2vec(handy), moves, suffStats = tab2vec(handy) \%*\% A, config = A,engine = "R") ) # cpp ~ 20-25x faster From 25941a843c946e33ef7d8c4af1d724a7ed50ac0a Mon Sep 17 00:00:00 2001 From: Innerst Date: Fri, 2 Mar 2018 11:24:11 -0600 Subject: [PATCH 30/53] plyr to dplyr --- NAMESPACE | 2 +- R/Aglm.r | 27 ++++--- R/algstat.r | 3 +- R/pmat.R | 2 +- man/Aglm.Rd | 4 +- src/metropolis_hypergeometric_cpp.cpp | 109 ++++++++++++++------------ src/metropolis_uniform_cpp.cpp | 10 +-- 7 files changed, 83 insertions(+), 74 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index e657dd6..4e65394 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -53,6 +53,7 @@ export(upper) export(variety) export(vec2tab) import(Rcpp) +import(dplyr) import(latter) import(m2r) import(mpoly) @@ -74,7 +75,6 @@ importFrom(parallel,makeCluster) importFrom(parallel,mclapply) importFrom(parallel,parLapply) importFrom(parallel,stopCluster) -importFrom(plyr,ddply) importFrom(reshape2,melt) importFrom(stats,deriv) importFrom(stats,dmultinom) diff --git a/R/Aglm.r b/R/Aglm.r index 3849d3f..dca58ec 100644 --- a/R/Aglm.r +++ b/R/Aglm.r @@ -65,7 +65,7 @@ #' predict(mod, data.frame(x = 1:5), type = "response") #' #' # aglm predictions -#' rowMeans(out$steps) / ddply(df, "x", nrow)$V1 +#' rowMeans(out$steps) / plry::ddply(df, "x", nrow)$V1 #' #' #' @@ -105,7 +105,7 @@ #' predict(mod, data.frame(x = c(1:5)), type = "response") #' #' # aglm predictions -#' rowMeans(out$steps) / ddply(df, "x", nrow)$V1 +#' rowMeans(out$steps) / plyr::ddply(df, "x", nrow)$V1 #' #' @export @@ -176,13 +176,16 @@ aglm <- function(model, data, family = poisson(), ## format the data names(data)[names(data) == response] <- "response" - + if (method == "binomial") { - data <- rbind(ddply(data, unique(unlist(model)), summarise, sum = sum(response)), - ddply(data, unique(unlist(model)), summarise, sum = length(response) - sum(response)) - ) + data <- group_by_(data, unique(unlist(model))) + success <- dplyr::summarise(data, sum = sum(response)) + failure <- dplyr::summarise(data, sum = length(response) - sum(response)) + data <- bind_rows(success, failure) + } else { - data <- ddply(data, unique(unlist(model)), summarise, sum = sum(response)) + data <- group_by_(data, unique(unlist(model))) + data <- dplyr::summarise(data, sum = sum(response)) } ## any 0 levels @@ -219,16 +222,16 @@ aglm <- function(model, data, family = poisson(), } ## levels (assuming all levels are numeric i.e. (1,2,3,... not Green, Blue, Red, etc.) if(ncol(data) <= 2){ - levls <- unique(data[,-ncol(data)]) + levels <- unique(data[,-ncol(data)]) } else { - levls <- lapply(data[,-ncol(data)], unique) + levels <- lapply(data[,-ncol(data)], unique) } # make configuration (model) matrix - A <- pmat(levls, facets) + A <- pmat(levels, facets) } # check to see if all level configurations are there (need work here) - lvlsInData <- as.list(as.data.frame(t(expand.grid(levls)))) %in% as.list(as.data.frame(t(data[,-ncol(data)]))) + lvlsInData <- as.list(as.data.frame(t(expand.grid(levels)))) %in% as.list(as.data.frame(t(data[,-ncol(data)]))) # subset A by levels that are present A <- A[,lvlsInData] @@ -254,7 +257,7 @@ aglm <- function(model, data, family = poisson(), warning( "No moves were provided and 4ti2 is not found.\n", " The resulting chain is likely not connected and strongly autocorrelated.\n", - " See ?pois_reg. Consider using rmove to generate SIS moves in advance.", + " See ?aglm. Consider using rmove to generate SIS moves in advance.", immediate. = TRUE ) message("Computing 1000 SIS moves... ", appendLF = FALSE) diff --git a/R/algstat.r b/R/algstat.r index 7c03144..b6a4e75 100644 --- a/R/algstat.r +++ b/R/algstat.r @@ -7,7 +7,7 @@ #' process, algstat leverages ports to Macaulay2 (through m2r), Bertini, LattE #' and 4ti2 (through latter). #' -#' @import Rcpp mpoly latter m2r +#' @import Rcpp mpoly latter m2r dplyr #' @importFrom stringr str_detect str_c str_dup str_replace str_replace_all #' str_split str_sub str_sub<- #' @importFrom lpSolve lp @@ -18,7 +18,6 @@ #' @importFrom reshape2 melt #' @importFrom stats deriv dmultinom loglin runif sd model.frame #' @importFrom utils combn download.file -#' @importFrom plyr ddply #' @useDynLib algstat #' @docType package #' @name algstat diff --git a/R/pmat.R b/R/pmat.R index 4083b24..14d4f4a 100644 --- a/R/pmat.R +++ b/R/pmat.R @@ -61,7 +61,7 @@ pmat <- function(levels, facets){ facetList <- facetList[-1] } #return the configuration matrix which includes only the elements need for the heirarchical model - return(fullMat[c(TRUE, facetList %in% facets),]) + fullMat[c(TRUE, facetList %in% facets),] } diff --git a/man/Aglm.Rd b/man/Aglm.Rd index da38b0c..747b411 100644 --- a/man/Aglm.Rd +++ b/man/Aglm.Rd @@ -86,7 +86,7 @@ Fitting generalized linear models with algebraic methods predict(mod, data.frame(x = 1:5), type = "response") # aglm predictions - rowMeans(out$steps) / ddply(df, "x", nrow)$V1 + rowMeans(out$steps) / plry::ddply(df, "x", nrow)$V1 @@ -126,6 +126,6 @@ Fitting generalized linear models with algebraic methods predict(mod, data.frame(x = c(1:5)), type = "response") # aglm predictions - rowMeans(out$steps) / ddply(df, "x", nrow)$V1 + rowMeans(out$steps) / plyr::ddply(df, "x", nrow)$V1 } diff --git a/src/metropolis_hypergeometric_cpp.cpp b/src/metropolis_hypergeometric_cpp.cpp index 9191a8d..98a6b6b 100644 --- a/src/metropolis_hypergeometric_cpp.cpp +++ b/src/metropolis_hypergeometric_cpp.cpp @@ -48,7 +48,6 @@ List metropolis_hypergeometric_cpp( unifs2 = runif(nTotalSamples); unifs3 = runif(nTotalSamples); Function print("print"); - NumericVector move_dist = rep(10.0, nMoves); double counter = sum(move_dist); @@ -77,7 +76,7 @@ List metropolis_hypergeometric_cpp( proposal[k] = current[k] + move[k]; } - }else{ + } else { // make move for(int k = 0; k < n; ++k){ @@ -91,49 +90,60 @@ List metropolis_hypergeometric_cpp( upperBound = stepSize[stepSize > 0]; lb = max(lowerBound); ub = min(upperBound); - + // MCMC inside MCMC //IntegerVector line = seq(lb, ub); if(adaptive){ - int line_length = ub-lb; - for(int m = 0; m < n;++m){ - w_current[m] = current[m]; - } - - for(int l = 0; l < line_length;++l){ - int constant2 = as(Rcpp::sample(constant, 1)); - for(int k = 0; k < n;++k){ - w_proposal[k] = w_current[k] + constant2 * move[k]; - } - bool anyIsNegative2; - anyIsNegative2 = false; - for(int k = 0; k < n; ++k){ - if(w_proposal[k] < 0){ - anyIsNegative2 = true; - } + int line_length = ub-lb + 1; + + for(int m = 0; m < n;++m){ + w_current[m] = current[m]; } - if(anyIsNegative2){ - prob2 = 0; - } else { - prob2 = exp( sum(lgamma(w_current+1)) - sum(lgamma(w_proposal+1)) ); + for(int l = 0; l < line_length; ++l){ + int constant2 = as(Rcpp::sample(constant, 1)); + for(int k = 0; k < n;++k){ + w_proposal[k] = w_current[k] + constant2 * move[k]; + } + bool anyIsNegative2; + anyIsNegative2 = false; + for(int k = 0; k < n; ++k){ + if(w_proposal[k] < 0){ + anyIsNegative2 = true; + } + } + + if(anyIsNegative2){ + prob2 = 0; + } else { + prob2 = exp( sum(lgamma(w_current+1)) - sum(lgamma(w_proposal+1)) ); + } + + if(prob2 > 1){ + prob2 = 1; + } + + // make move + if(unifs2[l] < prob2) { + for(int k = 0; k < n; ++k){ + w_current[k] = w_proposal[k]; + } + } } - - if(prob2 > 1){ - prob2 = 1; + bool didMove; + didMove = false; + for(int k = 0; k < n; ++k) { + if(w_current[k] != current[k]) didMove = true; } - - // make move - if(unifs[l] < prob2){ + if(didMove == true) { + for(int k = 0; k < n; ++k) { + proposal[k] = w_current[k]; + } + } else { for(int k = 0; k < n; ++k){ - w_current[k] = w_proposal[k]; + proposal[k] = current[k] + move[k]; } } - } - for(int k = 0; k < n; ++k){ - proposal[k] = w_current[k]; - } - } else { // Base Hit and Run @@ -145,23 +155,23 @@ List metropolis_hypergeometric_cpp( if(test2[i] < 0) ub = -1; } } - if(lb > ub){ - run[0] = 1; - }else{ - IntegerVector range = seq(lb,ub); - - run = Rcpp::sample(range,1); - } - if(run[0] == 0){ - run[0] = 1; + if(lb > ub){ + run = Rcpp::sample(constant, 1); + + } else { + IntegerVector range = seq(lb,ub); + run = Rcpp::sample(range,1); } - if(hit_and_run){ - for(int k = 0; k < n; ++k){ - proposal[k] = current[k] + as(run) * move[k]; + if(run[0] == 0){ + run = Rcpp::sample(constant, 1); + } + if(hit_and_run){ + for(int k = 0; k < n; ++k){ + proposal[k] = current[k] + as(run) * move[k]; + } } } - } - }else{ + } else { for(int k = 0; k < n; ++k){ proposal[k] = current[k] + move[k]; } @@ -170,6 +180,7 @@ List metropolis_hypergeometric_cpp( if(SIS){ if(unifs2[i] < .01) proposal = sis_tbl(config, suff_stats); } + // compute probability of transition anyIsNegative = false; for(int k = 0; k < n; ++k){ diff --git a/src/metropolis_uniform_cpp.cpp b/src/metropolis_uniform_cpp.cpp index 80a87e1..187a6d9 100644 --- a/src/metropolis_uniform_cpp.cpp +++ b/src/metropolis_uniform_cpp.cpp @@ -128,11 +128,7 @@ List metropolis_uniform_cpp( for(int k = 0; k < n; ++k){ proposal[k] = w_current[k]; } - //Attempt at recursively calling MCMC routine - // List MCMC_out = metropolis_hypergeometric_cpp(current, as(move), suff_stats, config, 50, 1, false, false, false); - // IntegerMatrix mini_steps = MCMC_out[0]; - // int step_length = mini_steps.ncol(); - // proposal = mini_steps(_, step_length); + } else { if(is_true(any(stepSize == 0))){ @@ -144,7 +140,7 @@ List metropolis_uniform_cpp( } } if(lb > ub){ - run[0] = 1; + run = Rcpp::sample(constant, 1); }else{ IntegerVector range = seq(lb,ub); @@ -153,7 +149,7 @@ List metropolis_uniform_cpp( } if(run[0] == 0){ - run[0] = 1; + run = Rcpp::sample(constant, 1); } } if(hit_and_run == TRUE){ From a68a445f7d1d59b3366c750c296910b289ee879f Mon Sep 17 00:00:00 2001 From: Innerst Date: Fri, 16 Mar 2018 14:33:22 -0500 Subject: [PATCH 31/53] added extra out-arg to metropolis for testing --- DESCRIPTION | 3 ++- R/metropolis.r | 2 +- src/metropolis_hypergeometric_cpp.cpp | 17 ++++++++++------- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index b454226..cc4f1cc 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -25,7 +25,8 @@ Imports: Rcpp, lpSolve, parallel, - ggplot2 + ggplot2, + dplyr Suggests: vcd, microbenchmark diff --git a/R/metropolis.r b/R/metropolis.r index ef88ffa..b56c6b0 100644 --- a/R/metropolis.r +++ b/R/metropolis.r @@ -404,7 +404,7 @@ metropolis <- function(init, moves, suffStats = 0, config = matrix(0), iter = 1E ## return output ################################################## - out[c("steps", "moves", "accept_prob")] + out[c("steps", "moves", "accept_prob", "prob_vec")] } diff --git a/src/metropolis_hypergeometric_cpp.cpp b/src/metropolis_hypergeometric_cpp.cpp index 98a6b6b..c20a697 100644 --- a/src/metropolis_hypergeometric_cpp.cpp +++ b/src/metropolis_hypergeometric_cpp.cpp @@ -27,6 +27,7 @@ List metropolis_hypergeometric_cpp( double prob; // the probability of transition double prob2; bool anyIsNegative; + bool anyIsNegative2; IntegerVector move(n); double accept_prob = 0; IntegerVector current_num; @@ -40,6 +41,8 @@ List metropolis_hypergeometric_cpp( IntegerVector constant = IntegerVector::create(-1,1); IntegerVector w_current(n); IntegerVector w_proposal(n); + NumericVector prob_vec(nTotalSamples); + Function sample("sample"); whichMove = sample(nMoves, nTotalSamples, 1); @@ -92,20 +95,20 @@ List metropolis_hypergeometric_cpp( ub = min(upperBound); // MCMC inside MCMC - //IntegerVector line = seq(lb, ub); if(adaptive){ int line_length = ub-lb + 1; - + for(int m = 0; m < n;++m){ w_current[m] = current[m]; } for(int l = 0; l < line_length; ++l){ + int constant2 = as(Rcpp::sample(constant, 1)); for(int k = 0; k < n;++k){ w_proposal[k] = w_current[k] + constant2 * move[k]; } - bool anyIsNegative2; + anyIsNegative2 = false; for(int k = 0; k < n; ++k){ if(w_proposal[k] < 0){ @@ -122,9 +125,8 @@ List metropolis_hypergeometric_cpp( if(prob2 > 1){ prob2 = 1; } - // make move - if(unifs2[l] < prob2) { + if(unifs[l] < prob2) { for(int k = 0; k < n; ++k){ w_current[k] = w_proposal[k]; } @@ -198,7 +200,7 @@ List metropolis_hypergeometric_cpp( if(prob > 1){ prob = 1; } - +prob_vec[thin*i+j] = prob; // store acceptance probability accept_prob = accept_prob + prob / nTotalSamples; @@ -232,7 +234,8 @@ List metropolis_hypergeometric_cpp( // create out list List out = List::create( Rcpp::Named("steps") = steps, - Rcpp::Named("accept_prob") = accept_prob + Rcpp::Named("accept_prob") = accept_prob, + Rcpp::Named("prob_vec") = prob_vec ); return out; From f704485a5824e19bcb8822c0dac77ec0575f4a30 Mon Sep 17 00:00:00 2001 From: Grant Innerst Date: Sat, 17 Mar 2018 23:41:12 -0500 Subject: [PATCH 32/53] rm DS_Store --- src/.DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 src/.DS_Store diff --git a/src/.DS_Store b/src/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 Date: Sat, 14 Apr 2018 14:08:12 -0500 Subject: [PATCH 33/53] initial greedy algorithm and others --- NAMESPACE | 4 +- R/RcppExports.R | 4 ++ R/metropolis.r | 2 +- src/RcppExports.cpp | 15 +++++ src/greedy_preprocess.cpp | 86 +++++++++++++++++++++++++++ src/metropolis_hypergeometric_cpp.cpp | 2 +- 6 files changed, 108 insertions(+), 5 deletions(-) create mode 100644 src/greedy_preprocess.cpp diff --git a/NAMESPACE b/NAMESPACE index cc24332..154febf 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -26,8 +26,6 @@ export(mchoose) export(metropolis) export(plotMatrix) export(pmat) -export(polyOptim) -export(polySolve) export(projectOnto) export(rawMetropolis) export(rfiber) @@ -42,8 +40,8 @@ export(teshape) export(upper) export(vec2tab) import(Rcpp) -import(dplyr) import(bertini) +import(dplyr) import(latter) import(m2r) import(mpoly) diff --git a/R/RcppExports.R b/R/RcppExports.R index b1b3a31..1443086 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -21,6 +21,10 @@ computeX2sCpp <- function(x, exp) { .Call('_algstat_computeX2sCpp', PACKAGE = 'algstat', x, exp) } +greedy_preprocess <- function(current, moves, iter, thin) { + .Call('_algstat_greedy_preprocess', PACKAGE = 'algstat', current, moves, iter, thin) +} + metropolis_hypergeometric_cpp <- function(current, moves, suff_stats, config, iter, thin, hit_and_run, SIS, non_uniform, adaptive) { .Call('_algstat_metropolis_hypergeometric_cpp', PACKAGE = 'algstat', current, moves, suff_stats, config, iter, thin, hit_and_run, SIS, non_uniform, adaptive) } diff --git a/R/metropolis.r b/R/metropolis.r index b56c6b0..ef88ffa 100644 --- a/R/metropolis.r +++ b/R/metropolis.r @@ -404,7 +404,7 @@ metropolis <- function(init, moves, suffStats = 0, config = matrix(0), iter = 1E ## return output ################################################## - out[c("steps", "moves", "accept_prob", "prob_vec")] + out[c("steps", "moves", "accept_prob")] } diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 26da1a9..a863fad 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -65,6 +65,20 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// greedy_preprocess +List greedy_preprocess(IntegerVector current, IntegerMatrix moves, int iter, int thin); +RcppExport SEXP _algstat_greedy_preprocess(SEXP currentSEXP, SEXP movesSEXP, SEXP iterSEXP, SEXP thinSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< IntegerVector >::type current(currentSEXP); + Rcpp::traits::input_parameter< IntegerMatrix >::type moves(movesSEXP); + Rcpp::traits::input_parameter< int >::type iter(iterSEXP); + Rcpp::traits::input_parameter< int >::type thin(thinSEXP); + rcpp_result_gen = Rcpp::wrap(greedy_preprocess(current, moves, iter, thin)); + return rcpp_result_gen; +END_RCPP +} // metropolis_hypergeometric_cpp List metropolis_hypergeometric_cpp(IntegerVector current, IntegerMatrix moves, IntegerVector suff_stats, IntegerMatrix config, int iter, int thin, bool hit_and_run, bool SIS, bool non_uniform, bool adaptive); RcppExport SEXP _algstat_metropolis_hypergeometric_cpp(SEXP currentSEXP, SEXP movesSEXP, SEXP suff_statsSEXP, SEXP configSEXP, SEXP iterSEXP, SEXP thinSEXP, SEXP hit_and_runSEXP, SEXP SISSEXP, SEXP non_uniformSEXP, SEXP adaptiveSEXP) { @@ -150,6 +164,7 @@ static const R_CallMethodDef CallEntries[] = { {"_algstat_computeNMsCpp", (DL_FUNC) &_algstat_computeNMsCpp, 2}, {"_algstat_computeUProbsCpp", (DL_FUNC) &_algstat_computeUProbsCpp, 1}, {"_algstat_computeX2sCpp", (DL_FUNC) &_algstat_computeX2sCpp, 2}, + {"_algstat_greedy_preprocess", (DL_FUNC) &_algstat_greedy_preprocess, 4}, {"_algstat_metropolis_hypergeometric_cpp", (DL_FUNC) &_algstat_metropolis_hypergeometric_cpp, 10}, {"_algstat_metropolis_uniform_cpp", (DL_FUNC) &_algstat_metropolis_uniform_cpp, 10}, {"_algstat_rfiberOne", (DL_FUNC) &_algstat_rfiberOne, 2}, diff --git a/src/greedy_preprocess.cpp b/src/greedy_preprocess.cpp new file mode 100644 index 0000000..c8def4e --- /dev/null +++ b/src/greedy_preprocess.cpp @@ -0,0 +1,86 @@ +#include +using namespace Rcpp; + + +// [[Rcpp::export]] +List greedy_preprocess( + IntegerVector current, + IntegerMatrix moves, + int iter, int thin +){ + + int nTotalSamples = iter * thin; // total number of steps + int n = current.size(); // number of cells + int nMoves = moves.ncol(); // number of moves + IntegerMatrix steps(n, iter); // columns are states + IntegerVector whichMove(nTotalSamples); // move selection + NumericVector unifs(nTotalSamples); // for transition probabilities + IntegerVector proposal(n); // the proposed moves + double prob; // the probability of transition + bool anyIsNegative; + IntegerVector move(n); + double accept_prob = 0; + + Function sample("sample"); + whichMove = sample(nMoves, nTotalSamples, 1); + Function runif("runif"); + unifs = runif(nTotalSamples); + Function print("print"); + + for(int i = 0; i < iter; ++i){ + for(int j = 0; j < thin; ++j){ + + // make move + for(int k = 0; k < n; ++k){ + move[k] = moves(k, whichMove[thin*i+j]-1); + } + + // compute proposal + for(int k = 0; k < n; ++k){ + proposal[k] = current[k] + move[k]; + } + + // compute probability of transition + anyIsNegative = false; + for(int k = 0; k < n; ++k){ + if(proposal[k] < 0){ + anyIsNegative = true; + } + } + + if(anyIsNegative){ + prob = 0; + } else { + prob = exp( sum(lgamma(current+1)) - sum(lgamma(proposal+1)) ); + } + + if(prob > 1){ + prob = 1; + } + + // store acceptance probability + accept_prob = accept_prob + prob / nTotalSamples; + + // make move + if(prob == 1){ + for(int k = 0; k < n; ++k){ + current[k] = proposal[k]; + } + } + + } + + // assign state move + for(int k = 0; k < n; ++k){ + steps(k,i) = current[k]; + } + } + + // create out list + List out = List::create( + Rcpp::Named("steps") = steps, + Rcpp::Named("accept_prob") = accept_prob + ); + + return out; +} \ No newline at end of file diff --git a/src/metropolis_hypergeometric_cpp.cpp b/src/metropolis_hypergeometric_cpp.cpp index c20a697..f91f3e0 100644 --- a/src/metropolis_hypergeometric_cpp.cpp +++ b/src/metropolis_hypergeometric_cpp.cpp @@ -126,7 +126,7 @@ List metropolis_hypergeometric_cpp( prob2 = 1; } // make move - if(unifs[l] < prob2) { + if(unifs[l] < prob2 * 3) { for(int k = 0; k < n; ++k){ w_current[k] = w_proposal[k]; } From 57e090115a9d4d8b311c3567a338490e47af16ed Mon Sep 17 00:00:00 2001 From: Grant Innerst Date: Sun, 15 Apr 2018 17:25:36 -0500 Subject: [PATCH 34/53] change code placement to include adaptive scheme --- src/metropolis_hypergeometric_cpp.cpp | 27 +++++++++++++++------------ src/metropolis_uniform_cpp.cpp | 22 +++++++++++++--------- 2 files changed, 28 insertions(+), 21 deletions(-) diff --git a/src/metropolis_hypergeometric_cpp.cpp b/src/metropolis_hypergeometric_cpp.cpp index f91f3e0..cdddca3 100644 --- a/src/metropolis_hypergeometric_cpp.cpp +++ b/src/metropolis_hypergeometric_cpp.cpp @@ -93,11 +93,22 @@ List metropolis_hypergeometric_cpp( upperBound = stepSize[stepSize > 0]; lb = max(lowerBound); ub = min(upperBound); + + if(is_true(any(stepSize == 0))){ + IntegerVector test1 = current + lb * move; + IntegerVector test2 = current + ub * move; + for(int i = 0; i < n; ++i){ + if(test1[i] < 0) lb = 1; + if(test2[i] < 0) ub = -1; + } + } // MCMC inside MCMC if(adaptive){ + int line_length = ub-lb + 1; - + if(line_length < 0) line_length = 1; + for(int m = 0; m < n;++m){ w_current[m] = current[m]; } @@ -108,20 +119,19 @@ List metropolis_hypergeometric_cpp( for(int k = 0; k < n;++k){ w_proposal[k] = w_current[k] + constant2 * move[k]; } - + anyIsNegative2 = false; for(int k = 0; k < n; ++k){ if(w_proposal[k] < 0){ anyIsNegative2 = true; } } - if(anyIsNegative2){ prob2 = 0; } else { prob2 = exp( sum(lgamma(w_current+1)) - sum(lgamma(w_proposal+1)) ); } - + if(prob2 > 1){ prob2 = 1; } @@ -149,14 +159,7 @@ List metropolis_hypergeometric_cpp( } else { // Base Hit and Run - if(is_true(any(stepSize == 0))){ - IntegerVector test1 = current + lb * move; - IntegerVector test2 = current + ub * move; - for(int i = 0; i < n; ++i){ - if(test1[i] < 0) lb = 1; - if(test2[i] < 0) ub = -1; - } - } + if(lb > ub){ run = Rcpp::sample(constant, 1); diff --git a/src/metropolis_uniform_cpp.cpp b/src/metropolis_uniform_cpp.cpp index 187a6d9..e28c793 100644 --- a/src/metropolis_uniform_cpp.cpp +++ b/src/metropolis_uniform_cpp.cpp @@ -89,8 +89,20 @@ List metropolis_uniform_cpp( lb = max(lowerBound); ub = min(upperBound); + if(is_true(any(stepSize == 0))){ + IntegerVector test1 = current + lb * move; + IntegerVector test2 = current + ub * move; + for(int i = 0; i < n; ++i){ + if(test1[i] < 0) lb = 1; + if(test2[i] < 0) ub = -1; + } + } + if(adaptive){ - int line_length = ub-lb; + + int line_length = ub-lb + 1; + if(line_length < 0) line_length = 1; + for(int m = 0; m < n;++m){ w_current[m] = current[m]; } @@ -131,14 +143,6 @@ List metropolis_uniform_cpp( } else { - if(is_true(any(stepSize == 0))){ - IntegerVector test1 = current + lb * move; - IntegerVector test2 = current + ub * move; - for(int i = 0; i < n; ++i){ - if(test1[i] < 0) lb = 1; - if(test2[i] < 0) ub = -1; - } - } if(lb > ub){ run = Rcpp::sample(constant, 1); }else{ From 9785fb01480eaa413fe83dbe49da6d540b77b00c Mon Sep 17 00:00:00 2001 From: Grant Innerst Date: Sun, 13 May 2018 16:57:14 -0500 Subject: [PATCH 35/53] added efficient sample size calc --- R/metropolis.r | 6 ++++-- src/metropolis_hypergeometric_cpp.cpp | 5 ++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/R/metropolis.r b/R/metropolis.r index ef88ffa..022981e 100644 --- a/R/metropolis.r +++ b/R/metropolis.r @@ -399,12 +399,14 @@ metropolis <- function(init, moves, suffStats = 0, config = matrix(0), iter = 1E message("done.") } - + PRs <- computeUProbsCpp(out$steps) + acfs <- acf(PRs, plot = FALSE) + out$neff <- iter / (1 + 2 * sum(acfs$acf[-1])) ## return output ################################################## - out[c("steps", "moves", "accept_prob")] + out[c("steps", "moves", "accept_prob", "neff")] } diff --git a/src/metropolis_hypergeometric_cpp.cpp b/src/metropolis_hypergeometric_cpp.cpp index cdddca3..d2298cd 100644 --- a/src/metropolis_hypergeometric_cpp.cpp +++ b/src/metropolis_hypergeometric_cpp.cpp @@ -136,7 +136,7 @@ List metropolis_hypergeometric_cpp( prob2 = 1; } // make move - if(unifs[l] < prob2 * 3) { + if(unifs[l] < prob2 * 3.14159) { for(int k = 0; k < n; ++k){ w_current[k] = w_proposal[k]; } @@ -237,8 +237,7 @@ prob_vec[thin*i+j] = prob; // create out list List out = List::create( Rcpp::Named("steps") = steps, - Rcpp::Named("accept_prob") = accept_prob, - Rcpp::Named("prob_vec") = prob_vec + Rcpp::Named("accept_prob") = accept_prob ); return out; From cd8ac1f10e8f4fb0e098b66a442030d0bc35c146 Mon Sep 17 00:00:00 2001 From: Grant Innerst Date: Tue, 22 May 2018 11:30:26 -0500 Subject: [PATCH 36/53] ESS modification --- R/metropolis.r | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/R/metropolis.r b/R/metropolis.r index 022981e..ac64fe1 100644 --- a/R/metropolis.r +++ b/R/metropolis.r @@ -400,9 +400,13 @@ metropolis <- function(init, moves, suffStats = 0, config = matrix(0), iter = 1E } PRs <- computeUProbsCpp(out$steps) - acfs <- acf(PRs, plot = FALSE) - out$neff <- iter / (1 + 2 * sum(acfs$acf[-1])) - + acfs <- acf(PRs, plot = FALSE)$acf[,,1] + if(any(acfs < 0)) { + first_neg <- which(acfs < 0)[1] + out$neff <- iter / (1 + 2 * sum(acfs[2:(first_neg - 1)])) + } else { + out$neff <- iter / (1 + 2 * sum(acfs[-1])) + } ## return output ################################################## From 294be551b661b19748a555ae47ba9c5a7ee18094 Mon Sep 17 00:00:00 2001 From: Grant Innerst Date: Tue, 22 May 2018 14:16:33 -0500 Subject: [PATCH 37/53] change ess to make match stan function --- R/metropolis.r | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/R/metropolis.r b/R/metropolis.r index ac64fe1..3632314 100644 --- a/R/metropolis.r +++ b/R/metropolis.r @@ -400,13 +400,25 @@ metropolis <- function(init, moves, suffStats = 0, config = matrix(0), iter = 1E } PRs <- computeUProbsCpp(out$steps) - acfs <- acf(PRs, plot = FALSE)$acf[,,1] - if(any(acfs < 0)) { - first_neg <- which(acfs < 0)[1] - out$neff <- iter / (1 + 2 * sum(acfs[2:(first_neg - 1)])) - } else { - out$neff <- iter / (1 + 2 * sum(acfs[-1])) + # acfs <- acf(PRs, plot = FALSE)$acf[,,1] + # if(any(acfs < 0)) { + # first_neg <- which(acfs < 0)[1] + # out$neff <- iter / (1 + 2 * sum(acfs[2:(first_neg - 1)])) + # } else { + # out$neff <- iter / (1 + 2 * sum(acfs[-1])) + # } + auto_cov <- acf(PRs, lag.max = iter - 1, plot = FALSE, type = "covariance")$acf[,,1] + mean_var <- auto_cov[1] * iter / (iter - 1) + rho_hat_sum <- 0 + for (t in 2:iter) { + rho_hat <- 1 - (mean_var - auto_cov[t]) / auto_cov[1] + if (is.nan(rho_hat)) rho_hat <- 0 + if (rho_hat < 0) break + rho_hat_sum <- rho_hat_sum + rho_hat } + ess <- iter + if (rho_hat_sum > 0) out$neff <- ess / (1 + 2 * rho_hat_sum) + ## return output ################################################## From 94f401f130572fe6bfe129a75cacf7d22139bde0 Mon Sep 17 00:00:00 2001 From: Grant Innerst Date: Tue, 22 May 2018 14:37:56 -0500 Subject: [PATCH 38/53] move neff calculation to seperate function --- R/metropolis.r | 19 ------------------- R/neff.r | 25 +++++++++++++++++++++++++ 2 files changed, 25 insertions(+), 19 deletions(-) create mode 100644 R/neff.r diff --git a/R/metropolis.r b/R/metropolis.r index 3632314..11c5ac6 100644 --- a/R/metropolis.r +++ b/R/metropolis.r @@ -399,25 +399,6 @@ metropolis <- function(init, moves, suffStats = 0, config = matrix(0), iter = 1E message("done.") } - PRs <- computeUProbsCpp(out$steps) - # acfs <- acf(PRs, plot = FALSE)$acf[,,1] - # if(any(acfs < 0)) { - # first_neg <- which(acfs < 0)[1] - # out$neff <- iter / (1 + 2 * sum(acfs[2:(first_neg - 1)])) - # } else { - # out$neff <- iter / (1 + 2 * sum(acfs[-1])) - # } - auto_cov <- acf(PRs, lag.max = iter - 1, plot = FALSE, type = "covariance")$acf[,,1] - mean_var <- auto_cov[1] * iter / (iter - 1) - rho_hat_sum <- 0 - for (t in 2:iter) { - rho_hat <- 1 - (mean_var - auto_cov[t]) / auto_cov[1] - if (is.nan(rho_hat)) rho_hat <- 0 - if (rho_hat < 0) break - rho_hat_sum <- rho_hat_sum + rho_hat - } - ess <- iter - if (rho_hat_sum > 0) out$neff <- ess / (1 + 2 * rho_hat_sum) ## return output ################################################## diff --git a/R/neff.r b/R/neff.r new file mode 100644 index 0000000..4b4f4d0 --- /dev/null +++ b/R/neff.r @@ -0,0 +1,25 @@ +# Effective Sample Size +# +# Calculates the effective sample size of one MCMC chain. This function is very similar to Stan function rstan:::ess_rfun. +# +# +# the parameter chain should be the output of the metropolis function +# +# + +neff <- function(chain) { + PRs <- computeUProbsCpp(chain$steps) + chain_length <- length(PRs) + auto_cov <- acf(PRs, lag.max = chain_length - 1, plot = FALSE, type = "covariance")$acf[,,1] + mean_var <- auto_cov[1] * chain_length / (chain_length - 1) + rho_hat_sum <- 0 + for (t in 2:chain_length) { + rho_hat <- 1 - (mean_var - auto_cov[t]) / auto_cov[1] + if (is.nan(rho_hat)) rho_hat <- 0 + if (rho_hat < 0) break + rho_hat_sum <- rho_hat_sum + rho_hat + } + ess <- chain_length + if (rho_hat_sum > 0) ess <- ess / (1 + 2 * rho_hat_sum) +ess +} \ No newline at end of file From df0a67fa2f73e00ff5fe48d1829718fdf630d09a Mon Sep 17 00:00:00 2001 From: Grant Innerst Date: Tue, 22 May 2018 14:40:11 -0500 Subject: [PATCH 39/53] forgot to take out neff of metropolis --- R/metropolis.r | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/metropolis.r b/R/metropolis.r index 11c5ac6..b63dc7f 100644 --- a/R/metropolis.r +++ b/R/metropolis.r @@ -403,7 +403,7 @@ metropolis <- function(init, moves, suffStats = 0, config = matrix(0), iter = 1E ## return output ################################################## - out[c("steps", "moves", "accept_prob", "neff")] + out[c("steps", "moves", "accept_prob")] } From 9d90c0f6bd91ce8362cb42f648ce46ea53341891 Mon Sep 17 00:00:00 2001 From: Grant Innerst Date: Fri, 1 Jun 2018 11:08:47 -0500 Subject: [PATCH 40/53] This is my sample commit --- R/metropolis.r | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/metropolis.r b/R/metropolis.r index b63dc7f..41e3d7e 100644 --- a/R/metropolis.r +++ b/R/metropolis.r @@ -404,6 +404,8 @@ metropolis <- function(init, moves, suffStats = 0, config = matrix(0), iter = 1E ################################################## out[c("steps", "moves", "accept_prob")] + + # This is a change to the function. } From e6dd22163d6d4f6f16acd26b893de3e039dea993 Mon Sep 17 00:00:00 2001 From: Grant Innerst Date: Fri, 1 Jun 2018 11:51:31 -0500 Subject: [PATCH 41/53] removing comment --- R/metropolis.r | 1 - 1 file changed, 1 deletion(-) diff --git a/R/metropolis.r b/R/metropolis.r index 41e3d7e..539c65c 100644 --- a/R/metropolis.r +++ b/R/metropolis.r @@ -405,7 +405,6 @@ metropolis <- function(init, moves, suffStats = 0, config = matrix(0), iter = 1E out[c("steps", "moves", "accept_prob")] - # This is a change to the function. } From 06012f7a6aa10a5ed7639cfdec5c24ce926d1e18 Mon Sep 17 00:00:00 2001 From: Grant Innerst Date: Wed, 13 Jun 2018 13:18:28 -0500 Subject: [PATCH 42/53] import dplyr to importFrom dplyr, algm fixes --- NAMESPACE | 4 +++- R/Aglm.r | 9 +++++---- R/algstat.r | 3 ++- man/metropolis.Rd | 2 +- src/metropolis_hypergeometric_cpp.cpp | 6 ++---- 5 files changed, 13 insertions(+), 11 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 154febf..5efbd82 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -41,10 +41,12 @@ export(upper) export(vec2tab) import(Rcpp) import(bertini) -import(dplyr) import(latter) import(m2r) import(mpoly) +importFrom(dplyr,bind_rows) +importFrom(dplyr,group_by_) +importFrom(dplyr,summarise) importFrom(ggplot2,coord_equal) importFrom(ggplot2,element_blank) importFrom(ggplot2,ggplot) diff --git a/R/Aglm.r b/R/Aglm.r index dca58ec..397e7ff 100644 --- a/R/Aglm.r +++ b/R/Aglm.r @@ -118,7 +118,7 @@ aglm <- function(model, data, family = poisson(), { ## set/check args ################################################## - + engine <- match.arg(engine) argList <- as.list(match.call(expand.dots = TRUE))[-1] @@ -179,13 +179,13 @@ aglm <- function(model, data, family = poisson(), if (method == "binomial") { data <- group_by_(data, unique(unlist(model))) - success <- dplyr::summarise(data, sum = sum(response)) - failure <- dplyr::summarise(data, sum = length(response) - sum(response)) + success <- summarise(data, sum = sum(response)) + failure <- summarise(data, sum = length(response) - sum(response)) data <- bind_rows(success, failure) } else { data <- group_by_(data, unique(unlist(model))) - data <- dplyr::summarise(data, sum = sum(response)) + data <- summarise(data, sum = sum(response)) } ## any 0 levels @@ -220,6 +220,7 @@ aglm <- function(model, data, family = poisson(), } else { stop("Invalid model specification, see ?aglm") } + facets <- lapply(facets, unname) ## levels (assuming all levels are numeric i.e. (1,2,3,... not Green, Blue, Red, etc.) if(ncol(data) <= 2){ levels <- unique(data[,-ncol(data)]) diff --git a/R/algstat.r b/R/algstat.r index a35eeed..7ebf3d8 100644 --- a/R/algstat.r +++ b/R/algstat.r @@ -7,10 +7,11 @@ #' process, algstat leverages ports to Macaulay2 (through m2r), Bertini, LattE #' and 4ti2 (through latter). #' -#' @import Rcpp mpoly latter m2r dplyr bertini +#' @import Rcpp mpoly latter m2r bertini #' @importFrom stringr str_detect str_c str_dup str_replace str_replace_all #' str_split str_sub str_sub<- #' @importFrom lpSolve lp +#' @importFrom dplyr group_by_ summarise bind_rows #' @importFrom ggplot2 ggplot scale_x_continuous scale_y_continuous #' scale_fill_gradient scale_fill_gradient2 qplot theme_bw coord_equal theme #' element_blank diff --git a/man/metropolis.Rd b/man/metropolis.Rd index f75c66b..f2a57e7 100644 --- a/man/metropolis.Rd +++ b/man/metropolis.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/metropolis.r +% Please edit documentation in R/metropolis.R \name{metropolis} \alias{metropolis} \alias{rawMetropolis} diff --git a/src/metropolis_hypergeometric_cpp.cpp b/src/metropolis_hypergeometric_cpp.cpp index d2298cd..5ec896b 100644 --- a/src/metropolis_hypergeometric_cpp.cpp +++ b/src/metropolis_hypergeometric_cpp.cpp @@ -41,8 +41,6 @@ List metropolis_hypergeometric_cpp( IntegerVector constant = IntegerVector::create(-1,1); IntegerVector w_current(n); IntegerVector w_proposal(n); - NumericVector prob_vec(nTotalSamples); - Function sample("sample"); whichMove = sample(nMoves, nTotalSamples, 1); @@ -136,7 +134,7 @@ List metropolis_hypergeometric_cpp( prob2 = 1; } // make move - if(unifs[l] < prob2 * 3.14159) { + if(unifs[l] < prob2) { for(int k = 0; k < n; ++k){ w_current[k] = w_proposal[k]; } @@ -203,7 +201,7 @@ List metropolis_hypergeometric_cpp( if(prob > 1){ prob = 1; } -prob_vec[thin*i+j] = prob; + // store acceptance probability accept_prob = accept_prob + prob / nTotalSamples; From 9edab7c1528eb49e9082164dd8d07298be3f413e Mon Sep 17 00:00:00 2001 From: Grant Innerst Date: Thu, 26 Jul 2018 22:17:57 -0500 Subject: [PATCH 43/53] typo fix, small changes to aglm function --- NAMESPACE | 2 +- R/Aglm.r | 28 +++++++++++++++++++--------- R/algstat.r | 2 +- R/pmat.R | 6 +++--- man/Aglm.Rd | 2 +- 5 files changed, 25 insertions(+), 15 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 5efbd82..e817366 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -45,7 +45,7 @@ import(latter) import(m2r) import(mpoly) importFrom(dplyr,bind_rows) -importFrom(dplyr,group_by_) +importFrom(dplyr,group_by) importFrom(dplyr,summarise) importFrom(ggplot2,coord_equal) importFrom(ggplot2,element_blank) diff --git a/R/Aglm.r b/R/Aglm.r index 397e7ff..70c80cb 100644 --- a/R/Aglm.r +++ b/R/Aglm.r @@ -65,7 +65,7 @@ #' predict(mod, data.frame(x = 1:5), type = "response") #' #' # aglm predictions -#' rowMeans(out$steps) / plry::ddply(df, "x", nrow)$V1 +#' rowMeans(out$steps) / plyr::ddply(df, "x", nrow)$V1 #' #' #' @@ -118,7 +118,6 @@ aglm <- function(model, data, family = poisson(), { ## set/check args ################################################## - engine <- match.arg(engine) argList <- as.list(match.call(expand.dots = TRUE))[-1] @@ -178,13 +177,13 @@ aglm <- function(model, data, family = poisson(), names(data)[names(data) == response] <- "response" if (method == "binomial") { - data <- group_by_(data, unique(unlist(model))) + data <- group_by(data, .dots = unique(unlist(model))) success <- summarise(data, sum = sum(response)) failure <- summarise(data, sum = length(response) - sum(response)) data <- bind_rows(success, failure) } else { - data <- group_by_(data, unique(unlist(model))) + data <- group_by(data, .dots = unique(unlist(model))) data <- summarise(data, sum = sum(response)) } @@ -226,13 +225,14 @@ aglm <- function(model, data, family = poisson(), levels <- unique(data[,-ncol(data)]) } else { levels <- lapply(data[,-ncol(data)], unique) + levels <- lapply(levels, sort) } # make configuration (model) matrix A <- pmat(levels, facets) } - # check to see if all level configurations are there (need work here) - lvlsInData <- as.list(as.data.frame(t(expand.grid(levels)))) %in% as.list(as.data.frame(t(data[,-ncol(data)]))) + # check to see if all level configurations are there (need work here) + lvlsInData <- as.list(as.data.frame(t(expand.grid(levels)))) %in% as.list(as.data.frame(t(data[, -ncol(data)]))) # subset A by levels that are present A <- A[,lvlsInData] @@ -285,9 +285,19 @@ aglm <- function(model, data, family = poisson(), ## run metropolis-hastings ################################################## init <- unname(init) # init - out <- metropolis(init, moves, suffStats = suffStats, config = unname(A), iter = iter, burn = burn, thin = thin, - engine = engine, ...) - + out <- + metropolis( + init, + moves, + suffStats = suffStats, + config = unname(A), + iter = iter, + burn = burn, + thin = thin, + engine = engine, + ... + ) + u <- t(t(data$sum)) PR <- computeUProbsCpp(matrix(u)) # unnormd prob; numers LAS 1.1.10 diff --git a/R/algstat.r b/R/algstat.r index 7ebf3d8..e4e4a97 100644 --- a/R/algstat.r +++ b/R/algstat.r @@ -11,7 +11,7 @@ #' @importFrom stringr str_detect str_c str_dup str_replace str_replace_all #' str_split str_sub str_sub<- #' @importFrom lpSolve lp -#' @importFrom dplyr group_by_ summarise bind_rows +#' @importFrom dplyr group_by summarise bind_rows #' @importFrom ggplot2 ggplot scale_x_continuous scale_y_continuous #' scale_fill_gradient scale_fill_gradient2 qplot theme_bw coord_equal theme #' element_blank diff --git a/R/pmat.R b/R/pmat.R index 14d4f4a..fffe54a 100644 --- a/R/pmat.R +++ b/R/pmat.R @@ -23,9 +23,9 @@ #' @export pmat pmat <- function(levels, facets){ - + #Small function to make single covariate configuration matrix - func <- function(x) rbind(rep(1, length(x)), x) + func <- function(x) unname(rbind(rep(1, length(x)), x)) if(!is.list(levels)){ numCovariates <- 1 @@ -61,7 +61,7 @@ pmat <- function(levels, facets){ facetList <- facetList[-1] } #return the configuration matrix which includes only the elements need for the heirarchical model - fullMat[c(TRUE, facetList %in% facets),] + fullMat[c(TRUE, facetList %in% type.convert(facets)),] } diff --git a/man/Aglm.Rd b/man/Aglm.Rd index 747b411..4a21cea 100644 --- a/man/Aglm.Rd +++ b/man/Aglm.Rd @@ -86,7 +86,7 @@ Fitting generalized linear models with algebraic methods predict(mod, data.frame(x = 1:5), type = "response") # aglm predictions - rowMeans(out$steps) / plry::ddply(df, "x", nrow)$V1 + rowMeans(out$steps) / plyr::ddply(df, "x", nrow)$V1 From a45b60f1c79b9c8c8b621c27e31f714c825852d9 Mon Sep 17 00:00:00 2001 From: Grant Innerst Date: Thu, 2 Aug 2018 22:44:28 -0500 Subject: [PATCH 44/53] add segre product --- NAMESPACE | 1 + R/pmat.R | 47 +++++++++++++++++++++++++---------------------- R/segre.R | 35 +++++++++++++++++++++++++++++++++++ man/segre.Rd | 27 +++++++++++++++++++++++++++ 4 files changed, 88 insertions(+), 22 deletions(-) create mode 100644 R/segre.R create mode 100644 man/segre.Rd diff --git a/NAMESPACE b/NAMESPACE index e817366..d735e31 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -31,6 +31,7 @@ export(rawMetropolis) export(rfiber) export(rmove) export(rvotes) +export(segre) export(spectral) export(subsets) export(tab2array) diff --git a/R/pmat.R b/R/pmat.R index fffe54a..e85a93c 100644 --- a/R/pmat.R +++ b/R/pmat.R @@ -22,25 +22,26 @@ #' pmat(levels, facets) #' @export pmat -pmat <- function(levels, facets){ - +pmat <- function(levels, facets) { #Small function to make single covariate configuration matrix - func <- function(x) unname(rbind(rep(1, length(x)), x)) + func <- function(x) + unname(rbind(rep(1, length(x)), x)) - if(!is.list(levels)){ + if (!is.list(levels)) { numCovariates <- 1 fullMat <- func(levels) - }else{ - numCovariates <- length(levels) - #Make single covariate configuration matrix for each covariate - matList <- lapply(levels, func) - #Full heirarchicial config matrix with all interactions included - fullMat <- do.call(kprod, matList) + } else{ + numCovariates <- length(levels) + #Make single covariate configuration matrix for each covariate + matList <- lapply(levels, func) + #Full heirarchicial config matrix with all interactions included + fullMat <- do.call(kprod, matList) + # fullMat <- do.call(segre, matList) } expCov <- 1:numCovariates - + #Checking heirarchical sturcture of facets - if(any(sapply(facets, length) > 1)){ + if (any(sapply(facets, length) > 1)) { longListElts <- facets[which(sapply(facets, length) > 1)] uniqueVals <- unique(unlist(longListElts)) @@ -51,17 +52,19 @@ pmat <- function(levels, facets){ } #All possible combinations of covariates (powerset like) to be compared to facets - if(length(expCov) == 1) { - facetList <- list(expCov) - }else{ - facetList <- list(integer(0)) - for(i in seq_along(expCov)){ - facetList <- c(facetList, lapply(facetList, function(x) c(x,expCov[i]))) - } - facetList <- facetList[-1] - } + if (length(expCov) == 1) { + facetList <- list(expCov) + } else{ + facetList <- list(integer(0)) + for (i in seq_along(expCov)) { + facetList <- + c(facetList, lapply(facetList, function(x) + c(x, expCov[i]))) + } + facetList <- facetList[-1] + } #return the configuration matrix which includes only the elements need for the heirarchical model - fullMat[c(TRUE, facetList %in% type.convert(facets)),] + fullMat[c(TRUE, facetList %in% type.convert(facets)), ] } diff --git a/R/segre.R b/R/segre.R new file mode 100644 index 0000000..b500c19 --- /dev/null +++ b/R/segre.R @@ -0,0 +1,35 @@ +#' Segre Product +#' +#' Compute the Segre product of an arbitrary number of matrices +#' +#' @param ... A listing of matrices +#' +#' @return A matrix that is the Segre product of the specified matrices. +#' @export +#' +#' @examples +#' +#' A <- B <- C <- matrix(c(1,1,1,2,1,3,1,4,1,5), nrow = 2, ncol = 5) +#' +#' # two matrices +#' segre(A, B) +#' +#' # more +#' segre(A, B, C) +segre <- function(...) { + Reduce(function(x,y) { + dim_x <- dim(x) + dim_y <- dim(y) + cols <- dim_x[2] * dim_y[2] + rows <- dim_x[1] + dim_y[1] + mat <- matrix(0L, nrow = rows, ncol = cols) + k <- 1 + for (i in 1:dim_x[2]) { + for (j in 1:dim_y[2]) { + mat[,k] <- c(x[,i],y[,j]) + k <- k+1 + } + } + mat[!duplicated(mat),] + }, list(...)) +} \ No newline at end of file diff --git a/man/segre.Rd b/man/segre.Rd new file mode 100644 index 0000000..942e929 --- /dev/null +++ b/man/segre.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/segre.R +\name{segre} +\alias{segre} +\title{Segre Product} +\usage{ +segre(...) +} +\arguments{ +\item{...}{A listing of matrices} +} +\value{ +A matrix that is the Segre product of the specified matrices. +} +\description{ +Compute the Segre product of an arbitrary number of matrices +} +\examples{ + +A <- B <- C <- matrix(c(1,1,1,2,1,3,1,4,1,5), nrow = 2, ncol = 5) + +# two matrices +segre(A, B) + +# more +segre(A, B, C) +} From 8e5024447a6cec65075daad605de51c1f5c29b57 Mon Sep 17 00:00:00 2001 From: Grant Innerst Date: Wed, 17 Oct 2018 00:34:06 -0500 Subject: [PATCH 45/53] refactor metropolis acceleration options --- R/RcppExports.R | 8 ++ src/RcppExports.cpp | 26 ++++++ src/adaptive_fun.cpp | 107 ++++++++++++++++++++++++ src/adaptive_fun.h | 9 +++ src/hit_and_run_fun.cpp | 57 +++++++++++++ src/hit_and_run_fun.h | 9 +++ src/metropolis_hypergeometric_cpp.cpp | 112 ++------------------------ 7 files changed, 224 insertions(+), 104 deletions(-) create mode 100644 src/adaptive_fun.cpp create mode 100644 src/adaptive_fun.h create mode 100644 src/hit_and_run_fun.cpp create mode 100644 src/hit_and_run_fun.h diff --git a/R/RcppExports.R b/R/RcppExports.R index 1443086..d2d367a 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,6 +1,10 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 +adaptive_fun <- function(current, move) { + .Call('_algstat_adaptive_fun', PACKAGE = 'algstat', current, move) +} + computeCRsCpp <- function(x, exp, lambda) { .Call('_algstat_computeCRsCpp', PACKAGE = 'algstat', x, exp, lambda) } @@ -25,6 +29,10 @@ greedy_preprocess <- function(current, moves, iter, thin) { .Call('_algstat_greedy_preprocess', PACKAGE = 'algstat', current, moves, iter, thin) } +hit_and_run_fun <- function(current, move) { + .Call('_algstat_hit_and_run_fun', PACKAGE = 'algstat', current, move) +} + metropolis_hypergeometric_cpp <- function(current, moves, suff_stats, config, iter, thin, hit_and_run, SIS, non_uniform, adaptive) { .Call('_algstat_metropolis_hypergeometric_cpp', PACKAGE = 'algstat', current, moves, suff_stats, config, iter, thin, hit_and_run, SIS, non_uniform, adaptive) } diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index a863fad..1f4aec1 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -5,6 +5,18 @@ using namespace Rcpp; +// adaptive_fun +IntegerVector adaptive_fun(IntegerVector current, IntegerVector move); +RcppExport SEXP _algstat_adaptive_fun(SEXP currentSEXP, SEXP moveSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< IntegerVector >::type current(currentSEXP); + Rcpp::traits::input_parameter< IntegerVector >::type move(moveSEXP); + rcpp_result_gen = Rcpp::wrap(adaptive_fun(current, move)); + return rcpp_result_gen; +END_RCPP +} // computeCRsCpp NumericVector computeCRsCpp(NumericMatrix x, NumericVector exp, double lambda); RcppExport SEXP _algstat_computeCRsCpp(SEXP xSEXP, SEXP expSEXP, SEXP lambdaSEXP) { @@ -79,6 +91,18 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// hit_and_run_fun +IntegerVector hit_and_run_fun(IntegerVector current, IntegerVector move); +RcppExport SEXP _algstat_hit_and_run_fun(SEXP currentSEXP, SEXP moveSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< IntegerVector >::type current(currentSEXP); + Rcpp::traits::input_parameter< IntegerVector >::type move(moveSEXP); + rcpp_result_gen = Rcpp::wrap(hit_and_run_fun(current, move)); + return rcpp_result_gen; +END_RCPP +} // metropolis_hypergeometric_cpp List metropolis_hypergeometric_cpp(IntegerVector current, IntegerMatrix moves, IntegerVector suff_stats, IntegerMatrix config, int iter, int thin, bool hit_and_run, bool SIS, bool non_uniform, bool adaptive); RcppExport SEXP _algstat_metropolis_hypergeometric_cpp(SEXP currentSEXP, SEXP movesSEXP, SEXP suff_statsSEXP, SEXP configSEXP, SEXP iterSEXP, SEXP thinSEXP, SEXP hit_and_runSEXP, SEXP SISSEXP, SEXP non_uniformSEXP, SEXP adaptiveSEXP) { @@ -159,12 +183,14 @@ END_RCPP } static const R_CallMethodDef CallEntries[] = { + {"_algstat_adaptive_fun", (DL_FUNC) &_algstat_adaptive_fun, 2}, {"_algstat_computeCRsCpp", (DL_FUNC) &_algstat_computeCRsCpp, 3}, {"_algstat_computeG2sCpp", (DL_FUNC) &_algstat_computeG2sCpp, 2}, {"_algstat_computeNMsCpp", (DL_FUNC) &_algstat_computeNMsCpp, 2}, {"_algstat_computeUProbsCpp", (DL_FUNC) &_algstat_computeUProbsCpp, 1}, {"_algstat_computeX2sCpp", (DL_FUNC) &_algstat_computeX2sCpp, 2}, {"_algstat_greedy_preprocess", (DL_FUNC) &_algstat_greedy_preprocess, 4}, + {"_algstat_hit_and_run_fun", (DL_FUNC) &_algstat_hit_and_run_fun, 2}, {"_algstat_metropolis_hypergeometric_cpp", (DL_FUNC) &_algstat_metropolis_hypergeometric_cpp, 10}, {"_algstat_metropolis_uniform_cpp", (DL_FUNC) &_algstat_metropolis_uniform_cpp, 10}, {"_algstat_rfiberOne", (DL_FUNC) &_algstat_rfiberOne, 2}, diff --git a/src/adaptive_fun.cpp b/src/adaptive_fun.cpp new file mode 100644 index 0000000..1b10070 --- /dev/null +++ b/src/adaptive_fun.cpp @@ -0,0 +1,107 @@ +#include +using namespace Rcpp; + + +// [[Rcpp::export]] +IntegerVector adaptive_fun(IntegerVector current, IntegerVector move) { + + int n = current.size(); + IntegerVector current_num; + IntegerVector move_num; + IntegerVector stepSize; + IntegerVector upperBound; + IntegerVector lowerBound; + int lb; + int ub; + IntegerVector run; + IntegerVector constant = IntegerVector::create(-1,1); + int constant2; + IntegerVector w_current(n); + IntegerVector w_proposal(n); + IntegerVector proposal(n); + bool anyIsNegative; + double prob; + bool didMove; + NumericVector unifs(1000); + Function runif("runif"); + unifs = runif(1000); + + current_num = current[move != 0]; + move_num = move[move != 0]; + stepSize = (-1 * current_num) / move_num; + lowerBound = stepSize[stepSize < 0]; + upperBound = stepSize[stepSize > 0]; + lb = max(lowerBound); + ub = min(upperBound); + + if(is_true(any(stepSize == 0))){ + IntegerVector test1 = current + lb * move; + IntegerVector test2 = current + ub * move; + for(int i = 0; i < n; ++i){ + if(test1[i] < 0) lb = 1; + if(test2[i] < 0) ub = -1; + } + } + + int line_length = ub-lb + 1; + if(line_length < 0) line_length = 1; + + for(int m = 0; m < n;++m){ + w_current[m] = current[m]; + } + + for(int l = 0; l < line_length; ++l){ + + constant2 = as(Rcpp::sample(constant, 1)); + if(constant2 == 0) constant2 = 1; + + for(int k = 0; k < n;++k){ + w_proposal[k] = w_current[k] + constant2 * move[k]; + } + + anyIsNegative = false; + for(int k = 0; k < n; ++k){ + if(w_proposal[k] < 0){ + anyIsNegative = true; + } + } + if(anyIsNegative){ + prob = 0; + } else { + prob = exp( sum(lgamma(w_current+1)) - sum(lgamma(w_proposal+1)) ); + } + + if(prob > 1){ + prob = 1; + } + // make move + if(unifs[l] < prob) { + for(int k = 0; k < n; ++k){ + w_current[k] = w_proposal[k]; + } + } + } + + didMove = false; + for(int k = 0; k < n; ++k) { + if(w_current[k] != current[k]) didMove = true; + } + if(didMove == true) { + for(int k = 0; k < n; ++k) { + proposal[k] = w_current[k]; + } + } else { + constant2 = as(Rcpp::sample(constant, 1)); + for(int k = 0; k < n; ++k){ + proposal[k] = current[k] + constant2 * move[k]; + } + } + + return proposal; +} + + + +/*** R +adaptive_fun(c(8,3,8,1), c(1,-1,-1,1)) +*/ diff --git a/src/adaptive_fun.h b/src/adaptive_fun.h new file mode 100644 index 0000000..ab83704 --- /dev/null +++ b/src/adaptive_fun.h @@ -0,0 +1,9 @@ +// adaptive_fun.h + +#ifndef __ADAPTIVE_FUN_H__ +#define __ADAPTIVE_FUN_H__ + +#include +#endif + +Rcpp::IntegerVector adaptive_fun(Rcpp::IntegerVector current, Rcpp::IntegerVector move); \ No newline at end of file diff --git a/src/hit_and_run_fun.cpp b/src/hit_and_run_fun.cpp new file mode 100644 index 0000000..13e5ff4 --- /dev/null +++ b/src/hit_and_run_fun.cpp @@ -0,0 +1,57 @@ +#include +using namespace Rcpp; + + + +// [[Rcpp::export]] + + +IntegerVector hit_and_run_fun(IntegerVector current, IntegerVector move) { + + int n = current.size(); + IntegerVector current_num; + IntegerVector move_num; + IntegerVector stepSize; + IntegerVector upperBound; + IntegerVector lowerBound; + int lb; + int ub; + IntegerVector run; + IntegerVector constant = IntegerVector::create(-1,1); + IntegerVector proposal(n); + + current_num = current[move != 0]; + move_num = move[move != 0]; + stepSize = (-1 * current_num) / move_num; + lowerBound = stepSize[stepSize < 0]; + upperBound = stepSize[stepSize > 0]; + lb = max(lowerBound); + ub = min(upperBound); + + if(is_true(any(stepSize == 0))){ + IntegerVector test1 = current + lb * move; + IntegerVector test2 = current + ub * move; + for(int i = 0; i < n; ++i){ + if(test1[i] < 0) lb = 1; + if(test2[i] < 0) ub = -1; + } + } + if(lb > ub){ + run = Rcpp::sample(constant, 1); + + } else { + IntegerVector range = seq(lb,ub); + run = Rcpp::sample(range,1); + } + if(run[0] == 0){ + run = Rcpp::sample(constant, 1); + } + + for(int k = 0; k < n; ++k){ + proposal[k] = current[k] + as(run) * move[k]; + } + return proposal; +} + + + diff --git a/src/hit_and_run_fun.h b/src/hit_and_run_fun.h new file mode 100644 index 0000000..5477c92 --- /dev/null +++ b/src/hit_and_run_fun.h @@ -0,0 +1,9 @@ +// hit_and_run_fun.h + +#ifndef __HIT_AND_RUN_FUN_H__ +#define __HIT_AND_RUN_FUN_H__ + +#include +#endif + +Rcpp::IntegerVector hit_and_run_fun(Rcpp::IntegerVector current, Rcpp::IntegerVector move); \ No newline at end of file diff --git a/src/metropolis_hypergeometric_cpp.cpp b/src/metropolis_hypergeometric_cpp.cpp index 5ec896b..997c5c2 100644 --- a/src/metropolis_hypergeometric_cpp.cpp +++ b/src/metropolis_hypergeometric_cpp.cpp @@ -1,5 +1,7 @@ #include #include "sis_tbl.h" +#include "hit_and_run_fun.h" +#include "adaptive_fun.h" using namespace Rcpp; // [[Rcpp::export]] @@ -25,22 +27,9 @@ List metropolis_hypergeometric_cpp( NumericVector unifs3(nTotalSamples); IntegerVector proposal(n); // the proposed moves double prob; // the probability of transition - double prob2; bool anyIsNegative; - bool anyIsNegative2; IntegerVector move(n); double accept_prob = 0; - IntegerVector current_num; - IntegerVector move_num; - IntegerVector stepSize; - IntegerVector upperBound; - IntegerVector lowerBound; - int lb; - int ub; - IntegerVector run; - IntegerVector constant = IntegerVector::create(-1,1); - IntegerVector w_current(n); - IntegerVector w_proposal(n); Function sample("sample"); whichMove = sample(nMoves, nTotalSamples, 1); @@ -83,98 +72,13 @@ List metropolis_hypergeometric_cpp( for(int k = 0; k < n; ++k){ move[k] = moves(k, whichMove[thin*i+j]-1); } - if(hit_and_run){ - current_num = current[move != 0]; - move_num = move[move != 0]; - stepSize = (-1 * current_num) / move_num; - lowerBound = stepSize[stepSize < 0]; - upperBound = stepSize[stepSize > 0]; - lb = max(lowerBound); - ub = min(upperBound); - - if(is_true(any(stepSize == 0))){ - IntegerVector test1 = current + lb * move; - IntegerVector test2 = current + ub * move; - for(int i = 0; i < n; ++i){ - if(test1[i] < 0) lb = 1; - if(test2[i] < 0) ub = -1; - } - } - - // MCMC inside MCMC - if(adaptive){ - - int line_length = ub-lb + 1; - if(line_length < 0) line_length = 1; - - for(int m = 0; m < n;++m){ - w_current[m] = current[m]; - } - - for(int l = 0; l < line_length; ++l){ - - int constant2 = as(Rcpp::sample(constant, 1)); - for(int k = 0; k < n;++k){ - w_proposal[k] = w_current[k] + constant2 * move[k]; - } - anyIsNegative2 = false; - for(int k = 0; k < n; ++k){ - if(w_proposal[k] < 0){ - anyIsNegative2 = true; - } - } - if(anyIsNegative2){ - prob2 = 0; - } else { - prob2 = exp( sum(lgamma(w_current+1)) - sum(lgamma(w_proposal+1)) ); - } - - if(prob2 > 1){ - prob2 = 1; - } - // make move - if(unifs[l] < prob2) { - for(int k = 0; k < n; ++k){ - w_current[k] = w_proposal[k]; - } - } - } - bool didMove; - didMove = false; - for(int k = 0; k < n; ++k) { - if(w_current[k] != current[k]) didMove = true; - } - if(didMove == true) { - for(int k = 0; k < n; ++k) { - proposal[k] = w_current[k]; - } - } else { - for(int k = 0; k < n; ++k){ - proposal[k] = current[k] + move[k]; - } - } - } else { - - // Base Hit and Run + // make proposal + if(hit_and_run) proposal = hit_and_run_fun(current, move); - if(lb > ub){ - run = Rcpp::sample(constant, 1); - - } else { - IntegerVector range = seq(lb,ub); - run = Rcpp::sample(range,1); - } - if(run[0] == 0){ - run = Rcpp::sample(constant, 1); - } - if(hit_and_run){ - for(int k = 0; k < n; ++k){ - proposal[k] = current[k] + as(run) * move[k]; - } - } - } - } else { + if(adaptive) proposal = adaptive_fun(current, move); + + if(hit_and_run == false & adaptive == false & non_uniform == false) { for(int k = 0; k < n; ++k){ proposal[k] = current[k] + move[k]; } @@ -216,7 +120,7 @@ List metropolis_hypergeometric_cpp( move_dist[which_move] = move_dist[which_move] + 1; ++counter; } - }else{ + } else { // make move if(unifs[thin*i+j] < prob){ for(int k = 0; k < n; ++k){ From 82e1377d61d97e7f8ff8468e43cc8189e7a1d262 Mon Sep 17 00:00:00 2001 From: Grant Innerst Date: Fri, 4 Jan 2019 10:41:51 -0500 Subject: [PATCH 46/53] temporary changes for dissertation illustration --- R/Aglm.r | 1 + R/metropolis.r | 2 +- src/adaptive_fun.cpp | 33 ++++++++++++--------------- src/metropolis_hypergeometric_cpp.cpp | 2 +- 4 files changed, 17 insertions(+), 21 deletions(-) diff --git a/R/Aglm.r b/R/Aglm.r index 70c80cb..764384b 100644 --- a/R/Aglm.r +++ b/R/Aglm.r @@ -116,6 +116,7 @@ aglm <- function(model, data, family = poisson(), moves, ...) { + ## set/check args ################################################## engine <- match.arg(engine) diff --git a/R/metropolis.r b/R/metropolis.r index 539c65c..a6517fb 100644 --- a/R/metropolis.r +++ b/R/metropolis.r @@ -141,7 +141,7 @@ #' #' ggplot(data = data) + geom_line(aes(steps, baseStats)) + #' geom_line(aes(steps, harStats), color = "red") + -#' labs(x = "Steps", y = "UNLL value", title = "Base Algorithm vs. Algorithm with Hit and Run option in red") +#' labs(x = "Steps", y = "UNLP value", title = "Base Algorithm vs. Algorithm with Hit and Run option in red") #' #' #' showSteps <- function(steps){ diff --git a/src/adaptive_fun.cpp b/src/adaptive_fun.cpp index 1b10070..7b3edea 100644 --- a/src/adaptive_fun.cpp +++ b/src/adaptive_fun.cpp @@ -21,10 +21,9 @@ IntegerVector adaptive_fun(IntegerVector current, IntegerVector move) { IntegerVector proposal(n); bool anyIsNegative; double prob; - bool didMove; + // bool didMove; NumericVector unifs(1000); Function runif("runif"); - unifs = runif(1000); current_num = current[move != 0]; move_num = move[move != 0]; @@ -45,6 +44,7 @@ IntegerVector adaptive_fun(IntegerVector current, IntegerVector move) { int line_length = ub-lb + 1; if(line_length < 0) line_length = 1; + unifs = runif(line_length); for(int m = 0; m < n;++m){ w_current[m] = current[m]; @@ -53,7 +53,6 @@ IntegerVector adaptive_fun(IntegerVector current, IntegerVector move) { for(int l = 0; l < line_length; ++l){ constant2 = as(Rcpp::sample(constant, 1)); - if(constant2 == 0) constant2 = 1; for(int k = 0; k < n;++k){ w_proposal[k] = w_current[k] + constant2 * move[k]; @@ -75,33 +74,29 @@ IntegerVector adaptive_fun(IntegerVector current, IntegerVector move) { prob = 1; } // make move - if(unifs[l] < prob) { + if(unifs[l] < prob * 14) { for(int k = 0; k < n; ++k){ w_current[k] = w_proposal[k]; } } } - didMove = false; - for(int k = 0; k < n; ++k) { - if(w_current[k] != current[k]) didMove = true; - } - if(didMove == true) { +// didMove = false; +// for(int k = 0; k < n; ++k) { +// if(w_current[k] != current[k]) didMove = true; +// } +// if(didMove == true) { for(int k = 0; k < n; ++k) { proposal[k] = w_current[k]; } - } else { - constant2 = as(Rcpp::sample(constant, 1)); - for(int k = 0; k < n; ++k){ - proposal[k] = current[k] + constant2 * move[k]; - } - } +// } else { +// constant2 = as(Rcpp::sample(constant, 1)); +// for(int k = 0; k < n; ++k){ +// proposal[k] = current[k] + constant2 * move[k]; +// } +// } return proposal; } - -/*** R -adaptive_fun(c(8,3,8,1), c(1,-1,-1,1)) -*/ diff --git a/src/metropolis_hypergeometric_cpp.cpp b/src/metropolis_hypergeometric_cpp.cpp index 997c5c2..f418b42 100644 --- a/src/metropolis_hypergeometric_cpp.cpp +++ b/src/metropolis_hypergeometric_cpp.cpp @@ -39,7 +39,7 @@ List metropolis_hypergeometric_cpp( unifs3 = runif(nTotalSamples); Function print("print"); - NumericVector move_dist = rep(10.0, nMoves); + NumericVector move_dist = rep(1.0, nMoves); double counter = sum(move_dist); int which_move; From 6bb87fc68b87ddd4f8fa7dd9a6c0c60f8cdc7b89 Mon Sep 17 00:00:00 2001 From: Grant Innerst Date: Fri, 4 Jan 2019 21:52:13 -0500 Subject: [PATCH 47/53] reverting to proper value in non-uniform sampler --- src/metropolis_hypergeometric_cpp.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/metropolis_hypergeometric_cpp.cpp b/src/metropolis_hypergeometric_cpp.cpp index f418b42..997c5c2 100644 --- a/src/metropolis_hypergeometric_cpp.cpp +++ b/src/metropolis_hypergeometric_cpp.cpp @@ -39,7 +39,7 @@ List metropolis_hypergeometric_cpp( unifs3 = runif(nTotalSamples); Function print("print"); - NumericVector move_dist = rep(1.0, nMoves); + NumericVector move_dist = rep(10.0, nMoves); double counter = sum(move_dist); int which_move; From f1feba54f70ef597caf1482f8cbed37a98168061 Mon Sep 17 00:00:00 2001 From: GrantInnerst Date: Tue, 21 May 2019 22:39:34 -0500 Subject: [PATCH 48/53] updates to aglm() --- R/Aglm.r | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/R/Aglm.r b/R/Aglm.r index 764384b..792f871 100644 --- a/R/Aglm.r +++ b/R/Aglm.r @@ -26,10 +26,13 @@ #' \item \code{p.value}: the exact p-values of individual tests, #' accurate to Monte-Carlo error. these are computed as the #' proportion of samples with statistics equal to or larger than -#' the oberved statistic. \item \code{mid.p.value}: the mid -#' p.values, see Agresti pp.20--21. \item \code{sampsStats}: +#' the oberved statistic. +#' \item \code{mid.p.value}: the mid +#' p.values, see Agresti pp.20--21. +#' \item \code{sampsStats}: #' the statistics computed for each mcmc -#' sample. \item \code{cells}: the number of cells in the table. } +#' sample. +#' \item \code{cells}: the number of cells in the table. } #' @examples #' #' library(ggplot2);theme_set(theme_bw()) @@ -112,7 +115,7 @@ aglm <- function(model, data, family = poisson(), iter = 1E4, burn = 10000, - thin = 100, engine = c("Cpp","R"), + thin = 100, engine = c("C++","R"), moves, ...) { @@ -189,9 +192,9 @@ aglm <- function(model, data, family = poisson(), } ## any 0 levels - if (any(data[,-ncol(data)] == 0)) { - stop("Cannot have a covariate with a level 0") - } + # if (any(data[,-ncol(data)] == 0)) { + # stop("Cannot have a covariate with a level 0") + # } if(length(model) == 1){ @@ -248,18 +251,18 @@ aglm <- function(model, data, family = poisson(), ## construct A matrix and compute moves ################################################## - if(missing(moves) && !is.null(getOption("4ti2_path"))){ + if(missing(moves) && has_4ti2()){ message("Computing Markov moves (4ti2)... ", appendLF = FALSE) - moves <- markov(A) + moves <- markov(A, p = "arb") message("done.", appendLF = TRUE) - } else if(missing(moves) && is.null(getOption("4ti2_path"))){ + } else if(missing(moves) && has_4ti2()){ warning( - "No moves were provided and 4ti2 is not found.\n", - " The resulting chain is likely not connected and strongly autocorrelated.\n", - " See ?aglm. Consider using rmove to generate SIS moves in advance.", + "No moves were provided and has_4ti2() = FALSE.\n", + " SIS moves will be used; estimates will likely be biased.\n", + " Consider using rmove() to generate SIS moves in advance.", immediate. = TRUE ) message("Computing 1000 SIS moves... ", appendLF = FALSE) @@ -270,10 +273,10 @@ aglm <- function(model, data, family = poisson(), movesMat <- NULL stopifnot(all(moves %in% c("lattice", "markov", "groebner", "grobner", "graver", "sis"))) - if("lattice" %in% moves) movesMat <- cbind(movesMat, zbasis(A)) - if("markov" %in% moves) movesMat <- cbind(movesMat, markov(A)) - if("groebner" %in% moves) movesMat <- cbind(movesMat, groebner(A)) - if("grobner" %in% moves) movesMat <- cbind(movesMat, groebner(A)) + if("lattice" %in% moves) movesMat <- cbind(movesMat, zbasis(A, p = "arb")) + if("markov" %in% moves) movesMat <- cbind(movesMat, markov(A, p = "arb")) + if("groebner" %in% moves) movesMat <- cbind(movesMat, groebner(A, p = "arb")) + if("grobner" %in% moves) movesMat <- cbind(movesMat, groebner(A, p = "arb")) if("graver" %in% moves) stop("graver not yet implemented.") moves <- movesMat @@ -290,8 +293,6 @@ aglm <- function(model, data, family = poisson(), metropolis( init, moves, - suffStats = suffStats, - config = unname(A), iter = iter, burn = burn, thin = thin, From b319363c6aae9bee950b1e67a97ff65e44641227 Mon Sep 17 00:00:00 2001 From: GrantInnerst Date: Tue, 21 May 2019 22:45:15 -0500 Subject: [PATCH 49/53] remove sis stuff --- src/sis_tbl.cpp | 132 ------------------------------------------------ src/sis_tbl.h | 9 ---- 2 files changed, 141 deletions(-) delete mode 100644 src/sis_tbl.cpp delete mode 100644 src/sis_tbl.h diff --git a/src/sis_tbl.cpp b/src/sis_tbl.cpp deleted file mode 100644 index bf8d9fa..0000000 --- a/src/sis_tbl.cpp +++ /dev/null @@ -1,132 +0,0 @@ -#include -#include "rcddAPI.h" -using namespace Rcpp; - -// [[Rcpp::export]] - -IntegerVector sis_tbl(IntegerMatrix A, IntegerVector suff_stats) { - int w = 0; // counter for while loop - int d = A.nrow(); // number of sufficient statisitcs - int r = A.ncol(); // number of cells in the table - IntegerVector tbl(r); //table to return - NumericMatrix constr(d+r, r+2); // constraint matrix - NumericVector objfun(r+1); // objective function - int p = 0; // counter for error purposes - IntegerMatrix work_A(A.nrow(), A.ncol()); // work matrix to edit configuration matrix - IntegerVector work_suff_stats(suff_stats.size()); // work vector to edit sufficient statistics - int min, max; // integers to calculate our range - - Function print("print"); - - // Elements needed to run the linear program solver - LogicalVector first(1); - first[0] = true; - LogicalVector second(1); - second[0] = false; - CharacterVector solver(1); - solver = "DualSimplex"; - - - // Theoretically, the loop will run until "correct" table is produced - while(w < 1){ - - for(int i = 0; i < A.nrow(); ++i){ - for(int j = 0; j < A.ncol(); ++j){ - work_A(i,j) = A(i,j); - } - } - for(int i = 0; i < suff_stats.size(); ++i) work_suff_stats[i] = suff_stats[i]; - - - bool lpsolved = true; //Logical for checking purposes - - for(int i = 0; i= d && l == 0) constr(k,l) = 0;//First column for inequalities - - if(k >= d && l == 1)constr(k,l) = 0;//Second column for right hand side - - //Constructing coefficients for each cell to be positive - for(int m = 2; m < r + 2; ++m){ - if(k >= d && l == m) constr(k,l) = (z == m) ? -1:0; - } - } - if(k >= d) ++z; - } - //Running linear program solver to find range of possible values(min, max) - SEXP out1 = lpcdd_f(constr, objfun, first, solver); - String solution = VECTOR_ELT(out1, 0); - //If solution Optimal continue, else break and start over - if(solution == "Optimal"){ - IntegerVector val = VECTOR_ELT(out1,3); - min = Rcpp::as(val); - }else{ - lpsolved = false; - break; - } - SEXP out2 = lpcdd_f(constr, objfun, second, solver); - String solution2 = VECTOR_ELT(out2, 0); - //If solution Optimal continue, else break and start over - if(solution2 == "Optimal"){ - IntegerVector val2 = VECTOR_ELT(out2,3); - max = Rcpp::as(val2); - }else{ - lpsolved = false; - break; - } - //Calculate the range and sample from that range to populate the table - if(min == max - 1) min = max; - - IntegerVector range = seq(min,max); - IntegerVector value = sample(range,1); - tbl[i] = Rcpp::as(value); - - // Update constraints(work_A and work_suff_stats) - IntegerVector index; - int y = 0; - - //Updating work_A by changing non-zero A elements to zero in the column - //Keep track of where non-zero elements were - for(int o = 0; o < d; ++o){ - if(work_A(o,i) != 0){ - work_A(o,i) = 0; - index[y] = o; - ++y; - } - } - //Updating work_suff_stats where elements of work_A were changed - int x = 0; - for(int p = 0; p < d; ++p){ - if(p == index[x]){ - work_suff_stats[p] = work_suff_stats[p] - tbl[i] * A(p,i); - ++x; - } - } - } - // If all linear programs are solved, index w and end the loop - if(lpsolved == true) ++w; - - // If error continues, only let it continue two times - ++p; - if(p > 2) break; - } - return tbl; -} - diff --git a/src/sis_tbl.h b/src/sis_tbl.h deleted file mode 100644 index 1315b33..0000000 --- a/src/sis_tbl.h +++ /dev/null @@ -1,9 +0,0 @@ -// sis_tbl.h - -#ifndef __SIS_TBL_H__ -#define __SIS_TBL_H__ - -#include -#endif - -Rcpp::IntegerVector sis_tbl(Rcpp::IntegerMatrix A, Rcpp::IntegerVector suff_stats); \ No newline at end of file From 958f7d0576118405d5c01f10ccee96caf469a1b5 Mon Sep 17 00:00:00 2001 From: GrantInnerst Date: Tue, 21 May 2019 22:47:07 -0500 Subject: [PATCH 50/53] fix cpp warning --- R/RcppExports.R | 4 ---- src/RcppExports.cpp | 13 ------------- src/metropolis_uniform_cpp.cpp | 2 +- 3 files changed, 1 insertion(+), 18 deletions(-) diff --git a/R/RcppExports.R b/R/RcppExports.R index 7a52da2..8019022 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -45,10 +45,6 @@ rfiberOne <- function(A, b) { .Call('_algstat_rfiberOne', PACKAGE = 'algstat', A, b) } -sis_tbl <- function(A, suff_stats) { - .Call('_algstat_sis_tbl', PACKAGE = 'algstat', A, suff_stats) -} - walk <- function(current, moves, iter, thin) { .Call('_algstat_walk', PACKAGE = 'algstat', current, moves, iter, thin) } diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index d415dc6..7bb30ac 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -149,18 +149,6 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } -// sis_tbl -IntegerVector sis_tbl(IntegerMatrix A, IntegerVector suff_stats); -RcppExport SEXP _algstat_sis_tbl(SEXP ASEXP, SEXP suff_statsSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< IntegerMatrix >::type A(ASEXP); - Rcpp::traits::input_parameter< IntegerVector >::type suff_stats(suff_statsSEXP); - rcpp_result_gen = Rcpp::wrap(sis_tbl(A, suff_stats)); - return rcpp_result_gen; -END_RCPP -} // walk IntegerMatrix walk(IntegerVector current, IntegerMatrix moves, int iter, int thin); RcppExport SEXP _algstat_walk(SEXP currentSEXP, SEXP movesSEXP, SEXP iterSEXP, SEXP thinSEXP) { @@ -188,7 +176,6 @@ static const R_CallMethodDef CallEntries[] = { {"_algstat_metropolis_hypergeometric_cpp", (DL_FUNC) &_algstat_metropolis_hypergeometric_cpp, 7}, {"_algstat_metropolis_uniform_cpp", (DL_FUNC) &_algstat_metropolis_uniform_cpp, 7}, {"_algstat_rfiberOne", (DL_FUNC) &_algstat_rfiberOne, 2}, - {"_algstat_sis_tbl", (DL_FUNC) &_algstat_sis_tbl, 2}, {"_algstat_walk", (DL_FUNC) &_algstat_walk, 4}, {NULL, NULL, 0} }; diff --git a/src/metropolis_uniform_cpp.cpp b/src/metropolis_uniform_cpp.cpp index a4fb6ef..1e2091c 100644 --- a/src/metropolis_uniform_cpp.cpp +++ b/src/metropolis_uniform_cpp.cpp @@ -25,7 +25,7 @@ List metropolis_uniform_cpp( int total_count = 0; bool any_negative_cells; IntegerVector move(n_cells); - double accept_prob = 0; + Function sample("sample"); which_move = sample(n_moves, burn + n_total_samples, 1); From bd88401515671244f9a42002ce70a34b446934f4 Mon Sep 17 00:00:00 2001 From: GrantInnerst Date: Thu, 20 Jun 2019 00:07:18 -0500 Subject: [PATCH 51/53] add control argument and change parsing in aglm --- R/Aglm.r | 85 ++++++++++++++++++++++++++++++----------------------- R/pmat.R | 61 +++++++++++++++++++++----------------- man/Aglm.Rd | 24 +++++++-------- 3 files changed, 92 insertions(+), 78 deletions(-) diff --git a/R/Aglm.r b/R/Aglm.r index 792f871..d7fa683 100644 --- a/R/Aglm.r +++ b/R/Aglm.r @@ -5,10 +5,7 @@ #' description of the model to be fitted #' @param data data, as a data frame of raw data with ordinal discrete covariates #' @param family a description of the error distirbution and link function used in the model -#' @param iter number of chain iterations -#' @param burn burn-in -#' @param thin thinning -#' @param engine C++ or R? (C++ yields roughly a 20-25x speedup) +#' @param control a list of arguments that control the MCMC algorithm #' @param moves the markov moves for the mcmc (as columns of a #' matrix). #' @param ... ... @@ -51,7 +48,7 @@ #' ) #' #' # function output -#' out <- aglm(y ~ x, data = df, family = poisson(), thin = 2000) +#' out <- aglm(y ~ x, data = df, family = poisson(), control = list(thin = 2000)) #' #' # check convergence through trace plot #' qplot(1:10000, out$sampsStats$PRs, geom = "line") @@ -93,7 +90,7 @@ #' ) #' #' # aglm -#' out <- aglm(y ~ x, data = df, family = binomial(), thin = 500) +#' out <- aglm(y ~ x, data = df, family = binomial(), control(thin = 500)) #' #' # check convergence through trace plot #' qplot(1:10000, out$sampsStats$PRs, geom = "line") @@ -114,15 +111,13 @@ aglm <- function(model, data, family = poisson(), - iter = 1E4, burn = 10000, - thin = 100, engine = c("C++","R"), - moves, + control = list(...), moves, ...) { ## set/check args ################################################## - engine <- match.arg(engine) + control <- do.call("aglm.control", control) argList <- as.list(match.call(expand.dots = TRUE))[-1] if("formula" %in% names(argList)){ @@ -161,21 +156,19 @@ aglm <- function(model, data, family = poisson(), } else { # if it's a formula, convert to list if(is.formula(model)){ + ## reshape data data <- model.frame(model, data) - # name data - vars <- names(data) - - ## parse formula - fString <- as.character(model) - response <- fString[2] - predString <- fString[3] + ## extract full model + pred_string <- attr(terms(data), "term.labels") + ## name data + vars <- names(data) + response <- vars[1] ## make list of facets - model <- strsplit(predString, " \\+ ")[[1]] - model <- strsplit(model, " \\* ") + model <- strsplit(pred_string, "\\:") ## format the data names(data)[names(data) == response] <- "response" @@ -236,10 +229,10 @@ aglm <- function(model, data, family = poisson(), } # check to see if all level configurations are there (need work here) - lvlsInData <- as.list(as.data.frame(t(expand.grid(levels)))) %in% as.list(as.data.frame(t(data[, -ncol(data)]))) + # lvlsInData <- as.list(as.data.frame(t(expand.grid(levels)))) %in% as.list(as.data.frame(t(data[, -ncol(data)]))) # subset A by levels that are present - A <- A[,lvlsInData] + # A <- A[,lvlsInData] # if family = "binomial" compute the lawernce lifting of A if (method == "binomial") { @@ -293,11 +286,12 @@ aglm <- function(model, data, family = poisson(), metropolis( init, moves, - iter = iter, - burn = burn, - thin = thin, - engine = engine, - ... + iter = control$iter, + burn = control$burn, + thin = control$thin, + engine = control$engine, + hit_and_run = control$hit_and_run, + adaptive = control$adaptive ) @@ -321,7 +315,7 @@ aglm <- function(model, data, family = poisson(), ) out$p.value.std.err <- c( - PR = sqrt(mean(PRs <= PR)*(1-mean(PRs <= PR))/iter) + PR = sqrt(mean(PRs <= PR)*(1-mean(PRs <= PR))/control$iter) ) out$mid.p.value <- c( @@ -329,9 +323,9 @@ aglm <- function(model, data, family = poisson(), ) - out$iter <- iter - out$burn <- burn - out$thin <- thin + out$iter <- control$iter + out$burn <- control$burn + out$thin <- control$thin out$statistic <- c(PR = PR) out$sampsStats <- list(PRs = PRs) out$cells <- nCells @@ -339,11 +333,28 @@ aglm <- function(model, data, family = poisson(), class(out) <- "aglm" out - - - - - - - +} + + + + + + + + + + + + +aglm.control <- function(iter = 10000, + burn = 10000, + thin = 100, + engine = c("C++", "R"), + hit_and_run = FALSE, + adaptive = FALSE +) { + engine <- match.arg(engine) +list(iter = iter, burn = burn, thin = thin, + engine = engine, hit_and_run = hit_and_run, + adaptive = adaptive) } \ No newline at end of file diff --git a/R/pmat.R b/R/pmat.R index e85a93c..59c73d6 100644 --- a/R/pmat.R +++ b/R/pmat.R @@ -23,48 +23,55 @@ #' @export pmat pmat <- function(levels, facets) { + #Small function to make single covariate configuration matrix func <- function(x) unname(rbind(rep(1, length(x)), x)) if (!is.list(levels)) { numCovariates <- 1 - fullMat <- func(levels) + full_mat_segre <- func(levels) } else{ numCovariates <- length(levels) #Make single covariate configuration matrix for each covariate matList <- lapply(levels, func) #Full heirarchicial config matrix with all interactions included - fullMat <- do.call(kprod, matList) - # fullMat <- do.call(segre, matList) - } - expCov <- 1:numCovariates - - #Checking heirarchical sturcture of facets - if (any(sapply(facets, length) > 1)) { - longListElts <- facets[which(sapply(facets, length) > 1)] - - uniqueVals <- unique(unlist(longListElts)) - - heirarc <- as.list(c(uniqueVals, longListElts)) - - facets <- union(heirarc, facets) + full_mat_kprod <- do.call(kprod, matList) + full_mat_segre <- do.call(segre, matList) } - #All possible combinations of covariates (powerset like) to be compared to facets - if (length(expCov) == 1) { - facetList <- list(expCov) + if(any(lengths(facets) > 1)) { + + expCov <- 1:numCovariates + + #Checking heirarchical sturcture of facets + if (any(sapply(facets, length) > 1)) { + longListElts <- facets[which(sapply(facets, length) > 1)] + + uniqueVals <- unique(unlist(longListElts)) + + heirarc <- as.list(c(uniqueVals, longListElts)) + + facets <- union(heirarc, facets) + } + + #All possible combinations of covariates (powerset like) to be compared to facets + if (length(expCov) == 1) { + facetList <- list(expCov) + } else{ + facetList <- list(integer(0)) + for (i in seq_along(expCov)) { + facetList <- + c(facetList, lapply(facetList, function(x) + c(x, expCov[i]))) + } + facetList <- facetList[-1] + } + #return the configuration matrix which includes only the elements need for the heirarchical model + full_mat_kprod[c(TRUE, facetList %in% type.convert(facets)), ] } else{ - facetList <- list(integer(0)) - for (i in seq_along(expCov)) { - facetList <- - c(facetList, lapply(facetList, function(x) - c(x, expCov[i]))) - } - facetList <- facetList[-1] + full_mat_segre } - #return the configuration matrix which includes only the elements need for the heirarchical model - fullMat[c(TRUE, facetList %in% type.convert(facets)), ] } diff --git a/man/Aglm.Rd b/man/Aglm.Rd index fbcccd1..fae5552 100644 --- a/man/Aglm.Rd +++ b/man/Aglm.Rd @@ -4,8 +4,7 @@ \alias{aglm} \title{Fitting generalized linear models with algebraic methods} \usage{ -aglm(model, data, family = poisson(), iter = 10000, burn = 10000, - thin = 100, engine = c("Cpp", "R"), moves, ...) +aglm(model, data, family = poisson(), control = list(...), moves, ...) } \arguments{ \item{model}{model specification, either in terms of a configuration matrix or a symbolic @@ -15,13 +14,7 @@ description of the model to be fitted} \item{family}{a description of the error distirbution and link function used in the model} -\item{iter}{number of chain iterations} - -\item{burn}{burn-in} - -\item{thin}{thinning} - -\item{engine}{C++ or R? (C++ yields roughly a 20-25x speedup)} +\item{control}{a list of arguments that control the MCMC algorithm} \item{moves}{the markov moves for the mcmc (as columns of a matrix).} @@ -43,10 +36,13 @@ statistics computing matrix. \item \code{p.value}: the exact p-values of individual tests, accurate to Monte-Carlo error. these are computed as the proportion of samples with statistics equal to or larger than -the oberved statistic. \item \code{mid.p.value}: the mid -p.values, see Agresti pp.20--21. \item \code{sampsStats}: +the oberved statistic. +\item \code{mid.p.value}: the mid +p.values, see Agresti pp.20--21. +\item \code{sampsStats}: the statistics computed for each mcmc -sample. \item \code{cells}: the number of cells in the table. } +sample. +\item \code{cells}: the number of cells in the table. } } \description{ Fitting generalized linear models with algebraic methods @@ -69,7 +65,7 @@ Fitting generalized linear models with algebraic methods ) # function output - out <- aglm(y ~ x, data = df, family = poisson(), thin = 2000) + out <- aglm(y ~ x, data = df, family = poisson(), control = list(thin = 2000)) # check convergence through trace plot qplot(1:10000, out$sampsStats$PRs, geom = "line") @@ -111,7 +107,7 @@ Fitting generalized linear models with algebraic methods ) # aglm - out <- aglm(y ~ x, data = df, family = binomial(), thin = 500) + out <- aglm(y ~ x, data = df, family = binomial(), control(thin = 500)) # check convergence through trace plot qplot(1:10000, out$sampsStats$PRs, geom = "line") From 6e8c9c56c2d38af62ce420f64212e89a1ea0232b Mon Sep 17 00:00:00 2001 From: GrantInnerst Date: Thu, 29 Aug 2019 11:39:20 -0400 Subject: [PATCH 52/53] improving examples in aglm, remove c++ sis for now --- DESCRIPTION | 2 +- NAMESPACE | 5 ++ R/Aglm.r | 146 ++++++++++++++++++++++--------------------- R/algstat-package.R | 4 +- R/metropolis.r | 12 +--- R/sis_tbl.R | 32 ---------- man/Aglm.Rd | 149 ++++++++++++++++++++++---------------------- 7 files changed, 158 insertions(+), 192 deletions(-) delete mode 100644 R/sis_tbl.R diff --git a/DESCRIPTION b/DESCRIPTION index 7ae4901..41b19b9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -14,7 +14,7 @@ Description: Functions enabling the use of algebraic methods on statistical problems. Prominent application domains are discrete multivariate analysis, discrete and Gaussian graphical models, algebraic pattern recognition, latent variable models, phylogenetics, and more. -LinkingTo: Rcpp, rcdd +LinkingTo: Rcpp Depends: R (>= 2.10), mpoly (>= 1.1.0), diff --git a/NAMESPACE b/NAMESPACE index f0f3127..aa7411a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -87,11 +87,15 @@ importFrom(purrr,cross_df) importFrom(purrr,map) importFrom(purrr,set_names) importFrom(reshape2,melt) +importFrom(stats,acf) importFrom(stats,deriv) importFrom(stats,dmultinom) importFrom(stats,loglin) +importFrom(stats,model.frame) +importFrom(stats,poisson) importFrom(stats,runif) importFrom(stats,sd) +importFrom(stats,terms) importFrom(stringr,"str_sub<-") importFrom(stringr,str_c) importFrom(stringr,str_detect) @@ -105,4 +109,5 @@ importFrom(tibble,tibble) importFrom(tidyr,gather) importFrom(utils,combn) importFrom(utils,download.file) +importFrom(utils,type.convert) useDynLib(algstat) diff --git a/R/Aglm.r b/R/Aglm.r index d7fa683..2d99770 100644 --- a/R/Aglm.r +++ b/R/Aglm.r @@ -1,6 +1,14 @@ #' Fitting generalized linear models with algebraic methods #' -#' +#'[aglm()] fits a generalized linear model to a dataset (typically a +#' contingency table) and performs an exact conditional test on the fitted model. +#' The exact test, which is a goodness-of-fit test, is performed via +#' Monte-Carlo sampling from the conditional distribution of the table +#' given the sufficient statistics of the model. In short, inference +#' is drawn by comparing the statistic of the observed table (the unnormalized log-likelihood) +#' to those of the samples. The proportion of sampled tables with equal to or +#' more extreme values than the observed table is the resulting p-value. +#' #' @param model model specification, either in terms of a configuration matrix or a symbolic #' description of the model to be fitted #' @param data data, as a data frame of raw data with ordinal discrete covariates @@ -32,81 +40,75 @@ #' \item \code{cells}: the number of cells in the table. } #' @examples #' -#' library(ggplot2);theme_set(theme_bw()) +#' # simple poisson regression example from Diaconis et. al. 1998 +#' +#' data <- data.frame(x = c(0,1,2,3,4), y = c(44,25,21,19,11)) +#' +#' # perform the fitting +#' fit_aglm <- aglm(y ~ x, data = data, family = poisson()) +#' +#' # aglm gof p-value +#' fit_aglm$p.value +#' +#' # compare with glm +#' fit_glm <- glm(y ~ x, data = data, family = poisson()) +#' +#' # glm gof p-value +#' 1 - pchisq(fit_glm$deviance, fit_glm$df.residual) +#' +#' +#' # logistic regression example with data from Haberman 1974 +#' +#' # data in elongated format +#' x <- c(rep(0,6),rep(1,2),rep(2,4),rep(3,9),rep(4,10),rep(5,20), +#' rep(6,34),rep(7,42),rep(8,124),rep(9,58),rep(10,77),rep(11,95), +#' rep(12,360)) #' -#' # generating data and running a poisson regression model -#' # pick beta 0 and beta 1 -#' b0 <- 1; b1 <- 0.3 +#' y <- c(rep(0,2),rep(1,10),rep(0,3),rep(1,6),rep(0,5),rep(1,5), +#' rep(0,7),rep(1,13),rep(0,9),rep(1,25),rep(0,15),rep(1,27),rep(0,49), +#' rep(1,75),rep(0,29),rep(1,29),rep(0,45),rep(1,32),rep(0,59), +#' rep(1,36),rep(0,245),rep(1,115)) +#' +#' data_aglm <- data.frame(x=x,y=y) +#' data_glm <- data.frame(x = 0:12) +#' data_glm$y <- cbind(c(4, 2, 4, 6, 5, 13, 25, 27, 75, 29, 32, 36, 115), +#' c(2, 0, 0, 3, 5, 7, 9, 15, 49, 29, 45, 59, 245)) #' -#' # generate data -#' n <- 5000 -#' x <- sample(1:5, n, replace = T) -#' y <- rpois(n, lambda = exp(b0 + b1*x)) -#' df <- data.frame( -#' x = x, -#' y = y -#' ) -#' -#' # function output -#' out <- aglm(y ~ x, data = df, family = poisson(), control = list(thin = 2000)) -#' -#' # check convergence through trace plot -#' qplot(1:10000, out$sampsStats$PRs, geom = "line") -#' -#' # compare aglm and glm predictions with the truth -#' -#' # model fitting with glm -#' mod <- glm(y ~ x, data = df, family = poisson()) -#' -#' # truth -#' exp(b0 + b1*(1:5)) -#' -#' # glm predictions -#' predict(mod, data.frame(x = 1:5), type = "response") -#' -#' # aglm predictions -#' rowMeans(out$steps) / plyr::ddply(df, "x", nrow)$V1 -#' -#' -#' -#' # generating data and running a logistic regression model +#' fit_aglm <- aglm(y ~ x, data = data_aglm, family = binomial()) +#' fit_aglm$p.value +#' +#' # compare to glm +#' +#' # special formatting of data needed for correct analysis +#' data_glm <- data.frame(x = 0:12) +#' data_glm$y <- cbind(c(4, 2, 4, 6, 5, 13, 25, 27, 75, 29, 32, 36, 115), +#' c(2, 0, 0, 3, 5, 7, 9, 15, 49, 29, 45, 59, 245)) +#' +#' fit_glm <- glm(y ~ x, data = data_glm, family = binomial()) +#' 1 - pchisq(fit_glm$deviance, fit_glm$df.residual) +#' +#' +#' # multiple poisson regression +#' +#' # create a fake dataset +#' b0 <- 1; b1 <- 0.5; b2 <- 0.5 +#' +#' covariates <- expand.grid(1:3,1:4) +#' +#' data <- data.frame( +#' x1 = covariates[,1], +#' x2 = covariates[,2], +#' y = rpois(12,exp(b0 + b1*covariates[,1] + b2*covariates[,2])) +#' ) +#' +#' # compare to glm +#' +#' fit_aglm <- aglm(y ~ x1 + x2, data = data, family = poisson()) +#' fit_glm <- glm(y ~ x1 + x2, data = data, family = poisson()) #' -#' # helper functions -#' link <- function(p) log(p/(1-p)) -#' invlink <- function(x) 1 / (1+exp(-x)) -#' -#' -#' # create a fake data set -#' -#' # one covariate -#' b0 <- 0.5; b1 <- 0.2 -#' -#' n <- 100 -#' x <- sample(1:5, n, replace = T) -#' y <- rbinom(n = n, size = 1, prob = invlink(b0 + b1*x)) -#' df <- data.frame( -#' x = x, -#' y = y -#' ) -#' -#' # aglm -#' out <- aglm(y ~ x, data = df, family = binomial(), control(thin = 500)) +#' fit_aglm$p.value +#' with(fit_glm, 1 - pchisq(deviance, df.residual)) #' -#' # check convergence through trace plot -#' qplot(1:10000, out$sampsStats$PRs, geom = "line") -#' -#' # using glm -#' mod <- glm(y ~ x, data = df, family = binomial()) -#' -#' # truth -#' invlink(b0 + b1*out$obs[,1]) -#' -#' # glm predictions -#' predict(mod, data.frame(x = c(1:5)), type = "response") -#' -#' # aglm predictions -#' rowMeans(out$steps) / plyr::ddply(df, "x", nrow)$V1 -#' #' @export diff --git a/R/algstat-package.R b/R/algstat-package.R index 851d393..9c2dc27 100644 --- a/R/algstat-package.R +++ b/R/algstat-package.R @@ -22,8 +22,8 @@ #' @importFrom magrittr %>% #' @importFrom tibble tibble as_tibble #' @importFrom tidyr gather -#' @importFrom stats deriv dmultinom loglin runif sd -#' @importFrom utils combn download.file +#' @importFrom stats deriv dmultinom loglin runif sd acf model.frame poisson terms +#' @importFrom utils combn download.file type.convert #' @useDynLib algstat #' @docType package #' @name algstat diff --git a/R/metropolis.r b/R/metropolis.r index d49571e..1e08d01 100644 --- a/R/metropolis.r +++ b/R/metropolis.r @@ -123,17 +123,7 @@ #' #' ggplot(data = data) + geom_line(aes(steps, baseStats)) + #' geom_line(aes(steps, harStats), color = "red") + -#' labs(x = "Steps", y = "UNLP value", title = "Base Algorithm vs. Algorithm with Hit and Run option in red") -#' -#' -#' -#' -#' -#' -#' -#' -#' -#' +#' labs(x = "Steps", y = "UNLP value", title = "Base Algorithm vs. Algorithm with Hit and Run Option in Red") #' } #' #' diff --git a/R/sis_tbl.R b/R/sis_tbl.R deleted file mode 100644 index 7a524a2..0000000 --- a/R/sis_tbl.R +++ /dev/null @@ -1,32 +0,0 @@ -sis_table <- function(configMat, suffStatistics){ - # need to check if configMat is a mat, suffStatistics is vector/mat, - # length of suffStatistics is same as number of rows of configMat - - # matricies and vectors to work with! - workA <- configMat - workSuff <- suffStatistics - tblElts <- ncol(configMat) - numConstraints <- nrow(configMat) - tbl <- vector(mode = "numeric", length = tblElts) - - for(i in 1:tblElts){ - constr <- unname(rbind(cbind(rep(1,numConstraints), workSuff, workA), - cbind(rep(0,tblElts), rep(0,tblElts), diag(-1,tblElts)))) - objfun <- vector(mode = "numeric", length = tblElts) - objfun[i] <- -1 - minLp <- lpcdd(constr, objfun) - maxLp <- lpcdd(constr, objfun, minimize = FALSE) - - if(minLp[1] == "Optimal" && maxLp[1] == "Optimal"){ - minimum <- as.numeric(unname(minLp[4])) - maximum <- as.numeric(unname(maxLp[4])) - tbl[i] <- if(isTRUE(all.equal(minimum, maximum))){minimum} - else{sample(minimum:maximum, 1)} - } else { tbl[i] <- 0 } - # update constraints and sufficient statistics - index <- which(workA[,i] == 1) - workA[index,i] <- 0 - workSuff[index] <- workSuff[index] - tbl[i] - } - return(tbl) -} \ No newline at end of file diff --git a/man/Aglm.Rd b/man/Aglm.Rd index fae5552..371e55e 100644 --- a/man/Aglm.Rd +++ b/man/Aglm.Rd @@ -45,83 +45,84 @@ sample. \item \code{cells}: the number of cells in the table. } } \description{ -Fitting generalized linear models with algebraic methods +\code{\link[=aglm]{aglm()}} fits a generalized linear model to a dataset (typically a +contingency table) and performs an exact conditional test on the fitted model. +The exact test, which is a goodness-of-fit test, is performed via +Monte-Carlo sampling from the conditional distribution of the table +given the sufficient statistics of the model. In short, inference +is drawn by comparing the statistic of the observed table (the unnormalized log-likelihood) +to those of the samples. The proportion of sampled tables with equal to or +more extreme values than the observed table is the resulting p-value. } \examples{ - library(ggplot2);theme_set(theme_bw()) + # simple poisson regression example from Diaconis et. al. 1998 - # generating data and running a poisson regression model - # pick beta 0 and beta 1 - b0 <- 1; b1 <- 0.3 + data <- data.frame(x = c(0,1,2,3,4), y = c(44,25,21,19,11)) - # generate data - n <- 5000 - x <- sample(1:5, n, replace = T) - y <- rpois(n, lambda = exp(b0 + b1*x)) - df <- data.frame( - x = x, - y = y - ) - - # function output - out <- aglm(y ~ x, data = df, family = poisson(), control = list(thin = 2000)) - - # check convergence through trace plot - qplot(1:10000, out$sampsStats$PRs, geom = "line") - - # compare aglm and glm predictions with the truth - - # model fitting with glm - mod <- glm(y ~ x, data = df, family = poisson()) - - # truth - exp(b0 + b1*(1:5)) - - # glm predictions - predict(mod, data.frame(x = 1:5), type = "response") - - # aglm predictions - rowMeans(out$steps) / plyr::ddply(df, "x", nrow)$V1 - - - - # generating data and running a logistic regression model - - # helper functions - link <- function(p) log(p/(1-p)) - invlink <- function(x) 1 / (1+exp(-x)) - - - # create a fake data set - - # one covariate - b0 <- 0.5; b1 <- 0.2 - - n <- 100 - x <- sample(1:5, n, replace = T) - y <- rbinom(n = n, size = 1, prob = invlink(b0 + b1*x)) - df <- data.frame( - x = x, - y = y - ) - - # aglm - out <- aglm(y ~ x, data = df, family = binomial(), control(thin = 500)) - - # check convergence through trace plot - qplot(1:10000, out$sampsStats$PRs, geom = "line") - - # using glm - mod <- glm(y ~ x, data = df, family = binomial()) - - # truth - invlink(b0 + b1*out$obs[,1]) - - # glm predictions - predict(mod, data.frame(x = c(1:5)), type = "response") - - # aglm predictions - rowMeans(out$steps) / plyr::ddply(df, "x", nrow)$V1 - + # perform the fitting + fit_aglm <- aglm(y ~ x, data = data, family = poisson()) + + # aglm gof p-value + fit_aglm$p.value + + # compare with glm + fit_glm <- glm(y ~ x, data = data, family = poisson()) + + # glm gof p-value + 1 - pchisq(fit_glm$deviance, fit_glm$df.residual) + + + # logistic regression example with data from Haberman 1974 + + # data in elongated format + x <- c(rep(0,6),rep(1,2),rep(2,4),rep(3,9),rep(4,10),rep(5,20), + rep(6,34),rep(7,42),rep(8,124),rep(9,58),rep(10,77),rep(11,95), + rep(12,360)) + + y <- c(rep(0,2),rep(1,10),rep(0,3),rep(1,6),rep(0,5),rep(1,5), + rep(0,7),rep(1,13),rep(0,9),rep(1,25),rep(0,15),rep(1,27),rep(0,49), + rep(1,75),rep(0,29),rep(1,29),rep(0,45),rep(1,32),rep(0,59), + rep(1,36),rep(0,245),rep(1,115)) + + data_aglm <- data.frame(x=x,y=y) + data_glm <- data.frame(x = 0:12) + data_glm$y <- cbind(c(4, 2, 4, 6, 5, 13, 25, 27, 75, 29, 32, 36, 115), + c(2, 0, 0, 3, 5, 7, 9, 15, 49, 29, 45, 59, 245)) + + fit_aglm <- aglm(y ~ x, data = data_aglm, family = binomial()) + fit_aglm$p.value + + # compare to glm + + # special formatting of data needed for correct analysis + data_glm <- data.frame(x = 0:12) + data_glm$y <- cbind(c(4, 2, 4, 6, 5, 13, 25, 27, 75, 29, 32, 36, 115), + c(2, 0, 0, 3, 5, 7, 9, 15, 49, 29, 45, 59, 245)) + + fit_glm <- glm(y ~ x, data = data_glm, family = binomial()) + 1 - pchisq(fit_glm$deviance, fit_glm$df.residual) + + + # multiple poisson regression + + # create a fake dataset + b0 <- 1; b1 <- 0.5; b2 <- 0.5 + + covariates <- expand.grid(1:3,1:4) + + data <- data.frame( + x1 = covariates[,1], + x2 = covariates[,2], + y = rpois(12,exp(b0 + b1*covariates[,1] + b2*covariates[,2])) + ) + + # compare to glm + + fit_aglm <- aglm(y ~ x1 + x2, data = data, family = poisson()) + fit_glm <- glm(y ~ x1 + x2, data = data, family = poisson()) + + fit_aglm$p.value + with(fit_glm, 1 - pchisq(deviance, df.residual)) + } From 71be736cf20972dfaa78b868cbc76b70d41862b1 Mon Sep 17 00:00:00 2001 From: GrantInnerst Date: Fri, 30 Aug 2019 17:42:38 -0400 Subject: [PATCH 53/53] change in reexports file --- man/reexports.Rd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/man/reexports.Rd b/man/reexports.Rd index 903d46c..d281e18 100644 --- a/man/reexports.Rd +++ b/man/reexports.Rd @@ -15,10 +15,10 @@ These objects are imported from other packages. Follow the links below to see their documentation. \describe{ - \item{Rcpp}{\code{\link[Rcpp]{cpp_object_initializer}}} - \item{latte}{\code{\link[latte]{tab2vec}}, \code{\link[latte]{vec2tab}}, \code{\link[latte]{plot_matrix}}, \code{\link[latte]{plot_matrix}}} \item{magrittr}{\code{\link[magrittr]{\%>\%}}} + + \item{Rcpp}{\code{\link[Rcpp]{cpp_object_initializer}}} }}