diff --git a/R/dpseudohuber_median_d2x.R b/R/dpseudohuber_median_d2x.R new file mode 100644 index 0000000..f859595 --- /dev/null +++ b/R/dpseudohuber_median_d2x.R @@ -0,0 +1,63 @@ +#' Calculate the second derivative of the pseudo-Huber smoothed median +#' +#' @param x A vector to calculate the derivative of the pseudo-Huber smoothed median for. +#' @param d Smoothing parameter, by default set to \code{0.1}. As \code{d} approaches +#' \code{0} the pseudo-Huber median function approaches the median and as \code{d} approaches +#' infinity this function approaches the mean. +#' @param na.rm Passed to \code{pseudohuber_median}, default is FALSE, if FALSE then when +#' \code{x} includes at least one NA value then NA is returned, if TRUE then when \code{x} +#' includes at least one NA value then that value is removed and the pseudo-Huber median +#' is computed without it. +#' +#' @return The second derivative of the calculated pseudo-Huber smoothed median over \code{x} with smoothing +#' parameter \code{d}. +#' +#' @note algebra derived by ChatGPT based on our supp +#' +#' @examples +#' dpseudohuber_median_dx(x = rnorm(10), d = 0.1) +#' +#' @aliases dpsuedohuber_median_dx +#' +#' @export +#' +dpseudohuber_median_d2x <- function(x, + d = 0.1, + na.rm = FALSE) { + # pseudo-Huber center + ps_center <- pseudohuber_median(x, d, na.rm = na.rm) + + # basic quantities + y <- (x - ps_center) / d + w <- 1 / sqrt(1 + y^2) # w_j + w3 <- w^3 # a_j = w_j^3 + S <- sum(w3, na.rm = na.rm) # S = sum_j w_j^3 + + # t_j = (x_j - c*)/d^2 * w_j^5 + v <- (x - ps_center) / d^2 + w5 <- w^5 + t <- v * w5 # t_j + T <- sum(t, na.rm = na.rm) # T = sum_j t_j + + n <- length(x) + H <- matrix(NA_real_, n, n) + + # Hessian entries: + # H_{pq} = -3/S * [ t_p (delta_{pq} - w3_q/S) - (t_q - w3_q*T/S) * w3_p/S ] + for (p in seq_len(n)) { + a_p <- w3[p] + t_p <- t[p] + for (q in seq_len(n)) { + a_q <- w3[q] + t_q <- t[q] + delta_pq <- as.numeric(p == q) + + H[p, q] <- -(3 / S) * ( + t_p * (delta_pq - a_q / S) - + (t_q - a_q * T / S) * a_p / S + ) + } + } + + H +} \ No newline at end of file diff --git a/R/f_info.R b/R/f_info.R index 5876f57..1696056 100644 --- a/R/f_info.R +++ b/R/f_info.R @@ -1,69 +1,69 @@ #' @importFrom methods as # fisher information for multinomial model -f_info <- function(Y, - B_cup, - B, - X, - X_cup, - omit = NULL, - compute_together = FALSE, - return_separate = FALSE){ +f_info <- function( + Y, + B_cup, + B, + X, + X_cup, + omit = NULL, + compute_together = FALSE, + return_separate = FALSE +) { n <- nrow(Y) J <- ncol(Y) p <- ncol(X) - if(is.null(omit)){ - f_info <- matrix(0,ncol = p*J,nrow = p*J) - } else{ - if(compute_together){ - f_info <- matrix(0,ncol = p*(J - 1), nrow = p*(J - 1)) + if (is.null(omit)) { + f_info <- matrix(0, ncol = p * J, nrow = p * J) + } else { + if (compute_together) { + f_info <- matrix(0, ncol = p * (J - 1), nrow = p * (J - 1)) } } - z <- update_z(Matrix::Matrix(Y),Matrix::Matrix(X),B) - - - - if(compute_together){ #older, slower way of computing info - #used only to test equality with newer way - ident_mat <- Matrix::Diagonal(x = rep(1,J)) - one_mat <- Matrix::Matrix(1,nrow = J,ncol=1) - for(i in 1:n){ - which_rows <- 1:J + (i - 1)*J - mu_i <- exp(X_cup[which_rows,]%*% B_cup + z[i]) - p_i <- mu_i/sum(mu_i) - - M_iX_i <- (ident_mat - Matrix::tcrossprod(one_mat,p_i))%*%X_cup[which_rows,] - - - f_info <- f_info + Matrix::crossprod(M_iX_i,Matrix::Diagonal(x= as.numeric(mu_i)))%*%M_iX_i - } - } else{ #newer, faster way to compute info - mus <- exp(X_cup%*%B_cup + rep(z,each = J)) + z <- update_z(Matrix::Matrix(Y), Matrix::Matrix(X), B) + + if (compute_together) { + #older, slower way of computing info + #used only to test equality with newer way + ident_mat <- Matrix::Diagonal(x = rep(1, J)) + one_mat <- Matrix::Matrix(1, nrow = J, ncol = 1) + for (i in 1:n) { + which_rows <- 1:J + (i - 1) * J + mu_i <- exp(X_cup[which_rows, ] %*% B_cup + z[i]) + p_i <- mu_i / sum(mu_i) + + M_iX_i <- (ident_mat - Matrix::tcrossprod(one_mat, p_i)) %*% + X_cup[which_rows, ] + + f_info <- f_info + + Matrix::crossprod(M_iX_i, Matrix::Diagonal(x = as.numeric(mu_i))) %*% + M_iX_i + } + } else { + #newer, faster way to compute info + mus <- exp(X_cup %*% B_cup + rep(z, each = J)) mu_diag <- Matrix::Diagonal(x = mus@x) - ps <- lapply(1:n, - function(i){ - p_i <- mus@x[(i-1)*J +1:J]; - return(p_i/sum(p_i))}) - ps_mat <- Matrix::sparseMatrix(i = 1:(n*J), - j = rep(1:n,each = J), - x = do.call(c,ps)) - X_cup_p <- Matrix::crossprod(X_cup,ps_mat) + ps <- lapply(1:n, function(i) { + p_i <- mus@x[(i - 1) * J + 1:J] + return(p_i / sum(p_i)) + }) + ps_mat <- Matrix::sparseMatrix( + i = 1:(n * J), + j = rep(1:n, each = J), + x = do.call(c, ps) + ) + X_cup_p <- Matrix::crossprod(X_cup, ps_mat) X_cup_p <- as.matrix(X_cup_p) - X_cup_p <- methods::as(X_cup_p,"sparseMatrix") + X_cup_p <- methods::as(X_cup_p, "sparseMatrix") sum_diag <- Matrix::Diagonal(x = rowSums(Y)) - f_info <- Matrix::crossprod(X_cup,mu_diag)%*%X_cup - - X_cup_p%*% - Matrix::tcrossprod(sum_diag,X_cup_p) - - - - } - - + f_info <- Matrix::crossprod(X_cup, mu_diag) %*% + X_cup - + X_cup_p %*% + Matrix::tcrossprod(sum_diag, X_cup_p) + } return(f_info) - - } diff --git a/R/fit_null.R b/R/fit_null.R index b6f6b65..98b736c 100644 --- a/R/fit_null.R +++ b/R/fit_null.R @@ -1,5 +1,5 @@ #' fits model with B_kj constrained to equal g(B_k) for constraint fn g -#' +#' #' @param B description #' @param Y Y (with augmentations) #' @param X design matrix @@ -23,102 +23,98 @@ #' @param trackB track value of beta across iterations and return? #' @param ignore_stop whether to ignore stopping criteria and run `maxit` iterations (could be helpful for diagnostic plots). #' @param null_diagnostic_plots logical: should diagnostic plots be made for estimation under the null hypothesis? Default is \code{FALSE}. -#' +#' #' @return A list containing elements `B`, `k_constr`, `j_constr`, `niter` -#' `gap`, `u`, `rho`, and `Bs`. `B` is a matrix containing parameter estimates +#' `gap`, `u`, `rho`, and `Bs`. `B` is a matrix containing parameter estimates #' under the null (obtained by maximum likelihood on augmented observations Y), -#' `k_constr`, and `j_constr` give row and column indexes of the parameter -#' fixed to be equal to the constraint function \eqn{g()} under the null. `niter` is a -#' scalar giving total number of outer iterations used to fit the null model, -#' `gap` gives the final value of \eqn{g(B_{k constr}) - B_{k constr, j constr}}, -#' `u` and `rho` are final values of augmented Lagrangian parameters, and -#' `Bs` is a data frame containing values of B by iteration if `trackB` was set +#' `k_constr`, and `j_constr` give row and column indexes of the parameter +#' fixed to be equal to the constraint function \eqn{g()} under the null. `niter` is a +#' scalar giving total number of outer iterations used to fit the null model, +#' `gap` gives the final value of \eqn{g(B_{k constr}) - B_{k constr, j constr}}, +#' `u` and `rho` are final values of augmented Lagrangian parameters, and +#' `Bs` is a data frame containing values of B by iteration if `trackB` was set #' equal to TRUE (otherwise it contains a NULL value). -#' -fit_null <- function(B, - Y, - X, - X_cup = NULL, - k_constr, - j_constr, - j_ref, - constraint_fn, - constraint_grad_fn, - rho_init = 1, - tau = 1.2, - kappa = 0.8, - B_tol = 1e-2, - inner_tol = 0.01, - constraint_tol = 1e-4, - max_step = 5, - c1 = 1e-4, - maxit = 1000, - inner_maxit = 25, - verbose = FALSE, - trackB = FALSE, - ignore_stop = FALSE, - null_diagnostic_plots = FALSE +#' +fit_null <- function( + B, + Y, + X, + X_cup = NULL, + k_constr, + j_constr, + j_ref, + constraint_fn, + constraint_grad_fn, + rho_init = 1, + tau = 1.2, + kappa = 0.8, + B_tol = 1e-2, + inner_tol = 0.01, + constraint_tol = 1e-4, + max_step = 5, + c1 = 1e-4, + maxit = 1000, + inner_maxit = 25, + verbose = FALSE, + trackB = FALSE, + ignore_stop = FALSE, + null_diagnostic_plots = FALSE ) { - J <- ncol(Y) n <- nrow(Y) p <- ncol(X) - - it_df <- data.frame(it = 1:maxit, - lik = NA, - max_abs_B = NA, - constraint_diff = NA) + + it_df <- data.frame( + it = 1:maxit, + lik = NA, + max_abs_B = NA, + constraint_diff = NA + ) if (null_diagnostic_plots) { it_df$test_stat <- NA } - - #store cols of B to update in a vector - j_to_update <- (1:J)[(1:J) != j_ref] + #in vector format, which indexes to remove (bc B^j_constr = 0 id. constr.) - indexes_to_remove <- (j_ref - 1)*p + 1:p - - + indexes_to_remove <- (j_ref - 1) * p + 1:p + #change id. constr. by subtracting approp. col of B from others - for(k in 1:p) { - B[k,] <- B[k,] - B[k,j_ref] + for (k in 1:p) { + B[k, ] <- B[k, ] - B[k, j_ref] } - + #update z z <- update_z(X = X, Y = Y, B = B) - - log_mean <- X%*%B + - matrix(z,ncol = 1)%*%matrix(1,ncol = J, nrow = 1) - - ll <- sum(log_mean*Y - exp(log_mean)) - + + log_mean <- X %*% B + matrix(z, ncol = 1) %*% matrix(1, ncol = J, nrow = 1) + + ll <- sum(log_mean * Y - exp(log_mean)) + if (trackB) { - Bs <- data.frame(k = rep(1:p,each = J), - j = rep(1:J,p), - B = NA) - } else{ - Bs <- NULL + Bs <- data.frame(k = rep(1:p, each = J), j = rep(1:J, p), B = NA) + } else { + Bs <- NULL } - + #get X_cup for later use if (is.null(X_cup)) { - X_cup = X_cup_from_X(X,J) + X_cup = X_cup_from_X(X, J) } - + #set iteration to zero iter <- 0 - + converged <- FALSE - + #compute gap (i.e. g(B_k) - B_kj) - gap <- constraint_fn[[k_constr]](B[k_constr,]) - B[k_constr,j_constr] + gap <- constraint_fn[[k_constr]](B[k_constr, ]) - B[k_constr, j_constr] init_gap <- abs(gap) - + #set rho equal to initial value rho <- rho_init - + #initiate u - u <- rho*gap - + u <- rho * gap + if (trackB) { Bs$iter <- 0 Bs$inner_iter <- 0 @@ -126,179 +122,169 @@ fit_null <- function(B, Bs$u <- u Bs$gap <- gap - # Bs$score_stat <- 0 + # Bs$score_stat <- 0 lilBs <- Bs - # - for(k in 1:p) { - Bs$B[1:J + J*(k - 1)] <- B[k,] + # + for (k in 1:p) { + Bs$B[1:J + J * (k - 1)] <- B[k, ] } } - - loop_j_to_update <- j_to_update - + use_max_inner_it <- FALSE B_diff <- Inf - + criteria <- TRUE - - while(criteria) { - - iter <- iter + 1 #increment iteration - inner_iter <- 0 #initiate internal iteration - + + while (criteria) { + iter <- iter + 1 #increment iteration + inner_iter <- 0 #initiate internal iteration + #evaluate augmented Lagrangian - + # get current value of augmented lagrangian - curr_lag_val <- -ll + u*gap + (rho/2)*(gap^2) - + curr_lag_val <- -ll + u * gap + (rho / 2) * (gap^2) + old_gap <- gap # old value of g(B_k) - B_kj old_B <- B #old value of B inner_diff <- Inf #initiate "observed" max abs val diff in B between inner iterations - #at Inf so inner loop runs at least once + #at Inf so inner loop runs at least once #inner loop: - while(((inner_diff > inner_tol | use_max_inner_it) & inner_iter <= inner_maxit)| inner_iter == 0) { - + while ( + ((inner_diff > inner_tol | use_max_inner_it) & + inner_iter <= inner_maxit) | + inner_iter == 0 + ) { inner_old_B <- B - + #perform block update to B as described in null estimation section of Clausen & Willis (2024) - update <- macro_fisher_null(X = X, - Y = Y, - B = B, - z = z, - J = J, - p = p, - k_constr = k_constr, - j_constr = j_constr, - j_ref = j_ref, - rho = rho, - u = u, - max_step = max_step, - constraint_fn = constraint_fn, - constraint_grad_fn = constraint_grad_fn, - verbose = verbose, - c1 = 1e-4) + update <- macro_fisher_null( + X = X, + Y = Y, + B = B, + z = z, + J = J, + p = p, + k_constr = k_constr, + j_constr = j_constr, + j_ref = j_ref, + rho = rho, + u = u, + max_step = max_step, + constraint_fn = constraint_fn, + constraint_grad_fn = constraint_grad_fn, + verbose = verbose, + c1 = 1e-4 + ) + B <- B + update$update gap <- update$gap - z <- update_z(Y,X,B) - + z <- update_z(Y, X, B) + if (trackB) { - # - # ### AW notes 1/15/24 that dlag_lamhat_dB is not computed in this function. - # temp_Bs <- lilBs - for(k in 1:p) { - temp_Bs$B[1:J + J*(k - 1)] <- B[k,] + for (k in 1:p) { + temp_Bs$B[1:J + J * (k - 1)] <- B[k, ] } temp_Bs$iter <- iter temp_Bs$inner_iter <- inner_iter temp_Bs$rho <- rho temp_Bs$u <- u - # temp_Bs$Q <- NA temp_Bs$gap <- gap - # if (iter > 1 | inner_iter > 1) { - # jcounter <- 1 - # for(j in 1:J) { - # if (j == j_constr) { - # temp_Bs$dlag_dB[temp_Bs$j ==j] <- 0 - # } else{ - # - # ### maybe in - # - # # dlag_lamhat_dB <- dldB + lambda_hat*dgdB - # # temp_Bs$dlag_dB[temp_Bs$j ==j] <- dlag_lamhat_dB[(jcounter - 1)*p + 1:p,] - # # jcounter <- jcounter + 1 - # } - # } - # } - # - Bs <- rbind(Bs,temp_Bs) - # + Bs <- rbind(Bs, temp_Bs) } - + #did we move much? inner_diff <- max(abs(B - inner_old_B)) - + #increment inner iteration counter inner_iter <- inner_iter + 1 - } #did we move much since last *outer* loop iteration? B_diff <- max(abs(B - old_B)) - + if (verbose) { - message("Max absolute difference in B since last augmented Lagrangian outer step: ", signif (B_diff,3)) + message( + "Max absolute difference in B since last augmented Lagrangian outer step: ", + signif(B_diff, 3) + ) } - + #update u and rho - if ( abs(gap) > constraint_tol) { - u <- u + rho*gap - if (abs(gap/old_gap) > kappa) { - rho <- rho*tau - } + if (abs(gap) > constraint_tol) { + u <- u + rho * gap + if (abs(gap / old_gap) > kappa) { + rho <- rho * tau + } } - + if (verbose) { - message("Estimate deviates from feasibility by ",signif (gap,3)) - message("Parameter u set to ", signif (u,3)) - message("Parameter rho set to ", signif (rho,3)) + message("Estimate deviates from feasibility by ", signif(gap, 3)) + message("Parameter u set to ", signif(u, 3)) + message("Parameter rho set to ", signif(rho, 3)) } - + if (abs(gap) < constraint_tol) { use_max_inner_it <- TRUE - # message("Score stat: ",signif (check$score_stat*(n/(n - 1)),3)) - # message("Influence stat: ",signif (check$influence_stat,3)) - # proj_score <- as.numeric(as.matrix(check$influence_stat)) - } else{ + } else { use_max_inner_it <- FALSE } - + if (ignore_stop) { criteria <- (iter < maxit & !is.infinite(rho)) } else { - criteria <- (abs(gap) > constraint_tol | B_diff> B_tol) & iter < maxit & !is.infinite(rho) + criteria <- (abs(gap) > constraint_tol | B_diff > B_tol) & + iter < maxit & + !is.infinite(rho) } - - log_means <- X %*% B + - matrix(z, ncol = 1) %*% matrix(1, ncol = J, nrow = 1) - + + log_means <- X %*% B + matrix(z, ncol = 1) %*% matrix(1, ncol = J, nrow = 1) + ll <- sum(Y * log_means - exp(log_means)) it_df$lik[iter] <- ll it_df$max_abs_B[iter] <- B_diff it_df$constraint_diff[iter] <- abs(gap) - + if (verbose) { - message("Log likelihood: ", signif (ll,3)) + message("Log likelihood: ", signif(ll, 3)) } - + if (null_diagnostic_plots) { - score_res <- try(get_score_stat(X = X, Y = Y, X_cup = X_cup, B = B, k_constr = k_constr, - j_constr = j_constr, constraint_grad_fn = constraint_grad_fn, - indexes_to_remove = (j_ref - 1)*p + 1:p, j_ref = j_ref, J = J, - n = n, p = p)) + score_res <- try(get_score_stat( + X = X, + Y = Y, + X_cup = X_cup, + B = B, + k_constr = k_constr, + j_constr = j_constr, + constraint_grad_fn = constraint_grad_fn, + indexes_to_remove = (j_ref - 1) * p + 1:p, + j_ref = j_ref, + J = J, + n = n, + p = p + )) if (!inherits(score_res, "try-error")) { it_df$test_stat[iter] <- score_res$score_stat } } - + if (abs(gap) <= constraint_tol & B_diff <= B_tol) { converged <- TRUE } - } - + it_df <- it_df[1:iter, ] - - return(list("B" = B, - "k_constr" = k_constr, - "j_constr" = j_constr, - # "score_stats" = list(score_stats), - # "proj_score" = proj_score, - "niter" = iter, - "gap" = gap, - "u" = u, - "rho" = rho, - "Bs" = Bs, - "it_df" = it_df, - "converged" = converged)) -} + return(list( + "B" = B, + "k_constr" = k_constr, + "j_constr" = j_constr, + "niter" = iter, + "gap" = gap, + "u" = u, + "rho" = rho, + "Bs" = Bs, + "it_df" = it_df, + "converged" = converged + )) +} diff --git a/R/fit_null_discrete.R b/R/fit_null_discrete.R new file mode 100644 index 0000000..b036996 --- /dev/null +++ b/R/fit_null_discrete.R @@ -0,0 +1,84 @@ +#' @note +#' Needs higher tolerance than macro_fisher_null to get superior LL + +fit_null_discrete_pseudohuber <- function( + Y, + X, + k_constr, + j_constr, + j_ref, + trackB = TRUE, + maxit = 1000, + tol = 1e-8, + ls_max = 20, + ls_rho = 0.5, + max_step = 1 +) { + + n <- nrow(Y) + J <- ncol(Y) + p <- ncol(X) + + ### check we're in the right paradigm + distinct_xx <- unique(X) + stopifnot(ncol(X) == nrow(distinct_xx)) + stopifnot(all(X[,1] == 1)) ### need a baseline category to simplify the parametrization + X_wo_1s <- distinct_xx[, -1] + + + ## Rearrange to have j_ref be the last (J-th) column + ## Setup: + ## put j_ref to be the J-th column + ## put j_constr to be the 1st column + ## Later, permute cols of B-hat + Y_orig <- Y + new_order <- c(j_constr, setdiff(1:J, c(j_constr, j_ref)), j_ref) + Y <- Y_orig[, new_order] + + groups <- split( + seq_len(nrow(X)), # row indices of original data + apply(X, 1, function(r) + paste(r, collapse = "_")) # grouping key by row contents + ) + + groups <- groups[order(sapply(groups, min))] + + totals <- lapply(groups, function(x) { + apply(Y[x, , drop = FALSE], 2, sum) + }) + + Y_sum <- do.call(rbind, totals) + + ## this should be equivalent to fit_null with j_ref=J + out <- fit_null_discrete_micro_fs( + Y = Y_sum, + X = X_wo_1s, + k_constr=k_constr, + j_constr=1, ## because it's been moved to the first column + constraint_fn=function(x) { pseudohuber_median(c(x, 0)) }, + constraint_grad_fn= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]}, + maxit = maxit, + tol = tol, + ls_max = ls_max, + ls_rho = ls_rho, + max_step = max_step + ) + + B <- out$B + + z <- update_z(Y, X, B) + log_means <- X %*% B + matrix(z, ncol = 1) %*% matrix(1, ncol = J, nrow = 1) + ll_new <- sum(Y * log_means - exp(log_means)) + + ### now reorder B to match what was input + return(list( + "B" = B[, order(new_order)], + "k_constr" = k_constr, + "j_constr" = j_constr, + "niter" = out$iter, + "loglik" = ll_new, + "it_df" = out$history, + "converged" = (out$iter < maxit) + )) + +} \ No newline at end of file diff --git a/R/fit_null_discrete_micro_fs.R b/R/fit_null_discrete_micro_fs.R new file mode 100644 index 0000000..f0cc27c --- /dev/null +++ b/R/fit_null_discrete_micro_fs.R @@ -0,0 +1,221 @@ +## this is equivalent to fit_null with j_ref=J +fit_null_discrete_micro_fs <- function(Y, X, + k_constr, + j_constr, # indices: row k* in 1..(p-1), column j* in 1..(J-1) + constraint_fn, constraint_grad_fn, # g: R^{m-1}->R on the constrained row; grad: R^{m-1}->R^{m-1} + maxit = 1000, + tol = 1e-8, + ls_max = 20, + ls_rho = 0.5, + ridge_base = 1e-4, + max_step_norm = 5, + max_step = 1) { + + ## using `alpha + beta*X` parametrisation, so decrease k_constr_non_intercept accordingly + k_constr_non_intercept <- k_constr - 1 + + Y <- as.matrix(Y); X <- as.matrix(X) + p <- nrow(Y); J <- ncol(Y); m <- J - 1L + if (nrow(X) != p || ncol(X) != p - 1L) stop("X must be p x (p-1).") + if (J < 3) stop("Need J >= 3 (one baseline).") + if (k_constr_non_intercept < 1L || k_constr_non_intercept > (p - 1L)) stop("k_constr_non_intercept must be in 1..(p-1).") + if (j_constr < 1L || j_constr > m) stop("j_constr must be in 1..(J-1).") + + n <- rowSums(Y) + + # Parameters + alpha <- rep(0, m) + beta <- matrix(0, nrow = p - 1L, ncol = m) + # initialize constrained element coherently (assume g excludes j_constr) + free_idx <- setdiff(seq_len(m), j_constr) + beta[k_constr_non_intercept, j_constr] <- constraint_fn(beta[k_constr_non_intercept, free_idx]) + + softmax_nb <- function(eta) { e <- exp(eta); e / (1 + sum(e)) } + + # Log-likelihood + loglik_fun <- function(alpha, beta) { + ll <- 0; #eps <- 1e-12 + for (i in 1:p) { + eta <- alpha + drop(X[i, , drop = FALSE] %*% beta) # length m + pnb <- softmax_nb(eta) + pfull <- c(pnb, 1 - sum(pnb)) + # pfull <- pmin(pmax(c(pnb, 1 - sum(pnb)), eps), 1 - eps) + ll <- ll + sum(Y[i, ] * log(pfull)) + } + ll + } + + # History + param_hist <- list() + record <- function(iter) { + rbind( + data.frame(iter = iter, type = "alpha", row = NA_integer_, col = seq_len(m), value = alpha), + data.frame(iter = iter, type = "beta", row = rep(seq_len(p - 1L), each = m), + col = rep(seq_len(m), times = p - 1L), value = as.vector(beta)) + ) + } + param_hist[[1]] <- record(0L) + + ll_old <- loglik_fun(alpha, beta); final_iter <- 0L + + for (iter in 1:maxit) { + # probs and M_i + p_list <- vector("list", p) + M_list <- vector("list", p) + for (i in 1:p) { + eta <- alpha + drop(X[i, , drop = FALSE] %*% beta) + pnb <- softmax_nb(eta) + p_list[[i]] <- pnb + M_list[[i]] <- diag(pnb) - tcrossprod(pnb) + } + + # scores + s_alpha <- rep(0, m) + s_beta <- matrix(0, nrow = p - 1L, ncol = m) + for (i in 1:p) { + s_i <- Y[i, 1:m] - n[i] * p_list[[i]] # length m + s_alpha <- s_alpha + s_i + # add outer product X[i,]^T * s_i + s_beta <- s_beta + tcrossprod(X[i, ], s_i) # (p-1) x m + } + + # reduced score: theta = (alpha, rows k != k*, full m; row k* free indices only) + s_theta <- c( + s_alpha, + as.vector(t(s_beta[setdiff(seq_len(p - 1L), k_constr_non_intercept), , drop = FALSE])), + # constrained row (chain rule) + { + grad_g <- constraint_grad_fn(beta[k_constr_non_intercept, free_idx]) + s_beta[k_constr_non_intercept, free_idx] + s_beta[k_constr_non_intercept, j_constr] * grad_g + } + ) + + if (sqrt(sum(s_theta^2)) < tol) { final_iter <- iter - 1L; break } + + # Fisher J in full phi = (alpha, vec(beta rows 1..p-1 each length m)) + q_dim <- m + (p - 1L) * m + Jmat <- matrix(0, q_dim, q_dim) + + # indices + alpha_idx <- 1:m + row_block_idx <- function(k) { # k in 1..(p-1), returns indices for row k in vec(beta) + start <- m + (k - 1L) * m + 1L + start:(start + m - 1L) + } + + for (i in 1:p) { + Mi <- M_list[[i]]; ni <- n[i]; Xi <- X[i, , drop = FALSE] # 1 x (p-1) + # alpha-alpha + Jmat[alpha_idx, alpha_idx] <- Jmat[alpha_idx, alpha_idx] + ni * Mi + # alpha-beta rows + for (k in 1:(p - 1L)) { + rb <- row_block_idx(k) + Jmat[alpha_idx, rb] <- Jmat[alpha_idx, rb] + ni * (Mi * Xi[1, k]) + Jmat[rb, alpha_idx] <- t(Jmat[alpha_idx, rb]) + } + # beta-beta rows + for (k in 1:(p - 1L)) { + for (l in 1:(p - 1L)) { + rbk <- row_block_idx(k); rbl <- row_block_idx(l) + Jmat[rbk, rbl] <- Jmat[rbk, rbl] + ni * (Mi * (Xi[1, k] * Xi[1, l])) + } + } + } + + # Transformation T: reduced theta -> full phi + # theta packs: alpha, then rows k != k* (full m), then row k* (m-1 free entries) + p_dim <- m + (p - 2L) * m + (m - 1L) + T <- matrix(0, nrow = q_dim, ncol = p_dim) + # alpha identity + T[alpha_idx, 1:m] <- diag(m) + col_ptr <- m + 1L + # rows k != k* + for (k in setdiff(seq_len(p - 1L), k_constr_non_intercept)) { + rb <- row_block_idx(k) + T[rb, col_ptr:(col_ptr + m - 1L)] <- diag(m) + col_ptr <- col_ptr + m + } + # row k* mapping B (m x (m-1)) + grad_g <- constraint_grad_fn(beta[k_constr_non_intercept, free_idx]) + Bmap <- matrix(0, nrow = m, ncol = m - 1L) + # identity on free indices + for (pos in seq_along(free_idx)) { + Bmap[free_idx[pos], pos] <- 1 + } + # constrained element row = grad_g^T + Bmap[j_constr, ] <- grad_g + T[row_block_idx(k_constr_non_intercept), col_ptr:(col_ptr + (m - 2L))] <- Bmap + + I_theta <- crossprod(T, Jmat %*% T) + + # Adaptive ridge solve + ridge_mult <- c(1, 10, 100, 1000, 1e4) + step_theta <- rep(NA_real_, p_dim); solved <- FALSE + for (rm in ridge_mult) { + Ir <- I_theta + (ridge_base * rm) * diag(p_dim) + st <- tryCatch(solve(Ir, s_theta), error = function(e) NA_real_) + if (all(is.finite(st))) { step_theta <- st; solved <- TRUE; break } + } + if (!solved) { final_iter <- iter - 1L; break } + + # Step capping + sn <- sqrt(sum(step_theta^2)) + if (sn > max_step_norm) step_theta <- step_theta * (max_step_norm / sn) + if (sqrt(sum(step_theta^2)) < tol) { final_iter <- iter - 1L; break } + + # Pack current reduced theta + pack_theta <- function(alpha, beta) { + c( + alpha, + as.vector(t(beta[setdiff(seq_len(p - 1L), k_constr_non_intercept), , drop = FALSE])), + beta[k_constr_non_intercept, free_idx] + ) + } + unpack_theta <- function(vec) { + wp <- 1L + a_new <- vec[wp:(wp + m - 1L)]; wp <- wp + m + beta_new <- beta + # rows k != k* + for (k in setdiff(seq_len(p - 1L), k_constr_non_intercept)) { + beta_new[k, ] <- vec[wp:(wp + m - 1L)]; wp <- wp + m + } + # row k* free + bfree <- vec[wp:(wp + (m - 2L))] + beta_new[k_constr_non_intercept, free_idx] <- bfree + beta_new[k_constr_non_intercept, j_constr] <- constraint_fn(bfree) + list(alpha = a_new, beta = beta_new) + } + + # Line search (monotone ℓ) + theta_old <- pack_theta(alpha, beta) + step_scale <- 1.0; accepted <- FALSE; ll_new <- ll_old + for (ls in 1:ls_max) { + prop <- unpack_theta(theta_old + step_scale * step_theta) + alpha_new <- prop$alpha + beta_new <- prop$beta + ll_cand <- loglik_fun(alpha_new, beta_new) + if (is.finite(ll_cand) && ll_cand >= ll_old) { + alpha <- alpha_new; beta <- beta_new; ll_new <- ll_cand; accepted <- TRUE; break + } + step_scale <- step_scale * ls_rho + } + if (!accepted) { final_iter <- iter - 1L; break } + + # record and check convergence + param_hist[[length(param_hist) + 1L]] <- record(iter) + if (abs(ll_new - ll_old) < tol) { ll_old <- ll_new; final_iter <- iter; break } + ll_old <- ll_new; final_iter <- iter + } + + B <- cbind(rbind(alpha, beta), 0) + rownames(B) <- NULL + + history <- do.call(rbind, param_hist) + history$k <- ifelse(is.na(history$row), 0, history$row) + 1 + history$j <- history$col + + list(B = B, + logLik = ll_old, + iter = final_iter, + history = history) +} diff --git a/R/fit_null_discrete_optim.R b/R/fit_null_discrete_optim.R new file mode 100644 index 0000000..84397b6 --- /dev/null +++ b/R/fit_null_discrete_optim.R @@ -0,0 +1,165 @@ +# ## n0: counts for x = 0, length J +# ## n1: counts for x = 1, length J +# ## gfun: function(beta_free2) -> scalar beta_1 +# ## beta_free2 is c(beta_2, ..., beta_{J-1}) +# ## gprimefun: function(beta_free2) -> numeric(J-2) +# ## gradient d beta_1 / d (beta_2,...,beta_{J-1}) +# +# loglik_counts_constrained <- function(par, n0, n1, gfun, gprimefun) { +# J <- length(n0) +# Jm1 <- J - 1 +# +# ## Split parameters: +# ## alpha_1,...,alpha_{J-1}, beta_2,...,beta_{J-1} +# alpha <- par[1:Jm1] +# beta_free2 <- par[(J):(2*J - 3)] # length J-2: beta_2..beta_{J-1} +# +# ## Constrained beta_1 via g +# beta1 <- gfun(beta_free2) +# +# ## Full beta for categories 1..J-1 (baseline J has beta_J = 0) +# beta_full <- numeric(Jm1) +# beta_full[1] <- beta1 +# beta_full[2:Jm1] <- beta_free2 +# +# ## Group sizes +# N0 <- sum(n0) +# N1 <- sum(n1) +# +# ## x = 0: eta0_j = alpha_j, eta0_J = 0 +# eta0 <- alpha # length J-1 +# m0 <- max(c(0, eta0)) +# exp_eta0 <- exp(eta0 - m0) +# denom0 <- exp(0 - m0) + sum(exp_eta0) +# p0 <- exp_eta0 / denom0 # probs for j=1..J-1 +# p0J <- exp(0 - m0) / denom0 # prob for baseline J +# +# ## x = 1: eta1_j = alpha_j + beta_j, eta1_J = 0 +# eta1 <- alpha + beta_full +# m1 <- max(c(0, eta1)) +# exp_eta1 <- exp(eta1 - m1) +# denom1 <- exp(0 - m1) + sum(exp_eta1) +# p1 <- exp_eta1 / denom1 # probs for j=1..J-1 +# p1J <- exp(0 - m1) / denom1 +# +# ## Log-likelihood: +# ## ℓ = sum_j n0_j log p0_j + sum_j n1_j log p1_j +# ## with p0_J, p1_J included for category J +# ll0 <- sum(n0[1:Jm1] * log(p0)) + n0[J] * log(p0J) +# ll1 <- sum(n1[1:Jm1] * log(p1)) + n1[J] * log(p1J) +# +# ll0 + ll1 +# } +# +# grad_counts_constrained <- function(par, n0, n1, gfun, gprimefun) { +# J <- length(n0) +# Jm1 <- J - 1 +# +# alpha <- par[1:Jm1] +# beta_free2 <- par[(J):(2*J - 3)] +# +# beta1 <- gfun(beta_free2) +# beta_full <- numeric(Jm1) +# beta_full[1] <- beta1 +# beta_full[2:Jm1] <- beta_free2 +# +# gprime <- gprimefun(beta_free2) # length J-2 +# +# N0 <- sum(n0) +# N1 <- sum(n1) +# +# ## x = 0 +# eta0 <- alpha +# m0 <- max(c(0, eta0)) +# exp_eta0 <- exp(eta0 - m0) +# denom0 <- exp(0 - m0) + sum(exp_eta0) +# p0 <- exp_eta0 / denom0 # j = 1..J-1 +# p0J <- exp(0 - m0) / denom0 +# +# ## x = 1 +# eta1 <- alpha + beta_full +# m1 <- max(c(0, eta1)) +# exp_eta1 <- exp(eta1 - m1) +# denom1 <- exp(0 - m1) + sum(exp_eta1) +# p1 <- exp_eta1 / denom1 # j = 1..J-1 +# p1J <- exp(0 - m1) / denom1 +# +# ## Gradient w.r.t. alpha_j (j=1..J-1): +# ## ∂ℓ/∂alpha_j = (n0_j - N0 p0_j) + (n1_j - N1 p1_j) +# grad_alpha <- (n0[1:Jm1] - N0 * p0) + (n1[1:Jm1] - N1 * p1) +# +# ## Unconstrained slope scores: +# ## ∂ℓ/∂beta_j = n1_j - N1 p1_j, j = 1..J-1 +# score_beta_full <- n1[1:Jm1] - N1 * p1 +# +# ## ∂ℓ/∂beta_1: +# S1 <- score_beta_full[1] +# +# ## For free beta_2..beta_{J-1}: +# ## ∂ℓ/∂beta_j_free = (n1_j - N1 p1_j) + S1 * ∂beta1/∂beta_j +# ## j = 2..J-1 +# S_direct <- score_beta_full[2:Jm1] # length J-2 +# grad_beta_free2 <- S_direct + S1 * gprime +# +# c(grad_alpha, grad_beta_free2) +# } +# +# fit_multinom_constrained_counts <- function(n0, n1, gfun, gprimefun, +# start = NULL) { +# J <- length(n0) +# if (length(n1) != J) stop("n0 and n1 must have the same length") +# if (J < 3) stop("Need at least 3 categories (J >= 3).") +# +# Jm1 <- J - 1 +# n_par <- (Jm1) + (Jm1 - 1) # alpha_1..alpha_{J-1}, beta_2..beta_{J-1} +# +# if (is.null(start)) { +# start <- rep(0, n_par) +# } +# +# fn <- function(par) { +# -loglik_counts_constrained(par, n0, n1, gfun, gprimefun) # optim minimizes +# } +# +# gr <- function(par) { +# -grad_counts_constrained(par, n0, n1, gfun, gprimefun) +# } +# +# opt <- optim(start, fn, gr, method = "BFGS", control = list(maxit = 10)) +# +# par_hat <- opt$par +# +# ## Reconstruct alpha_1..alpha_J and beta_1..beta_J +# alpha_hat <- par_hat[1:Jm1] +# beta_free2_hat <- par_hat[(J):(2*J - 3)] +# beta1_hat <- gfun(beta_free2_hat) +# +# beta_hat <- numeric(J) +# beta_hat[1] <- beta1_hat +# beta_hat[2:Jm1] <- beta_free2_hat +# beta_hat[J] <- 0 +# +# alpha_hat_full <- numeric(J) +# alpha_hat_full[1:Jm1] <- alpha_hat +# alpha_hat_full[J] <- 0 +# +# list( +# alpha = alpha_hat_full, +# beta = beta_hat, +# logLik = -opt$value, +# convergence = opt$convergence, +# optim = opt +# ) +# } +# +# # system.time({ +# # out_test3 <- +# fit_multinom_constrained_counts(n0=Y[which(X[,2] == 0), ] %>% colSums, +# n1 = Y[which(X[,2] == 1), ] %>% colSums, +# gfun=function(x) { pseudohuber_median(c(x, 0)) }, +# gprimefun= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]} +# ) +# # }) +# out_test2$alpha +# out_test2$beta +# out_test3 diff --git a/R/macro_fisher_null.R b/R/macro_fisher_null.R index 29f7ecb..09cdb43 100644 --- a/R/macro_fisher_null.R +++ b/R/macro_fisher_null.R @@ -1,162 +1,172 @@ - -#block approximate Newton update to B used in +#block approximate Newton update to B used in #optimization under null -macro_fisher_null <- function(X, - Y, - B, - z, - J, - p, - k_constr, - j_constr, - j_ref, - rho, - u, - constraint_fn, - constraint_grad_fn, - stepsize = 0.5, - c1 = 1e-4, - regularization = 0.1, - max_step = 1, - verbose = FALSE, - debug = FALSE #return more information useful for - #debugging - ){ +macro_fisher_null <- function( + X, + Y, + B, + z, + J, + p, + k_constr, + j_constr, + j_ref, + rho, + u, + constraint_fn, + constraint_grad_fn, + stepsize = 0.5, + c1 = 1e-4, + regularization = 0.1, + max_step = 1, + verbose = FALSE, + debug = FALSE #return more information useful for + #debugging +) { #list of log means... by taxon log_means_by_j <- - lapply(1:J,function(j) - X%*%B[,j,drop = FALSE] + z) - + lapply(1:J, function(j) { + X %*% B[, j, drop = FALSE] + z + }) + #compute information matrix of *Poisson* log likelihood #for fixed z - info_bits <- lapply(log_means_by_j, - function(log_mu){ - crossprod(X,diag(as.numeric(exp(log_mu))))%*%X - }) - info_diag <- do.call(c,lapply(info_bits,diag)) + info_bits <- lapply(log_means_by_j, function(log_mu) { + crossprod(X, diag(as.numeric(exp(log_mu)))) %*% X + }) + info_diag <- do.call(c, lapply(info_bits, diag)) mean_diag <- mean(info_diag) - - info_bits <- lapply(info_bits, - function(x){ - diag(x) <- diag(x) + regularization*mean_diag; - return(x) - }) + + info_bits <- lapply(info_bits, function(x) { + diag(x) <- diag(x) + regularization * mean_diag + return(x) + }) #invert info by inverting each block diagonal element - inv_info_by_j <- lapply(info_bits, - function(x) - qr.solve(x,tol = 1e-20)) + inv_info_by_j <- lapply(info_bits, function(x) { + qr.solve(x, tol = 1e-20) + }) #remove diagonal block corresponding to convenience constraint inv_info_by_j <- inv_info_by_j[-j_ref] - - #stick the block diagonal elements together into one matrix to get + + #stick the block diagonal elements together into one matrix to get #inverse as an actual matrix info_inverse <- Matrix::bdiag(inv_info_by_j) - + #compute gradient of constraint function - cg <- constraint_grad_fn[[k_constr]](B[k_constr,]) + cg <- constraint_grad_fn[[k_constr]](B[k_constr, ]) cg[j_constr] <- cg[j_constr] - 1 cg <- cg[-j_ref] - + #vector of p elements whose k_constr-th element is 1, all else 0 - e_k_constr <- Matrix(0,nrow = p,ncol = 1) - e_k_constr[k_constr,] <- 1 - #take kronecker producr with gradient of constraint to + e_k_constr <- Matrix(0, nrow = p, ncol = 1) + e_k_constr[k_constr, ] <- 1 + #take kronecker producr with gradient of constraint to #get a gradient in long format B (i.e., B_cup) - cg_expanded <- Matrix::kronecker(Matrix(cg,ncol = 1),e_k_constr) - v <- sqrt(rho)*cg_expanded + cg_expanded <- Matrix::kronecker(Matrix(cg, ncol = 1), e_k_constr) + v <- sqrt(rho) * cg_expanded #do sherman-morrison rank-1 update in a couple steps - sm_denom <- as.numeric(as.matrix(1 + Matrix::crossprod(v,info_inverse)%*%v)) - sm_half_num <- info_inverse%*%v - - #strictly speaking, info_inverse isn't the inverse info -- its the inverse of + sm_denom <- as.numeric(as.matrix( + 1 + Matrix::crossprod(v, info_inverse) %*% v + )) + sm_half_num <- info_inverse %*% v + + #strictly speaking, info_inverse isn't the inverse info -- its the inverse of # (an approximation to) the hessian of the augmented lagrangian - + #compute derivative of augmented lagrangian - lag_deriv <- lapply(1:J, - function(j){ - - -t(X)%*%(Y[,j,drop = FALSE] - exp(log_means_by_j[[j]])) - }) - lag_deriv <- do.call(rbind,lag_deriv[-j_ref]) - gap <- constraint_fn[[k_constr]](B[k_constr,]) - B[k_constr,j_constr] - lag_deriv <- lag_deriv + (u + rho*gap)*cg_expanded - - + lag_deriv <- lapply(1:J, function(j) { + -t(X) %*% (Y[, j, drop = FALSE] - exp(log_means_by_j[[j]])) + }) + lag_deriv <- do.call(rbind, lag_deriv[-j_ref]) + gap <- constraint_fn[[k_constr]](B[k_constr, ]) - B[k_constr, j_constr] + lag_deriv <- lag_deriv + (u + rho * gap) * cg_expanded + #direction for (approximate) Newton step - update_dir <- -info_inverse %*%lag_deriv + - sm_half_num%*% Matrix::crossprod(sm_half_num,lag_deriv)/sm_denom + update_dir <- -info_inverse %*% + lag_deriv + + sm_half_num %*% Matrix::crossprod(sm_half_num, lag_deriv) / sm_denom #shorten step if any of its elements are larger than allowed by max_step - if(max(abs(update_dir), na.rm = TRUE)>max_step){ - update_dir <- update_dir/max(abs(update_dir), na.rm = TRUE) + if (max(abs(update_dir), na.rm = TRUE) > max_step) { + update_dir <- update_dir / max(abs(update_dir), na.rm = TRUE) } - + # make sure update direction is never missing update_dir[is.na(update_dir)] <- 0 - - armijo_term <- c1*sum(update_dir*lag_deriv) - + + armijo_term <- c1 * sum(update_dir * lag_deriv) + #update direction in long format - update_dir <- B_from_B_cup(update_dir,J - 1, p) + update_dir <- B_from_B_cup(update_dir, J - 1, p) #don't update betas for j_ref - update_dir <- do.call(cbind,list(update_dir[,(1:(J - 1))=j_ref,drop = FALSE])) - - + update_dir <- do.call( + cbind, + list( + update_dir[, (1:(J - 1)) < j_ref, drop = FALSE], + 0, + update_dir[, (1:(J - 1)) >= j_ref, drop = FALSE] + ) + ) + #in debug mode, compute full inverse info for backward compatibility - if(debug){ - info_inverse <- info_inverse - Matrix::tcrossprod(sm_half_num)/sm_denom + if (debug) { + info_inverse <- info_inverse - Matrix::tcrossprod(sm_half_num) / sm_denom } - - - - - + #find stepsize that satisfies armijo rule accept <- FALSE - - curr_lag_val <- do.call(sum, - lapply(1:J, function(j) - -Y[,j,drop = FALSE]*log_means_by_j[[j]] + exp(log_means_by_j[[j]]))) + - u*gap + (rho/2)*gap^2 - - while(!accept){ - prop_B <- B + stepsize*update_dir - prop_gap <- constraint_fn[[k_constr]](prop_B[k_constr,]) - prop_B[k_constr,j_constr] - + + curr_lag_val <- do.call( + sum, + lapply(1:J, function(j) { + -Y[, j, drop = FALSE] * log_means_by_j[[j]] + exp(log_means_by_j[[j]]) + }) + ) + + u * gap + + (rho / 2) * gap^2 + + while (!accept) { + prop_B <- B + stepsize * update_dir + prop_gap <- constraint_fn[[k_constr]](prop_B[k_constr, ]) - + prop_B[k_constr, j_constr] + prop_log_means_by_j <- - lapply(1:J,function(j) - X%*%prop_B[,j,drop = FALSE] + z) - - prop_lag_val <- do.call(sum, - lapply(1:J, function(j) - -Y[,j,drop = FALSE]*prop_log_means_by_j[[j]] + exp(prop_log_means_by_j[[j]]))) + - u*prop_gap + (rho/2)*prop_gap^2 - - accept <- prop_lag_val <= curr_lag_val + armijo_term*stepsize - stepsize <- stepsize/2 + lapply(1:J, function(j) { + X %*% prop_B[, j, drop = FALSE] + z + }) + + prop_lag_val <- do.call( + sum, + lapply(1:J, function(j) { + -Y[, j, drop = FALSE] * + prop_log_means_by_j[[j]] + + exp(prop_log_means_by_j[[j]]) + }) + ) + + u * prop_gap + + (rho / 2) * prop_gap^2 + + accept <- prop_lag_val <= curr_lag_val + armijo_term * stepsize + stepsize <- stepsize / 2 # stepsize <- stepsize*0.9 - + # steps <- c(steps,stepsize) # props <- c(props,prop_lag_val) - } - stepsize <- stepsize*2 - if (stepsize < 1e-2 & verbose) { + stepsize <- stepsize * 2 + if (stepsize < 1e-2 & verbose) { #ideally, stepsize doesn't get super small and we never see this message #but it can be helpful if something is going wrong - message("Small stepsize: ",signif(stepsize,2)) + message("Small stepsize: ", signif(stepsize, 2)) } if (debug) { - return(list(update = update_dir*stepsize,gap = prop_gap, - lag_deriv = lag_deriv, - info_inverse = info_inverse, - stepsize = stepsize)) - } else{ - return(list(update = update_dir*stepsize,gap = prop_gap)) - + return(list( + update = update_dir * stepsize, + gap = prop_gap, + lag_deriv = lag_deriv, + info_inverse = info_inverse, + stepsize = stepsize + )) + } else { + return(list(update = update_dir * stepsize, gap = prop_gap)) } - - } diff --git a/R/my_fs_stable_two_groups.R b/R/my_fs_stable_two_groups.R new file mode 100644 index 0000000..25a9c33 --- /dev/null +++ b/R/my_fs_stable_two_groups.R @@ -0,0 +1,240 @@ +## this is equivalent to fit_null with k_constr=2, j_constr=1, j_ref=J, +my_fs_stable_two_groups <- function(n0, + n1, + g_beta, + g_beta_grad, + maxit = 1000, + tol = 1e-8, + ls_max = 20, # max step-halving + ls_rho = 0.5) { # shrink factor for step-halving + + K <- length(n0) + Km <- K - 1 # m + + if (K < 3) { + stop("beta constraint beta1 = g(beta2,...,beta_{K-1}) requires K >= 3.") + } + + # Parameters: categories 1..K-1 (baseline K is 0) + alpha <- rep(0, Km) + beta <- rep(0, Km) + + # Enforce beta constraint initially: beta1 = g(beta2..beta_{K-1}) + beta[1] <- g_beta(beta[2:Km]) + + n0p <- sum(n0) + n1p <- sum(n1) + + # Softmax probabilities for x = 0 and x = 1 (excluding baseline) + p0fun <- function(a) { + ea <- exp(a) + ea / (1 + sum(ea)) + } + p1fun <- function(a, b) { + eb <- exp(a + b) + eb / (1 + sum(eb)) + } + + # Log-likelihood given alpha, beta + loglik_fun <- function(alpha, beta) { + p0 <- p0fun(alpha) + p1 <- p1fun(alpha, beta) + + p0_full <- c(p0, 1 - sum(p0)) + p1_full <- c(p1, 1 - sum(p1)) + + eps <- 1e-12 + p0_full <- pmin(pmax(p0_full, eps), 1 - eps) + p1_full <- pmin(pmax(p1_full, eps), 1 - eps) + + sum(n0 * log(p0_full)) + sum(n1 * log(p1_full)) + } + + # ---- history tracking ---- + param_history <- list() + param_history[[length(param_history) + 1]] <- rbind( + data.frame(iter = 0L, + type = "alpha", + k = seq_len(Km), + value = alpha), + data.frame(iter = 0L, + type = "beta", + k = seq_len(Km), + value = beta) + ) + # --------------------------- + + ll_old <- loglik_fun(alpha, beta) + final_iter <- 0L + + # tuning constants for stability + ridge_base <- 1e-4 # base ridge + max_step_norm <- 5 # max Euclidean norm of step in θ + clip_logit_max <- 15 # clip for alpha/beta components + + for (iter in 1:maxit) { + + # Probabilities for non-baseline categories 1..K-1 + p0 <- p0fun(alpha) + p1 <- p1fun(alpha, beta) + + # ----- SCORES (gradients) ----- + # Unconstrained score wrt alpha[1:Km] and beta[1:Km] + s_alpha_full <- (n0[1:Km] + n1[1:Km]) - n0p * p0 - n1p * p1 + s_beta_full <- n1[1:Km] - n1p * p1 + + # Beta free gradient via chain rule (k = 2..Km) + grad_g_beta <- g_beta_grad(beta[2:Km]) # length Km-1 + s_beta_free <- s_beta_full[2:Km] + s_beta_full[1] * grad_g_beta + + # Score vector for reduced parameters θ = (alpha[1:Km], beta_free[1:(Km-1)]) + score_theta <- c(s_alpha_full, s_beta_free) + + # Convergence: score norm + score_norm <- sqrt(sum(score_theta^2)) + if (score_norm < tol) { + final_iter <- iter - 1L + break + } + + # ----- FULL FISHER INFORMATION (α, β_full) ----- + # M0, M1 (m x m) + M0 <- diag(p0) - tcrossprod(p0) + M1 <- diag(p1) - tcrossprod(p1) + + Iaa <- n0p * M0 + n1p * M1 # alpha-alpha + Iab <- n1p * M1 # alpha-beta + Ibb <- n1p * M1 # beta-beta + + # Full J (2m x 2m) + J_top <- cbind(Iaa, Iab) + J_bottom <- cbind(t(Iab), Ibb) + J <- rbind(J_top, J_bottom) + + # ----- TRANSFORMATION T from θ to φ = (alpha, beta_full) ----- + m <- Km + p <- 2 * m - 1 # dim(θ) + q <- 2 * m # dim(φ) + + T <- matrix(0, nrow = q, ncol = p) + # alpha-part: identity + T[1:m, 1:m] <- diag(m) + + # beta-part: B (m x (m-1)) at rows (m+1)..(2m), cols (m+1)..(2m-1) + B <- matrix(0, nrow = m, ncol = m - 1) + B[1, ] <- grad_g_beta # row for beta1 + if (m > 1) { + B[2:m, ] <- diag(m - 1) # rows beta2..betam + } + T[(m + 1):(2 * m), (m + 1):p] <- B + + # Fisher info for θ + I_theta <- crossprod(T, J %*% T) # T^T J T + + # ----- solve for step with adaptive ridge ----- + ridge_multipliers <- c(1, 10, 100, 1000, 1e4) + step_theta <- rep(NA_real_, length(score_theta)) + solved <- FALSE + + for (rm in ridge_multipliers) { + ridge <- ridge_base * rm + I_theta_r <- I_theta + ridge * diag(p) + step_try <- tryCatch( + solve(I_theta_r, score_theta), + error = function(e) NA_real_ + ) + if (all(is.finite(step_try))) { + step_theta <- step_try + solved <- TRUE + break + } + } + + if (!solved) { + final_iter <- iter - 1L + break + } + + # Cap step length + step_norm <- sqrt(sum(step_theta^2)) + if (step_norm > max_step_norm) { + step_theta <- step_theta * (max_step_norm / step_norm) + } + + # If step is essentially zero, we're done + if (sqrt(sum(step_theta^2)) < tol) { + final_iter <- iter - 1L + break + } + + # ----- Step-halving line search for robustness ----- + step_scale <- 1.0 + ll_new <- ll_old + accepted <- FALSE + + for (ls_iter in 1:ls_max) { + + theta_old <- c(alpha, beta[2:Km]) + theta_new <- theta_old + step_scale * step_theta + + alpha_new <- theta_new[1:Km] + beta_free_new <- theta_new[(Km + 1):(2 * Km - 1)] + + beta_new <- beta + beta_new[2:Km] <- beta_free_new + beta_new[1] <- g_beta(beta_new[2:Km]) + + # clip logits to avoid numeric blowups + alpha_new <- pmax(pmin(alpha_new, clip_logit_max), -clip_logit_max) + beta_new <- pmax(pmin(beta_new, clip_logit_max), -clip_logit_max) + + ll_candidate <- loglik_fun(alpha_new, beta_new) + + if (is.finite(ll_candidate) && ll_candidate >= ll_old) { + alpha <- alpha_new + beta <- beta_new + ll_new <- ll_candidate + accepted <- TRUE + break + } else { + step_scale <- step_scale * ls_rho + } + } + + if (!accepted) { + final_iter <- iter - 1L + break + } + + # record parameters after accepted update + param_history[[length(param_history) + 1]] <- rbind( + data.frame(iter = iter, + type = "alpha", + k = seq_len(Km), + value = alpha), + data.frame(iter = iter, + type = "beta", + k = seq_len(Km), + value = beta) + ) + + if (abs(ll_new - ll_old) < tol) { + ll_old <- ll_new + final_iter <- iter + break + } + + ll_old <- ll_new + final_iter <- iter + } + + param_df <- do.call(rbind, param_history) + + list( + alpha = alpha, + beta = beta, + logLik = ll_old, + iter = final_iter, + history = param_df + ) +} diff --git a/R/my_gd_fs.R b/R/my_gd_fs.R new file mode 100644 index 0000000..1666255 --- /dev/null +++ b/R/my_gd_fs.R @@ -0,0 +1,214 @@ +my_gd_fs <- function(n0, n1, + g_beta, g_beta_grad, + eta_alpha = 1e-3, # kept for signature; not used + eta_beta = 1e-3, # kept for signature; not used + maxit = 1000, + B_tol = 1e-8, + ls_max = 20, # step-halving max + ls_rho = 0.5) { # step shrink factor + + K <- length(n0) + Km <- K - 1 # m + + if (K < 3) { + stop("beta constraint beta1 = g(beta2,...,beta_{K-1}) requires K >= 3.") + } + + # Parameters: categories 1..K-1 (baseline K is 0) + alpha <- rep(0, Km) + beta <- rep(0, Km) + + # Enforce beta constraint initially: beta1 = g(beta2..beta_{K-1}) + beta[1] <- g_beta(beta[2:Km]) + + n0p <- sum(n0) + n1p <- sum(n1) + + # Softmax probabilities for x = 0 and x = 1 (excluding baseline) + p0fun <- function(a) { + ea <- exp(a) + ea / (1 + sum(ea)) + } + p1fun <- function(a, b) { + eb <- exp(a + b) + eb / (1 + sum(eb)) + } + + # Log-likelihood given alpha, beta + loglik_fun <- function(alpha, beta) { + p0 <- p0fun(alpha) + p1 <- p1fun(alpha, beta) + + p0_full <- c(p0, 1 - sum(p0)) + p1_full <- c(p1, 1 - sum(p1)) + + eps <- 1e-12 + p0_full <- pmin(pmax(p0_full, eps), 1 - eps) + p1_full <- pmin(pmax(p1_full, eps), 1 - eps) + + sum(n0 * log(p0_full)) + sum(n1 * log(p1_full)) + } + + # ---- history tracking ---- + param_history <- list() + param_history[[length(param_history) + 1]] <- rbind( + data.frame(iter = 0L, + type = "alpha", + k = seq_len(Km), + value = alpha), + data.frame(iter = 0L, + type = "beta", + k = seq_len(Km), + value = beta) + ) + # --------------------------- + + ll_old <- loglik_fun(alpha, beta) + final_iter <- 0L + + for (iter in 1:maxit) { + + # Probabilities for non-baseline categories 1..K-1 + p0 <- p0fun(alpha) + p1 <- p1fun(alpha, beta) + + # ----- SCORES (gradients) ----- + # Unconstrained score wrt alpha[1:Km] and beta[1:Km] + s_alpha_full <- (n0[1:Km] + n1[1:Km]) - n0p * p0 - n1p * p1 + s_beta_full <- n1[1:Km] - n1p * p1 + + # Beta free gradient via chain rule (k = 2..Km) + grad_g_beta <- g_beta_grad(beta[2:Km]) # length Km-1 + s_beta_free <- s_beta_full[2:Km] + s_beta_full[1] * grad_g_beta + + # Score vector for reduced parameters θ = (alpha[1:Km], beta_free[1:(Km-1)]) + score_theta <- c(s_alpha_full, s_beta_free) + + # Convergence: score norm + score_norm <- sqrt(sum(score_theta^2)) + # if (score_norm < B_tol) { + # final_iter <- iter - 1L + # break + # } + + # ----- FULL FISHER INFORMATION (α, β_full) ----- + # M0, M1 (m x m) + M0 <- diag(p0) - tcrossprod(p0) + M1 <- diag(p1) - tcrossprod(p1) + + Iaa <- n0p * M0 + n1p * M1 # alpha-alpha + Iab <- n1p * M1 # alpha-beta + Ibb <- n1p * M1 # beta-beta + + # Full J (2m x 2m) + J_top <- cbind(Iaa, Iab) + J_bottom <- cbind(t(Iab), Ibb) + J <- rbind(J_top, J_bottom) + + # ----- TRANSFORMATION T from θ to φ = (alpha, beta_full) ----- + m <- Km + p <- 2 * m - 1 # dim(θ) + q <- 2 * m # dim(φ) + + T <- matrix(0, nrow = q, ncol = p) + # alpha-part: identity + T[1:m, 1:m] <- diag(m) + + # beta-part: B (m x (m-1)) at rows (m+1)..(2m), cols (m+1)..(2m-1) + B <- matrix(0, nrow = m, ncol = m - 1) + B[1, ] <- grad_g_beta # row for beta1 + if (m > 1) { + B[2:m, ] <- diag(m - 1) # rows beta2..betam + } + T[(m + 1):(2 * m), (m + 1):p] <- B + + # Fisher info for θ + I_theta <- crossprod(T, J %*% T) # T^T J T + + # Add tiny ridge for numerical stability if needed + ridge <- 1e-8 + I_theta_r <- I_theta + ridge * diag(p) + + # Fisher scoring step: θ_new = θ_old + I^{-1} S + step_theta <- tryCatch( + solve(I_theta_r, score_theta), + error = function(e) { + # In case of numerical issues, break + rep(0, length(score_theta)) + } + ) + + # If step is essentially zero, we're done + # if (sqrt(sum(step_theta^2)) < B_tol) { + if (max(abs(step_theta)) < B_tol) { + final_iter <- iter - 1L + break + } + + # ----- Step-halving line search for robustness ----- + step_scale <- 1.0 + ll_new <- ll_old + accepted <- FALSE + + for (ls_iter in 1:ls_max) { + + theta_new <- c(alpha, beta[2:Km]) + step_scale * step_theta + alpha_new <- theta_new[1:Km] + beta_free_new <- theta_new[(Km + 1):(2 * Km - 1)] + + beta_new <- beta + beta_new[2:Km] <- beta_free_new + beta_new[1] <- g_beta(beta_new[2:Km]) + + print(cbind(alpha_new, beta_new)) + ll_candidate <- loglik_fun(alpha_new, beta_new) + print(ll_candidate) + + if (ll_candidate >= ll_old) { + alpha <- alpha_new + beta <- beta_new + ll_new <- ll_candidate + accepted <- TRUE + break + } else { + step_scale <- step_scale * ls_rho + } + } + + if (!accepted) { + final_iter <- iter - 1L + break + } + + # record parameters after accepted update + param_history[[length(param_history) + 1]] <- rbind( + data.frame(iter = iter, + type = "alpha", + k = seq_len(Km), + value = alpha), + data.frame(iter = iter, + type = "beta", + k = seq_len(Km), + value = beta) + ) + + if (abs(ll_new - ll_old) < B_tol) { + ll_old <- ll_new + final_iter <- iter + break + } + + ll_old <- ll_new + final_iter <- iter + } + + param_df <- do.call(rbind, param_history) + + list( + alpha = alpha, + beta = beta, + logLik = ll_old, + iter = final_iter, + history = param_df + ) +} diff --git a/R/my_gd_ls.R b/R/my_gd_ls.R new file mode 100644 index 0000000..3c89289 --- /dev/null +++ b/R/my_gd_ls.R @@ -0,0 +1,224 @@ +my_gd_ls <- function(n0, n1, + g_beta, g_beta_grad, + eta_alpha = 1e-3, + eta_beta = 1e-3, + maxit = 1000, + tol = 1e-8, + ls_max = 30, # max backtracking steps + ls_c = 1e-4, # Armijo constant + ls_rho = 0.5) { # backtracking shrink factor + + K <- length(n0) + Km <- K - 1 + + if (K < 3) { + stop("beta constraint beta1 = g(beta2,...,beta_{K-1}) requires K >= 3.") + } + + # Parameters for categories 1..K-1 (baseline K is 0) + alpha <- rep(0, Km) + beta <- rep(0, Km) + + # Enforce beta constraint initially: beta1 = g(beta2..beta_{K-1}) + beta[1] <- g_beta(beta[2:Km]) + + n0p <- sum(n0) + n1p <- sum(n1) + + # Softmax probabilities for x = 0 and x = 1 (excluding baseline) + p0fun <- function(a) { + ea <- exp(a) + ea / (1 + sum(ea)) + } + p1fun <- function(a, b) { + eb <- exp(a + b) + eb / (1 + sum(eb)) + } + + # Log-likelihood given alpha, beta + loglik_fun <- function(alpha, beta) { + p0 <- p0fun(alpha) + p1 <- p1fun(alpha, beta) + + p0_full <- c(p0, 1 - sum(p0)) + p1_full <- c(p1, 1 - sum(p1)) + + # clip probs to avoid log(0) + eps <- 1e-12 + p0_full <- pmin(pmax(p0_full, eps), 1 - eps) + p1_full <- pmin(pmax(p1_full, eps), 1 - eps) + + sum(n0 * log(p0_full)) + sum(n1 * log(p1_full)) + } + + # ---- history tracking ---- + param_history <- list() + # iteration 0 (initial) + param_history[[length(param_history) + 1]] <- rbind( + data.frame(iter = 0L, + type = "alpha", + k = seq_len(Km), + value = alpha), + data.frame(iter = 0L, + type = "beta", + k = seq_len(Km), + value = beta) + ) + # --------------------------- + + # Initial log-likelihood + ll_old <- loglik_fun(alpha, beta) + + final_iter <- 0L + + for (iter in 1:maxit) { + + # Probabilities for non-baseline categories 1..K-1 + p0 <- p0fun(alpha) + p1 <- p1fun(alpha, beta) + + # Unconstrained gradients wrt alpha[1:Km] and beta[1:Km] + dlda <- (n0[1:Km] + n1[1:Km]) - n0p * p0 - n1p * p1 + dldb <- n1[1:Km] - n1p * p1 + + # Gradient for beta free coordinates k = 2..Km (chain rule) + grad_g_beta <- g_beta_grad(beta[2:Km]) # length K-2 + grad_beta_free <- dldb[2:Km] + dldb[1] * grad_g_beta # length K-2 + + # Gradient norm for convergence check (use α and free β) + grad_norm <- sqrt(sum(dlda^2) + sum(grad_beta_free^2)) + if (grad_norm < tol) { + final_iter <- iter - 1L + break + } + + # Search directions (gradient ascent) + dir_alpha <- eta_alpha * dlda + dir_beta_free <- eta_beta * grad_beta_free + + # Directional magnitude for Armijo: + # D = η_alpha * ||grad_alpha||^2 + η_beta * ||grad_beta_free||^2 + D <- eta_alpha * sum(dlda^2) + eta_beta * sum(grad_beta_free^2) + if (D <= 0) { + final_iter <- iter - 1L + break # gradients essentially zero + } + + # Backtracking line search + step_scale <- 1.0 + ll_new <- ll_old + accepted <- FALSE + + for (ls_iter in 1:ls_max) { + + # Candidate alpha + alpha_new <- alpha + step_scale * dir_alpha + + # Candidate beta free coords + beta_free_new <- beta[2:Km] + step_scale * dir_beta_free + + # Re-enforce beta constraint + beta_new <- beta + beta_new[2:Km] <- beta_free_new + beta_new[1] <- g_beta(beta_new[2:Km]) + + ll_candidate <- loglik_fun(alpha_new, beta_new) + + # Armijo condition for maximization: + # ℓ(new) >= ℓ(old) + c * step_scale * D + if (ll_candidate >= ll_old + ls_c * step_scale * D) { + ll_new <- ll_candidate + alpha <- alpha_new + beta <- beta_new + accepted <- TRUE + break + } else { + step_scale <- step_scale * ls_rho + } + } + + # If no step accepted, treat as convergence/stall + if (!accepted) { + final_iter <- iter - 1L + break + } + + # record parameters *after* this accepted update + param_history[[length(param_history) + 1]] <- rbind( + data.frame(iter = iter, + type = "alpha", + k = seq_len(Km), + value = alpha), + data.frame(iter = iter, + type = "beta", + k = seq_len(Km), + value = beta) + ) + + # Monotone increase in log-likelihood at accepted steps + if (abs(ll_new - ll_old) < tol) { + ll_old <- ll_new + final_iter <- iter + break + } + + ll_old <- ll_new + final_iter <- iter + } + + # Bind history into a single data.frame + param_df <- do.call(rbind, param_history) + + list( + alpha = alpha, + beta = beta, + logLik = ll_old, + iter = final_iter, + history = param_df + ) +} + +# fn3_1_J <- fit_null(B=fit3$B, Y=Y, X = X, k_constr=2, j_constr=1, j_ref=J, +# constraint_fn=list(pseudohuber_median, pseudohuber_median), +# constraint_grad_fn=list(radEmu::dpseudohuber_median_dx, radEmu::dpseudohuber_median_dx), +# B_tol=1e-8, constraint_tol=1e-8) +# +# out_test2 <- my_gd_ls(n0=Y[which(X[,2] == 0), ] %>% colSums, +# n1 = Y[which(X[,2] == 1), ] %>% colSums, +# g_beta=function(x) { pseudohuber_median(c(x, 0)) }, +# g_beta_grad= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]}, +# eta_alpha = 1e-3, +# eta_beta = 1e-3, +# maxit = 1000, +# tol = 1e-6) +# +# out_test$alpha +# out_test$beta +# fn3_1_J$B # j_constr=1, j_ref=J +# +# n0s <- Y[which(X[,2] == 0), ] %>% colSums +# n1s <- Y[which(X[,2] == 1), ] %>% colSums +# n0s_mod <- n0s; n0s_mod[1] <- n0s[1] + n1s[1] +# +# +# Y +# logit <- function(x) log(x/(1-x)) +# lg <- logit(n0s/sum(n0s)) +# lg +# lg - lg[J] +# +# logit(n0s_mod/sum(n0s_mod)) - logit(n0s_mod/sum(n0s_mod))[J] +# +# +# pseudohuber_median_mod(out_test$alpha[-1]) +# pseudohuber_median_mod(out_test$beta[-1]) +# +# +# out_test +# fn3_1_J$B +# +# pseudohuber_median_mod(out_test$alpha[-1]) +# pseudohuber_median_mod(out_test$beta[-1]) + + + diff --git a/tests/testthat/test-fit_null_discrete.R b/tests/testthat/test-fit_null_discrete.R new file mode 100644 index 0000000..6ffddd4 --- /dev/null +++ b/tests/testthat/test-fit_null_discrete.R @@ -0,0 +1,225 @@ + +test_that("new discrete is correct", { + + # set.seed(1) + n <- 40 + J <- 30 + p <- 5 + beta <- matrix(rnorm(p*J, mean = 2, sd=1), ncol = J) + X <- cbind(1, + c(rep(0, n/2), rep(1, n/2)), + c(rep(0, 5*n/8), rep(1, 3*n/8)), + c(rep(0, 3*n/4), rep(1, n/4)), + c(rep(0, 7*n/8), rep(1, n/8))) + Y <- matrix(rpois(n=n*J, lambda=exp(1 + X %*% beta)), ncol = J) + Y <- pmax(Y, 1) + + my_jstar <- 9 + my_kstar <- 4 + my_jref <- J + + t_discrete <- system.time({ + out_discrete <- fit_null_discrete_pseudohuber(Y = Y, X = X, k_constr = my_kstar, j_constr = my_jstar, j_ref = my_jref, + tol = 1e-12) + }) + + ## check that the (my_kstar, my_kstar) constraint is satisfied + expect_equal(pseudohuber_median(out_discrete$B[my_kstar, -my_jstar]), + out_discrete$B[my_kstar, my_jstar] ) + + ## check that the j_ref constraint is satisfied + expect_equal(out_discrete$B[, my_jref], + rep(0, p)) + + ## check permutation invariant + X2 <- X[, c(1, (p:2))] + new_order <- c(sample(1:(J-1), J-1), J) + new_j_star <- which(new_order == my_jstar) + Y2 <- Y[, new_order] + t_discrete2 <- system.time({ + out_discrete2 <- fit_null_discrete_pseudohuber(Y = Y2, X = X2, k_constr = which(c(1, (p:2)) == my_kstar), + j_constr = new_j_star, j_ref = my_jref, + tol = 1e-12) + }) + + z_discrete <- update_z(Y, X, out_discrete$B) + log_means_discrete <- X %*% out_discrete$B + matrix(z_discrete, ncol = 1) %*% matrix(1, ncol = J, nrow = 1) + ll_new_discrete <- sum(Y * log_means_discrete - exp(log_means_discrete)) + + z_discrete2 <- update_z(Y2, X2, out_discrete2$B) + log_means_discrete2 <- X2 %*% out_discrete2$B + matrix(z_discrete2, ncol = 1) %*% matrix(1, ncol = J, nrow = 1) + ll_new_discrete2 <- sum(Y2 * log_means_discrete2 - exp(log_means_discrete2)) + + ## expect them to be close in estimates and likelihood + expect_true(max(abs(out_discrete$B[, new_order] - out_discrete2$B[c(1, (p:2)), ])) < 1e-4) + expect_equal(ll_new_discrete2, ll_new_discrete) + +}) + + + +test_that("new discrete is correct with flexible jref and jstar", { + + # set.seed(3) + n <- 24 + J <- 10 + p <- 5 + beta <- matrix(rnorm(p*J, mean = 1, sd=1), ncol = J) + X <- cbind(1, + c(rep(0, n/2), rep(1, n/2)), + c(rep(0, 5*n/8), rep(1, 3*n/8)), + c(rep(0, 3*n/4), rep(1, n/4)), + c(rep(0, 7*n/8), rep(1, n/8))) + Y <- matrix(rpois(n=n*J, lambda=exp(1 + X %*% beta)), ncol = J) + Y <- pmax(Y, 1) + + my_jstar <- 9 + my_jref <- 1 + my_kstar <- 4 + + out_discrete91 <- fit_null_discrete_pseudohuber(Y = Y, X = X, k_constr = my_kstar, j_constr = my_jstar, j_ref = my_jref, + tol = 1e-12) + + ## check that the (my_kstar, my_kstar) constraint is satisfied + expect_equal(pseudohuber_median(out_discrete91$B[my_kstar, -my_jstar]), + out_discrete91$B[my_kstar, my_jstar] ) + + ## check that the j_ref constraint is satisfied + expect_equal(out_discrete91$B[, my_jref], + rep(0, p)) + + my_jstar <- 4 + my_jref <- 3 + + out_discrete43 <- fit_null_discrete_pseudohuber(Y = Y, X = X, k_constr = my_kstar, j_constr = my_jstar, j_ref = my_jref, + tol = 1e-12) + + ## check that the (my_kstar, my_kstar) constraint is satisfied + expect_equal(pseudohuber_median(out_discrete43$B[my_kstar, -my_jstar]), + out_discrete43$B[my_kstar, my_jstar] ) + + ## check that the j_ref constraint is satisfied + expect_equal(out_discrete43$B[, my_jref], + rep(0, p)) + + my_jstar <- 4 + my_jref <- 6 + + out_discrete46 <- fit_null_discrete_pseudohuber(Y = Y, X = X, k_constr = my_kstar, j_constr = my_jstar, j_ref = my_jref, + tol = 1e-12) + + ## should be same value across rows + expect_equal((out_discrete43$B - out_discrete46$B) %>% round(4) %>% apply(1, sd) %>% max, 0) + +}) + + + + +test_that("new discrete is fast", { + + skip("Too slow for automated testing; 20 mins on laptop") + + set.seed(2) + n <- 12 + J <- 10 + p <- 3 + beta <- matrix(rnorm(p*J, mean = 2, sd=1), ncol = J) + X <- cbind(1, c(rep(0, n/2), rep(1, n/2)), c(rep(0, 3*n/4), rep(1, n/4))) + Y <- matrix(rpois(n=n*J, lambda=exp(2 + X %*% beta)), ncol = J) + Y <- pmax(Y, 1) + + my_jstar <- J-1 + my_kstar <- 2 + my_jref <- J + + t_discrete <- system.time({ + out_discrete <- fit_null_discrete_pseudohuber(Y = Y, X = X, k_constr = my_kstar, j_constr = my_jstar, j_ref = my_jref, + tol = 1e-12) + }) + + t_orig <- system.time({ + out_orig <- fit_null(B=matrix(0, nrow = p, ncol = J), + Y=Y, X = X, + k_constr=my_kstar, j_constr=my_jstar, j_ref=my_jref, + constraint_fn=list(pseudohuber_median, pseudohuber_median), + constraint_grad_fn=list(radEmu::dpseudohuber_median_dx, radEmu::dpseudohuber_median_dx), + B_tol=1e-8, constraint_tol=1e-10, maxit = 1e8) + }) # 816.931 489.308 1317.577 + + ## check faster + expect_lt(t_discrete[3], t_orig[3]) + + ## check that the (my_kstar, my_kstar) constraint is satisfied + expect_equal(pseudohuber_median(out_discrete$B[my_kstar, -my_jstar]), + out_discrete$B[my_kstar, my_jstar] ) + + expect_equal(pseudohuber_median(out_orig$B[my_kstar, -my_jstar]), + out_orig$B[my_kstar, my_jstar] ) + + expect_lte(max(abs(out_discrete$B - out_orig$B)), 1e-3) + + z_discrete <- update_z(Y, X, out_discrete$B) + log_means_discrete <- X %*% out_discrete$B + matrix(z_discrete, ncol = 1) %*% matrix(1, ncol = J, nrow = 1) + ll_new_discrete <- sum(Y * log_means_discrete - exp(log_means_discrete)) + + z_orig <- update_z(Y, X, out_orig$B) + log_means_orig <- X %*% out_orig$B + matrix(z_orig, ncol = 1) %*% matrix(1, ncol = J, nrow = 1) + ll_new_orig <- sum(Y * log_means_orig - exp(log_means_orig)) + + ## expect the new likelihood is no worse than 1% lower + expect_true(ll_new_discrete > ll_new_orig*0.99) + +}) + + +test_that("new discrete null aligns with older code", { + + skip("Too slow for automated testing; 3 mins on laptop") + + set.seed(4) + n <- 12 + J <- 5 + p <- 3 + beta <- matrix(rnorm(p*J, mean = 2, sd=1), ncol = J) + X <- cbind(1, c(rep(0, n/3), rep(1, n/3), rep(0, n/3)), c(rep(0, n/3), rep(0, n/3), rep(1, n/3))) + Y <- matrix(rpois(n=n*J, lambda=exp(2 + X %*% beta)), ncol = J) + Y <- pmax(Y, 1) + + my_jstar <- J-2 + my_jref <- J-1 + my_kstar <- 2 + + t_discrete <- system.time({ + out_discrete <- fit_null_discrete_pseudohuber(Y = Y, X = X, k_constr = my_kstar, j_constr = my_jstar, j_ref = my_jref, + tol = 1e-12) + }) + + t_orig <- system.time({ + out_orig <- fit_null(B=matrix(0, nrow = p, ncol = J), + Y=Y, X = X, + k_constr=my_kstar, j_constr=my_jstar, j_ref=my_jref, + constraint_fn=list(pseudohuber_median, pseudohuber_median), + constraint_grad_fn=list(radEmu::dpseudohuber_median_dx, radEmu::dpseudohuber_median_dx), + B_tol=1e-3, constraint_tol=1e-6, maxit = 1e8) + }) # 94.249 66.621 163.194 + + ## check faster + expect_lt(t_discrete[3], t_orig[3]) + + ## check align + expect_lte(max(abs(out_discrete$B - out_orig$B)), 0.02) + + ## check no worse + z_discrete <- update_z(Y, X, out_discrete$B) + log_means_discrete <- X %*% out_discrete$B + matrix(z_discrete, ncol = 1) %*% matrix(1, ncol = J, nrow = 1) + ll_new_discrete <- sum(Y * log_means_discrete - exp(log_means_discrete)) + + z_orig <- update_z(Y, X, out_orig$B) + log_means_orig <- X %*% out_orig$B + matrix(z_orig, ncol = 1) %*% matrix(1, ncol = J, nrow = 1) + ll_new_orig <- sum(Y * log_means_orig - exp(log_means_orig)) + + ## expect the new likelihood is no worse than 1% lower + expect_true(ll_new_discrete > ll_new_orig*0.99) + +}) \ No newline at end of file