From 05c937172111c47e1a36125ad66e55250d8c1124 Mon Sep 17 00:00:00 2001 From: amy Date: Wed, 19 Nov 2025 16:48:08 -0800 Subject: [PATCH 1/7] gradient descent + line search, sigh --- R/f_info.R | 106 +++--- R/fit_null.R | 350 +++++++++--------- R/fit_null_discrete.R | 130 +++++++ ...t_null_discrete_grad_descent_line_search.R | 264 +++++++++++++ 4 files changed, 615 insertions(+), 235 deletions(-) create mode 100644 R/fit_null_discrete.R create mode 100644 R/fit_null_discrete_grad_descent_line_search.R diff --git a/R/f_info.R b/R/f_info.R index 5876f57..1696056 100644 --- a/R/f_info.R +++ b/R/f_info.R @@ -1,69 +1,69 @@ #' @importFrom methods as # fisher information for multinomial model -f_info <- function(Y, - B_cup, - B, - X, - X_cup, - omit = NULL, - compute_together = FALSE, - return_separate = FALSE){ +f_info <- function( + Y, + B_cup, + B, + X, + X_cup, + omit = NULL, + compute_together = FALSE, + return_separate = FALSE +) { n <- nrow(Y) J <- ncol(Y) p <- ncol(X) - if(is.null(omit)){ - f_info <- matrix(0,ncol = p*J,nrow = p*J) - } else{ - if(compute_together){ - f_info <- matrix(0,ncol = p*(J - 1), nrow = p*(J - 1)) + if (is.null(omit)) { + f_info <- matrix(0, ncol = p * J, nrow = p * J) + } else { + if (compute_together) { + f_info <- matrix(0, ncol = p * (J - 1), nrow = p * (J - 1)) } } - z <- update_z(Matrix::Matrix(Y),Matrix::Matrix(X),B) - - - - if(compute_together){ #older, slower way of computing info - #used only to test equality with newer way - ident_mat <- Matrix::Diagonal(x = rep(1,J)) - one_mat <- Matrix::Matrix(1,nrow = J,ncol=1) - for(i in 1:n){ - which_rows <- 1:J + (i - 1)*J - mu_i <- exp(X_cup[which_rows,]%*% B_cup + z[i]) - p_i <- mu_i/sum(mu_i) - - M_iX_i <- (ident_mat - Matrix::tcrossprod(one_mat,p_i))%*%X_cup[which_rows,] - - - f_info <- f_info + Matrix::crossprod(M_iX_i,Matrix::Diagonal(x= as.numeric(mu_i)))%*%M_iX_i - } - } else{ #newer, faster way to compute info - mus <- exp(X_cup%*%B_cup + rep(z,each = J)) + z <- update_z(Matrix::Matrix(Y), Matrix::Matrix(X), B) + + if (compute_together) { + #older, slower way of computing info + #used only to test equality with newer way + ident_mat <- Matrix::Diagonal(x = rep(1, J)) + one_mat <- Matrix::Matrix(1, nrow = J, ncol = 1) + for (i in 1:n) { + which_rows <- 1:J + (i - 1) * J + mu_i <- exp(X_cup[which_rows, ] %*% B_cup + z[i]) + p_i <- mu_i / sum(mu_i) + + M_iX_i <- (ident_mat - Matrix::tcrossprod(one_mat, p_i)) %*% + X_cup[which_rows, ] + + f_info <- f_info + + Matrix::crossprod(M_iX_i, Matrix::Diagonal(x = as.numeric(mu_i))) %*% + M_iX_i + } + } else { + #newer, faster way to compute info + mus <- exp(X_cup %*% B_cup + rep(z, each = J)) mu_diag <- Matrix::Diagonal(x = mus@x) - ps <- lapply(1:n, - function(i){ - p_i <- mus@x[(i-1)*J +1:J]; - return(p_i/sum(p_i))}) - ps_mat <- Matrix::sparseMatrix(i = 1:(n*J), - j = rep(1:n,each = J), - x = do.call(c,ps)) - X_cup_p <- Matrix::crossprod(X_cup,ps_mat) + ps <- lapply(1:n, function(i) { + p_i <- mus@x[(i - 1) * J + 1:J] + return(p_i / sum(p_i)) + }) + ps_mat <- Matrix::sparseMatrix( + i = 1:(n * J), + j = rep(1:n, each = J), + x = do.call(c, ps) + ) + X_cup_p <- Matrix::crossprod(X_cup, ps_mat) X_cup_p <- as.matrix(X_cup_p) - X_cup_p <- methods::as(X_cup_p,"sparseMatrix") + X_cup_p <- methods::as(X_cup_p, "sparseMatrix") sum_diag <- Matrix::Diagonal(x = rowSums(Y)) - f_info <- Matrix::crossprod(X_cup,mu_diag)%*%X_cup - - X_cup_p%*% - Matrix::tcrossprod(sum_diag,X_cup_p) - - - - } - - + f_info <- Matrix::crossprod(X_cup, mu_diag) %*% + X_cup - + X_cup_p %*% + Matrix::tcrossprod(sum_diag, X_cup_p) + } return(f_info) - - } diff --git a/R/fit_null.R b/R/fit_null.R index b6f6b65..9957347 100644 --- a/R/fit_null.R +++ b/R/fit_null.R @@ -1,5 +1,5 @@ #' fits model with B_kj constrained to equal g(B_k) for constraint fn g -#' +#' #' @param B description #' @param Y Y (with augmentations) #' @param X design matrix @@ -23,102 +23,98 @@ #' @param trackB track value of beta across iterations and return? #' @param ignore_stop whether to ignore stopping criteria and run `maxit` iterations (could be helpful for diagnostic plots). #' @param null_diagnostic_plots logical: should diagnostic plots be made for estimation under the null hypothesis? Default is \code{FALSE}. -#' +#' #' @return A list containing elements `B`, `k_constr`, `j_constr`, `niter` -#' `gap`, `u`, `rho`, and `Bs`. `B` is a matrix containing parameter estimates +#' `gap`, `u`, `rho`, and `Bs`. `B` is a matrix containing parameter estimates #' under the null (obtained by maximum likelihood on augmented observations Y), -#' `k_constr`, and `j_constr` give row and column indexes of the parameter -#' fixed to be equal to the constraint function \eqn{g()} under the null. `niter` is a -#' scalar giving total number of outer iterations used to fit the null model, -#' `gap` gives the final value of \eqn{g(B_{k constr}) - B_{k constr, j constr}}, -#' `u` and `rho` are final values of augmented Lagrangian parameters, and -#' `Bs` is a data frame containing values of B by iteration if `trackB` was set +#' `k_constr`, and `j_constr` give row and column indexes of the parameter +#' fixed to be equal to the constraint function \eqn{g()} under the null. `niter` is a +#' scalar giving total number of outer iterations used to fit the null model, +#' `gap` gives the final value of \eqn{g(B_{k constr}) - B_{k constr, j constr}}, +#' `u` and `rho` are final values of augmented Lagrangian parameters, and +#' `Bs` is a data frame containing values of B by iteration if `trackB` was set #' equal to TRUE (otherwise it contains a NULL value). -#' -fit_null <- function(B, - Y, - X, - X_cup = NULL, - k_constr, - j_constr, - j_ref, - constraint_fn, - constraint_grad_fn, - rho_init = 1, - tau = 1.2, - kappa = 0.8, - B_tol = 1e-2, - inner_tol = 0.01, - constraint_tol = 1e-4, - max_step = 5, - c1 = 1e-4, - maxit = 1000, - inner_maxit = 25, - verbose = FALSE, - trackB = FALSE, - ignore_stop = FALSE, - null_diagnostic_plots = FALSE +#' +fit_null <- function( + B, + Y, + X, + X_cup = NULL, + k_constr, + j_constr, + j_ref, + constraint_fn, + constraint_grad_fn, + rho_init = 1, + tau = 1.2, + kappa = 0.8, + B_tol = 1e-2, + inner_tol = 0.01, + constraint_tol = 1e-4, + max_step = 5, + c1 = 1e-4, + maxit = 1000, + inner_maxit = 25, + verbose = FALSE, + trackB = FALSE, + ignore_stop = FALSE, + null_diagnostic_plots = FALSE ) { - J <- ncol(Y) n <- nrow(Y) p <- ncol(X) - - it_df <- data.frame(it = 1:maxit, - lik = NA, - max_abs_B = NA, - constraint_diff = NA) + + it_df <- data.frame( + it = 1:maxit, + lik = NA, + max_abs_B = NA, + constraint_diff = NA + ) if (null_diagnostic_plots) { it_df$test_stat <- NA } - - #store cols of B to update in a vector - j_to_update <- (1:J)[(1:J) != j_ref] + #in vector format, which indexes to remove (bc B^j_constr = 0 id. constr.) - indexes_to_remove <- (j_ref - 1)*p + 1:p - - + indexes_to_remove <- (j_ref - 1) * p + 1:p + #change id. constr. by subtracting approp. col of B from others - for(k in 1:p) { - B[k,] <- B[k,] - B[k,j_ref] + for (k in 1:p) { + B[k, ] <- B[k, ] - B[k, j_ref] } - + #update z z <- update_z(X = X, Y = Y, B = B) - - log_mean <- X%*%B + - matrix(z,ncol = 1)%*%matrix(1,ncol = J, nrow = 1) - - ll <- sum(log_mean*Y - exp(log_mean)) - + + log_mean <- X %*% B + matrix(z, ncol = 1) %*% matrix(1, ncol = J, nrow = 1) + + ll <- sum(log_mean * Y - exp(log_mean)) + if (trackB) { - Bs <- data.frame(k = rep(1:p,each = J), - j = rep(1:J,p), - B = NA) - } else{ - Bs <- NULL + Bs <- data.frame(k = rep(1:p, each = J), j = rep(1:J, p), B = NA) + } else { + Bs <- NULL } - + #get X_cup for later use if (is.null(X_cup)) { - X_cup = X_cup_from_X(X,J) + X_cup = X_cup_from_X(X, J) } - + #set iteration to zero iter <- 0 - + converged <- FALSE - + #compute gap (i.e. g(B_k) - B_kj) - gap <- constraint_fn[[k_constr]](B[k_constr,]) - B[k_constr,j_constr] + gap <- constraint_fn[[k_constr]](B[k_constr, ]) - B[k_constr, j_constr] init_gap <- abs(gap) - + #set rho equal to initial value rho <- rho_init - + #initiate u - u <- rho*gap - + u <- rho * gap + if (trackB) { Bs$iter <- 0 Bs$inner_iter <- 0 @@ -126,179 +122,169 @@ fit_null <- function(B, Bs$u <- u Bs$gap <- gap - # Bs$score_stat <- 0 + # Bs$score_stat <- 0 lilBs <- Bs - # - for(k in 1:p) { - Bs$B[1:J + J*(k - 1)] <- B[k,] + # + for (k in 1:p) { + Bs$B[1:J + J * (k - 1)] <- B[k, ] } } - - loop_j_to_update <- j_to_update - + use_max_inner_it <- FALSE B_diff <- Inf - + criteria <- TRUE - - while(criteria) { - - iter <- iter + 1 #increment iteration - inner_iter <- 0 #initiate internal iteration - + + while (criteria) { + iter <- iter + 1 #increment iteration + inner_iter <- 0 #initiate internal iteration + #evaluate augmented Lagrangian - + # get current value of augmented lagrangian - curr_lag_val <- -ll + u*gap + (rho/2)*(gap^2) - + curr_lag_val <- -ll + u * gap + (rho / 2) * (gap^2) + old_gap <- gap # old value of g(B_k) - B_kj old_B <- B #old value of B inner_diff <- Inf #initiate "observed" max abs val diff in B between inner iterations - #at Inf so inner loop runs at least once + #at Inf so inner loop runs at least once #inner loop: - while(((inner_diff > inner_tol | use_max_inner_it) & inner_iter <= inner_maxit)| inner_iter == 0) { - + while ( + ((inner_diff > inner_tol | use_max_inner_it) & + inner_iter <= inner_maxit) | + inner_iter == 0 + ) { inner_old_B <- B - + #perform block update to B as described in null estimation section of Clausen & Willis (2024) - update <- macro_fisher_null(X = X, - Y = Y, - B = B, - z = z, - J = J, - p = p, - k_constr = k_constr, - j_constr = j_constr, - j_ref = j_ref, - rho = rho, - u = u, - max_step = max_step, - constraint_fn = constraint_fn, - constraint_grad_fn = constraint_grad_fn, - verbose = verbose, - c1 = 1e-4) + update <- macro_fisher_null( + X = X, + Y = Y, + B = B, + z = z, + J = J, + p = p, + k_constr = k_constr, + j_constr = j_constr, + j_ref = j_ref, + rho = rho, + u = u, + max_step = max_step, + constraint_fn = constraint_fn, + constraint_grad_fn = constraint_grad_fn, + verbose = verbose, + c1 = 1e-4 + ) + print(update) B <- B + update$update gap <- update$gap - z <- update_z(Y,X,B) - + z <- update_z(Y, X, B) + if (trackB) { - # - # ### AW notes 1/15/24 that dlag_lamhat_dB is not computed in this function. - # temp_Bs <- lilBs - for(k in 1:p) { - temp_Bs$B[1:J + J*(k - 1)] <- B[k,] + for (k in 1:p) { + temp_Bs$B[1:J + J * (k - 1)] <- B[k, ] } temp_Bs$iter <- iter temp_Bs$inner_iter <- inner_iter temp_Bs$rho <- rho temp_Bs$u <- u - # temp_Bs$Q <- NA temp_Bs$gap <- gap - # if (iter > 1 | inner_iter > 1) { - # jcounter <- 1 - # for(j in 1:J) { - # if (j == j_constr) { - # temp_Bs$dlag_dB[temp_Bs$j ==j] <- 0 - # } else{ - # - # ### maybe in - # - # # dlag_lamhat_dB <- dldB + lambda_hat*dgdB - # # temp_Bs$dlag_dB[temp_Bs$j ==j] <- dlag_lamhat_dB[(jcounter - 1)*p + 1:p,] - # # jcounter <- jcounter + 1 - # } - # } - # } - # - Bs <- rbind(Bs,temp_Bs) - # + Bs <- rbind(Bs, temp_Bs) } - + #did we move much? inner_diff <- max(abs(B - inner_old_B)) - + #increment inner iteration counter inner_iter <- inner_iter + 1 - } #did we move much since last *outer* loop iteration? B_diff <- max(abs(B - old_B)) - + if (verbose) { - message("Max absolute difference in B since last augmented Lagrangian outer step: ", signif (B_diff,3)) + message( + "Max absolute difference in B since last augmented Lagrangian outer step: ", + signif(B_diff, 3) + ) } - + #update u and rho - if ( abs(gap) > constraint_tol) { - u <- u + rho*gap - if (abs(gap/old_gap) > kappa) { - rho <- rho*tau - } + if (abs(gap) > constraint_tol) { + u <- u + rho * gap + if (abs(gap / old_gap) > kappa) { + rho <- rho * tau + } } - + if (verbose) { - message("Estimate deviates from feasibility by ",signif (gap,3)) - message("Parameter u set to ", signif (u,3)) - message("Parameter rho set to ", signif (rho,3)) + message("Estimate deviates from feasibility by ", signif(gap, 3)) + message("Parameter u set to ", signif(u, 3)) + message("Parameter rho set to ", signif(rho, 3)) } - + if (abs(gap) < constraint_tol) { use_max_inner_it <- TRUE - # message("Score stat: ",signif (check$score_stat*(n/(n - 1)),3)) - # message("Influence stat: ",signif (check$influence_stat,3)) - # proj_score <- as.numeric(as.matrix(check$influence_stat)) - } else{ + } else { use_max_inner_it <- FALSE } - + if (ignore_stop) { criteria <- (iter < maxit & !is.infinite(rho)) } else { - criteria <- (abs(gap) > constraint_tol | B_diff> B_tol) & iter < maxit & !is.infinite(rho) + criteria <- (abs(gap) > constraint_tol | B_diff > B_tol) & + iter < maxit & + !is.infinite(rho) } - - log_means <- X %*% B + - matrix(z, ncol = 1) %*% matrix(1, ncol = J, nrow = 1) - + + log_means <- X %*% B + matrix(z, ncol = 1) %*% matrix(1, ncol = J, nrow = 1) + ll <- sum(Y * log_means - exp(log_means)) it_df$lik[iter] <- ll it_df$max_abs_B[iter] <- B_diff it_df$constraint_diff[iter] <- abs(gap) - + if (verbose) { - message("Log likelihood: ", signif (ll,3)) + message("Log likelihood: ", signif(ll, 3)) } - + if (null_diagnostic_plots) { - score_res <- try(get_score_stat(X = X, Y = Y, X_cup = X_cup, B = B, k_constr = k_constr, - j_constr = j_constr, constraint_grad_fn = constraint_grad_fn, - indexes_to_remove = (j_ref - 1)*p + 1:p, j_ref = j_ref, J = J, - n = n, p = p)) + score_res <- try(get_score_stat( + X = X, + Y = Y, + X_cup = X_cup, + B = B, + k_constr = k_constr, + j_constr = j_constr, + constraint_grad_fn = constraint_grad_fn, + indexes_to_remove = (j_ref - 1) * p + 1:p, + j_ref = j_ref, + J = J, + n = n, + p = p + )) if (!inherits(score_res, "try-error")) { it_df$test_stat[iter] <- score_res$score_stat } } - + if (abs(gap) <= constraint_tol & B_diff <= B_tol) { converged <- TRUE } - } - + it_df <- it_df[1:iter, ] - - return(list("B" = B, - "k_constr" = k_constr, - "j_constr" = j_constr, - # "score_stats" = list(score_stats), - # "proj_score" = proj_score, - "niter" = iter, - "gap" = gap, - "u" = u, - "rho" = rho, - "Bs" = Bs, - "it_df" = it_df, - "converged" = converged)) -} + return(list( + "B" = B, + "k_constr" = k_constr, + "j_constr" = j_constr, + "niter" = iter, + "gap" = gap, + "u" = u, + "rho" = rho, + "Bs" = Bs, + "it_df" = it_df, + "converged" = converged + )) +} diff --git a/R/fit_null_discrete.R b/R/fit_null_discrete.R new file mode 100644 index 0000000..a488c70 --- /dev/null +++ b/R/fit_null_discrete.R @@ -0,0 +1,130 @@ +# fit_null_discrete <- function( +# B, +# Y, +# X, +# k_constr, +# j_constr, +# j_ref, +# constraint_fn, +# constraint_grad_fn +# ) { +# +# if (is.null(j_ref)) { +# j_ref <- ifelse(j_constr == 1, 2, 1) +# } +# +# n <- nrow(Y) +# J <- ncol(Y) +# p <- ncol(X) +# +# distinct_xx <- unique(X) +# +# # fitted values eta = X %*% beta +# pihats <- matrix(NA, nrow = p, ncol = J) # p x J +# etahats <- matrix(NA, nrow = p, ncol = J) # p x J +# etahats[, j_ref] <- 0 +# +# groups <- split( +# seq_len(nrow(X)), # row indices of original data +# apply(X, 1, function(r) +# paste(r, collapse = "_")) # grouping key by row contents +# ) +# groups <- groups[order(sapply(groups, min))] +# +# for (the_cat in 1:length(groups)) { +# the_xs <- groups[[the_cat]] +# totals <- apply(Y[the_xs, , drop = FALSE], 2, sum) +# pihats[the_cat, ] <- totals / sum(totals) +# etahats[the_cat, setdiff(1:J, j_ref)] <- log(pihats[-j_ref] / pihats[j_ref]) +# } +# +# +# +# +# # beta = X^{-1} %*% eta +# betahats <- round(MASS::ginv(distinct_xx), 8) %*% etahats +# betahats +# +# } +# +# +# +# set.seed(1) +# n <- 10 +# J <- 5 +# p <- 2 +# beta <- matrix(rnorm(p*J, sd=5), ncol = J) +# X <- cbind(1, c(rep(0, n/2), rep(1, n/2))) +# Y <- matrix(rpois(n=n*J, lambda=exp(5 + X %*% beta)), ncol = J) +# Y +# +# # woah, that takes ages +# fit1 <- radEmu::emuFit(Y = Y, X = X, test_kj=data.frame("k" = 2, "j" = 3)) +# fit1 +# fit1 %>% names +# fit1$estimation_converged +# fit1$score_test_hyperparams +# fit1 +# +# fit2 <- radEmu::emuFit(Y = Y, X = X, test_kj=data.frame("k" = 2, "j" = 3), control=list("trackB" = TRUE), refit=F, fitted_model=fit1) +# fit2 +# fit2 %>% names +# library(tidyverse) +# final_bs <- fit2$trackB_list[[1]] %>% tibble %>% tail(p*J) +# final_bs %>% +# signif(3) +# fit2$coef +# +# +# final_bs %>% +# signif(3) +# Y +# X +# final_bs %>% pull(B) %>% matrix(nrow = 2, byrow=T) +# +# +# #### attempt 2 +# set.seed(1) +# n <- 10 +# J <- 5 +# p <- 2 +# beta <- matrix(rnorm(p*J, sd=2), ncol = J) +# X <- cbind(1, c(rep(0, n/2), rep(1, n/2))) +# Y <- matrix(rpois(n=n*J, lambda=exp(5 + X %*% beta)), ncol = J) +# Y +# +# fit3 <- radEmu::emuFit(Y = Y, X = X, test_kj=data.frame("k" = 2, "j" = 4), control=list("trackB" = TRUE)) +# fit3 +# fit3$trackB_list[[1]] %>% tibble %>% tail(p*J) +# +# fit3$trackB_list[[1]] %>% tibble %>% tail(J) %>% pull(B) %>% psuedohuber_median +# fit3$trackB_list[[1]] %>% tibble %>% tail(p*J) %>% head(J) %>% pull(B) %>% psuedohuber_median +# +# final_bs %>% +# signif(3) +# Y +# X +# final_bs %>% pull(B) %>% matrix(nrow = 2, byrow=T) +# +# +# fit3$B +# +# fn3b <- fit_null(B=fit3$B, Y=Y, X = X, k_constr=2, j_constr=4, j_ref=2, +# constraint_fn=list(pseudohuber_median, pseudohuber_median), +# constraint_grad_fn=list(radEmu::dpseudohuber_median_dx, radEmu::dpseudohuber_median_dx)) +# fn3a$B +# fn3b$B +# +# fn3a <- fit_null(B=fit3$B, Y=Y, X = X, k_constr=2, j_constr=4, j_ref=1, +# constraint_fn=list(pseudohuber_median, pseudohuber_median), +# constraint_grad_fn=list(radEmu::dpseudohuber_median_dx, radEmu::dpseudohuber_median_dx), +# B_tol=1e-8, constraint_tol=1e-8) +# totest <- fn3a +# totest$B +# totest$B[totest$k_constr, setdiff(1:J, totest$j_constr)] %>% psuedohuber_median +# totest$B[totest$k_constr, totest$j_constr] +# +# +# +# aaa <- fit3$trackB_list[[1]] %>% tibble %>% tail(p*J) %>% tail(J) +# (aaa[c(1,2,3,5)] %>% psuedohuber_median) - aaa[4] diff --git a/R/fit_null_discrete_grad_descent_line_search.R b/R/fit_null_discrete_grad_descent_line_search.R new file mode 100644 index 0000000..de91b57 --- /dev/null +++ b/R/fit_null_discrete_grad_descent_line_search.R @@ -0,0 +1,264 @@ +my_gd_ls <- function(n0, n1, + g_beta, g_beta_grad, + eta_alpha = 1e-3, + eta_beta = 1e-3, + maxit = 1000, + tol = 1e-8, + ls_max = 30, # max backtracking steps + ls_c = 1e-4, # Armijo constant + ls_rho = 0.5) { # backtracking shrink factor + + K <- length(n0) + Km <- K - 1 + + if (K < 3) { + stop("beta constraint beta1 = g(beta2,...,beta_{K-1}) requires K >= 3.") + } + + # Parameters for categories 1..K-1 (baseline K is 0) + alpha <- rep(0, Km) + beta <- rep(0, Km) + + # Enforce beta constraint initially: beta1 = g(beta2..beta_{K-1}) + beta[1] <- g_beta(beta[2:Km]) + + n0p <- sum(n0) + n1p <- sum(n1) + + # Softmax probabilities for x = 0 and x = 1 (excluding baseline) + p0fun <- function(a) { + ea <- exp(a) + ea / (1 + sum(ea)) + } + p1fun <- function(a, b) { + eb <- exp(a + b) + eb / (1 + sum(eb)) + } + + # Log-likelihood given alpha, beta + loglik_fun <- function(alpha, beta) { + p0 <- p0fun(alpha) + p1 <- p1fun(alpha, beta) + + p0_full <- c(p0, 1 - sum(p0)) + p1_full <- c(p1, 1 - sum(p1)) + + # clip probs to avoid log(0) + eps <- 1e-12 + p0_full <- pmin(pmax(p0_full, eps), 1 - eps) + p1_full <- pmin(pmax(p1_full, eps), 1 - eps) + + sum(n0 * log(p0_full)) + sum(n1 * log(p1_full)) + } + + # ---- history tracking ---- + param_history <- list() + # iteration 0 (initial) + param_history[[length(param_history) + 1]] <- rbind( + data.frame(iter = 0L, + type = "alpha", + k = seq_len(Km), + value = alpha), + data.frame(iter = 0L, + type = "beta", + k = seq_len(Km), + value = beta) + ) + # --------------------------- + + # Initial log-likelihood + ll_old <- loglik_fun(alpha, beta) + + final_iter <- 0L + + for (iter in 1:maxit) { + + # Probabilities for non-baseline categories 1..K-1 + p0 <- p0fun(alpha) + p1 <- p1fun(alpha, beta) + + # Unconstrained gradients wrt alpha[1:Km] and beta[1:Km] + dlda <- (n0[1:Km] + n1[1:Km]) - n0p * p0 - n1p * p1 + dldb <- n1[1:Km] - n1p * p1 + + # Gradient for beta free coordinates k = 2..Km (chain rule) + grad_g_beta <- g_beta_grad(beta[2:Km]) # length K-2 + grad_beta_free <- dldb[2:Km] + dldb[1] * grad_g_beta # length K-2 + + # Gradient norm for convergence check (use α and free β) + grad_norm <- sqrt(sum(dlda^2) + sum(grad_beta_free^2)) + if (grad_norm < tol) { + final_iter <- iter - 1L + break + } + + # Search directions (gradient ascent) + dir_alpha <- eta_alpha * dlda + dir_beta_free <- eta_beta * grad_beta_free + + # Directional magnitude for Armijo: + # D = η_alpha * ||grad_alpha||^2 + η_beta * ||grad_beta_free||^2 + D <- eta_alpha * sum(dlda^2) + eta_beta * sum(grad_beta_free^2) + if (D <= 0) { + final_iter <- iter - 1L + break # gradients essentially zero + } + + # Backtracking line search + step_scale <- 1.0 + ll_new <- ll_old + accepted <- FALSE + + for (ls_iter in 1:ls_max) { + + # Candidate alpha + alpha_new <- alpha + step_scale * dir_alpha + + # Candidate beta free coords + beta_free_new <- beta[2:Km] + step_scale * dir_beta_free + + # Re-enforce beta constraint + beta_new <- beta + beta_new[2:Km] <- beta_free_new + beta_new[1] <- g_beta(beta_new[2:Km]) + + ll_candidate <- loglik_fun(alpha_new, beta_new) + + # Armijo condition for maximization: + # ℓ(new) >= ℓ(old) + c * step_scale * D + if (ll_candidate >= ll_old + ls_c * step_scale * D) { + ll_new <- ll_candidate + alpha <- alpha_new + beta <- beta_new + accepted <- TRUE + break + } else { + step_scale <- step_scale * ls_rho + } + } + + # If no step accepted, treat as convergence/stall + if (!accepted) { + final_iter <- iter - 1L + break + } + + # record parameters *after* this accepted update + param_history[[length(param_history) + 1]] <- rbind( + data.frame(iter = iter, + type = "alpha", + k = seq_len(Km), + value = alpha), + data.frame(iter = iter, + type = "beta", + k = seq_len(Km), + value = beta) + ) + + # Monotone increase in log-likelihood at accepted steps + if (abs(ll_new - ll_old) < tol) { + ll_old <- ll_new + final_iter <- iter + break + } + + ll_old <- ll_new + final_iter <- iter + } + + # Bind history into a single data.frame + param_df <- do.call(rbind, param_history) + + list( + alpha = alpha, + beta = beta, + logLik = ll_old, + iter = final_iter, + history = param_df + ) +} + + + +out_test <- my_gd_ls(n0=Y[which(X[,2] == 0), ] %>% colSums, + n1 = Y[which(X[,2] == 1), ] %>% colSums, + g_beta=pseudohuber_median_mod, + g_beta_grad= dpseudohuber_median_dx_mod, + eta_alpha = 1e-3, + eta_beta = 1e-3, + maxit = 1000, + tol = 1e-6) +out_test$alpha +out_test$beta +fn3_1_J$B # j_constr=1, j_ref=J + +n0s <- Y[which(X[,2] == 0), ] %>% colSums +n1s <- Y[which(X[,2] == 1), ] %>% colSums +n0s_mod <- n0s; n0s_mod[1] <- n0s[1] + n1s[1] + + +Y +logit <- function(x) log(x/(1-x)) +lg <- logit(n0s/sum(n0s)) +lg +lg - lg[J] + +logit(n0s_mod/sum(n0s_mod)) - logit(n0s_mod/sum(n0s_mod))[J] + + +pseudohuber_median_mod(out_test$alpha[-1]) +pseudohuber_median_mod(out_test$beta[-1]) + + + +out_test <- my_gd_ls(n0=Y[which(X[,2] == 0), ] %>% colSums, + n1 = Y[which(X[,2] == 1), ] %>% colSums, + g_alpha=pseudohuber_median_mod, + g_alpha_grad=dpseudohuber_median_dx_mod, + g_beta=pseudohuber_median_mod, + g_beta_grad= dpseudohuber_median_dx_mod, + eta_alpha = 1e-3, + eta_beta = 1e-3, + maxit = 1000, + tol = 1e-6) +out_test +fn3_1_J$B + +pseudohuber_median_mod(out_test$alpha[-1]) +pseudohuber_median_mod(out_test$beta[-1]) + + + + +fit_null(B=fit3$B, Y=Y, X = X, k_constr=2, j_constr=1, j_ref=J, + constraint_fn=list(pseudohuber_median, pseudohuber_median), + constraint_grad_fn=list(radEmu::dpseudohuber_median_dx, radEmu::dpseudohuber_median_dx), + B_tol=1e-8, constraint_tol=1e-8) + +fn3_1_J <- fit_null(B=fit3$B, Y=Y, X = X, k_constr=2, j_constr=1, j_ref=J, + constraint_fn=list(pseudohuber_median, pseudohuber_median), + constraint_grad_fn=list(radEmu::dpseudohuber_median_dx, radEmu::dpseudohuber_median_dx), + B_tol=1e-8, constraint_tol=1e-8) +# fn3_1_J$B +# +# fn3_1_J$B[fn3_1_J$k_constr, setdiff(1:J, fn3_1_J$j_constr)] %>% psuedohuber_median +# fn3_1_J$B[fn3_1_J$k_constr, fn3_1_J$j_constr] +# +# fn3_1_J$B[1, ] %>% psuedohuber_median +# fn3_1_J$B[1, ] %>% dput +# fn3_1_J$B[1, setdiff(1:J, c(1, J))] %>% psuedohuber_median +# fn3_1_J$B[1, setdiff(1:J, J)] %>% psuedohuber_median +# fn3_1_J$B[1, setdiff(1:J, 1)] %>% psuedohuber_median +# fn3_1_J$B[1, c(1, J)] %>% psuedohuber_median +# +# fn3_1_J$B[1, 1:5] %>% psuedohuber_median +# fn3_1_J$B[1, c(2,3,4)] %>% psuedohuber_median +# +# +# fn3_1_J$B[1, c(1,5)] %>% psuedohuber_median +# +# +# fn3_1_J$B[1, fn3_1_J$j_constr] +# fn3_1_J$B +# +# fn3_1_J$B[1, ] %>% psuedohuber_median From efcf5dd45f7a55a250f0101a7f5a21ae4dfa3fbf Mon Sep 17 00:00:00 2001 From: amy Date: Thu, 20 Nov 2025 09:07:56 -0800 Subject: [PATCH 2/7] a few attempts --- R/fit_null.R | 2 +- ...t_null_discrete_grad_descent_line_search.R | 32 +--- R/fit_null_discrete_optim.R | 165 ++++++++++++++++++ 3 files changed, 174 insertions(+), 25 deletions(-) create mode 100644 R/fit_null_discrete_optim.R diff --git a/R/fit_null.R b/R/fit_null.R index 9957347..98b736c 100644 --- a/R/fit_null.R +++ b/R/fit_null.R @@ -175,7 +175,7 @@ fit_null <- function( verbose = verbose, c1 = 1e-4 ) - print(update) + B <- B + update$update gap <- update$gap z <- update_z(Y, X, B) diff --git a/R/fit_null_discrete_grad_descent_line_search.R b/R/fit_null_discrete_grad_descent_line_search.R index de91b57..0c23698 100644 --- a/R/fit_null_discrete_grad_descent_line_search.R +++ b/R/fit_null_discrete_grad_descent_line_search.R @@ -178,16 +178,20 @@ my_gd_ls <- function(n0, n1, ) } +fn3_1_J <- fit_null(B=fit3$B, Y=Y, X = X, k_constr=2, j_constr=1, j_ref=J, + constraint_fn=list(pseudohuber_median, pseudohuber_median), + constraint_grad_fn=list(radEmu::dpseudohuber_median_dx, radEmu::dpseudohuber_median_dx), + B_tol=1e-8, constraint_tol=1e-8) - -out_test <- my_gd_ls(n0=Y[which(X[,2] == 0), ] %>% colSums, +out_test2 <- my_gd_ls(n0=Y[which(X[,2] == 0), ] %>% colSums, n1 = Y[which(X[,2] == 1), ] %>% colSums, - g_beta=pseudohuber_median_mod, - g_beta_grad= dpseudohuber_median_dx_mod, + g_beta=function(x) { pseudohuber_median(c(x, 0)) }, + g_beta_grad= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]}, eta_alpha = 1e-3, eta_beta = 1e-3, maxit = 1000, tol = 1e-6) + out_test$alpha out_test$beta fn3_1_J$B # j_constr=1, j_ref=J @@ -210,17 +214,6 @@ pseudohuber_median_mod(out_test$alpha[-1]) pseudohuber_median_mod(out_test$beta[-1]) - -out_test <- my_gd_ls(n0=Y[which(X[,2] == 0), ] %>% colSums, - n1 = Y[which(X[,2] == 1), ] %>% colSums, - g_alpha=pseudohuber_median_mod, - g_alpha_grad=dpseudohuber_median_dx_mod, - g_beta=pseudohuber_median_mod, - g_beta_grad= dpseudohuber_median_dx_mod, - eta_alpha = 1e-3, - eta_beta = 1e-3, - maxit = 1000, - tol = 1e-6) out_test fn3_1_J$B @@ -230,15 +223,6 @@ pseudohuber_median_mod(out_test$beta[-1]) -fit_null(B=fit3$B, Y=Y, X = X, k_constr=2, j_constr=1, j_ref=J, - constraint_fn=list(pseudohuber_median, pseudohuber_median), - constraint_grad_fn=list(radEmu::dpseudohuber_median_dx, radEmu::dpseudohuber_median_dx), - B_tol=1e-8, constraint_tol=1e-8) - -fn3_1_J <- fit_null(B=fit3$B, Y=Y, X = X, k_constr=2, j_constr=1, j_ref=J, - constraint_fn=list(pseudohuber_median, pseudohuber_median), - constraint_grad_fn=list(radEmu::dpseudohuber_median_dx, radEmu::dpseudohuber_median_dx), - B_tol=1e-8, constraint_tol=1e-8) # fn3_1_J$B # # fn3_1_J$B[fn3_1_J$k_constr, setdiff(1:J, fn3_1_J$j_constr)] %>% psuedohuber_median diff --git a/R/fit_null_discrete_optim.R b/R/fit_null_discrete_optim.R new file mode 100644 index 0000000..b29c85f --- /dev/null +++ b/R/fit_null_discrete_optim.R @@ -0,0 +1,165 @@ +## n0: counts for x = 0, length J +## n1: counts for x = 1, length J +## gfun: function(beta_free2) -> scalar beta_1 +## beta_free2 is c(beta_2, ..., beta_{J-1}) +## gprimefun: function(beta_free2) -> numeric(J-2) +## gradient d beta_1 / d (beta_2,...,beta_{J-1}) + +loglik_counts_constrained <- function(par, n0, n1, gfun, gprimefun) { + J <- length(n0) + Jm1 <- J - 1 + + ## Split parameters: + ## alpha_1,...,alpha_{J-1}, beta_2,...,beta_{J-1} + alpha <- par[1:Jm1] + beta_free2 <- par[(J):(2*J - 3)] # length J-2: beta_2..beta_{J-1} + + ## Constrained beta_1 via g + beta1 <- gfun(beta_free2) + + ## Full beta for categories 1..J-1 (baseline J has beta_J = 0) + beta_full <- numeric(Jm1) + beta_full[1] <- beta1 + beta_full[2:Jm1] <- beta_free2 + + ## Group sizes + N0 <- sum(n0) + N1 <- sum(n1) + + ## x = 0: eta0_j = alpha_j, eta0_J = 0 + eta0 <- alpha # length J-1 + m0 <- max(c(0, eta0)) + exp_eta0 <- exp(eta0 - m0) + denom0 <- exp(0 - m0) + sum(exp_eta0) + p0 <- exp_eta0 / denom0 # probs for j=1..J-1 + p0J <- exp(0 - m0) / denom0 # prob for baseline J + + ## x = 1: eta1_j = alpha_j + beta_j, eta1_J = 0 + eta1 <- alpha + beta_full + m1 <- max(c(0, eta1)) + exp_eta1 <- exp(eta1 - m1) + denom1 <- exp(0 - m1) + sum(exp_eta1) + p1 <- exp_eta1 / denom1 # probs for j=1..J-1 + p1J <- exp(0 - m1) / denom1 + + ## Log-likelihood: + ## ℓ = sum_j n0_j log p0_j + sum_j n1_j log p1_j + ## with p0_J, p1_J included for category J + ll0 <- sum(n0[1:Jm1] * log(p0)) + n0[J] * log(p0J) + ll1 <- sum(n1[1:Jm1] * log(p1)) + n1[J] * log(p1J) + + ll0 + ll1 +} + +grad_counts_constrained <- function(par, n0, n1, gfun, gprimefun) { + J <- length(n0) + Jm1 <- J - 1 + + alpha <- par[1:Jm1] + beta_free2 <- par[(J):(2*J - 3)] + + beta1 <- gfun(beta_free2) + beta_full <- numeric(Jm1) + beta_full[1] <- beta1 + beta_full[2:Jm1] <- beta_free2 + + gprime <- gprimefun(beta_free2) # length J-2 + + N0 <- sum(n0) + N1 <- sum(n1) + + ## x = 0 + eta0 <- alpha + m0 <- max(c(0, eta0)) + exp_eta0 <- exp(eta0 - m0) + denom0 <- exp(0 - m0) + sum(exp_eta0) + p0 <- exp_eta0 / denom0 # j = 1..J-1 + p0J <- exp(0 - m0) / denom0 + + ## x = 1 + eta1 <- alpha + beta_full + m1 <- max(c(0, eta1)) + exp_eta1 <- exp(eta1 - m1) + denom1 <- exp(0 - m1) + sum(exp_eta1) + p1 <- exp_eta1 / denom1 # j = 1..J-1 + p1J <- exp(0 - m1) / denom1 + + ## Gradient w.r.t. alpha_j (j=1..J-1): + ## ∂ℓ/∂alpha_j = (n0_j - N0 p0_j) + (n1_j - N1 p1_j) + grad_alpha <- (n0[1:Jm1] - N0 * p0) + (n1[1:Jm1] - N1 * p1) + + ## Unconstrained slope scores: + ## ∂ℓ/∂beta_j = n1_j - N1 p1_j, j = 1..J-1 + score_beta_full <- n1[1:Jm1] - N1 * p1 + + ## ∂ℓ/∂beta_1: + S1 <- score_beta_full[1] + + ## For free beta_2..beta_{J-1}: + ## ∂ℓ/∂beta_j_free = (n1_j - N1 p1_j) + S1 * ∂beta1/∂beta_j + ## j = 2..J-1 + S_direct <- score_beta_full[2:Jm1] # length J-2 + grad_beta_free2 <- S_direct + S1 * gprime + + c(grad_alpha, grad_beta_free2) +} + +fit_multinom_constrained_counts <- function(n0, n1, gfun, gprimefun, + start = NULL) { + J <- length(n0) + if (length(n1) != J) stop("n0 and n1 must have the same length") + if (J < 3) stop("Need at least 3 categories (J >= 3).") + + Jm1 <- J - 1 + n_par <- (Jm1) + (Jm1 - 1) # alpha_1..alpha_{J-1}, beta_2..beta_{J-1} + + if (is.null(start)) { + start <- rep(0, n_par) + } + + fn <- function(par) { + -loglik_counts_constrained(par, n0, n1, gfun, gprimefun) # optim minimizes + } + + gr <- function(par) { + -grad_counts_constrained(par, n0, n1, gfun, gprimefun) + } + + opt <- optim(start, fn, gr, method = "BFGS", control = list(maxit = 100)) + + par_hat <- opt$par + + ## Reconstruct alpha_1..alpha_J and beta_1..beta_J + alpha_hat <- par_hat[1:Jm1] + beta_free2_hat <- par_hat[(J):(2*J - 3)] + beta1_hat <- gfun(beta_free2_hat) + + beta_hat <- numeric(J) + beta_hat[1] <- beta1_hat + beta_hat[2:Jm1] <- beta_free2_hat + beta_hat[J] <- 0 + + alpha_hat_full <- numeric(J) + alpha_hat_full[1:Jm1] <- alpha_hat + alpha_hat_full[J] <- 0 + + list( + alpha = alpha_hat_full, + beta = beta_hat, + logLik = -opt$value, + convergence = opt$convergence, + optim = opt + ) +} + +# system.time({ +# out_test3 <- +fit_multinom_constrained_counts(n0=Y[which(X[,2] == 0), ] %>% colSums, + n1 = Y[which(X[,2] == 1), ] %>% colSums, + gfun=function(x) { pseudohuber_median(c(x, 0)) }, + gprimefun= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]} +) +# }) +out_test2$alpha +out_test2$beta +out_test3 From 1cd6157f60e88a0ac0d2f1f4e3b0e18eb7fad325 Mon Sep 17 00:00:00 2001 From: amy Date: Thu, 20 Nov 2025 12:02:45 -0800 Subject: [PATCH 3/7] add rough preliminary test --- R/dpseudohuber_median_d2x.R | 63 ++++ R/fit_null_discrete.R | 114 +++--- R/fit_null_discrete_fisher_scoring.R | 214 ++++++++++++ R/fit_null_discrete_fisher_scoring_stable.R | 240 +++++++++++++ ...t_null_discrete_grad_descent_line_search.R | 94 ++--- R/fit_null_discrete_optim.R | 330 +++++++++--------- test_new_discrete.R | 189 ++++++++++ 7 files changed, 972 insertions(+), 272 deletions(-) create mode 100644 R/dpseudohuber_median_d2x.R create mode 100644 R/fit_null_discrete_fisher_scoring.R create mode 100644 R/fit_null_discrete_fisher_scoring_stable.R create mode 100644 test_new_discrete.R diff --git a/R/dpseudohuber_median_d2x.R b/R/dpseudohuber_median_d2x.R new file mode 100644 index 0000000..f859595 --- /dev/null +++ b/R/dpseudohuber_median_d2x.R @@ -0,0 +1,63 @@ +#' Calculate the second derivative of the pseudo-Huber smoothed median +#' +#' @param x A vector to calculate the derivative of the pseudo-Huber smoothed median for. +#' @param d Smoothing parameter, by default set to \code{0.1}. As \code{d} approaches +#' \code{0} the pseudo-Huber median function approaches the median and as \code{d} approaches +#' infinity this function approaches the mean. +#' @param na.rm Passed to \code{pseudohuber_median}, default is FALSE, if FALSE then when +#' \code{x} includes at least one NA value then NA is returned, if TRUE then when \code{x} +#' includes at least one NA value then that value is removed and the pseudo-Huber median +#' is computed without it. +#' +#' @return The second derivative of the calculated pseudo-Huber smoothed median over \code{x} with smoothing +#' parameter \code{d}. +#' +#' @note algebra derived by ChatGPT based on our supp +#' +#' @examples +#' dpseudohuber_median_dx(x = rnorm(10), d = 0.1) +#' +#' @aliases dpsuedohuber_median_dx +#' +#' @export +#' +dpseudohuber_median_d2x <- function(x, + d = 0.1, + na.rm = FALSE) { + # pseudo-Huber center + ps_center <- pseudohuber_median(x, d, na.rm = na.rm) + + # basic quantities + y <- (x - ps_center) / d + w <- 1 / sqrt(1 + y^2) # w_j + w3 <- w^3 # a_j = w_j^3 + S <- sum(w3, na.rm = na.rm) # S = sum_j w_j^3 + + # t_j = (x_j - c*)/d^2 * w_j^5 + v <- (x - ps_center) / d^2 + w5 <- w^5 + t <- v * w5 # t_j + T <- sum(t, na.rm = na.rm) # T = sum_j t_j + + n <- length(x) + H <- matrix(NA_real_, n, n) + + # Hessian entries: + # H_{pq} = -3/S * [ t_p (delta_{pq} - w3_q/S) - (t_q - w3_q*T/S) * w3_p/S ] + for (p in seq_len(n)) { + a_p <- w3[p] + t_p <- t[p] + for (q in seq_len(n)) { + a_q <- w3[q] + t_q <- t[q] + delta_pq <- as.numeric(p == q) + + H[p, q] <- -(3 / S) * ( + t_p * (delta_pq - a_q / S) - + (t_q - a_q * T / S) * a_p / S + ) + } + } + + H +} \ No newline at end of file diff --git a/R/fit_null_discrete.R b/R/fit_null_discrete.R index a488c70..c1750ac 100644 --- a/R/fit_null_discrete.R +++ b/R/fit_null_discrete.R @@ -1,51 +1,69 @@ -# fit_null_discrete <- function( -# B, -# Y, -# X, -# k_constr, -# j_constr, -# j_ref, -# constraint_fn, -# constraint_grad_fn -# ) { -# -# if (is.null(j_ref)) { -# j_ref <- ifelse(j_constr == 1, 2, 1) -# } -# -# n <- nrow(Y) -# J <- ncol(Y) -# p <- ncol(X) -# -# distinct_xx <- unique(X) -# -# # fitted values eta = X %*% beta -# pihats <- matrix(NA, nrow = p, ncol = J) # p x J -# etahats <- matrix(NA, nrow = p, ncol = J) # p x J -# etahats[, j_ref] <- 0 -# -# groups <- split( -# seq_len(nrow(X)), # row indices of original data -# apply(X, 1, function(r) -# paste(r, collapse = "_")) # grouping key by row contents -# ) -# groups <- groups[order(sapply(groups, min))] -# -# for (the_cat in 1:length(groups)) { -# the_xs <- groups[[the_cat]] -# totals <- apply(Y[the_xs, , drop = FALSE], 2, sum) -# pihats[the_cat, ] <- totals / sum(totals) -# etahats[the_cat, setdiff(1:J, j_ref)] <- log(pihats[-j_ref] / pihats[j_ref]) -# } -# -# -# -# -# # beta = X^{-1} %*% eta -# betahats <- round(MASS::ginv(distinct_xx), 8) %*% etahats -# betahats -# -# } +fit_null_discrete_pseudohuber <- function( + B, + Y, + X, + k_constr, + j_constr, + j_ref +) { + + if (is.null(j_ref)) { + j_ref <- ifelse(j_constr == 1, 2, 1) + } + + n <- nrow(Y) + J <- ncol(Y) + p <- ncol(X) + + distinct_xx <- unique(X) + + ## Currently this works for + #### two groups with X's (1, 0) and (1, 1) + #### testing the first column, last column also constrained + + #### it is equivalent to fit_null with k_constr=2, j_constr=1, j_ref=J, + + ## It needs to be generalized to consider + #### multiple categories (can we assume identity-like then back transform? is this constraint and likelihood-preserving?) + #### different columns for testing + #### the same inputs as fit_null and comparable convergence statistics + + out <- my_fs_stable_two_groups(n0=Y[which(X[,2] == 0), ] %>% colSums, + n1 = Y[which(X[,2] == 1), ] %>% colSums, + g_beta=function(x) { pseudohuber_median(c(x, 0)) }, + g_beta_grad= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]}, + maxit = 1e8, + tol = 1e-10) + + + + B <- cbind(rbind(out$alpha, out$beta), 0) + z <- update_z(Y, X, B) + log_means <- X %*% B + matrix(z, ncol = 1) %*% matrix(1, ncol = J, nrow = 1) + ll_new <- sum(Y * log_means - exp(log_means)) + + # beta = X^{-1} %*% eta + betahats <- round(MASS::ginv(distinct_xx), 8) %*% etahats + betahats + + + it_df <- it_df[1:iter, ] + + + return(list( + "B" = B, + "k_constr" = k_constr, + "j_constr" = j_constr, + "niter" = iter, + "gap" = gap, + "u" = u, + "rho" = rho, + "Bs" = Bs, + "it_df" = it_df, + "converged" = converged + )) + +} # # # diff --git a/R/fit_null_discrete_fisher_scoring.R b/R/fit_null_discrete_fisher_scoring.R new file mode 100644 index 0000000..1666255 --- /dev/null +++ b/R/fit_null_discrete_fisher_scoring.R @@ -0,0 +1,214 @@ +my_gd_fs <- function(n0, n1, + g_beta, g_beta_grad, + eta_alpha = 1e-3, # kept for signature; not used + eta_beta = 1e-3, # kept for signature; not used + maxit = 1000, + B_tol = 1e-8, + ls_max = 20, # step-halving max + ls_rho = 0.5) { # step shrink factor + + K <- length(n0) + Km <- K - 1 # m + + if (K < 3) { + stop("beta constraint beta1 = g(beta2,...,beta_{K-1}) requires K >= 3.") + } + + # Parameters: categories 1..K-1 (baseline K is 0) + alpha <- rep(0, Km) + beta <- rep(0, Km) + + # Enforce beta constraint initially: beta1 = g(beta2..beta_{K-1}) + beta[1] <- g_beta(beta[2:Km]) + + n0p <- sum(n0) + n1p <- sum(n1) + + # Softmax probabilities for x = 0 and x = 1 (excluding baseline) + p0fun <- function(a) { + ea <- exp(a) + ea / (1 + sum(ea)) + } + p1fun <- function(a, b) { + eb <- exp(a + b) + eb / (1 + sum(eb)) + } + + # Log-likelihood given alpha, beta + loglik_fun <- function(alpha, beta) { + p0 <- p0fun(alpha) + p1 <- p1fun(alpha, beta) + + p0_full <- c(p0, 1 - sum(p0)) + p1_full <- c(p1, 1 - sum(p1)) + + eps <- 1e-12 + p0_full <- pmin(pmax(p0_full, eps), 1 - eps) + p1_full <- pmin(pmax(p1_full, eps), 1 - eps) + + sum(n0 * log(p0_full)) + sum(n1 * log(p1_full)) + } + + # ---- history tracking ---- + param_history <- list() + param_history[[length(param_history) + 1]] <- rbind( + data.frame(iter = 0L, + type = "alpha", + k = seq_len(Km), + value = alpha), + data.frame(iter = 0L, + type = "beta", + k = seq_len(Km), + value = beta) + ) + # --------------------------- + + ll_old <- loglik_fun(alpha, beta) + final_iter <- 0L + + for (iter in 1:maxit) { + + # Probabilities for non-baseline categories 1..K-1 + p0 <- p0fun(alpha) + p1 <- p1fun(alpha, beta) + + # ----- SCORES (gradients) ----- + # Unconstrained score wrt alpha[1:Km] and beta[1:Km] + s_alpha_full <- (n0[1:Km] + n1[1:Km]) - n0p * p0 - n1p * p1 + s_beta_full <- n1[1:Km] - n1p * p1 + + # Beta free gradient via chain rule (k = 2..Km) + grad_g_beta <- g_beta_grad(beta[2:Km]) # length Km-1 + s_beta_free <- s_beta_full[2:Km] + s_beta_full[1] * grad_g_beta + + # Score vector for reduced parameters θ = (alpha[1:Km], beta_free[1:(Km-1)]) + score_theta <- c(s_alpha_full, s_beta_free) + + # Convergence: score norm + score_norm <- sqrt(sum(score_theta^2)) + # if (score_norm < B_tol) { + # final_iter <- iter - 1L + # break + # } + + # ----- FULL FISHER INFORMATION (α, β_full) ----- + # M0, M1 (m x m) + M0 <- diag(p0) - tcrossprod(p0) + M1 <- diag(p1) - tcrossprod(p1) + + Iaa <- n0p * M0 + n1p * M1 # alpha-alpha + Iab <- n1p * M1 # alpha-beta + Ibb <- n1p * M1 # beta-beta + + # Full J (2m x 2m) + J_top <- cbind(Iaa, Iab) + J_bottom <- cbind(t(Iab), Ibb) + J <- rbind(J_top, J_bottom) + + # ----- TRANSFORMATION T from θ to φ = (alpha, beta_full) ----- + m <- Km + p <- 2 * m - 1 # dim(θ) + q <- 2 * m # dim(φ) + + T <- matrix(0, nrow = q, ncol = p) + # alpha-part: identity + T[1:m, 1:m] <- diag(m) + + # beta-part: B (m x (m-1)) at rows (m+1)..(2m), cols (m+1)..(2m-1) + B <- matrix(0, nrow = m, ncol = m - 1) + B[1, ] <- grad_g_beta # row for beta1 + if (m > 1) { + B[2:m, ] <- diag(m - 1) # rows beta2..betam + } + T[(m + 1):(2 * m), (m + 1):p] <- B + + # Fisher info for θ + I_theta <- crossprod(T, J %*% T) # T^T J T + + # Add tiny ridge for numerical stability if needed + ridge <- 1e-8 + I_theta_r <- I_theta + ridge * diag(p) + + # Fisher scoring step: θ_new = θ_old + I^{-1} S + step_theta <- tryCatch( + solve(I_theta_r, score_theta), + error = function(e) { + # In case of numerical issues, break + rep(0, length(score_theta)) + } + ) + + # If step is essentially zero, we're done + # if (sqrt(sum(step_theta^2)) < B_tol) { + if (max(abs(step_theta)) < B_tol) { + final_iter <- iter - 1L + break + } + + # ----- Step-halving line search for robustness ----- + step_scale <- 1.0 + ll_new <- ll_old + accepted <- FALSE + + for (ls_iter in 1:ls_max) { + + theta_new <- c(alpha, beta[2:Km]) + step_scale * step_theta + alpha_new <- theta_new[1:Km] + beta_free_new <- theta_new[(Km + 1):(2 * Km - 1)] + + beta_new <- beta + beta_new[2:Km] <- beta_free_new + beta_new[1] <- g_beta(beta_new[2:Km]) + + print(cbind(alpha_new, beta_new)) + ll_candidate <- loglik_fun(alpha_new, beta_new) + print(ll_candidate) + + if (ll_candidate >= ll_old) { + alpha <- alpha_new + beta <- beta_new + ll_new <- ll_candidate + accepted <- TRUE + break + } else { + step_scale <- step_scale * ls_rho + } + } + + if (!accepted) { + final_iter <- iter - 1L + break + } + + # record parameters after accepted update + param_history[[length(param_history) + 1]] <- rbind( + data.frame(iter = iter, + type = "alpha", + k = seq_len(Km), + value = alpha), + data.frame(iter = iter, + type = "beta", + k = seq_len(Km), + value = beta) + ) + + if (abs(ll_new - ll_old) < B_tol) { + ll_old <- ll_new + final_iter <- iter + break + } + + ll_old <- ll_new + final_iter <- iter + } + + param_df <- do.call(rbind, param_history) + + list( + alpha = alpha, + beta = beta, + logLik = ll_old, + iter = final_iter, + history = param_df + ) +} diff --git a/R/fit_null_discrete_fisher_scoring_stable.R b/R/fit_null_discrete_fisher_scoring_stable.R new file mode 100644 index 0000000..25a9c33 --- /dev/null +++ b/R/fit_null_discrete_fisher_scoring_stable.R @@ -0,0 +1,240 @@ +## this is equivalent to fit_null with k_constr=2, j_constr=1, j_ref=J, +my_fs_stable_two_groups <- function(n0, + n1, + g_beta, + g_beta_grad, + maxit = 1000, + tol = 1e-8, + ls_max = 20, # max step-halving + ls_rho = 0.5) { # shrink factor for step-halving + + K <- length(n0) + Km <- K - 1 # m + + if (K < 3) { + stop("beta constraint beta1 = g(beta2,...,beta_{K-1}) requires K >= 3.") + } + + # Parameters: categories 1..K-1 (baseline K is 0) + alpha <- rep(0, Km) + beta <- rep(0, Km) + + # Enforce beta constraint initially: beta1 = g(beta2..beta_{K-1}) + beta[1] <- g_beta(beta[2:Km]) + + n0p <- sum(n0) + n1p <- sum(n1) + + # Softmax probabilities for x = 0 and x = 1 (excluding baseline) + p0fun <- function(a) { + ea <- exp(a) + ea / (1 + sum(ea)) + } + p1fun <- function(a, b) { + eb <- exp(a + b) + eb / (1 + sum(eb)) + } + + # Log-likelihood given alpha, beta + loglik_fun <- function(alpha, beta) { + p0 <- p0fun(alpha) + p1 <- p1fun(alpha, beta) + + p0_full <- c(p0, 1 - sum(p0)) + p1_full <- c(p1, 1 - sum(p1)) + + eps <- 1e-12 + p0_full <- pmin(pmax(p0_full, eps), 1 - eps) + p1_full <- pmin(pmax(p1_full, eps), 1 - eps) + + sum(n0 * log(p0_full)) + sum(n1 * log(p1_full)) + } + + # ---- history tracking ---- + param_history <- list() + param_history[[length(param_history) + 1]] <- rbind( + data.frame(iter = 0L, + type = "alpha", + k = seq_len(Km), + value = alpha), + data.frame(iter = 0L, + type = "beta", + k = seq_len(Km), + value = beta) + ) + # --------------------------- + + ll_old <- loglik_fun(alpha, beta) + final_iter <- 0L + + # tuning constants for stability + ridge_base <- 1e-4 # base ridge + max_step_norm <- 5 # max Euclidean norm of step in θ + clip_logit_max <- 15 # clip for alpha/beta components + + for (iter in 1:maxit) { + + # Probabilities for non-baseline categories 1..K-1 + p0 <- p0fun(alpha) + p1 <- p1fun(alpha, beta) + + # ----- SCORES (gradients) ----- + # Unconstrained score wrt alpha[1:Km] and beta[1:Km] + s_alpha_full <- (n0[1:Km] + n1[1:Km]) - n0p * p0 - n1p * p1 + s_beta_full <- n1[1:Km] - n1p * p1 + + # Beta free gradient via chain rule (k = 2..Km) + grad_g_beta <- g_beta_grad(beta[2:Km]) # length Km-1 + s_beta_free <- s_beta_full[2:Km] + s_beta_full[1] * grad_g_beta + + # Score vector for reduced parameters θ = (alpha[1:Km], beta_free[1:(Km-1)]) + score_theta <- c(s_alpha_full, s_beta_free) + + # Convergence: score norm + score_norm <- sqrt(sum(score_theta^2)) + if (score_norm < tol) { + final_iter <- iter - 1L + break + } + + # ----- FULL FISHER INFORMATION (α, β_full) ----- + # M0, M1 (m x m) + M0 <- diag(p0) - tcrossprod(p0) + M1 <- diag(p1) - tcrossprod(p1) + + Iaa <- n0p * M0 + n1p * M1 # alpha-alpha + Iab <- n1p * M1 # alpha-beta + Ibb <- n1p * M1 # beta-beta + + # Full J (2m x 2m) + J_top <- cbind(Iaa, Iab) + J_bottom <- cbind(t(Iab), Ibb) + J <- rbind(J_top, J_bottom) + + # ----- TRANSFORMATION T from θ to φ = (alpha, beta_full) ----- + m <- Km + p <- 2 * m - 1 # dim(θ) + q <- 2 * m # dim(φ) + + T <- matrix(0, nrow = q, ncol = p) + # alpha-part: identity + T[1:m, 1:m] <- diag(m) + + # beta-part: B (m x (m-1)) at rows (m+1)..(2m), cols (m+1)..(2m-1) + B <- matrix(0, nrow = m, ncol = m - 1) + B[1, ] <- grad_g_beta # row for beta1 + if (m > 1) { + B[2:m, ] <- diag(m - 1) # rows beta2..betam + } + T[(m + 1):(2 * m), (m + 1):p] <- B + + # Fisher info for θ + I_theta <- crossprod(T, J %*% T) # T^T J T + + # ----- solve for step with adaptive ridge ----- + ridge_multipliers <- c(1, 10, 100, 1000, 1e4) + step_theta <- rep(NA_real_, length(score_theta)) + solved <- FALSE + + for (rm in ridge_multipliers) { + ridge <- ridge_base * rm + I_theta_r <- I_theta + ridge * diag(p) + step_try <- tryCatch( + solve(I_theta_r, score_theta), + error = function(e) NA_real_ + ) + if (all(is.finite(step_try))) { + step_theta <- step_try + solved <- TRUE + break + } + } + + if (!solved) { + final_iter <- iter - 1L + break + } + + # Cap step length + step_norm <- sqrt(sum(step_theta^2)) + if (step_norm > max_step_norm) { + step_theta <- step_theta * (max_step_norm / step_norm) + } + + # If step is essentially zero, we're done + if (sqrt(sum(step_theta^2)) < tol) { + final_iter <- iter - 1L + break + } + + # ----- Step-halving line search for robustness ----- + step_scale <- 1.0 + ll_new <- ll_old + accepted <- FALSE + + for (ls_iter in 1:ls_max) { + + theta_old <- c(alpha, beta[2:Km]) + theta_new <- theta_old + step_scale * step_theta + + alpha_new <- theta_new[1:Km] + beta_free_new <- theta_new[(Km + 1):(2 * Km - 1)] + + beta_new <- beta + beta_new[2:Km] <- beta_free_new + beta_new[1] <- g_beta(beta_new[2:Km]) + + # clip logits to avoid numeric blowups + alpha_new <- pmax(pmin(alpha_new, clip_logit_max), -clip_logit_max) + beta_new <- pmax(pmin(beta_new, clip_logit_max), -clip_logit_max) + + ll_candidate <- loglik_fun(alpha_new, beta_new) + + if (is.finite(ll_candidate) && ll_candidate >= ll_old) { + alpha <- alpha_new + beta <- beta_new + ll_new <- ll_candidate + accepted <- TRUE + break + } else { + step_scale <- step_scale * ls_rho + } + } + + if (!accepted) { + final_iter <- iter - 1L + break + } + + # record parameters after accepted update + param_history[[length(param_history) + 1]] <- rbind( + data.frame(iter = iter, + type = "alpha", + k = seq_len(Km), + value = alpha), + data.frame(iter = iter, + type = "beta", + k = seq_len(Km), + value = beta) + ) + + if (abs(ll_new - ll_old) < tol) { + ll_old <- ll_new + final_iter <- iter + break + } + + ll_old <- ll_new + final_iter <- iter + } + + param_df <- do.call(rbind, param_history) + + list( + alpha = alpha, + beta = beta, + logLik = ll_old, + iter = final_iter, + history = param_df + ) +} diff --git a/R/fit_null_discrete_grad_descent_line_search.R b/R/fit_null_discrete_grad_descent_line_search.R index 0c23698..3c89289 100644 --- a/R/fit_null_discrete_grad_descent_line_search.R +++ b/R/fit_null_discrete_grad_descent_line_search.R @@ -178,71 +178,47 @@ my_gd_ls <- function(n0, n1, ) } -fn3_1_J <- fit_null(B=fit3$B, Y=Y, X = X, k_constr=2, j_constr=1, j_ref=J, - constraint_fn=list(pseudohuber_median, pseudohuber_median), - constraint_grad_fn=list(radEmu::dpseudohuber_median_dx, radEmu::dpseudohuber_median_dx), - B_tol=1e-8, constraint_tol=1e-8) - -out_test2 <- my_gd_ls(n0=Y[which(X[,2] == 0), ] %>% colSums, - n1 = Y[which(X[,2] == 1), ] %>% colSums, - g_beta=function(x) { pseudohuber_median(c(x, 0)) }, - g_beta_grad= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]}, - eta_alpha = 1e-3, - eta_beta = 1e-3, - maxit = 1000, - tol = 1e-6) - -out_test$alpha -out_test$beta -fn3_1_J$B # j_constr=1, j_ref=J - -n0s <- Y[which(X[,2] == 0), ] %>% colSums -n1s <- Y[which(X[,2] == 1), ] %>% colSums -n0s_mod <- n0s; n0s_mod[1] <- n0s[1] + n1s[1] - - -Y -logit <- function(x) log(x/(1-x)) -lg <- logit(n0s/sum(n0s)) -lg -lg - lg[J] - -logit(n0s_mod/sum(n0s_mod)) - logit(n0s_mod/sum(n0s_mod))[J] - - -pseudohuber_median_mod(out_test$alpha[-1]) -pseudohuber_median_mod(out_test$beta[-1]) - - -out_test -fn3_1_J$B - -pseudohuber_median_mod(out_test$alpha[-1]) -pseudohuber_median_mod(out_test$beta[-1]) - - - - -# fn3_1_J$B +# fn3_1_J <- fit_null(B=fit3$B, Y=Y, X = X, k_constr=2, j_constr=1, j_ref=J, +# constraint_fn=list(pseudohuber_median, pseudohuber_median), +# constraint_grad_fn=list(radEmu::dpseudohuber_median_dx, radEmu::dpseudohuber_median_dx), +# B_tol=1e-8, constraint_tol=1e-8) +# +# out_test2 <- my_gd_ls(n0=Y[which(X[,2] == 0), ] %>% colSums, +# n1 = Y[which(X[,2] == 1), ] %>% colSums, +# g_beta=function(x) { pseudohuber_median(c(x, 0)) }, +# g_beta_grad= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]}, +# eta_alpha = 1e-3, +# eta_beta = 1e-3, +# maxit = 1000, +# tol = 1e-6) # -# fn3_1_J$B[fn3_1_J$k_constr, setdiff(1:J, fn3_1_J$j_constr)] %>% psuedohuber_median -# fn3_1_J$B[fn3_1_J$k_constr, fn3_1_J$j_constr] +# out_test$alpha +# out_test$beta +# fn3_1_J$B # j_constr=1, j_ref=J # -# fn3_1_J$B[1, ] %>% psuedohuber_median -# fn3_1_J$B[1, ] %>% dput -# fn3_1_J$B[1, setdiff(1:J, c(1, J))] %>% psuedohuber_median -# fn3_1_J$B[1, setdiff(1:J, J)] %>% psuedohuber_median -# fn3_1_J$B[1, setdiff(1:J, 1)] %>% psuedohuber_median -# fn3_1_J$B[1, c(1, J)] %>% psuedohuber_median +# n0s <- Y[which(X[,2] == 0), ] %>% colSums +# n1s <- Y[which(X[,2] == 1), ] %>% colSums +# n0s_mod <- n0s; n0s_mod[1] <- n0s[1] + n1s[1] # -# fn3_1_J$B[1, 1:5] %>% psuedohuber_median -# fn3_1_J$B[1, c(2,3,4)] %>% psuedohuber_median # +# Y +# logit <- function(x) log(x/(1-x)) +# lg <- logit(n0s/sum(n0s)) +# lg +# lg - lg[J] # -# fn3_1_J$B[1, c(1,5)] %>% psuedohuber_median +# logit(n0s_mod/sum(n0s_mod)) - logit(n0s_mod/sum(n0s_mod))[J] # # -# fn3_1_J$B[1, fn3_1_J$j_constr] +# pseudohuber_median_mod(out_test$alpha[-1]) +# pseudohuber_median_mod(out_test$beta[-1]) +# +# +# out_test # fn3_1_J$B # -# fn3_1_J$B[1, ] %>% psuedohuber_median +# pseudohuber_median_mod(out_test$alpha[-1]) +# pseudohuber_median_mod(out_test$beta[-1]) + + + diff --git a/R/fit_null_discrete_optim.R b/R/fit_null_discrete_optim.R index b29c85f..84397b6 100644 --- a/R/fit_null_discrete_optim.R +++ b/R/fit_null_discrete_optim.R @@ -1,165 +1,165 @@ -## n0: counts for x = 0, length J -## n1: counts for x = 1, length J -## gfun: function(beta_free2) -> scalar beta_1 -## beta_free2 is c(beta_2, ..., beta_{J-1}) -## gprimefun: function(beta_free2) -> numeric(J-2) -## gradient d beta_1 / d (beta_2,...,beta_{J-1}) - -loglik_counts_constrained <- function(par, n0, n1, gfun, gprimefun) { - J <- length(n0) - Jm1 <- J - 1 - - ## Split parameters: - ## alpha_1,...,alpha_{J-1}, beta_2,...,beta_{J-1} - alpha <- par[1:Jm1] - beta_free2 <- par[(J):(2*J - 3)] # length J-2: beta_2..beta_{J-1} - - ## Constrained beta_1 via g - beta1 <- gfun(beta_free2) - - ## Full beta for categories 1..J-1 (baseline J has beta_J = 0) - beta_full <- numeric(Jm1) - beta_full[1] <- beta1 - beta_full[2:Jm1] <- beta_free2 - - ## Group sizes - N0 <- sum(n0) - N1 <- sum(n1) - - ## x = 0: eta0_j = alpha_j, eta0_J = 0 - eta0 <- alpha # length J-1 - m0 <- max(c(0, eta0)) - exp_eta0 <- exp(eta0 - m0) - denom0 <- exp(0 - m0) + sum(exp_eta0) - p0 <- exp_eta0 / denom0 # probs for j=1..J-1 - p0J <- exp(0 - m0) / denom0 # prob for baseline J - - ## x = 1: eta1_j = alpha_j + beta_j, eta1_J = 0 - eta1 <- alpha + beta_full - m1 <- max(c(0, eta1)) - exp_eta1 <- exp(eta1 - m1) - denom1 <- exp(0 - m1) + sum(exp_eta1) - p1 <- exp_eta1 / denom1 # probs for j=1..J-1 - p1J <- exp(0 - m1) / denom1 - - ## Log-likelihood: - ## ℓ = sum_j n0_j log p0_j + sum_j n1_j log p1_j - ## with p0_J, p1_J included for category J - ll0 <- sum(n0[1:Jm1] * log(p0)) + n0[J] * log(p0J) - ll1 <- sum(n1[1:Jm1] * log(p1)) + n1[J] * log(p1J) - - ll0 + ll1 -} - -grad_counts_constrained <- function(par, n0, n1, gfun, gprimefun) { - J <- length(n0) - Jm1 <- J - 1 - - alpha <- par[1:Jm1] - beta_free2 <- par[(J):(2*J - 3)] - - beta1 <- gfun(beta_free2) - beta_full <- numeric(Jm1) - beta_full[1] <- beta1 - beta_full[2:Jm1] <- beta_free2 - - gprime <- gprimefun(beta_free2) # length J-2 - - N0 <- sum(n0) - N1 <- sum(n1) - - ## x = 0 - eta0 <- alpha - m0 <- max(c(0, eta0)) - exp_eta0 <- exp(eta0 - m0) - denom0 <- exp(0 - m0) + sum(exp_eta0) - p0 <- exp_eta0 / denom0 # j = 1..J-1 - p0J <- exp(0 - m0) / denom0 - - ## x = 1 - eta1 <- alpha + beta_full - m1 <- max(c(0, eta1)) - exp_eta1 <- exp(eta1 - m1) - denom1 <- exp(0 - m1) + sum(exp_eta1) - p1 <- exp_eta1 / denom1 # j = 1..J-1 - p1J <- exp(0 - m1) / denom1 - - ## Gradient w.r.t. alpha_j (j=1..J-1): - ## ∂ℓ/∂alpha_j = (n0_j - N0 p0_j) + (n1_j - N1 p1_j) - grad_alpha <- (n0[1:Jm1] - N0 * p0) + (n1[1:Jm1] - N1 * p1) - - ## Unconstrained slope scores: - ## ∂ℓ/∂beta_j = n1_j - N1 p1_j, j = 1..J-1 - score_beta_full <- n1[1:Jm1] - N1 * p1 - - ## ∂ℓ/∂beta_1: - S1 <- score_beta_full[1] - - ## For free beta_2..beta_{J-1}: - ## ∂ℓ/∂beta_j_free = (n1_j - N1 p1_j) + S1 * ∂beta1/∂beta_j - ## j = 2..J-1 - S_direct <- score_beta_full[2:Jm1] # length J-2 - grad_beta_free2 <- S_direct + S1 * gprime - - c(grad_alpha, grad_beta_free2) -} - -fit_multinom_constrained_counts <- function(n0, n1, gfun, gprimefun, - start = NULL) { - J <- length(n0) - if (length(n1) != J) stop("n0 and n1 must have the same length") - if (J < 3) stop("Need at least 3 categories (J >= 3).") - - Jm1 <- J - 1 - n_par <- (Jm1) + (Jm1 - 1) # alpha_1..alpha_{J-1}, beta_2..beta_{J-1} - - if (is.null(start)) { - start <- rep(0, n_par) - } - - fn <- function(par) { - -loglik_counts_constrained(par, n0, n1, gfun, gprimefun) # optim minimizes - } - - gr <- function(par) { - -grad_counts_constrained(par, n0, n1, gfun, gprimefun) - } - - opt <- optim(start, fn, gr, method = "BFGS", control = list(maxit = 100)) - - par_hat <- opt$par - - ## Reconstruct alpha_1..alpha_J and beta_1..beta_J - alpha_hat <- par_hat[1:Jm1] - beta_free2_hat <- par_hat[(J):(2*J - 3)] - beta1_hat <- gfun(beta_free2_hat) - - beta_hat <- numeric(J) - beta_hat[1] <- beta1_hat - beta_hat[2:Jm1] <- beta_free2_hat - beta_hat[J] <- 0 - - alpha_hat_full <- numeric(J) - alpha_hat_full[1:Jm1] <- alpha_hat - alpha_hat_full[J] <- 0 - - list( - alpha = alpha_hat_full, - beta = beta_hat, - logLik = -opt$value, - convergence = opt$convergence, - optim = opt - ) -} - -# system.time({ -# out_test3 <- -fit_multinom_constrained_counts(n0=Y[which(X[,2] == 0), ] %>% colSums, - n1 = Y[which(X[,2] == 1), ] %>% colSums, - gfun=function(x) { pseudohuber_median(c(x, 0)) }, - gprimefun= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]} -) -# }) -out_test2$alpha -out_test2$beta -out_test3 +# ## n0: counts for x = 0, length J +# ## n1: counts for x = 1, length J +# ## gfun: function(beta_free2) -> scalar beta_1 +# ## beta_free2 is c(beta_2, ..., beta_{J-1}) +# ## gprimefun: function(beta_free2) -> numeric(J-2) +# ## gradient d beta_1 / d (beta_2,...,beta_{J-1}) +# +# loglik_counts_constrained <- function(par, n0, n1, gfun, gprimefun) { +# J <- length(n0) +# Jm1 <- J - 1 +# +# ## Split parameters: +# ## alpha_1,...,alpha_{J-1}, beta_2,...,beta_{J-1} +# alpha <- par[1:Jm1] +# beta_free2 <- par[(J):(2*J - 3)] # length J-2: beta_2..beta_{J-1} +# +# ## Constrained beta_1 via g +# beta1 <- gfun(beta_free2) +# +# ## Full beta for categories 1..J-1 (baseline J has beta_J = 0) +# beta_full <- numeric(Jm1) +# beta_full[1] <- beta1 +# beta_full[2:Jm1] <- beta_free2 +# +# ## Group sizes +# N0 <- sum(n0) +# N1 <- sum(n1) +# +# ## x = 0: eta0_j = alpha_j, eta0_J = 0 +# eta0 <- alpha # length J-1 +# m0 <- max(c(0, eta0)) +# exp_eta0 <- exp(eta0 - m0) +# denom0 <- exp(0 - m0) + sum(exp_eta0) +# p0 <- exp_eta0 / denom0 # probs for j=1..J-1 +# p0J <- exp(0 - m0) / denom0 # prob for baseline J +# +# ## x = 1: eta1_j = alpha_j + beta_j, eta1_J = 0 +# eta1 <- alpha + beta_full +# m1 <- max(c(0, eta1)) +# exp_eta1 <- exp(eta1 - m1) +# denom1 <- exp(0 - m1) + sum(exp_eta1) +# p1 <- exp_eta1 / denom1 # probs for j=1..J-1 +# p1J <- exp(0 - m1) / denom1 +# +# ## Log-likelihood: +# ## ℓ = sum_j n0_j log p0_j + sum_j n1_j log p1_j +# ## with p0_J, p1_J included for category J +# ll0 <- sum(n0[1:Jm1] * log(p0)) + n0[J] * log(p0J) +# ll1 <- sum(n1[1:Jm1] * log(p1)) + n1[J] * log(p1J) +# +# ll0 + ll1 +# } +# +# grad_counts_constrained <- function(par, n0, n1, gfun, gprimefun) { +# J <- length(n0) +# Jm1 <- J - 1 +# +# alpha <- par[1:Jm1] +# beta_free2 <- par[(J):(2*J - 3)] +# +# beta1 <- gfun(beta_free2) +# beta_full <- numeric(Jm1) +# beta_full[1] <- beta1 +# beta_full[2:Jm1] <- beta_free2 +# +# gprime <- gprimefun(beta_free2) # length J-2 +# +# N0 <- sum(n0) +# N1 <- sum(n1) +# +# ## x = 0 +# eta0 <- alpha +# m0 <- max(c(0, eta0)) +# exp_eta0 <- exp(eta0 - m0) +# denom0 <- exp(0 - m0) + sum(exp_eta0) +# p0 <- exp_eta0 / denom0 # j = 1..J-1 +# p0J <- exp(0 - m0) / denom0 +# +# ## x = 1 +# eta1 <- alpha + beta_full +# m1 <- max(c(0, eta1)) +# exp_eta1 <- exp(eta1 - m1) +# denom1 <- exp(0 - m1) + sum(exp_eta1) +# p1 <- exp_eta1 / denom1 # j = 1..J-1 +# p1J <- exp(0 - m1) / denom1 +# +# ## Gradient w.r.t. alpha_j (j=1..J-1): +# ## ∂ℓ/∂alpha_j = (n0_j - N0 p0_j) + (n1_j - N1 p1_j) +# grad_alpha <- (n0[1:Jm1] - N0 * p0) + (n1[1:Jm1] - N1 * p1) +# +# ## Unconstrained slope scores: +# ## ∂ℓ/∂beta_j = n1_j - N1 p1_j, j = 1..J-1 +# score_beta_full <- n1[1:Jm1] - N1 * p1 +# +# ## ∂ℓ/∂beta_1: +# S1 <- score_beta_full[1] +# +# ## For free beta_2..beta_{J-1}: +# ## ∂ℓ/∂beta_j_free = (n1_j - N1 p1_j) + S1 * ∂beta1/∂beta_j +# ## j = 2..J-1 +# S_direct <- score_beta_full[2:Jm1] # length J-2 +# grad_beta_free2 <- S_direct + S1 * gprime +# +# c(grad_alpha, grad_beta_free2) +# } +# +# fit_multinom_constrained_counts <- function(n0, n1, gfun, gprimefun, +# start = NULL) { +# J <- length(n0) +# if (length(n1) != J) stop("n0 and n1 must have the same length") +# if (J < 3) stop("Need at least 3 categories (J >= 3).") +# +# Jm1 <- J - 1 +# n_par <- (Jm1) + (Jm1 - 1) # alpha_1..alpha_{J-1}, beta_2..beta_{J-1} +# +# if (is.null(start)) { +# start <- rep(0, n_par) +# } +# +# fn <- function(par) { +# -loglik_counts_constrained(par, n0, n1, gfun, gprimefun) # optim minimizes +# } +# +# gr <- function(par) { +# -grad_counts_constrained(par, n0, n1, gfun, gprimefun) +# } +# +# opt <- optim(start, fn, gr, method = "BFGS", control = list(maxit = 10)) +# +# par_hat <- opt$par +# +# ## Reconstruct alpha_1..alpha_J and beta_1..beta_J +# alpha_hat <- par_hat[1:Jm1] +# beta_free2_hat <- par_hat[(J):(2*J - 3)] +# beta1_hat <- gfun(beta_free2_hat) +# +# beta_hat <- numeric(J) +# beta_hat[1] <- beta1_hat +# beta_hat[2:Jm1] <- beta_free2_hat +# beta_hat[J] <- 0 +# +# alpha_hat_full <- numeric(J) +# alpha_hat_full[1:Jm1] <- alpha_hat +# alpha_hat_full[J] <- 0 +# +# list( +# alpha = alpha_hat_full, +# beta = beta_hat, +# logLik = -opt$value, +# convergence = opt$convergence, +# optim = opt +# ) +# } +# +# # system.time({ +# # out_test3 <- +# fit_multinom_constrained_counts(n0=Y[which(X[,2] == 0), ] %>% colSums, +# n1 = Y[which(X[,2] == 1), ] %>% colSums, +# gfun=function(x) { pseudohuber_median(c(x, 0)) }, +# gprimefun= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]} +# ) +# # }) +# out_test2$alpha +# out_test2$beta +# out_test3 diff --git a/test_new_discrete.R b/test_new_discrete.R new file mode 100644 index 0000000..52771c6 --- /dev/null +++ b/test_new_discrete.R @@ -0,0 +1,189 @@ +# js <- seq(from = 100, to = 1000, by = 100) +js <- seq(from = 20, to = 100, by = 20) +df_timing <- data.frame("j" = js, "fit_null" = NA, "new" = NA) +df_timing +set.seed(1) +ii <- 1 +for (jj in js) { + df_timing$j[ii] <- jj + + n <- 20 + J <- jj + p <- 2 + beta <- matrix(rnorm(p*J, sd=5), ncol = J) + X <- cbind(1, c(rep(0, n/2), rep(1, n/2))) + Y <- matrix(rpois(n=n*J, lambda=exp(5 + X %*% beta)), ncol = J) + Y <- pmax(Y, 1) + + # fit3 <- radEmu::emuFit(Y = Y, X = X, run_score_tests=F) + # + # df_timing$fit_null[ii] <- system.time({ + # fit_null(B=fit3$B, Y=Y, X = X, k_constr=2, j_constr=1, j_ref=J, + # constraint_fn=list(pseudohuber_median, pseudohuber_median), + # constraint_grad_fn=list(radEmu::dpseudohuber_median_dx, radEmu::dpseudohuber_median_dx), + # B_tol=1e-8, constraint_tol=1e-8) + # })[3] + + + df_timing$fit_null[ii] <- system.time({ + fn <- fit_null(B=matrix(0, nrow = 2, ncol = J), Y=Y, X = X, k_constr=2, j_constr=1, j_ref=J, + constraint_fn=list(pseudohuber_median, pseudohuber_median), + constraint_grad_fn=list(radEmu::dpseudohuber_median_dx, radEmu::dpseudohuber_median_dx), + B_tol=1e-5, constraint_tol=1e-8) + })[3] + + # df_timing$new[ii] <- system.time({ + # out_test4 <- my_gd_fs(n0=Y[which(X[,2] == 0), ] %>% colSums, + # n1 = Y[which(X[,2] == 1), ] %>% colSums, + # g_beta=function(x) { pseudohuber_median(c(x, 0)) }, + # g_beta_grad= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]}, + # eta_alpha = 1e-3, + # eta_beta = 1e-3, + # maxit = 1e10, + # B_tol = 1e-5) + # })[3] + df_timing$new[ii] <- system.time({ + out_test4 <- my_fs_stable(n0=Y[which(X[,2] == 0), ] %>% colSums, + n1 = Y[which(X[,2] == 1), ] %>% colSums, + g_beta=function(x) { pseudohuber_median(c(x, 0)) }, + g_beta_grad= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]}, + eta_alpha = 1e-3, + eta_beta = 1e-3, + maxit = 1e8, + tol = 1e-10) + })[3] + ii <- ii + 1 + print(df_timing) +} +df_timing + +out_test4 +rbind(out_test4$alpha, out_test4$beta) +fn$B +### why didn't they align??? + + +set.seed(1) +n <- 20 +J <- 100 +p <- 2 +beta <- matrix(rnorm(p*J, mean = 2, sd=2), ncol = J) +X <- cbind(1, c(rep(0, n/2), rep(1, n/2))) +Y <- matrix(rpois(n=n*J, lambda=exp(2 + X %*% beta)), ncol = J) +Y <- pmax(Y, 1) +Y +Ysimple <- rbind(Y[which(X[,2] == 0), ] %>% colSums, Y[which(X[,2] == 1), ] %>% colSums) + +# the_fit <- radEmu::emuFit(Y = Y, X = X, run_score_tests=F) + +the_fit <- radEmu::emuFit(Y = Ysimple, X = X[10:11, ], run_score_tests=F) +the_fit + +system.time({ + fn3_1_J <- fit_null(B=the_fit$B, Y=Y, X = X, k_constr=2, j_constr=1, j_ref=J, + constraint_fn=list(pseudohuber_median, pseudohuber_median), + constraint_grad_fn=list(radEmu::dpseudohuber_median_dx, radEmu::dpseudohuber_median_dx), + B_tol=1e-8, constraint_tol=1e-8) +}) # 26.270 0.152 27.411 + +system.time({ + out_test2 <- my_gd_ls(n0=Y[which(X[,2] == 0), ] %>% colSums, + n1 = Y[which(X[,2] == 1), ] %>% colSums, + g_beta=function(x) { pseudohuber_median(c(x, 0)) }, + g_beta_grad= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]}, + eta_alpha = 1e-3, + eta_beta = 1e-3, + maxit = 1000, + tol = 1e-6) +}) # 2.912 0.033 3.287 + + +system.time({ + out_test4 <- my_gd_fs(n0=Y[which(X[,2] == 0), ] %>% colSums, + n1 = Y[which(X[,2] == 1), ] %>% colSums, + g_beta=function(x) { pseudohuber_median(c(x, 0)) }, + g_beta_grad= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]}, + eta_alpha = 1e-3, + eta_beta = 1e-3, + maxit = 1000, + B_tol = 1e-8) +}) # 0.021 0.001 0.023 + + +system.time({ + out_test5 <- my_fs_stable(n0=Y[which(X[,2] == 0), ] %>% colSums, + n1 = Y[which(X[,2] == 1), ] %>% colSums, + g_beta=function(x) { pseudohuber_median(c(x, 0)) }, + g_beta_grad= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]}, + eta_alpha = 1e-3, + eta_beta = 1e-3, + maxit = 1000, + tol = 1e-8) +}) # 0.016 0.000 0.017 + + + +# out_test4$history %>% filter(type == "beta" & k == 3) +# +fn3_1_J$B +rbind(out_test2$alpha, out_test2$beta) +rbind(out_test4$alpha, out_test4$beta) +rbind(out_test5$alpha, out_test5$beta) + +#################################################### +#### Try larger sample + + +set.seed(1) +n <- 20 +J <- 100 +p <- 2 +beta <- matrix(rnorm(p*J, mean = 2, sd=2), ncol = J) +X <- cbind(1, c(rep(0, n/2), rep(1, n/2))) +Y <- matrix(rpois(n=n*J, lambda=exp(2 + X %*% beta)), ncol = J) +Y <- pmax(Y, 1) +Y +Ysimple <- rbind(Y[which(X[,2] == 0), ] %>% colSums, Y[which(X[,2] == 1), ] %>% colSums) + +system.time({ + fn3_1_J <- fit_null(B=matrix(0, nrow = p, ncol = J), Y=Y, X = X, k_constr=2, j_constr=1, j_ref=J, + constraint_fn=list(pseudohuber_median, pseudohuber_median), + constraint_grad_fn=list(radEmu::dpseudohuber_median_dx, radEmu::dpseudohuber_median_dx), + B_tol=1e-8, constraint_tol=1e-8) +}) + +system.time({ + out_test5 <- my_fs_stable(n0=Y[which(X[,2] == 0), ] %>% colSums, + n1 = Y[which(X[,2] == 1), ] %>% colSums, + g_beta=function(x) { pseudohuber_median(c(x, 0)) }, + g_beta_grad= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]}, + eta_alpha = 1e-3, + eta_beta = 1e-3, + maxit = 1000, + tol = 1e-8) +}) + +orig <- fn3_1_J$B +new <- cbind(rbind(out_test5$alpha, out_test5$beta), 0) + +## are the constraints satisfied +c(psuedohuber_median(orig[2,-1]), psuedohuber_median(orig[2,1])) +c(psuedohuber_median(new[2,-1]), psuedohuber_median(new[2,1])) + +B <- orig +z <- update_z(Y, X, B) +log_means <- X %*% B + matrix(z, ncol = 1) %*% matrix(1, ncol = J, nrow = 1) +ll_orig <- sum(Y * log_means - exp(log_means)) + +B <- new +z <- update_z(Y, X, B) +log_means <- X %*% B + matrix(z, ncol = 1) %*% matrix(1, ncol = J, nrow = 1) +ll_new <- sum(Y * log_means - exp(log_means)) + +B <- matrix(0, nrow = p, ncol = J) +z <- update_z(Y, X, B) +log_means <- X %*% B + matrix(z, ncol = 1) %*% matrix(1, ncol = J, nrow = 1) +sum(Y * log_means - exp(log_means)) + +ll_orig < ll_new ### new is better + From b229694c64493dfe265ffb8ef1bed2ac78cd35a1 Mon Sep 17 00:00:00 2001 From: Amy Willis Date: Mon, 8 Dec 2025 08:26:38 +1000 Subject: [PATCH 4/7] wrapping my head around it; plan to pull from main --- R/my_fs_stable_g_groups.R | 327 ++++++++++++++++++ ...ing_stable.R => my_fs_stable_two_groups.R} | 0 ...l_discrete_fisher_scoring.R => my_gd_fs.R} | 0 ..._grad_descent_line_search.R => my_gd_ls.R} | 0 test_new_discrete.R | 28 +- 5 files changed, 351 insertions(+), 4 deletions(-) create mode 100644 R/my_fs_stable_g_groups.R rename R/{fit_null_discrete_fisher_scoring_stable.R => my_fs_stable_two_groups.R} (100%) rename R/{fit_null_discrete_fisher_scoring.R => my_gd_fs.R} (100%) rename R/{fit_null_discrete_grad_descent_line_search.R => my_gd_ls.R} (100%) diff --git a/R/my_fs_stable_g_groups.R b/R/my_fs_stable_g_groups.R new file mode 100644 index 0000000..fdbbf0a --- /dev/null +++ b/R/my_fs_stable_g_groups.R @@ -0,0 +1,327 @@ +my_fs_stable_g_groups <- function(n_list, + g_beta, + g_beta_grad, + maxit = 1000, + tol = 1e-8, + ls_max = 20, # max step-halving + ls_rho = 0.5) { # shrink factor for step-halving + + G <- length(n_list) # number of groups + if (G < 2) stop("Need at least 2 groups (baseline + at least 1 non-baseline).") + + K <- length(n_list[[1]]) + if (K < 3) stop("beta1 = g(beta2,...,beta_{K-1}) requires K >= 3.") + for (g in seq_len(G)) { + if (length(n_list[[g]]) != K) stop("All n_g must have same length K.") + } + + m <- K - 1 # non-baseline categories + + # Group totals + n_tot <- vapply(n_list, sum, numeric(1)) + + # Parameters: + # alpha: length m + # beta_mat: (G-1) x m, each row = beta^(g) for group g=2..G + alpha <- rep(0, m) + beta_mat <- matrix(0, nrow = G - 1, ncol = m) + + # Enforce beta constraint initially: for each non-baseline group + if (G > 1) { + for (r in 1:(G - 1)) { + beta_mat[r, 1] <- g_beta(beta_mat[r, 2:m]) + } + } + + # Softmax for non-baseline categories + softmax_nb <- function(eta) { + e <- exp(eta) + e / (1 + sum(e)) + } + + # Log-likelihood given alpha, beta_mat + loglik_fun <- function(alpha, beta_mat) { + ll <- 0 + eps <- 1e-12 + + for (g in seq_len(G)) { + n_g <- n_list[[g]] + if (g == 1) { + eta <- alpha + } else { + r <- g - 1 + eta <- alpha + beta_mat[r, ] + } + p_nb <- softmax_nb(eta) + p_full <- c(p_nb, 1 - sum(p_nb)) + p_full <- pmin(pmax(p_full, eps), 1 - eps) + ll <- ll + sum(n_g * log(p_full)) + } + ll + } + + # ---- history tracking ---- + param_history <- list() + # iteration 0 (initial) + { + df_alpha <- data.frame(iter = 0L, + type = "alpha", + group = NA_integer_, + k = seq_len(m), + value = alpha) + df_beta_list <- vector("list", G - 1) + for (r in 1:(G - 1)) { + df_beta_list[[r]] <- data.frame(iter = 0L, + type = "beta", + group = r + 1L, # group index in n_list (2..G) + k = seq_len(m), + value = beta_mat[r, ]) + } + param_history[[1]] <- rbind(df_alpha, do.call(rbind, df_beta_list)) + } + # --------------------------- + + ll_old <- loglik_fun(alpha, beta_mat) + final_iter <- 0L + + # stability parameters + ridge_base <- 1e-4 + max_step_norm <- 5 + clip_logit_max <- 15 + + for (iter in 1:maxit) { + + # --- compute p_g and M_g for each group --- + p_list <- vector("list", G) + M_list <- vector("list", G) + for (g in seq_len(G)) { + if (g == 1) { + eta <- alpha + } else { + r <- g - 1 + eta <- alpha + beta_mat[r, ] + } + p_nb <- softmax_nb(eta) + p_list[[g]] <- p_nb + M_list[[g]] <- diag(p_nb) - tcrossprod(p_nb) + } + + # --- SCORES --- + # alpha score + s_alpha <- numeric(m) + for (g in seq_len(G)) { + n_g <- n_list[[g]] + n_gp <- n_tot[g] + p_nb <- p_list[[g]] + s_alpha <- s_alpha + (n_g[1:m] - n_gp * p_nb) + } + + # beta scores: full and free for each non-baseline group + s_beta_full_mat <- matrix(0, nrow = G - 1, ncol = m) + s_beta_free_mat <- matrix(0, nrow = G - 1, ncol = m - 1) + grad_g_beta_mat <- matrix(0, nrow = G - 1, ncol = m - 1) + + if (G > 1) { + for (r in 1:(G - 1)) { + g <- r + 1 + n_g <- n_list[[g]] + n_gp <- n_tot[g] + p_nb <- p_list[[g]] + + s_beta_full <- n_g[1:m] - n_gp * p_nb + s_beta_full_mat[r, ] <- s_beta_full + + grad_g <- g_beta_grad(beta_mat[r, 2:m]) # length m-1 + grad_g_beta_mat[r, ] <- grad_g + + s_beta_free_mat[r, ] <- s_beta_full[2:m] + s_beta_full[1] * grad_g + } + } + + # Reduced score vector θ = (alpha, all beta_free) + score_theta <- c(s_alpha, as.vector(t(s_beta_free_mat))) + + score_norm <- sqrt(sum(score_theta^2)) + if (score_norm < tol) { + final_iter <- iter - 1L + break + } + + # --- FULL FISHER J for (alpha, all beta_full) --- + q_dim <- m + (G - 1) * m + J <- matrix(0, nrow = q_dim, ncol = q_dim) + + # indices for alpha in full φ + alpha_idx <- 1:m + + # helper to get beta block indices for group r (r=1..G-1) + beta_block_idx <- function(r) { + start <- m + (r - 1) * m + 1 + end <- m + r * m + start:end + } + + for (g in seq_len(G)) { + M_g <- M_list[[g]] + n_gp <- n_tot[g] + + # alpha-alpha block + J[alpha_idx, alpha_idx] <- J[alpha_idx, alpha_idx] + n_gp * M_g + + if (g > 1) { + r <- g - 1 + B_idx <- beta_block_idx(r) + + # alpha-beta and beta-beta blocks + J[alpha_idx, B_idx] <- J[alpha_idx, B_idx] + n_gp * M_g + J[B_idx, alpha_idx] <- t(J[alpha_idx, B_idx]) + J[B_idx, B_idx] <- J[B_idx, B_idx] + n_gp * M_g + } + } + + # --- TRANSFORMATION T from θ to φ --- + p_dim <- m + (G - 1) * (m - 1) + T <- matrix(0, nrow = q_dim, ncol = p_dim) + + # alpha part: identity + T[alpha_idx, 1:m] <- diag(m) + + # beta parts: each group r has a B^(r) + for (r in 1:(G - 1)) { + B_idx <- beta_block_idx(r) + C_start <- m + (r - 1) * (m - 1) + 1 + C_end <- m + r * (m - 1) + C_idx <- C_start:C_end + + B <- matrix(0, nrow = m, ncol = m - 1) + B[1, ] <- grad_g_beta_mat[r, ] + if (m > 1) { + B[2:m, ] <- diag(m - 1) + } + T[B_idx, C_idx] <- B + } + + I_theta <- crossprod(T, J %*% T) + + # --- solve for Fisher scoring step with adaptive ridge --- + ridge_base <- 1e-4 + ridge_multipliers <- c(1, 10, 100, 1000, 1e4) + step_theta <- rep(NA_real_, length(score_theta)) + solved <- FALSE + + for (rm in ridge_multipliers) { + ridge <- ridge_base * rm + I_theta_r <- I_theta + ridge * diag(p_dim) + step_try <- tryCatch( + solve(I_theta_r, score_theta), + error = function(e) NA_real_ + ) + if (all(is.finite(step_try))) { + step_theta <- step_try + solved <- TRUE + break + } + } + + if (!solved) { + final_iter <- iter - 1L + break + } + + # Cap step length + step_norm <- sqrt(sum(step_theta^2)) + if (step_norm > max_step_norm) { + step_theta <- step_theta * (max_step_norm / step_norm) + } + + if (sqrt(sum(step_theta^2)) < tol) { + final_iter <- iter - 1L + break + } + + # --- Step-halving line search --- + step_scale <- 1.0 + ll_new <- ll_old + accepted <- FALSE + + for (ls_iter in 1:ls_max) { + + theta_old <- c(alpha, as.vector(t(beta_mat[, 2:m]))) + theta_new <- theta_old + step_scale * step_theta + + # unpack theta_new + alpha_new <- theta_new[1:m] + if (G > 1) { + bfree_vec <- theta_new[(m + 1):p_dim] + beta_free_mat_new <- matrix(bfree_vec, nrow = G - 1, ncol = m - 1, byrow = TRUE) + } else { + beta_free_mat_new <- matrix(0, nrow = 0, ncol = m - 1) + } + + beta_mat_new <- beta_mat + if (G > 1) { + beta_mat_new[, 2:m] <- beta_free_mat_new + # re-enforce constraint for each non-baseline group + for (r in 1:(G - 1)) { + beta_mat_new[r, 1] <- g_beta(beta_mat_new[r, 2:m]) + } + } + + # clip logits to avoid blowups + alpha_new <- pmax(pmin(alpha_new, clip_logit_max), -clip_logit_max) + beta_mat_new <- pmax(pmin(beta_mat_new, clip_logit_max), -clip_logit_max) + + ll_candidate <- loglik_fun(alpha_new, beta_mat_new) + + if (is.finite(ll_candidate) && ll_candidate >= ll_old) { + alpha <- alpha_new + beta_mat <- beta_mat_new + ll_new <- ll_candidate + accepted <- TRUE + break + } else { + step_scale <- step_scale * ls_rho + } + } + + if (!accepted) { + final_iter <- iter - 1L + break + } + + # record parameters after accepted update + df_alpha <- data.frame(iter = iter, + type = "alpha", + group = NA_integer_, + k = seq_len(m), + value = alpha) + df_beta_list <- vector("list", G - 1) + for (r in 1:(G - 1)) { + df_beta_list[[r]] <- data.frame(iter = iter, + type = "beta", + group = r + 1L, + k = seq_len(m), + value = beta_mat[r, ]) + } + param_history[[length(param_history) + 1]] <- rbind(df_alpha, do.call(rbind, df_beta_list)) + + if (abs(ll_new - ll_old) < tol) { + ll_old <- ll_new + final_iter <- iter + break + } + + ll_old <- ll_new + final_iter <- iter + } + + param_df <- do.call(rbind, param_history) + + list( + alpha = alpha, + beta = beta_mat, + logLik = ll_old, + iter = final_iter, + history = param_df + ) +} diff --git a/R/fit_null_discrete_fisher_scoring_stable.R b/R/my_fs_stable_two_groups.R similarity index 100% rename from R/fit_null_discrete_fisher_scoring_stable.R rename to R/my_fs_stable_two_groups.R diff --git a/R/fit_null_discrete_fisher_scoring.R b/R/my_gd_fs.R similarity index 100% rename from R/fit_null_discrete_fisher_scoring.R rename to R/my_gd_fs.R diff --git a/R/fit_null_discrete_grad_descent_line_search.R b/R/my_gd_ls.R similarity index 100% rename from R/fit_null_discrete_grad_descent_line_search.R rename to R/my_gd_ls.R diff --git a/test_new_discrete.R b/test_new_discrete.R index 52771c6..192ddde 100644 --- a/test_new_discrete.R +++ b/test_new_discrete.R @@ -43,7 +43,7 @@ for (jj in js) { # B_tol = 1e-5) # })[3] df_timing$new[ii] <- system.time({ - out_test4 <- my_fs_stable(n0=Y[which(X[,2] == 0), ] %>% colSums, + out_test4 <- my_fs_stable_two_groups(n0=Y[which(X[,2] == 0), ] %>% colSums, n1 = Y[which(X[,2] == 1), ] %>% colSums, g_beta=function(x) { pseudohuber_median(c(x, 0)) }, g_beta_grad= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]}, @@ -153,12 +153,10 @@ system.time({ }) system.time({ - out_test5 <- my_fs_stable(n0=Y[which(X[,2] == 0), ] %>% colSums, + out_test5 <- my_fs_stable_two_groups(n0=Y[which(X[,2] == 0), ] %>% colSums, n1 = Y[which(X[,2] == 1), ] %>% colSums, g_beta=function(x) { pseudohuber_median(c(x, 0)) }, g_beta_grad= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]}, - eta_alpha = 1e-3, - eta_beta = 1e-3, maxit = 1000, tol = 1e-8) }) @@ -187,3 +185,25 @@ sum(Y * log_means - exp(log_means)) ll_orig < ll_new ### new is better + +##### try g-dim version + +set.seed(1) +n <- 20 +J <- 30 +p <- 3 +beta <- matrix(rnorm(p*J, mean = 2, sd=1), ncol = J) +X <- cbind(1, c(rep(0, n/2), rep(1, n/2)), c(rep(0, 3*n/4), rep(1, n/4))) +Y <- matrix(rpois(n=n*J, lambda=exp(2 + X %*% beta)), ncol = J) +Y <- pmax(Y, 1) +Y + +load_all() +system.time({ + out_test5 <- my_fs_stable_two_groups(n0=Y[which(X[,2] == 0), ] %>% colSums, + n1 = Y[which(X[,2] == 1), ] %>% colSums, + g_beta=function(x) { pseudohuber_median(c(x, 0)) }, + g_beta_grad= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]}, + maxit = 1000, + tol = 1e-8) +}) From 5fb1561645dcc4c9085c5aaa4d447b41c35f7e9e Mon Sep 17 00:00:00 2001 From: Amy Willis Date: Mon, 8 Dec 2025 10:55:37 +1000 Subject: [PATCH 5/7] generalise to from two to multiple groups --- R/fit_null_discrete.R | 60 ++++- R/fit_null_discrete_micro_fs.R | 214 ++++++++++++++++ R/my_fs_stable_g_groups.R | 327 ------------------------ test_new_discrete.R | 70 +++-- tests/testthat/test-fit_null_discrete.R | 53 ++++ 5 files changed, 366 insertions(+), 358 deletions(-) create mode 100644 R/fit_null_discrete_micro_fs.R delete mode 100644 R/my_fs_stable_g_groups.R create mode 100644 tests/testthat/test-fit_null_discrete.R diff --git a/R/fit_null_discrete.R b/R/fit_null_discrete.R index c1750ac..69514f5 100644 --- a/R/fit_null_discrete.R +++ b/R/fit_null_discrete.R @@ -4,19 +4,52 @@ fit_null_discrete_pseudohuber <- function( X, k_constr, j_constr, - j_ref + j_ref, + maxit = 1000 ) { - if (is.null(j_ref)) { - j_ref <- ifelse(j_constr == 1, 2, 1) - } + # if (is.null(j_ref)) { + # j_ref <- ifelse(j_constr == 1, 2, 1) + # } n <- nrow(Y) J <- ncol(Y) p <- ncol(X) + ## TODO generalise + stopifnot(j_ref == J) + distinct_xx <- unique(X) + stopifnot(ncol(X) == nrow(distinct_xx)) + stopifnot(all(X[,1] == 1)) ### need a baseline category to simplify the parametrization + X_wo_1s <- distinct_xx[, 1] + + groups <- split( + seq_len(nrow(X)), # row indices of original data + apply(X, 1, function(r) + paste(r, collapse = "_")) # grouping key by row contents + ) + + groups <- groups[order(sapply(groups, min))] + + totals <- lapply(groups, function(x) { + apply(Y[x, , drop = FALSE], 2, sum) + }) + + Y_sum <- do.call(rbind, totals) + + ## this should be equivalent to fit_null with j_ref=J + out <- fit_null_discrete_micro_fs( + Y = Y_sum, + X = X_wo_1s, + k_constr=k_constr, + j_constr=j_constr, + constraint_fn=function(x) { pseudohuber_median(c(x, 0)) }, + constraint_grad_fn= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]} + maxit = maxit, + ) + ## Currently this works for #### two groups with X's (1, 0) and (1, 1) #### testing the first column, last column also constrained @@ -28,16 +61,18 @@ fit_null_discrete_pseudohuber <- function( #### different columns for testing #### the same inputs as fit_null and comparable convergence statistics - out <- my_fs_stable_two_groups(n0=Y[which(X[,2] == 0), ] %>% colSums, - n1 = Y[which(X[,2] == 1), ] %>% colSums, - g_beta=function(x) { pseudohuber_median(c(x, 0)) }, - g_beta_grad= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]}, - maxit = 1e8, - tol = 1e-10) + # out <- my_fs_stable_two_groups(n0=Y[which(X[,2] == 0), ] %>% colSums, + # n1 = Y[which(X[,2] == 1), ] %>% colSums, + # g_beta=function(x) { pseudohuber_median(c(x, 0)) }, + # g_beta_grad= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]}, + # maxit = 1000, tol = 1e-8, + # ls_max = 20, ls_rho = 0.5, + # ridge_base = 1e-4, + # max_step_norm = 5, + # clip_logit_max = 15) + B <- out$B - - B <- cbind(rbind(out$alpha, out$beta), 0) z <- update_z(Y, X, B) log_means <- X %*% B + matrix(z, ncol = 1) %*% matrix(1, ncol = J, nrow = 1) ll_new <- sum(Y * log_means - exp(log_means)) @@ -49,7 +84,6 @@ fit_null_discrete_pseudohuber <- function( it_df <- it_df[1:iter, ] - return(list( "B" = B, "k_constr" = k_constr, diff --git a/R/fit_null_discrete_micro_fs.R b/R/fit_null_discrete_micro_fs.R new file mode 100644 index 0000000..c7c80f5 --- /dev/null +++ b/R/fit_null_discrete_micro_fs.R @@ -0,0 +1,214 @@ +## this is equivalent to fit_null with j_ref=J +fit_null_discrete_micro_fs <- function(Y, X, + k_constr, j_constr, # indices: row k* in 1..(p-1), column j* in 1..(J-1) + constraint_fn, constraint_grad_fn, # g: R^{m-1}->R on the constrained row; grad: R^{m-1}->R^{m-1} + maxit = 1000, + tol = 1e-8, + ls_max = 20, + ls_rho = 0.5, + ridge_base = 1e-4, + max_step_norm = 5, + clip_logit_max = 15) { + Y <- as.matrix(Y); X <- as.matrix(X) + p <- nrow(Y); J <- ncol(Y); m <- J - 1L + if (nrow(X) != p || ncol(X) != p - 1L) stop("X must be p x (p-1).") + if (J < 3) stop("Need J >= 3 (one baseline).") + if (k_constr < 1L || k_constr > (p - 1L)) stop("k_constr must be in 1..(p-1).") + if (j_constr < 1L || j_constr > m) stop("j_constr must be in 1..(J-1).") + + n <- rowSums(Y) + + # Parameters + alpha <- rep(0, m) + beta <- matrix(0, nrow = p - 1L, ncol = m) + # initialize constrained element coherently (assume g excludes j_constr) + free_idx <- setdiff(seq_len(m), j_constr) + beta[k_constr, j_constr] <- constraint_fn(beta[k_constr, free_idx]) + + softmax_nb <- function(eta) { e <- exp(eta); e / (1 + sum(e)) } + + # Log-likelihood + loglik_fun <- function(alpha, beta) { + ll <- 0; eps <- 1e-12 + for (i in 1:p) { + eta <- alpha + drop(X[i, , drop = FALSE] %*% beta) # length m + pnb <- softmax_nb(eta) + pfull <- pmin(pmax(c(pnb, 1 - sum(pnb)), eps), 1 - eps) + ll <- ll + sum(Y[i, ] * log(pfull)) + } + ll + } + + # History + param_hist <- list() + record <- function(iter) { + rbind( + data.frame(iter = iter, type = "alpha", row = NA_integer_, col = seq_len(m), value = alpha), + data.frame(iter = iter, type = "beta", row = rep(seq_len(p - 1L), each = m), + col = rep(seq_len(m), times = p - 1L), value = as.vector(beta)) + ) + } + param_hist[[1]] <- record(0L) + + ll_old <- loglik_fun(alpha, beta); final_iter <- 0L + + for (iter in 1:maxit) { + # probs and M_i + p_list <- vector("list", p) + M_list <- vector("list", p) + for (i in 1:p) { + eta <- alpha + drop(X[i, , drop = FALSE] %*% beta) + pnb <- softmax_nb(eta) + p_list[[i]] <- pnb + M_list[[i]] <- diag(pnb) - tcrossprod(pnb) + } + + # scores + s_alpha <- rep(0, m) + s_beta <- matrix(0, nrow = p - 1L, ncol = m) + for (i in 1:p) { + s_i <- Y[i, 1:m] - n[i] * p_list[[i]] # length m + s_alpha <- s_alpha + s_i + # add outer product X[i,]^T * s_i + s_beta <- s_beta + tcrossprod(X[i, ], s_i) # (p-1) x m + } + + # reduced score: theta = (alpha, rows k != k*, full m; row k* free indices only) + s_theta <- c( + s_alpha, + as.vector(t(s_beta[setdiff(seq_len(p - 1L), k_constr), , drop = FALSE])), + # constrained row (chain rule) + { + grad_g <- constraint_grad_fn(beta[k_constr, free_idx]) + s_beta[k_constr, free_idx] + s_beta[k_constr, j_constr] * grad_g + } + ) + + if (sqrt(sum(s_theta^2)) < tol) { final_iter <- iter - 1L; break } + + # Fisher J in full phi = (alpha, vec(beta rows 1..p-1 each length m)) + q_dim <- m + (p - 1L) * m + Jmat <- matrix(0, q_dim, q_dim) + + # indices + alpha_idx <- 1:m + row_block_idx <- function(k) { # k in 1..(p-1), returns indices for row k in vec(beta) + start <- m + (k - 1L) * m + 1L + start:(start + m - 1L) + } + + for (i in 1:p) { + Mi <- M_list[[i]]; ni <- n[i]; Xi <- X[i, , drop = FALSE] # 1 x (p-1) + # alpha-alpha + Jmat[alpha_idx, alpha_idx] <- Jmat[alpha_idx, alpha_idx] + ni * Mi + # alpha-beta rows + for (k in 1:(p - 1L)) { + rb <- row_block_idx(k) + Jmat[alpha_idx, rb] <- Jmat[alpha_idx, rb] + ni * (Mi * Xi[1, k]) + Jmat[rb, alpha_idx] <- t(Jmat[alpha_idx, rb]) + } + # beta-beta rows + for (k in 1:(p - 1L)) { + for (l in 1:(p - 1L)) { + rbk <- row_block_idx(k); rbl <- row_block_idx(l) + Jmat[rbk, rbl] <- Jmat[rbk, rbl] + ni * (Mi * (Xi[1, k] * Xi[1, l])) + } + } + } + + # Transformation T: reduced theta -> full phi + # theta packs: alpha, then rows k != k* (full m), then row k* (m-1 free entries) + p_dim <- m + (p - 2L) * m + (m - 1L) + T <- matrix(0, nrow = q_dim, ncol = p_dim) + # alpha identity + T[alpha_idx, 1:m] <- diag(m) + col_ptr <- m + 1L + # rows k != k* + for (k in setdiff(seq_len(p - 1L), k_constr)) { + rb <- row_block_idx(k) + T[rb, col_ptr:(col_ptr + m - 1L)] <- diag(m) + col_ptr <- col_ptr + m + } + # row k* mapping B (m x (m-1)) + grad_g <- constraint_grad_fn(beta[k_constr, free_idx]) + Bmap <- matrix(0, nrow = m, ncol = m - 1L) + # identity on free indices + for (pos in seq_along(free_idx)) { + Bmap[free_idx[pos], pos] <- 1 + } + # constrained element row = grad_g^T + Bmap[j_constr, ] <- grad_g + T[row_block_idx(k_constr), col_ptr:(col_ptr + (m - 2L))] <- Bmap + + I_theta <- crossprod(T, Jmat %*% T) + + # Adaptive ridge solve + ridge_mult <- c(1, 10, 100, 1000, 1e4) + step_theta <- rep(NA_real_, p_dim); solved <- FALSE + for (rm in ridge_mult) { + Ir <- I_theta + (ridge_base * rm) * diag(p_dim) + st <- tryCatch(solve(Ir, s_theta), error = function(e) NA_real_) + if (all(is.finite(st))) { step_theta <- st; solved <- TRUE; break } + } + if (!solved) { final_iter <- iter - 1L; break } + + # Step capping + sn <- sqrt(sum(step_theta^2)) + if (sn > max_step_norm) step_theta <- step_theta * (max_step_norm / sn) + if (sqrt(sum(step_theta^2)) < tol) { final_iter <- iter - 1L; break } + + # Pack current reduced theta + pack_theta <- function(alpha, beta) { + c( + alpha, + as.vector(t(beta[setdiff(seq_len(p - 1L), k_constr), , drop = FALSE])), + beta[k_constr, free_idx] + ) + } + unpack_theta <- function(vec) { + wp <- 1L + a_new <- vec[wp:(wp + m - 1L)]; wp <- wp + m + beta_new <- beta + # rows k != k* + for (k in setdiff(seq_len(p - 1L), k_constr)) { + beta_new[k, ] <- vec[wp:(wp + m - 1L)]; wp <- wp + m + } + # row k* free + bfree <- vec[wp:(wp + (m - 2L))] + beta_new[k_constr, free_idx] <- bfree + beta_new[k_constr, j_constr] <- constraint_fn(bfree) + list(alpha = a_new, beta = beta_new) + } + + # Line search (monotone ℓ) + theta_old <- pack_theta(alpha, beta) + step_scale <- 1.0; accepted <- FALSE; ll_new <- ll_old + for (ls in 1:ls_max) { + prop <- unpack_theta(theta_old + step_scale * step_theta) + alpha_new <- prop$alpha + beta_new <- prop$beta + # clip logits (keep numerics sane) + alpha_new <- pmax(pmin(alpha_new, clip_logit_max), -clip_logit_max) + beta_new <- pmax(pmin(beta_new, clip_logit_max), -clip_logit_max) + ll_cand <- loglik_fun(alpha_new, beta_new) + if (is.finite(ll_cand) && ll_cand >= ll_old) { + alpha <- alpha_new; beta <- beta_new; ll_new <- ll_cand; accepted <- TRUE; break + } + step_scale <- step_scale * ls_rho + } + if (!accepted) { final_iter <- iter - 1L; break } + + # record and check convergence + param_hist[[length(param_hist) + 1L]] <- record(iter) + if (abs(ll_new - ll_old) < tol) { ll_old <- ll_new; final_iter <- iter; break } + ll_old <- ll_new; final_iter <- iter + } + + B <- cbind(rbind(alpha, beta), 0) + + history <- do.call(rbind, param_hist) + list(B = B, + logLik = ll_old, + iter = final_iter, + history = history) +} diff --git a/R/my_fs_stable_g_groups.R b/R/my_fs_stable_g_groups.R deleted file mode 100644 index fdbbf0a..0000000 --- a/R/my_fs_stable_g_groups.R +++ /dev/null @@ -1,327 +0,0 @@ -my_fs_stable_g_groups <- function(n_list, - g_beta, - g_beta_grad, - maxit = 1000, - tol = 1e-8, - ls_max = 20, # max step-halving - ls_rho = 0.5) { # shrink factor for step-halving - - G <- length(n_list) # number of groups - if (G < 2) stop("Need at least 2 groups (baseline + at least 1 non-baseline).") - - K <- length(n_list[[1]]) - if (K < 3) stop("beta1 = g(beta2,...,beta_{K-1}) requires K >= 3.") - for (g in seq_len(G)) { - if (length(n_list[[g]]) != K) stop("All n_g must have same length K.") - } - - m <- K - 1 # non-baseline categories - - # Group totals - n_tot <- vapply(n_list, sum, numeric(1)) - - # Parameters: - # alpha: length m - # beta_mat: (G-1) x m, each row = beta^(g) for group g=2..G - alpha <- rep(0, m) - beta_mat <- matrix(0, nrow = G - 1, ncol = m) - - # Enforce beta constraint initially: for each non-baseline group - if (G > 1) { - for (r in 1:(G - 1)) { - beta_mat[r, 1] <- g_beta(beta_mat[r, 2:m]) - } - } - - # Softmax for non-baseline categories - softmax_nb <- function(eta) { - e <- exp(eta) - e / (1 + sum(e)) - } - - # Log-likelihood given alpha, beta_mat - loglik_fun <- function(alpha, beta_mat) { - ll <- 0 - eps <- 1e-12 - - for (g in seq_len(G)) { - n_g <- n_list[[g]] - if (g == 1) { - eta <- alpha - } else { - r <- g - 1 - eta <- alpha + beta_mat[r, ] - } - p_nb <- softmax_nb(eta) - p_full <- c(p_nb, 1 - sum(p_nb)) - p_full <- pmin(pmax(p_full, eps), 1 - eps) - ll <- ll + sum(n_g * log(p_full)) - } - ll - } - - # ---- history tracking ---- - param_history <- list() - # iteration 0 (initial) - { - df_alpha <- data.frame(iter = 0L, - type = "alpha", - group = NA_integer_, - k = seq_len(m), - value = alpha) - df_beta_list <- vector("list", G - 1) - for (r in 1:(G - 1)) { - df_beta_list[[r]] <- data.frame(iter = 0L, - type = "beta", - group = r + 1L, # group index in n_list (2..G) - k = seq_len(m), - value = beta_mat[r, ]) - } - param_history[[1]] <- rbind(df_alpha, do.call(rbind, df_beta_list)) - } - # --------------------------- - - ll_old <- loglik_fun(alpha, beta_mat) - final_iter <- 0L - - # stability parameters - ridge_base <- 1e-4 - max_step_norm <- 5 - clip_logit_max <- 15 - - for (iter in 1:maxit) { - - # --- compute p_g and M_g for each group --- - p_list <- vector("list", G) - M_list <- vector("list", G) - for (g in seq_len(G)) { - if (g == 1) { - eta <- alpha - } else { - r <- g - 1 - eta <- alpha + beta_mat[r, ] - } - p_nb <- softmax_nb(eta) - p_list[[g]] <- p_nb - M_list[[g]] <- diag(p_nb) - tcrossprod(p_nb) - } - - # --- SCORES --- - # alpha score - s_alpha <- numeric(m) - for (g in seq_len(G)) { - n_g <- n_list[[g]] - n_gp <- n_tot[g] - p_nb <- p_list[[g]] - s_alpha <- s_alpha + (n_g[1:m] - n_gp * p_nb) - } - - # beta scores: full and free for each non-baseline group - s_beta_full_mat <- matrix(0, nrow = G - 1, ncol = m) - s_beta_free_mat <- matrix(0, nrow = G - 1, ncol = m - 1) - grad_g_beta_mat <- matrix(0, nrow = G - 1, ncol = m - 1) - - if (G > 1) { - for (r in 1:(G - 1)) { - g <- r + 1 - n_g <- n_list[[g]] - n_gp <- n_tot[g] - p_nb <- p_list[[g]] - - s_beta_full <- n_g[1:m] - n_gp * p_nb - s_beta_full_mat[r, ] <- s_beta_full - - grad_g <- g_beta_grad(beta_mat[r, 2:m]) # length m-1 - grad_g_beta_mat[r, ] <- grad_g - - s_beta_free_mat[r, ] <- s_beta_full[2:m] + s_beta_full[1] * grad_g - } - } - - # Reduced score vector θ = (alpha, all beta_free) - score_theta <- c(s_alpha, as.vector(t(s_beta_free_mat))) - - score_norm <- sqrt(sum(score_theta^2)) - if (score_norm < tol) { - final_iter <- iter - 1L - break - } - - # --- FULL FISHER J for (alpha, all beta_full) --- - q_dim <- m + (G - 1) * m - J <- matrix(0, nrow = q_dim, ncol = q_dim) - - # indices for alpha in full φ - alpha_idx <- 1:m - - # helper to get beta block indices for group r (r=1..G-1) - beta_block_idx <- function(r) { - start <- m + (r - 1) * m + 1 - end <- m + r * m - start:end - } - - for (g in seq_len(G)) { - M_g <- M_list[[g]] - n_gp <- n_tot[g] - - # alpha-alpha block - J[alpha_idx, alpha_idx] <- J[alpha_idx, alpha_idx] + n_gp * M_g - - if (g > 1) { - r <- g - 1 - B_idx <- beta_block_idx(r) - - # alpha-beta and beta-beta blocks - J[alpha_idx, B_idx] <- J[alpha_idx, B_idx] + n_gp * M_g - J[B_idx, alpha_idx] <- t(J[alpha_idx, B_idx]) - J[B_idx, B_idx] <- J[B_idx, B_idx] + n_gp * M_g - } - } - - # --- TRANSFORMATION T from θ to φ --- - p_dim <- m + (G - 1) * (m - 1) - T <- matrix(0, nrow = q_dim, ncol = p_dim) - - # alpha part: identity - T[alpha_idx, 1:m] <- diag(m) - - # beta parts: each group r has a B^(r) - for (r in 1:(G - 1)) { - B_idx <- beta_block_idx(r) - C_start <- m + (r - 1) * (m - 1) + 1 - C_end <- m + r * (m - 1) - C_idx <- C_start:C_end - - B <- matrix(0, nrow = m, ncol = m - 1) - B[1, ] <- grad_g_beta_mat[r, ] - if (m > 1) { - B[2:m, ] <- diag(m - 1) - } - T[B_idx, C_idx] <- B - } - - I_theta <- crossprod(T, J %*% T) - - # --- solve for Fisher scoring step with adaptive ridge --- - ridge_base <- 1e-4 - ridge_multipliers <- c(1, 10, 100, 1000, 1e4) - step_theta <- rep(NA_real_, length(score_theta)) - solved <- FALSE - - for (rm in ridge_multipliers) { - ridge <- ridge_base * rm - I_theta_r <- I_theta + ridge * diag(p_dim) - step_try <- tryCatch( - solve(I_theta_r, score_theta), - error = function(e) NA_real_ - ) - if (all(is.finite(step_try))) { - step_theta <- step_try - solved <- TRUE - break - } - } - - if (!solved) { - final_iter <- iter - 1L - break - } - - # Cap step length - step_norm <- sqrt(sum(step_theta^2)) - if (step_norm > max_step_norm) { - step_theta <- step_theta * (max_step_norm / step_norm) - } - - if (sqrt(sum(step_theta^2)) < tol) { - final_iter <- iter - 1L - break - } - - # --- Step-halving line search --- - step_scale <- 1.0 - ll_new <- ll_old - accepted <- FALSE - - for (ls_iter in 1:ls_max) { - - theta_old <- c(alpha, as.vector(t(beta_mat[, 2:m]))) - theta_new <- theta_old + step_scale * step_theta - - # unpack theta_new - alpha_new <- theta_new[1:m] - if (G > 1) { - bfree_vec <- theta_new[(m + 1):p_dim] - beta_free_mat_new <- matrix(bfree_vec, nrow = G - 1, ncol = m - 1, byrow = TRUE) - } else { - beta_free_mat_new <- matrix(0, nrow = 0, ncol = m - 1) - } - - beta_mat_new <- beta_mat - if (G > 1) { - beta_mat_new[, 2:m] <- beta_free_mat_new - # re-enforce constraint for each non-baseline group - for (r in 1:(G - 1)) { - beta_mat_new[r, 1] <- g_beta(beta_mat_new[r, 2:m]) - } - } - - # clip logits to avoid blowups - alpha_new <- pmax(pmin(alpha_new, clip_logit_max), -clip_logit_max) - beta_mat_new <- pmax(pmin(beta_mat_new, clip_logit_max), -clip_logit_max) - - ll_candidate <- loglik_fun(alpha_new, beta_mat_new) - - if (is.finite(ll_candidate) && ll_candidate >= ll_old) { - alpha <- alpha_new - beta_mat <- beta_mat_new - ll_new <- ll_candidate - accepted <- TRUE - break - } else { - step_scale <- step_scale * ls_rho - } - } - - if (!accepted) { - final_iter <- iter - 1L - break - } - - # record parameters after accepted update - df_alpha <- data.frame(iter = iter, - type = "alpha", - group = NA_integer_, - k = seq_len(m), - value = alpha) - df_beta_list <- vector("list", G - 1) - for (r in 1:(G - 1)) { - df_beta_list[[r]] <- data.frame(iter = iter, - type = "beta", - group = r + 1L, - k = seq_len(m), - value = beta_mat[r, ]) - } - param_history[[length(param_history) + 1]] <- rbind(df_alpha, do.call(rbind, df_beta_list)) - - if (abs(ll_new - ll_old) < tol) { - ll_old <- ll_new - final_iter <- iter - break - } - - ll_old <- ll_new - final_iter <- iter - } - - param_df <- do.call(rbind, param_history) - - list( - alpha = alpha, - beta = beta_mat, - logLik = ll_old, - iter = final_iter, - history = param_df - ) -} diff --git a/test_new_discrete.R b/test_new_discrete.R index 192ddde..a1af0da 100644 --- a/test_new_discrete.R +++ b/test_new_discrete.R @@ -44,13 +44,13 @@ for (jj in js) { # })[3] df_timing$new[ii] <- system.time({ out_test4 <- my_fs_stable_two_groups(n0=Y[which(X[,2] == 0), ] %>% colSums, - n1 = Y[which(X[,2] == 1), ] %>% colSums, - g_beta=function(x) { pseudohuber_median(c(x, 0)) }, - g_beta_grad= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]}, - eta_alpha = 1e-3, - eta_beta = 1e-3, - maxit = 1e8, - tol = 1e-10) + n1 = Y[which(X[,2] == 1), ] %>% colSums, + g_beta=function(x) { pseudohuber_median(c(x, 0)) }, + g_beta_grad= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]}, + eta_alpha = 1e-3, + eta_beta = 1e-3, + maxit = 1e8, + tol = 1e-10) })[3] ii <- ii + 1 print(df_timing) @@ -154,11 +154,11 @@ system.time({ system.time({ out_test5 <- my_fs_stable_two_groups(n0=Y[which(X[,2] == 0), ] %>% colSums, - n1 = Y[which(X[,2] == 1), ] %>% colSums, - g_beta=function(x) { pseudohuber_median(c(x, 0)) }, - g_beta_grad= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]}, - maxit = 1000, - tol = 1e-8) + n1 = Y[which(X[,2] == 1), ] %>% colSums, + g_beta=function(x) { pseudohuber_median(c(x, 0)) }, + g_beta_grad= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]}, + maxit = 1000, + tol = 1e-8) }) orig <- fn3_1_J$B @@ -198,12 +198,46 @@ Y <- matrix(rpois(n=n*J, lambda=exp(2 + X %*% beta)), ncol = J) Y <- pmax(Y, 1) Y +# radEmu:::emuFit_micro_discrete(X, Y) +# radEmu:::emuFit_micro_discrete(X[n:1, ], Y[n:1, ]) + +unique(X) + +my_n_list <- list(Y[apply(X, 1, function(x) all(x == unique(X)[1,])), ] %>% colSums, + Y[apply(X, 1, function(x) all(x == unique(X)[2,])), ] %>% colSums, + Y[apply(X, 1, function(x) all(x == unique(X)[3,])), ] %>% colSums) + + load_all() +my_jstar = 9 system.time({ - out_test5 <- my_fs_stable_two_groups(n0=Y[which(X[,2] == 0), ] %>% colSums, - n1 = Y[which(X[,2] == 1), ] %>% colSums, - g_beta=function(x) { pseudohuber_median(c(x, 0)) }, - g_beta_grad= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]}, - maxit = 1000, - tol = 1e-8) + out_test6 <- fs_direct_beta_constraint( + Y = do.call(rbind, my_n_list), + X = unique(X)[, -1], k_constr=2, j_constr=my_jstar, + constraint_fn=function(x) { pseudohuber_median(c(x, 0)) }, # f(z2..zm) + constraint_grad_fn= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]} + ) }) +out_test6 + +out_test6$iter +out_test6$alpha %>% c(0) %>% pseudohuber_median + +(out_test6$beta[,-my_jstar] %>% apply(1, function(x) { pseudohuber_median(c(x, 0)) }))[2] +out_test6$beta[2,my_jstar] +### ok, promising + + + +system.time({ + out_test6_orig <- fit_null(B=matrix(0, nrow = p, ncol = J), + Y=Y, X = X, + k_constr=2, j_constr=my_jstar, j_ref=J, + constraint_fn=list(pseudohuber_median, pseudohuber_median), + constraint_grad_fn=list(radEmu::dpseudohuber_median_dx, radEmu::dpseudohuber_median_dx), + B_tol=1e-8, constraint_tol=1e-8) +}) # 246.458 0.906 247.997 +out_test6_orig$B +cbind(rbind(out_test6$alpha, out_test6$beta), 0) +## excellent +## now check constraints diff --git a/tests/testthat/test-fit_null_discrete.R b/tests/testthat/test-fit_null_discrete.R new file mode 100644 index 0000000..766d975 --- /dev/null +++ b/tests/testthat/test-fit_null_discrete.R @@ -0,0 +1,53 @@ + +test_that("new discrete is correct", { + + set.seed(1) + n <- 20 + J <- 10 + p <- 3 + beta <- matrix(rnorm(p*J, mean = 2, sd=1), ncol = J) + X <- cbind(1, c(rep(0, n/2), rep(1, n/2)), c(rep(0, 3*n/4), rep(1, n/4))) + Y <- matrix(rpois(n=n*J, lambda=exp(2 + X %*% beta)), ncol = J) + Y <- pmax(Y, 1) + + my_n_list <- list(Y[apply(X, 1, function(x) all(x == unique(X)[1,])), ] %>% colSums, + Y[apply(X, 1, function(x) all(x == unique(X)[2,])), ] %>% colSums, + Y[apply(X, 1, function(x) all(x == unique(X)[3,])), ] %>% colSums) + + + my_jstar = 9 + system.time({ + out_test6 <- fs_direct_beta_constraint( + Y = do.call(rbind, my_n_list), + X = unique(X)[, -1], k_constr=2, j_constr=my_jstar, + constraint_fn=function(x) { pseudohuber_median(c(x, 0)) }, # f(z2..zm) + constraint_grad_fn= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]} + ) + }) + out_test6 + + out_test6$iter + out_test6$alpha %>% c(0) %>% pseudohuber_median + + (out_test6$beta[,-my_jstar] %>% apply(1, function(x) { pseudohuber_median(c(x, 0)) }))[2] + out_test6$beta[2,my_jstar] + ### ok, promising + + + + system.time({ + out_test6_orig <- fit_null(B=matrix(0, nrow = p, ncol = J), + Y=Y, X = X, + k_constr=2, j_constr=my_jstar, j_ref=J, + constraint_fn=list(pseudohuber_median, pseudohuber_median), + constraint_grad_fn=list(radEmu::dpseudohuber_median_dx, radEmu::dpseudohuber_median_dx), + B_tol=1e-8, constraint_tol=1e-8) + }) # 246.458 0.906 247.997 + out_test6_orig$B + cbind(rbind(out_test6$alpha, out_test6$beta), 0) + ## excellent + ## now check constraints + + + +}) \ No newline at end of file From 82b383238b0539f86b17ce203a1730215a9f98a2 Mon Sep 17 00:00:00 2001 From: Amy Willis Date: Mon, 8 Dec 2025 20:26:25 +1000 Subject: [PATCH 6/7] generalise j_ref and j_constr --- R/fit_null_discrete.R | 170 ++++------------ R/fit_null_discrete_micro_fs.R | 47 +++-- R/macro_fisher_null.R | 250 ++++++++++++------------ tests/testthat/test-fit_null_discrete.R | 236 +++++++++++++++++++--- 4 files changed, 397 insertions(+), 306 deletions(-) diff --git a/R/fit_null_discrete.R b/R/fit_null_discrete.R index 69514f5..b036996 100644 --- a/R/fit_null_discrete.R +++ b/R/fit_null_discrete.R @@ -1,29 +1,39 @@ +#' @note +#' Needs higher tolerance than macro_fisher_null to get superior LL + fit_null_discrete_pseudohuber <- function( - B, Y, X, k_constr, j_constr, j_ref, - maxit = 1000 + trackB = TRUE, + maxit = 1000, + tol = 1e-8, + ls_max = 20, + ls_rho = 0.5, + max_step = 1 ) { - # if (is.null(j_ref)) { - # j_ref <- ifelse(j_constr == 1, 2, 1) - # } - n <- nrow(Y) J <- ncol(Y) p <- ncol(X) - ## TODO generalise - stopifnot(j_ref == J) - + ### check we're in the right paradigm distinct_xx <- unique(X) - stopifnot(ncol(X) == nrow(distinct_xx)) stopifnot(all(X[,1] == 1)) ### need a baseline category to simplify the parametrization - X_wo_1s <- distinct_xx[, 1] + X_wo_1s <- distinct_xx[, -1] + + + ## Rearrange to have j_ref be the last (J-th) column + ## Setup: + ## put j_ref to be the J-th column + ## put j_constr to be the 1st column + ## Later, permute cols of B-hat + Y_orig <- Y + new_order <- c(j_constr, setdiff(1:J, c(j_constr, j_ref)), j_ref) + Y <- Y_orig[, new_order] groups <- split( seq_len(nrow(X)), # row indices of original data @@ -44,139 +54,31 @@ fit_null_discrete_pseudohuber <- function( Y = Y_sum, X = X_wo_1s, k_constr=k_constr, - j_constr=j_constr, + j_constr=1, ## because it's been moved to the first column constraint_fn=function(x) { pseudohuber_median(c(x, 0)) }, - constraint_grad_fn= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]} - maxit = maxit, + constraint_grad_fn= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]}, + maxit = maxit, + tol = tol, + ls_max = ls_max, + ls_rho = ls_rho, + max_step = max_step ) - - ## Currently this works for - #### two groups with X's (1, 0) and (1, 1) - #### testing the first column, last column also constrained - - #### it is equivalent to fit_null with k_constr=2, j_constr=1, j_ref=J, - - ## It needs to be generalized to consider - #### multiple categories (can we assume identity-like then back transform? is this constraint and likelihood-preserving?) - #### different columns for testing - #### the same inputs as fit_null and comparable convergence statistics - - # out <- my_fs_stable_two_groups(n0=Y[which(X[,2] == 0), ] %>% colSums, - # n1 = Y[which(X[,2] == 1), ] %>% colSums, - # g_beta=function(x) { pseudohuber_median(c(x, 0)) }, - # g_beta_grad= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]}, - # maxit = 1000, tol = 1e-8, - # ls_max = 20, ls_rho = 0.5, - # ridge_base = 1e-4, - # max_step_norm = 5, - # clip_logit_max = 15) - + B <- out$B z <- update_z(Y, X, B) log_means <- X %*% B + matrix(z, ncol = 1) %*% matrix(1, ncol = J, nrow = 1) ll_new <- sum(Y * log_means - exp(log_means)) - # beta = X^{-1} %*% eta - betahats <- round(MASS::ginv(distinct_xx), 8) %*% etahats - betahats - - - it_df <- it_df[1:iter, ] - + ### now reorder B to match what was input return(list( - "B" = B, + "B" = B[, order(new_order)], "k_constr" = k_constr, "j_constr" = j_constr, - "niter" = iter, - "gap" = gap, - "u" = u, - "rho" = rho, - "Bs" = Bs, - "it_df" = it_df, - "converged" = converged + "niter" = out$iter, + "loglik" = ll_new, + "it_df" = out$history, + "converged" = (out$iter < maxit) )) -} -# -# -# -# set.seed(1) -# n <- 10 -# J <- 5 -# p <- 2 -# beta <- matrix(rnorm(p*J, sd=5), ncol = J) -# X <- cbind(1, c(rep(0, n/2), rep(1, n/2))) -# Y <- matrix(rpois(n=n*J, lambda=exp(5 + X %*% beta)), ncol = J) -# Y -# -# # woah, that takes ages -# fit1 <- radEmu::emuFit(Y = Y, X = X, test_kj=data.frame("k" = 2, "j" = 3)) -# fit1 -# fit1 %>% names -# fit1$estimation_converged -# fit1$score_test_hyperparams -# fit1 -# -# fit2 <- radEmu::emuFit(Y = Y, X = X, test_kj=data.frame("k" = 2, "j" = 3), control=list("trackB" = TRUE), refit=F, fitted_model=fit1) -# fit2 -# fit2 %>% names -# library(tidyverse) -# final_bs <- fit2$trackB_list[[1]] %>% tibble %>% tail(p*J) -# final_bs %>% -# signif(3) -# fit2$coef -# -# -# final_bs %>% -# signif(3) -# Y -# X -# final_bs %>% pull(B) %>% matrix(nrow = 2, byrow=T) -# -# -# #### attempt 2 -# set.seed(1) -# n <- 10 -# J <- 5 -# p <- 2 -# beta <- matrix(rnorm(p*J, sd=2), ncol = J) -# X <- cbind(1, c(rep(0, n/2), rep(1, n/2))) -# Y <- matrix(rpois(n=n*J, lambda=exp(5 + X %*% beta)), ncol = J) -# Y -# -# fit3 <- radEmu::emuFit(Y = Y, X = X, test_kj=data.frame("k" = 2, "j" = 4), control=list("trackB" = TRUE)) -# fit3 -# fit3$trackB_list[[1]] %>% tibble %>% tail(p*J) -# -# fit3$trackB_list[[1]] %>% tibble %>% tail(J) %>% pull(B) %>% psuedohuber_median -# fit3$trackB_list[[1]] %>% tibble %>% tail(p*J) %>% head(J) %>% pull(B) %>% psuedohuber_median -# -# final_bs %>% -# signif(3) -# Y -# X -# final_bs %>% pull(B) %>% matrix(nrow = 2, byrow=T) -# -# -# fit3$B -# -# fn3b <- fit_null(B=fit3$B, Y=Y, X = X, k_constr=2, j_constr=4, j_ref=2, -# constraint_fn=list(pseudohuber_median, pseudohuber_median), -# constraint_grad_fn=list(radEmu::dpseudohuber_median_dx, radEmu::dpseudohuber_median_dx)) -# fn3a$B -# fn3b$B -# -# fn3a <- fit_null(B=fit3$B, Y=Y, X = X, k_constr=2, j_constr=4, j_ref=1, -# constraint_fn=list(pseudohuber_median, pseudohuber_median), -# constraint_grad_fn=list(radEmu::dpseudohuber_median_dx, radEmu::dpseudohuber_median_dx), -# B_tol=1e-8, constraint_tol=1e-8) -# totest <- fn3a -# totest$B -# totest$B[totest$k_constr, setdiff(1:J, totest$j_constr)] %>% psuedohuber_median -# totest$B[totest$k_constr, totest$j_constr] -# -# -# -# aaa <- fit3$trackB_list[[1]] %>% tibble %>% tail(p*J) %>% tail(J) -# (aaa[c(1,2,3,5)] %>% psuedohuber_median) - aaa[4] +} \ No newline at end of file diff --git a/R/fit_null_discrete_micro_fs.R b/R/fit_null_discrete_micro_fs.R index c7c80f5..f0cc27c 100644 --- a/R/fit_null_discrete_micro_fs.R +++ b/R/fit_null_discrete_micro_fs.R @@ -1,6 +1,7 @@ ## this is equivalent to fit_null with j_ref=J fit_null_discrete_micro_fs <- function(Y, X, - k_constr, j_constr, # indices: row k* in 1..(p-1), column j* in 1..(J-1) + k_constr, + j_constr, # indices: row k* in 1..(p-1), column j* in 1..(J-1) constraint_fn, constraint_grad_fn, # g: R^{m-1}->R on the constrained row; grad: R^{m-1}->R^{m-1} maxit = 1000, tol = 1e-8, @@ -8,12 +9,16 @@ fit_null_discrete_micro_fs <- function(Y, X, ls_rho = 0.5, ridge_base = 1e-4, max_step_norm = 5, - clip_logit_max = 15) { + max_step = 1) { + + ## using `alpha + beta*X` parametrisation, so decrease k_constr_non_intercept accordingly + k_constr_non_intercept <- k_constr - 1 + Y <- as.matrix(Y); X <- as.matrix(X) p <- nrow(Y); J <- ncol(Y); m <- J - 1L if (nrow(X) != p || ncol(X) != p - 1L) stop("X must be p x (p-1).") if (J < 3) stop("Need J >= 3 (one baseline).") - if (k_constr < 1L || k_constr > (p - 1L)) stop("k_constr must be in 1..(p-1).") + if (k_constr_non_intercept < 1L || k_constr_non_intercept > (p - 1L)) stop("k_constr_non_intercept must be in 1..(p-1).") if (j_constr < 1L || j_constr > m) stop("j_constr must be in 1..(J-1).") n <- rowSums(Y) @@ -23,17 +28,18 @@ fit_null_discrete_micro_fs <- function(Y, X, beta <- matrix(0, nrow = p - 1L, ncol = m) # initialize constrained element coherently (assume g excludes j_constr) free_idx <- setdiff(seq_len(m), j_constr) - beta[k_constr, j_constr] <- constraint_fn(beta[k_constr, free_idx]) + beta[k_constr_non_intercept, j_constr] <- constraint_fn(beta[k_constr_non_intercept, free_idx]) softmax_nb <- function(eta) { e <- exp(eta); e / (1 + sum(e)) } # Log-likelihood loglik_fun <- function(alpha, beta) { - ll <- 0; eps <- 1e-12 + ll <- 0; #eps <- 1e-12 for (i in 1:p) { eta <- alpha + drop(X[i, , drop = FALSE] %*% beta) # length m pnb <- softmax_nb(eta) - pfull <- pmin(pmax(c(pnb, 1 - sum(pnb)), eps), 1 - eps) + pfull <- c(pnb, 1 - sum(pnb)) + # pfull <- pmin(pmax(c(pnb, 1 - sum(pnb)), eps), 1 - eps) ll <- ll + sum(Y[i, ] * log(pfull)) } ll @@ -76,11 +82,11 @@ fit_null_discrete_micro_fs <- function(Y, X, # reduced score: theta = (alpha, rows k != k*, full m; row k* free indices only) s_theta <- c( s_alpha, - as.vector(t(s_beta[setdiff(seq_len(p - 1L), k_constr), , drop = FALSE])), + as.vector(t(s_beta[setdiff(seq_len(p - 1L), k_constr_non_intercept), , drop = FALSE])), # constrained row (chain rule) { - grad_g <- constraint_grad_fn(beta[k_constr, free_idx]) - s_beta[k_constr, free_idx] + s_beta[k_constr, j_constr] * grad_g + grad_g <- constraint_grad_fn(beta[k_constr_non_intercept, free_idx]) + s_beta[k_constr_non_intercept, free_idx] + s_beta[k_constr_non_intercept, j_constr] * grad_g } ) @@ -124,13 +130,13 @@ fit_null_discrete_micro_fs <- function(Y, X, T[alpha_idx, 1:m] <- diag(m) col_ptr <- m + 1L # rows k != k* - for (k in setdiff(seq_len(p - 1L), k_constr)) { + for (k in setdiff(seq_len(p - 1L), k_constr_non_intercept)) { rb <- row_block_idx(k) T[rb, col_ptr:(col_ptr + m - 1L)] <- diag(m) col_ptr <- col_ptr + m } # row k* mapping B (m x (m-1)) - grad_g <- constraint_grad_fn(beta[k_constr, free_idx]) + grad_g <- constraint_grad_fn(beta[k_constr_non_intercept, free_idx]) Bmap <- matrix(0, nrow = m, ncol = m - 1L) # identity on free indices for (pos in seq_along(free_idx)) { @@ -138,7 +144,7 @@ fit_null_discrete_micro_fs <- function(Y, X, } # constrained element row = grad_g^T Bmap[j_constr, ] <- grad_g - T[row_block_idx(k_constr), col_ptr:(col_ptr + (m - 2L))] <- Bmap + T[row_block_idx(k_constr_non_intercept), col_ptr:(col_ptr + (m - 2L))] <- Bmap I_theta <- crossprod(T, Jmat %*% T) @@ -161,8 +167,8 @@ fit_null_discrete_micro_fs <- function(Y, X, pack_theta <- function(alpha, beta) { c( alpha, - as.vector(t(beta[setdiff(seq_len(p - 1L), k_constr), , drop = FALSE])), - beta[k_constr, free_idx] + as.vector(t(beta[setdiff(seq_len(p - 1L), k_constr_non_intercept), , drop = FALSE])), + beta[k_constr_non_intercept, free_idx] ) } unpack_theta <- function(vec) { @@ -170,13 +176,13 @@ fit_null_discrete_micro_fs <- function(Y, X, a_new <- vec[wp:(wp + m - 1L)]; wp <- wp + m beta_new <- beta # rows k != k* - for (k in setdiff(seq_len(p - 1L), k_constr)) { + for (k in setdiff(seq_len(p - 1L), k_constr_non_intercept)) { beta_new[k, ] <- vec[wp:(wp + m - 1L)]; wp <- wp + m } # row k* free bfree <- vec[wp:(wp + (m - 2L))] - beta_new[k_constr, free_idx] <- bfree - beta_new[k_constr, j_constr] <- constraint_fn(bfree) + beta_new[k_constr_non_intercept, free_idx] <- bfree + beta_new[k_constr_non_intercept, j_constr] <- constraint_fn(bfree) list(alpha = a_new, beta = beta_new) } @@ -187,9 +193,6 @@ fit_null_discrete_micro_fs <- function(Y, X, prop <- unpack_theta(theta_old + step_scale * step_theta) alpha_new <- prop$alpha beta_new <- prop$beta - # clip logits (keep numerics sane) - alpha_new <- pmax(pmin(alpha_new, clip_logit_max), -clip_logit_max) - beta_new <- pmax(pmin(beta_new, clip_logit_max), -clip_logit_max) ll_cand <- loglik_fun(alpha_new, beta_new) if (is.finite(ll_cand) && ll_cand >= ll_old) { alpha <- alpha_new; beta <- beta_new; ll_new <- ll_cand; accepted <- TRUE; break @@ -205,8 +208,12 @@ fit_null_discrete_micro_fs <- function(Y, X, } B <- cbind(rbind(alpha, beta), 0) + rownames(B) <- NULL history <- do.call(rbind, param_hist) + history$k <- ifelse(is.na(history$row), 0, history$row) + 1 + history$j <- history$col + list(B = B, logLik = ll_old, iter = final_iter, diff --git a/R/macro_fisher_null.R b/R/macro_fisher_null.R index 29f7ecb..09cdb43 100644 --- a/R/macro_fisher_null.R +++ b/R/macro_fisher_null.R @@ -1,162 +1,172 @@ - -#block approximate Newton update to B used in +#block approximate Newton update to B used in #optimization under null -macro_fisher_null <- function(X, - Y, - B, - z, - J, - p, - k_constr, - j_constr, - j_ref, - rho, - u, - constraint_fn, - constraint_grad_fn, - stepsize = 0.5, - c1 = 1e-4, - regularization = 0.1, - max_step = 1, - verbose = FALSE, - debug = FALSE #return more information useful for - #debugging - ){ +macro_fisher_null <- function( + X, + Y, + B, + z, + J, + p, + k_constr, + j_constr, + j_ref, + rho, + u, + constraint_fn, + constraint_grad_fn, + stepsize = 0.5, + c1 = 1e-4, + regularization = 0.1, + max_step = 1, + verbose = FALSE, + debug = FALSE #return more information useful for + #debugging +) { #list of log means... by taxon log_means_by_j <- - lapply(1:J,function(j) - X%*%B[,j,drop = FALSE] + z) - + lapply(1:J, function(j) { + X %*% B[, j, drop = FALSE] + z + }) + #compute information matrix of *Poisson* log likelihood #for fixed z - info_bits <- lapply(log_means_by_j, - function(log_mu){ - crossprod(X,diag(as.numeric(exp(log_mu))))%*%X - }) - info_diag <- do.call(c,lapply(info_bits,diag)) + info_bits <- lapply(log_means_by_j, function(log_mu) { + crossprod(X, diag(as.numeric(exp(log_mu)))) %*% X + }) + info_diag <- do.call(c, lapply(info_bits, diag)) mean_diag <- mean(info_diag) - - info_bits <- lapply(info_bits, - function(x){ - diag(x) <- diag(x) + regularization*mean_diag; - return(x) - }) + + info_bits <- lapply(info_bits, function(x) { + diag(x) <- diag(x) + regularization * mean_diag + return(x) + }) #invert info by inverting each block diagonal element - inv_info_by_j <- lapply(info_bits, - function(x) - qr.solve(x,tol = 1e-20)) + inv_info_by_j <- lapply(info_bits, function(x) { + qr.solve(x, tol = 1e-20) + }) #remove diagonal block corresponding to convenience constraint inv_info_by_j <- inv_info_by_j[-j_ref] - - #stick the block diagonal elements together into one matrix to get + + #stick the block diagonal elements together into one matrix to get #inverse as an actual matrix info_inverse <- Matrix::bdiag(inv_info_by_j) - + #compute gradient of constraint function - cg <- constraint_grad_fn[[k_constr]](B[k_constr,]) + cg <- constraint_grad_fn[[k_constr]](B[k_constr, ]) cg[j_constr] <- cg[j_constr] - 1 cg <- cg[-j_ref] - + #vector of p elements whose k_constr-th element is 1, all else 0 - e_k_constr <- Matrix(0,nrow = p,ncol = 1) - e_k_constr[k_constr,] <- 1 - #take kronecker producr with gradient of constraint to + e_k_constr <- Matrix(0, nrow = p, ncol = 1) + e_k_constr[k_constr, ] <- 1 + #take kronecker producr with gradient of constraint to #get a gradient in long format B (i.e., B_cup) - cg_expanded <- Matrix::kronecker(Matrix(cg,ncol = 1),e_k_constr) - v <- sqrt(rho)*cg_expanded + cg_expanded <- Matrix::kronecker(Matrix(cg, ncol = 1), e_k_constr) + v <- sqrt(rho) * cg_expanded #do sherman-morrison rank-1 update in a couple steps - sm_denom <- as.numeric(as.matrix(1 + Matrix::crossprod(v,info_inverse)%*%v)) - sm_half_num <- info_inverse%*%v - - #strictly speaking, info_inverse isn't the inverse info -- its the inverse of + sm_denom <- as.numeric(as.matrix( + 1 + Matrix::crossprod(v, info_inverse) %*% v + )) + sm_half_num <- info_inverse %*% v + + #strictly speaking, info_inverse isn't the inverse info -- its the inverse of # (an approximation to) the hessian of the augmented lagrangian - + #compute derivative of augmented lagrangian - lag_deriv <- lapply(1:J, - function(j){ - - -t(X)%*%(Y[,j,drop = FALSE] - exp(log_means_by_j[[j]])) - }) - lag_deriv <- do.call(rbind,lag_deriv[-j_ref]) - gap <- constraint_fn[[k_constr]](B[k_constr,]) - B[k_constr,j_constr] - lag_deriv <- lag_deriv + (u + rho*gap)*cg_expanded - - + lag_deriv <- lapply(1:J, function(j) { + -t(X) %*% (Y[, j, drop = FALSE] - exp(log_means_by_j[[j]])) + }) + lag_deriv <- do.call(rbind, lag_deriv[-j_ref]) + gap <- constraint_fn[[k_constr]](B[k_constr, ]) - B[k_constr, j_constr] + lag_deriv <- lag_deriv + (u + rho * gap) * cg_expanded + #direction for (approximate) Newton step - update_dir <- -info_inverse %*%lag_deriv + - sm_half_num%*% Matrix::crossprod(sm_half_num,lag_deriv)/sm_denom + update_dir <- -info_inverse %*% + lag_deriv + + sm_half_num %*% Matrix::crossprod(sm_half_num, lag_deriv) / sm_denom #shorten step if any of its elements are larger than allowed by max_step - if(max(abs(update_dir), na.rm = TRUE)>max_step){ - update_dir <- update_dir/max(abs(update_dir), na.rm = TRUE) + if (max(abs(update_dir), na.rm = TRUE) > max_step) { + update_dir <- update_dir / max(abs(update_dir), na.rm = TRUE) } - + # make sure update direction is never missing update_dir[is.na(update_dir)] <- 0 - - armijo_term <- c1*sum(update_dir*lag_deriv) - + + armijo_term <- c1 * sum(update_dir * lag_deriv) + #update direction in long format - update_dir <- B_from_B_cup(update_dir,J - 1, p) + update_dir <- B_from_B_cup(update_dir, J - 1, p) #don't update betas for j_ref - update_dir <- do.call(cbind,list(update_dir[,(1:(J - 1))=j_ref,drop = FALSE])) - - + update_dir <- do.call( + cbind, + list( + update_dir[, (1:(J - 1)) < j_ref, drop = FALSE], + 0, + update_dir[, (1:(J - 1)) >= j_ref, drop = FALSE] + ) + ) + #in debug mode, compute full inverse info for backward compatibility - if(debug){ - info_inverse <- info_inverse - Matrix::tcrossprod(sm_half_num)/sm_denom + if (debug) { + info_inverse <- info_inverse - Matrix::tcrossprod(sm_half_num) / sm_denom } - - - - - + #find stepsize that satisfies armijo rule accept <- FALSE - - curr_lag_val <- do.call(sum, - lapply(1:J, function(j) - -Y[,j,drop = FALSE]*log_means_by_j[[j]] + exp(log_means_by_j[[j]]))) + - u*gap + (rho/2)*gap^2 - - while(!accept){ - prop_B <- B + stepsize*update_dir - prop_gap <- constraint_fn[[k_constr]](prop_B[k_constr,]) - prop_B[k_constr,j_constr] - + + curr_lag_val <- do.call( + sum, + lapply(1:J, function(j) { + -Y[, j, drop = FALSE] * log_means_by_j[[j]] + exp(log_means_by_j[[j]]) + }) + ) + + u * gap + + (rho / 2) * gap^2 + + while (!accept) { + prop_B <- B + stepsize * update_dir + prop_gap <- constraint_fn[[k_constr]](prop_B[k_constr, ]) - + prop_B[k_constr, j_constr] + prop_log_means_by_j <- - lapply(1:J,function(j) - X%*%prop_B[,j,drop = FALSE] + z) - - prop_lag_val <- do.call(sum, - lapply(1:J, function(j) - -Y[,j,drop = FALSE]*prop_log_means_by_j[[j]] + exp(prop_log_means_by_j[[j]]))) + - u*prop_gap + (rho/2)*prop_gap^2 - - accept <- prop_lag_val <= curr_lag_val + armijo_term*stepsize - stepsize <- stepsize/2 + lapply(1:J, function(j) { + X %*% prop_B[, j, drop = FALSE] + z + }) + + prop_lag_val <- do.call( + sum, + lapply(1:J, function(j) { + -Y[, j, drop = FALSE] * + prop_log_means_by_j[[j]] + + exp(prop_log_means_by_j[[j]]) + }) + ) + + u * prop_gap + + (rho / 2) * prop_gap^2 + + accept <- prop_lag_val <= curr_lag_val + armijo_term * stepsize + stepsize <- stepsize / 2 # stepsize <- stepsize*0.9 - + # steps <- c(steps,stepsize) # props <- c(props,prop_lag_val) - } - stepsize <- stepsize*2 - if (stepsize < 1e-2 & verbose) { + stepsize <- stepsize * 2 + if (stepsize < 1e-2 & verbose) { #ideally, stepsize doesn't get super small and we never see this message #but it can be helpful if something is going wrong - message("Small stepsize: ",signif(stepsize,2)) + message("Small stepsize: ", signif(stepsize, 2)) } if (debug) { - return(list(update = update_dir*stepsize,gap = prop_gap, - lag_deriv = lag_deriv, - info_inverse = info_inverse, - stepsize = stepsize)) - } else{ - return(list(update = update_dir*stepsize,gap = prop_gap)) - + return(list( + update = update_dir * stepsize, + gap = prop_gap, + lag_deriv = lag_deriv, + info_inverse = info_inverse, + stepsize = stepsize + )) + } else { + return(list(update = update_dir * stepsize, gap = prop_gap)) } - - } diff --git a/tests/testthat/test-fit_null_discrete.R b/tests/testthat/test-fit_null_discrete.R index 766d975..e6f96ce 100644 --- a/tests/testthat/test-fit_null_discrete.R +++ b/tests/testthat/test-fit_null_discrete.R @@ -1,8 +1,127 @@ test_that("new discrete is correct", { - set.seed(1) - n <- 20 + # set.seed(1) + n <- 40 + J <- 30 + p <- 5 + beta <- matrix(rnorm(p*J, mean = 2, sd=1), ncol = J) + X <- cbind(1, + c(rep(0, n/2), rep(1, n/2)), + c(rep(0, 5*n/8), rep(1, 3*n/8)), + c(rep(0, 3*n/4), rep(1, n/4)), + c(rep(0, 7*n/8), rep(1, n/8))) + Y <- matrix(rpois(n=n*J, lambda=exp(1 + X %*% beta)), ncol = J) + Y <- pmax(Y, 1) + + my_jstar <- 9 + my_kstar <- 4 + my_jref <- J + + t_discrete <- system.time({ + out_discrete <- fit_null_discrete_pseudohuber(Y = Y, X = X, k_constr = my_kstar, j_constr = my_jstar, j_ref = my_jref, + tol = 1e-12) + }) + + ## check that the (my_kstar, my_kstar) constraint is satisfied + expect_equal(pseudohuber_median(out_discrete$B[my_kstar, -my_jstar]), + out_discrete$B[my_kstar, my_jstar] ) + + ## check that the j_ref constraint is satisfied + expect_equal(out_discrete$B[, my_jref], + rep(0, p)) + + ## check permutation invariant + X2 <- X[, c(1, (p:2))] + new_order <- c(sample(1:(J-1), J-1), J) + new_j_star <- which(new_order == my_jstar) + Y2 <- Y[, new_order] + t_discrete2 <- system.time({ + out_discrete2 <- fit_null_discrete_pseudohuber(Y = Y2, X = X2, k_constr = which(c(1, (p:2)) == my_kstar), + j_constr = new_j_star, j_ref = my_jref, + tol = 1e-12) + }) + + z_discrete <- update_z(Y, X, out_discrete$B) + log_means_discrete <- X %*% out_discrete$B + matrix(z_discrete, ncol = 1) %*% matrix(1, ncol = J, nrow = 1) + ll_new_discrete <- sum(Y * log_means_discrete - exp(log_means_discrete)) + + z_discrete2 <- update_z(Y2, X2, out_discrete2$B) + log_means_discrete2 <- X2 %*% out_discrete2$B + matrix(z_discrete2, ncol = 1) %*% matrix(1, ncol = J, nrow = 1) + ll_new_discrete2 <- sum(Y2 * log_means_discrete2 - exp(log_means_discrete2)) + + ## expect them to be close in estimates and likelihood + expect_true(max(abs(out_discrete$B[, new_order] - out_discrete2$B[c(1, (p:2)), ])) < 1e-4) + expect_equal(ll_new_discrete2, ll_new_discrete) + +}) + + + +test_that("new discrete is correct with flexible jref and jstar", { + + # set.seed(3) + n <- 24 + J <- 10 + p <- 5 + beta <- matrix(rnorm(p*J, mean = 1, sd=1), ncol = J) + X <- cbind(1, + c(rep(0, n/2), rep(1, n/2)), + c(rep(0, 5*n/8), rep(1, 3*n/8)), + c(rep(0, 3*n/4), rep(1, n/4)), + c(rep(0, 7*n/8), rep(1, n/8))) + Y <- matrix(rpois(n=n*J, lambda=exp(1 + X %*% beta)), ncol = J) + Y <- pmax(Y, 1) + + my_jstar <- 9 + my_jref <- 1 + my_kstar <- 4 + + out_discrete91 <- fit_null_discrete_pseudohuber(Y = Y, X = X, k_constr = my_kstar, j_constr = my_jstar, j_ref = my_jref, + tol = 1e-12) + + ## check that the (my_kstar, my_kstar) constraint is satisfied + expect_equal(pseudohuber_median(out_discrete91$B[my_kstar, -my_jstar]), + out_discrete91$B[my_kstar, my_jstar] ) + + ## check that the j_ref constraint is satisfied + expect_equal(out_discrete91$B[, my_jref], + rep(0, p)) + + my_jstar <- 4 + my_jref <- 3 + + out_discrete43 <- fit_null_discrete_pseudohuber(Y = Y, X = X, k_constr = my_kstar, j_constr = my_jstar, j_ref = my_jref, + tol = 1e-12) + + ## check that the (my_kstar, my_kstar) constraint is satisfied + expect_equal(pseudohuber_median(out_discrete43$B[my_kstar, -my_jstar]), + out_discrete43$B[my_kstar, my_jstar] ) + + ## check that the j_ref constraint is satisfied + expect_equal(out_discrete43$B[, my_jref], + rep(0, p)) + + my_jstar <- 4 + my_jref <- 6 + + out_discrete46 <- fit_null_discrete_pseudohuber(Y = Y, X = X, k_constr = my_kstar, j_constr = my_jstar, j_ref = my_jref, + tol = 1e-12) + + ## should be same value across rows + expect_equal((out_discrete43$B - out_discrete46$B) %>% round(4) %>% apply(1, sd) %>% max, 0) + +}) + + + + +test_that("new discrete is fast", { + + skip("Too slow for automated testing; 20 mins on laptop") + + set.seed(2) + n <- 12 J <- 10 p <- 3 beta <- matrix(rnorm(p*J, mean = 2, sd=1), ncol = J) @@ -10,44 +129,97 @@ test_that("new discrete is correct", { Y <- matrix(rpois(n=n*J, lambda=exp(2 + X %*% beta)), ncol = J) Y <- pmax(Y, 1) - my_n_list <- list(Y[apply(X, 1, function(x) all(x == unique(X)[1,])), ] %>% colSums, - Y[apply(X, 1, function(x) all(x == unique(X)[2,])), ] %>% colSums, - Y[apply(X, 1, function(x) all(x == unique(X)[3,])), ] %>% colSums) + my_jstar <- J-1 + my_kstar <- 2 + my_jref <- J + + t_discrete <- system.time({ + out_discrete <- fit_null_discrete_pseudohuber(Y = Y, X = X, k_constr = my_kstar, j_constr = my_jstar, j_ref = my_jref, + tol = 1e-12) + }) + + t_orig <- system.time({ + out_orig <- fit_null(B=matrix(0, nrow = p, ncol = J), + Y=Y, X = X, + k_constr=my_kstar, j_constr=my_jstar, j_ref=my_jref, + constraint_fn=list(pseudohuber_median, pseudohuber_median), + constraint_grad_fn=list(radEmu::dpseudohuber_median_dx, radEmu::dpseudohuber_median_dx), + B_tol=1e-8, constraint_tol=1e-10, maxit = 1e8) + }) # 816.931 489.308 1317.577 + + ## check faster + expect_lt(t_discrete[3], t_orig[3]) + + ## check that the (my_kstar, my_kstar) constraint is satisfied + expect_equal(pseudohuber_median(out_discrete$B[my_kstar, -my_jstar]), + out_discrete$B[my_kstar, my_jstar] ) + expect_equal(pseudohuber_median(out_orig$B[my_kstar, -my_jstar]), + out_orig$B[my_kstar, my_jstar] ) + + expect_lte(max(abs(out_discrete$B - out_orig$B)), 1e-3) + + z_discrete <- update_z(Y, X, out_discrete$B) + log_means_discrete <- X %*% out_discrete$B + matrix(z_discrete, ncol = 1) %*% matrix(1, ncol = J, nrow = 1) + ll_new_discrete <- sum(Y * log_means_discrete - exp(log_means_discrete)) + + z_orig <- update_z(Y, X, out_orig$B) + log_means_orig <- X %*% out_orig$B + matrix(z_orig, ncol = 1) %*% matrix(1, ncol = J, nrow = 1) + ll_new_orig <- sum(Y * log_means_orig - exp(log_means_orig)) + + ## expect the new likelihood is no worse than 1% lower + expect_true(ll_new_discrete > ll_new_orig*0.99) + +}) + + +test_that("new discrete is fast", { + + skip("Too slow for automated testing; X mins on laptop") + + set.seed(4) + n <- 12 + J <- 5 + p <- 3 + beta <- matrix(rnorm(p*J, mean = 2, sd=1), ncol = J) + X <- cbind(1, c(rep(0, n/3), rep(1, n/3), rep(0, n/3)), c(rep(0, n/3), rep(0, n/3), rep(1, n/3))) + Y <- matrix(rpois(n=n*J, lambda=exp(2 + X %*% beta)), ncol = J) + Y <- pmax(Y, 1) - my_jstar = 9 - system.time({ - out_test6 <- fs_direct_beta_constraint( - Y = do.call(rbind, my_n_list), - X = unique(X)[, -1], k_constr=2, j_constr=my_jstar, - constraint_fn=function(x) { pseudohuber_median(c(x, 0)) }, # f(z2..zm) - constraint_grad_fn= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]} - ) - }) - out_test6 + my_jstar <- J-2 + my_jref <- J-1 + my_kstar <- 2 - out_test6$iter - out_test6$alpha %>% c(0) %>% pseudohuber_median + t_discrete <- system.time({ + out_discrete <- fit_null_discrete_pseudohuber(Y = Y, X = X, k_constr = my_kstar, j_constr = my_jstar, j_ref = my_jref, + tol = 1e-12) + }) - (out_test6$beta[,-my_jstar] %>% apply(1, function(x) { pseudohuber_median(c(x, 0)) }))[2] - out_test6$beta[2,my_jstar] - ### ok, promising + t_orig <- system.time({ + out_orig <- fit_null(B=out_discrete$B, + Y=Y, X = X, + k_constr=my_kstar, j_constr=my_jstar, j_ref=my_jref, + constraint_fn=list(pseudohuber_median, pseudohuber_median), + constraint_grad_fn=list(radEmu::dpseudohuber_median_dx, radEmu::dpseudohuber_median_dx), + B_tol=1e-3, constraint_tol=1e-6, maxit = 1e8) + }) # 816.931 489.308 1317.577 + ## check faster + expect_lt(t_discrete[3], t_orig[3]) + ## check align + expect_lte(max(abs(out_discrete$B - out_orig$B)), 1e-3) - system.time({ - out_test6_orig <- fit_null(B=matrix(0, nrow = p, ncol = J), - Y=Y, X = X, - k_constr=2, j_constr=my_jstar, j_ref=J, - constraint_fn=list(pseudohuber_median, pseudohuber_median), - constraint_grad_fn=list(radEmu::dpseudohuber_median_dx, radEmu::dpseudohuber_median_dx), - B_tol=1e-8, constraint_tol=1e-8) - }) # 246.458 0.906 247.997 - out_test6_orig$B - cbind(rbind(out_test6$alpha, out_test6$beta), 0) - ## excellent - ## now check constraints + ## check no worse + z_discrete <- update_z(Y, X, out_discrete$B) + log_means_discrete <- X %*% out_discrete$B + matrix(z_discrete, ncol = 1) %*% matrix(1, ncol = J, nrow = 1) + ll_new_discrete <- sum(Y * log_means_discrete - exp(log_means_discrete)) + z_orig <- update_z(Y, X, out_orig$B) + log_means_orig <- X %*% out_orig$B + matrix(z_orig, ncol = 1) %*% matrix(1, ncol = J, nrow = 1) + ll_new_orig <- sum(Y * log_means_orig - exp(log_means_orig)) + ## expect the new likelihood is no worse than 1% lower + expect_true(ll_new_discrete > ll_new_orig*0.99) }) \ No newline at end of file From 7c08edc42d52d1a10922ec5686e77ef6dcb9cc37 Mon Sep 17 00:00:00 2001 From: Amy Willis Date: Mon, 8 Dec 2025 20:30:12 +1000 Subject: [PATCH 7/7] discrete fitting ready for ST to review --- test_new_discrete.R | 243 ------------------------ tests/testthat/test-fit_null_discrete.R | 10 +- 2 files changed, 5 insertions(+), 248 deletions(-) delete mode 100644 test_new_discrete.R diff --git a/test_new_discrete.R b/test_new_discrete.R deleted file mode 100644 index a1af0da..0000000 --- a/test_new_discrete.R +++ /dev/null @@ -1,243 +0,0 @@ -# js <- seq(from = 100, to = 1000, by = 100) -js <- seq(from = 20, to = 100, by = 20) -df_timing <- data.frame("j" = js, "fit_null" = NA, "new" = NA) -df_timing -set.seed(1) -ii <- 1 -for (jj in js) { - df_timing$j[ii] <- jj - - n <- 20 - J <- jj - p <- 2 - beta <- matrix(rnorm(p*J, sd=5), ncol = J) - X <- cbind(1, c(rep(0, n/2), rep(1, n/2))) - Y <- matrix(rpois(n=n*J, lambda=exp(5 + X %*% beta)), ncol = J) - Y <- pmax(Y, 1) - - # fit3 <- radEmu::emuFit(Y = Y, X = X, run_score_tests=F) - # - # df_timing$fit_null[ii] <- system.time({ - # fit_null(B=fit3$B, Y=Y, X = X, k_constr=2, j_constr=1, j_ref=J, - # constraint_fn=list(pseudohuber_median, pseudohuber_median), - # constraint_grad_fn=list(radEmu::dpseudohuber_median_dx, radEmu::dpseudohuber_median_dx), - # B_tol=1e-8, constraint_tol=1e-8) - # })[3] - - - df_timing$fit_null[ii] <- system.time({ - fn <- fit_null(B=matrix(0, nrow = 2, ncol = J), Y=Y, X = X, k_constr=2, j_constr=1, j_ref=J, - constraint_fn=list(pseudohuber_median, pseudohuber_median), - constraint_grad_fn=list(radEmu::dpseudohuber_median_dx, radEmu::dpseudohuber_median_dx), - B_tol=1e-5, constraint_tol=1e-8) - })[3] - - # df_timing$new[ii] <- system.time({ - # out_test4 <- my_gd_fs(n0=Y[which(X[,2] == 0), ] %>% colSums, - # n1 = Y[which(X[,2] == 1), ] %>% colSums, - # g_beta=function(x) { pseudohuber_median(c(x, 0)) }, - # g_beta_grad= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]}, - # eta_alpha = 1e-3, - # eta_beta = 1e-3, - # maxit = 1e10, - # B_tol = 1e-5) - # })[3] - df_timing$new[ii] <- system.time({ - out_test4 <- my_fs_stable_two_groups(n0=Y[which(X[,2] == 0), ] %>% colSums, - n1 = Y[which(X[,2] == 1), ] %>% colSums, - g_beta=function(x) { pseudohuber_median(c(x, 0)) }, - g_beta_grad= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]}, - eta_alpha = 1e-3, - eta_beta = 1e-3, - maxit = 1e8, - tol = 1e-10) - })[3] - ii <- ii + 1 - print(df_timing) -} -df_timing - -out_test4 -rbind(out_test4$alpha, out_test4$beta) -fn$B -### why didn't they align??? - - -set.seed(1) -n <- 20 -J <- 100 -p <- 2 -beta <- matrix(rnorm(p*J, mean = 2, sd=2), ncol = J) -X <- cbind(1, c(rep(0, n/2), rep(1, n/2))) -Y <- matrix(rpois(n=n*J, lambda=exp(2 + X %*% beta)), ncol = J) -Y <- pmax(Y, 1) -Y -Ysimple <- rbind(Y[which(X[,2] == 0), ] %>% colSums, Y[which(X[,2] == 1), ] %>% colSums) - -# the_fit <- radEmu::emuFit(Y = Y, X = X, run_score_tests=F) - -the_fit <- radEmu::emuFit(Y = Ysimple, X = X[10:11, ], run_score_tests=F) -the_fit - -system.time({ - fn3_1_J <- fit_null(B=the_fit$B, Y=Y, X = X, k_constr=2, j_constr=1, j_ref=J, - constraint_fn=list(pseudohuber_median, pseudohuber_median), - constraint_grad_fn=list(radEmu::dpseudohuber_median_dx, radEmu::dpseudohuber_median_dx), - B_tol=1e-8, constraint_tol=1e-8) -}) # 26.270 0.152 27.411 - -system.time({ - out_test2 <- my_gd_ls(n0=Y[which(X[,2] == 0), ] %>% colSums, - n1 = Y[which(X[,2] == 1), ] %>% colSums, - g_beta=function(x) { pseudohuber_median(c(x, 0)) }, - g_beta_grad= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]}, - eta_alpha = 1e-3, - eta_beta = 1e-3, - maxit = 1000, - tol = 1e-6) -}) # 2.912 0.033 3.287 - - -system.time({ - out_test4 <- my_gd_fs(n0=Y[which(X[,2] == 0), ] %>% colSums, - n1 = Y[which(X[,2] == 1), ] %>% colSums, - g_beta=function(x) { pseudohuber_median(c(x, 0)) }, - g_beta_grad= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]}, - eta_alpha = 1e-3, - eta_beta = 1e-3, - maxit = 1000, - B_tol = 1e-8) -}) # 0.021 0.001 0.023 - - -system.time({ - out_test5 <- my_fs_stable(n0=Y[which(X[,2] == 0), ] %>% colSums, - n1 = Y[which(X[,2] == 1), ] %>% colSums, - g_beta=function(x) { pseudohuber_median(c(x, 0)) }, - g_beta_grad= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]}, - eta_alpha = 1e-3, - eta_beta = 1e-3, - maxit = 1000, - tol = 1e-8) -}) # 0.016 0.000 0.017 - - - -# out_test4$history %>% filter(type == "beta" & k == 3) -# -fn3_1_J$B -rbind(out_test2$alpha, out_test2$beta) -rbind(out_test4$alpha, out_test4$beta) -rbind(out_test5$alpha, out_test5$beta) - -#################################################### -#### Try larger sample - - -set.seed(1) -n <- 20 -J <- 100 -p <- 2 -beta <- matrix(rnorm(p*J, mean = 2, sd=2), ncol = J) -X <- cbind(1, c(rep(0, n/2), rep(1, n/2))) -Y <- matrix(rpois(n=n*J, lambda=exp(2 + X %*% beta)), ncol = J) -Y <- pmax(Y, 1) -Y -Ysimple <- rbind(Y[which(X[,2] == 0), ] %>% colSums, Y[which(X[,2] == 1), ] %>% colSums) - -system.time({ - fn3_1_J <- fit_null(B=matrix(0, nrow = p, ncol = J), Y=Y, X = X, k_constr=2, j_constr=1, j_ref=J, - constraint_fn=list(pseudohuber_median, pseudohuber_median), - constraint_grad_fn=list(radEmu::dpseudohuber_median_dx, radEmu::dpseudohuber_median_dx), - B_tol=1e-8, constraint_tol=1e-8) -}) - -system.time({ - out_test5 <- my_fs_stable_two_groups(n0=Y[which(X[,2] == 0), ] %>% colSums, - n1 = Y[which(X[,2] == 1), ] %>% colSums, - g_beta=function(x) { pseudohuber_median(c(x, 0)) }, - g_beta_grad= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]}, - maxit = 1000, - tol = 1e-8) -}) - -orig <- fn3_1_J$B -new <- cbind(rbind(out_test5$alpha, out_test5$beta), 0) - -## are the constraints satisfied -c(psuedohuber_median(orig[2,-1]), psuedohuber_median(orig[2,1])) -c(psuedohuber_median(new[2,-1]), psuedohuber_median(new[2,1])) - -B <- orig -z <- update_z(Y, X, B) -log_means <- X %*% B + matrix(z, ncol = 1) %*% matrix(1, ncol = J, nrow = 1) -ll_orig <- sum(Y * log_means - exp(log_means)) - -B <- new -z <- update_z(Y, X, B) -log_means <- X %*% B + matrix(z, ncol = 1) %*% matrix(1, ncol = J, nrow = 1) -ll_new <- sum(Y * log_means - exp(log_means)) - -B <- matrix(0, nrow = p, ncol = J) -z <- update_z(Y, X, B) -log_means <- X %*% B + matrix(z, ncol = 1) %*% matrix(1, ncol = J, nrow = 1) -sum(Y * log_means - exp(log_means)) - -ll_orig < ll_new ### new is better - - -##### try g-dim version - -set.seed(1) -n <- 20 -J <- 30 -p <- 3 -beta <- matrix(rnorm(p*J, mean = 2, sd=1), ncol = J) -X <- cbind(1, c(rep(0, n/2), rep(1, n/2)), c(rep(0, 3*n/4), rep(1, n/4))) -Y <- matrix(rpois(n=n*J, lambda=exp(2 + X %*% beta)), ncol = J) -Y <- pmax(Y, 1) -Y - -# radEmu:::emuFit_micro_discrete(X, Y) -# radEmu:::emuFit_micro_discrete(X[n:1, ], Y[n:1, ]) - -unique(X) - -my_n_list <- list(Y[apply(X, 1, function(x) all(x == unique(X)[1,])), ] %>% colSums, - Y[apply(X, 1, function(x) all(x == unique(X)[2,])), ] %>% colSums, - Y[apply(X, 1, function(x) all(x == unique(X)[3,])), ] %>% colSums) - - -load_all() -my_jstar = 9 -system.time({ - out_test6 <- fs_direct_beta_constraint( - Y = do.call(rbind, my_n_list), - X = unique(X)[, -1], k_constr=2, j_constr=my_jstar, - constraint_fn=function(x) { pseudohuber_median(c(x, 0)) }, # f(z2..zm) - constraint_grad_fn= function(x) { x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]} - ) -}) -out_test6 - -out_test6$iter -out_test6$alpha %>% c(0) %>% pseudohuber_median - -(out_test6$beta[,-my_jstar] %>% apply(1, function(x) { pseudohuber_median(c(x, 0)) }))[2] -out_test6$beta[2,my_jstar] -### ok, promising - - - -system.time({ - out_test6_orig <- fit_null(B=matrix(0, nrow = p, ncol = J), - Y=Y, X = X, - k_constr=2, j_constr=my_jstar, j_ref=J, - constraint_fn=list(pseudohuber_median, pseudohuber_median), - constraint_grad_fn=list(radEmu::dpseudohuber_median_dx, radEmu::dpseudohuber_median_dx), - B_tol=1e-8, constraint_tol=1e-8) -}) # 246.458 0.906 247.997 -out_test6_orig$B -cbind(rbind(out_test6$alpha, out_test6$beta), 0) -## excellent -## now check constraints diff --git a/tests/testthat/test-fit_null_discrete.R b/tests/testthat/test-fit_null_discrete.R index e6f96ce..6ffddd4 100644 --- a/tests/testthat/test-fit_null_discrete.R +++ b/tests/testthat/test-fit_null_discrete.R @@ -173,9 +173,9 @@ test_that("new discrete is fast", { }) -test_that("new discrete is fast", { +test_that("new discrete null aligns with older code", { - skip("Too slow for automated testing; X mins on laptop") + skip("Too slow for automated testing; 3 mins on laptop") set.seed(4) n <- 12 @@ -196,19 +196,19 @@ test_that("new discrete is fast", { }) t_orig <- system.time({ - out_orig <- fit_null(B=out_discrete$B, + out_orig <- fit_null(B=matrix(0, nrow = p, ncol = J), Y=Y, X = X, k_constr=my_kstar, j_constr=my_jstar, j_ref=my_jref, constraint_fn=list(pseudohuber_median, pseudohuber_median), constraint_grad_fn=list(radEmu::dpseudohuber_median_dx, radEmu::dpseudohuber_median_dx), B_tol=1e-3, constraint_tol=1e-6, maxit = 1e8) - }) # 816.931 489.308 1317.577 + }) # 94.249 66.621 163.194 ## check faster expect_lt(t_discrete[3], t_orig[3]) ## check align - expect_lte(max(abs(out_discrete$B - out_orig$B)), 1e-3) + expect_lte(max(abs(out_discrete$B - out_orig$B)), 0.02) ## check no worse z_discrete <- update_z(Y, X, out_discrete$B)