diff --git a/R/dpseudohuber_median_d2x.R b/R/dpseudohuber_median_d2x.R
new file mode 100644
index 0000000..f859595
--- /dev/null
+++ b/R/dpseudohuber_median_d2x.R
@@ -0,0 +1,63 @@
+#' Calculate the second derivative of the pseudo-Huber smoothed median
+#'
+#' @param x A vector to calculate the derivative of the pseudo-Huber smoothed median for. 
+#' @param d Smoothing parameter, by default set to \code{0.1}. As \code{d} approaches 
+#' \code{0} the pseudo-Huber median function approaches the median and as \code{d} approaches 
+#' infinity this function approaches the mean. 
+#' @param na.rm Passed to \code{pseudohuber_median}, default is FALSE, if FALSE then when 
+#' \code{x} includes at least one NA value then NA is returned, if TRUE then when \code{x} 
+#' includes at least one NA value then that value is removed and the pseudo-Huber median 
+#' is computed without it.
+#'
+#' @return The second derivative of the calculated pseudo-Huber smoothed median over \code{x} with smoothing 
+#' parameter \code{d}. 
+#' 
+#' @note algebra derived by ChatGPT based on our supp
+#' 
+#' @examples
+#' dpseudohuber_median_dx(x = rnorm(10), d = 0.1)
+#'
+#' @aliases dpsuedohuber_median_dx
+#'
+#' @export
+#'
+dpseudohuber_median_d2x <- function(x,
+                                    d = 0.1,
+                                    na.rm = FALSE) {
+  # pseudo-Huber center
+  ps_center <- pseudohuber_median(x, d, na.rm = na.rm)
+  
+  # basic quantities
+  y  <- (x - ps_center) / d
+  w  <- 1 / sqrt(1 + y^2)          # w_j
+  w3 <- w^3                        # a_j = w_j^3
+  S  <- sum(w3, na.rm = na.rm)     # S = sum_j w_j^3
+  
+  # t_j = (x_j - c*)/d^2 * w_j^5
+  v  <- (x - ps_center) / d^2
+  w5 <- w^5
+  t  <- v * w5                     # t_j
+  T  <- sum(t, na.rm = na.rm)      # T = sum_j t_j
+  
+  n <- length(x)
+  H <- matrix(NA_real_, n, n)
+  
+  # Hessian entries:
+  # H_{pq} = -3/S * [ t_p (delta_{pq} - w3_q/S) - (t_q - w3_q*T/S) * w3_p/S ]
+  for (p in seq_len(n)) {
+    a_p <- w3[p]
+    t_p <- t[p]
+    for (q in seq_len(n)) {
+      a_q <- w3[q]
+      t_q <- t[q]
+      delta_pq <- as.numeric(p == q)
+      
+      H[p, q] <- -(3 / S) * (
+        t_p * (delta_pq - a_q / S) -
+          (t_q - a_q * T / S) * a_p / S
+      )
+    }
+  }
+  
+  H
+}
\ No newline at end of file
diff --git a/R/f_info.R b/R/f_info.R
index 5876f57..1696056 100644
--- a/R/f_info.R
+++ b/R/f_info.R
@@ -1,69 +1,69 @@
 #' @importFrom methods as
 # fisher information for multinomial model
-f_info <- function(Y,
-                   B_cup,
-                   B,
-                   X,
-                   X_cup,
-                   omit = NULL,
-                   compute_together = FALSE,
-                   return_separate = FALSE){
+f_info <- function(
+  Y,
+  B_cup,
+  B,
+  X,
+  X_cup,
+  omit = NULL,
+  compute_together = FALSE,
+  return_separate = FALSE
+) {
   n <- nrow(Y)
   J <- ncol(Y)
   p <- ncol(X)
 
-  if(is.null(omit)){
-  f_info <- matrix(0,ncol = p*J,nrow = p*J)
-  } else{
-    if(compute_together){
-    f_info <- matrix(0,ncol = p*(J - 1), nrow = p*(J - 1))
+  if (is.null(omit)) {
+    f_info <- matrix(0, ncol = p * J, nrow = p * J)
+  } else {
+    if (compute_together) {
+      f_info <- matrix(0, ncol = p * (J - 1), nrow = p * (J - 1))
     }
   }
-  z <- update_z(Matrix::Matrix(Y),Matrix::Matrix(X),B)
-
-
-
-  if(compute_together){ #older, slower way of computing info
-                        #used only to test equality with newer way
-    ident_mat <- Matrix::Diagonal(x = rep(1,J))
-    one_mat <- Matrix::Matrix(1,nrow = J,ncol=1)
-  for(i in 1:n){
-    which_rows <- 1:J + (i - 1)*J
-    mu_i <- exp(X_cup[which_rows,]%*% B_cup + z[i])
-    p_i <- mu_i/sum(mu_i)
-
-    M_iX_i <- (ident_mat - Matrix::tcrossprod(one_mat,p_i))%*%X_cup[which_rows,]
-
-
-    f_info <- f_info + Matrix::crossprod(M_iX_i,Matrix::Diagonal(x= as.numeric(mu_i)))%*%M_iX_i
-  }
-  } else{ #newer, faster way to compute info
-    mus <- exp(X_cup%*%B_cup + rep(z,each = J))
+  z <- update_z(Matrix::Matrix(Y), Matrix::Matrix(X), B)
+
+  if (compute_together) {
+    #older, slower way of computing info
+    #used only to test equality with newer way
+    ident_mat <- Matrix::Diagonal(x = rep(1, J))
+    one_mat <- Matrix::Matrix(1, nrow = J, ncol = 1)
+    for (i in 1:n) {
+      which_rows <- 1:J + (i - 1) * J
+      mu_i <- exp(X_cup[which_rows, ] %*% B_cup + z[i])
+      p_i <- mu_i / sum(mu_i)
+
+      M_iX_i <- (ident_mat - Matrix::tcrossprod(one_mat, p_i)) %*%
+        X_cup[which_rows, ]
+
+      f_info <- f_info +
+        Matrix::crossprod(M_iX_i, Matrix::Diagonal(x = as.numeric(mu_i))) %*%
+          M_iX_i
+    }
+  } else {
+    #newer, faster way to compute info
+    mus <- exp(X_cup %*% B_cup + rep(z, each = J))
     mu_diag <- Matrix::Diagonal(x = mus@x)
 
-    ps <- lapply(1:n,
-                 function(i){
-                   p_i <- mus@x[(i-1)*J +1:J];
-                 return(p_i/sum(p_i))})
-    ps_mat <- Matrix::sparseMatrix(i = 1:(n*J),
-                                   j = rep(1:n,each = J),
-                                   x = do.call(c,ps))
-    X_cup_p <- Matrix::crossprod(X_cup,ps_mat)
+    ps <- lapply(1:n, function(i) {
+      p_i <- mus@x[(i - 1) * J + 1:J]
+      return(p_i / sum(p_i))
+    })
+    ps_mat <- Matrix::sparseMatrix(
+      i = 1:(n * J),
+      j = rep(1:n, each = J),
+      x = do.call(c, ps)
+    )
+    X_cup_p <- Matrix::crossprod(X_cup, ps_mat)
     X_cup_p <- as.matrix(X_cup_p)
-    X_cup_p <- methods::as(X_cup_p,"sparseMatrix")
+    X_cup_p <- methods::as(X_cup_p, "sparseMatrix")
     sum_diag <- Matrix::Diagonal(x = rowSums(Y))
 
-    f_info <- Matrix::crossprod(X_cup,mu_diag)%*%X_cup -
-      X_cup_p%*%
-      Matrix::tcrossprod(sum_diag,X_cup_p)
-
-
-
-    }
-
-
+    f_info <- Matrix::crossprod(X_cup, mu_diag) %*%
+      X_cup -
+      X_cup_p %*%
+        Matrix::tcrossprod(sum_diag, X_cup_p)
+  }
 
   return(f_info)
-
-
 }
diff --git a/R/fit_null.R b/R/fit_null.R
index b6f6b65..98b736c 100644
--- a/R/fit_null.R
+++ b/R/fit_null.R
@@ -1,5 +1,5 @@
 #' fits model with B_kj constrained to equal g(B_k) for constraint fn g
-#' 
+#'
 #' @param B description
 #' @param Y Y (with augmentations)
 #' @param X design matrix
@@ -23,102 +23,98 @@
 #' @param trackB track value of beta across iterations and return?
 #' @param ignore_stop whether to ignore stopping criteria and run `maxit` iterations (could be helpful for diagnostic plots).
 #' @param null_diagnostic_plots logical: should diagnostic plots be made for estimation under the null hypothesis? Default is \code{FALSE}.
-#' 
+#'
 #' @return A list containing elements `B`, `k_constr`, `j_constr`, `niter`
-#' `gap`, `u`, `rho`, and `Bs`. `B` is a matrix containing parameter estimates 
+#' `gap`, `u`, `rho`, and `Bs`. `B` is a matrix containing parameter estimates
 #' under the null (obtained by maximum likelihood on augmented observations Y),
-#' `k_constr`, and `j_constr` give row and column indexes of the parameter 
-#' fixed to be equal to the constraint function \eqn{g()} under the null. `niter` is a 
-#' scalar giving total number of outer iterations used to fit the null model, 
-#' `gap` gives the final value of \eqn{g(B_{k constr}) - B_{k constr, j constr}}, 
-#' `u` and `rho` are final values of augmented Lagrangian parameters, and 
-#' `Bs` is a data frame containing values of B by iteration if `trackB` was set 
+#' `k_constr`, and `j_constr` give row and column indexes of the parameter
+#' fixed to be equal to the constraint function \eqn{g()} under the null. `niter` is a
+#' scalar giving total number of outer iterations used to fit the null model,
+#' `gap` gives the final value of \eqn{g(B_{k constr}) - B_{k constr, j constr}},
+#' `u` and `rho` are final values of augmented Lagrangian parameters, and
+#' `Bs` is a data frame containing values of B by iteration if `trackB` was set
 #' equal to TRUE (otherwise it contains a NULL value).
-#' 
-fit_null <- function(B,
-                     Y, 
-                     X, 
-                     X_cup = NULL,
-                     k_constr, 
-                     j_constr, 
-                     j_ref, 
-                     constraint_fn, 
-                     constraint_grad_fn, 
-                     rho_init = 1, 
-                     tau = 1.2, 
-                     kappa = 0.8, 
-                     B_tol = 1e-2, 
-                     inner_tol = 0.01, 
-                     constraint_tol = 1e-4, 
-                     max_step = 5, 
-                     c1 = 1e-4, 
-                     maxit = 1000, 
-                     inner_maxit = 25, 
-                     verbose = FALSE, 
-                     trackB = FALSE,
-                     ignore_stop = FALSE,
-                     null_diagnostic_plots = FALSE
+#'
+fit_null <- function(
+  B,
+  Y,
+  X,
+  X_cup = NULL,
+  k_constr,
+  j_constr,
+  j_ref,
+  constraint_fn,
+  constraint_grad_fn,
+  rho_init = 1,
+  tau = 1.2,
+  kappa = 0.8,
+  B_tol = 1e-2,
+  inner_tol = 0.01,
+  constraint_tol = 1e-4,
+  max_step = 5,
+  c1 = 1e-4,
+  maxit = 1000,
+  inner_maxit = 25,
+  verbose = FALSE,
+  trackB = FALSE,
+  ignore_stop = FALSE,
+  null_diagnostic_plots = FALSE
 ) {
-  
   J <- ncol(Y)
   n <- nrow(Y)
   p <- ncol(X)
-  
-  it_df <- data.frame(it = 1:maxit,
-                      lik = NA,
-                      max_abs_B = NA,
-                      constraint_diff = NA)
+
+  it_df <- data.frame(
+    it = 1:maxit,
+    lik = NA,
+    max_abs_B = NA,
+    constraint_diff = NA
+  )
   if (null_diagnostic_plots) {
     it_df$test_stat <- NA
   }
-  
-  #store cols of B to update in a vector
-  j_to_update <- (1:J)[(1:J) != j_ref]
+
   #in vector format, which indexes to remove (bc B^j_constr = 0 id. constr.)
-  indexes_to_remove <- (j_ref - 1)*p + 1:p
-  
-  
+  indexes_to_remove <- (j_ref - 1) * p + 1:p
+
   #change id. constr. by subtracting approp. col of B from others
-  for(k in 1:p) {
-    B[k,] <- B[k,] - B[k,j_ref]
+  for (k in 1:p) {
+    B[k, ] <- B[k, ] - B[k, j_ref]
   }
-  
+
   #update z
   z <- update_z(X = X, Y = Y, B = B)
-  
-  log_mean <- X%*%B +
-    matrix(z,ncol = 1)%*%matrix(1,ncol = J, nrow = 1)
-  
-  ll <- sum(log_mean*Y - exp(log_mean))
-  
+
+  log_mean <- X %*% B + matrix(z, ncol = 1) %*% matrix(1, ncol = J, nrow = 1)
+
+  ll <- sum(log_mean * Y - exp(log_mean))
+
   if (trackB) {
-    Bs <- data.frame(k = rep(1:p,each = J),
-                     j = rep(1:J,p),
-                     B = NA)
-  } else{
-  Bs <- NULL
+    Bs <- data.frame(k = rep(1:p, each = J), j = rep(1:J, p), B = NA)
+  } else {
+    Bs <- NULL
   }
-  
+
   #get X_cup for later use
   if (is.null(X_cup)) {
-    X_cup = X_cup_from_X(X,J)
+    X_cup = X_cup_from_X(X, J)
   }
-  
+
   #set iteration to zero
   iter <- 0
-  
+
   converged <- FALSE
-  
+
   #compute gap (i.e. g(B_k) - B_kj)
-  gap <- constraint_fn[[k_constr]](B[k_constr,]) - B[k_constr,j_constr]
+  gap <- constraint_fn[[k_constr]](B[k_constr, ]) - B[k_constr, j_constr]
   init_gap <- abs(gap)
-  
+
   #set rho equal to initial value
   rho <- rho_init
-  
+
   #initiate u
-  u <- rho*gap
-  
+  u <- rho * gap
+
   if (trackB) {
     Bs$iter <- 0
     Bs$inner_iter <- 0
@@ -126,179 +122,169 @@ fit_null <- function(B,
     Bs$u <- u
 
     Bs$gap <- gap
-  #   Bs$score_stat <- 0
+    #   Bs$score_stat <- 0
     lilBs <- Bs
-  #   
-    for(k in 1:p) {
-      Bs$B[1:J + J*(k - 1)] <- B[k,]
+    #
+    for (k in 1:p) {
+      Bs$B[1:J + J * (k - 1)] <- B[k, ]
     }
   }
-  
-  loop_j_to_update <- j_to_update
-  
+
   use_max_inner_it <- FALSE
   B_diff <- Inf
-  
+
   criteria <- TRUE
-  
-  while(criteria) {
-    
-    iter <- iter + 1    #increment iteration
-    inner_iter <- 0     #initiate internal iteration
-    
+
+  while (criteria) {
+    iter <- iter + 1 #increment iteration
+    inner_iter <- 0 #initiate internal iteration
+
     #evaluate augmented Lagrangian
-    
+
     #  get current value of augmented lagrangian
-    curr_lag_val <- -ll + u*gap + (rho/2)*(gap^2)
-    
+    curr_lag_val <- -ll + u * gap + (rho / 2) * (gap^2)
+
     old_gap <- gap # old value of g(B_k) - B_kj
     old_B <- B #old value of B
     inner_diff <- Inf #initiate "observed" max abs val diff in B between inner iterations
-                      #at Inf so inner loop runs at least once
+    #at Inf so inner loop runs at least once
     #inner loop:
-    while(((inner_diff > inner_tol | use_max_inner_it) & inner_iter <= inner_maxit)| inner_iter == 0) {
-      
+    while (
+      ((inner_diff > inner_tol | use_max_inner_it) &
+        inner_iter <= inner_maxit) |
+        inner_iter == 0
+    ) {
       inner_old_B <- B
-      
+
       #perform block update to B as described in null estimation section of Clausen & Willis (2024)
-      update <- macro_fisher_null(X = X,
-                                  Y = Y,
-                                  B = B,
-                                  z = z,
-                                  J = J,
-                                  p = p,
-                                  k_constr = k_constr,
-                                  j_constr = j_constr,
-                                  j_ref = j_ref,
-                                  rho = rho,
-                                  u = u,
-                                  max_step = max_step,
-                                  constraint_fn = constraint_fn,
-                                  constraint_grad_fn = constraint_grad_fn,
-                                  verbose = verbose, 
-                                  c1 = 1e-4)
+      update <- macro_fisher_null(
+        X = X,
+        Y = Y,
+        B = B,
+        z = z,
+        J = J,
+        p = p,
+        k_constr = k_constr,
+        j_constr = j_constr,
+        j_ref = j_ref,
+        rho = rho,
+        u = u,
+        max_step = max_step,
+        constraint_fn = constraint_fn,
+        constraint_grad_fn = constraint_grad_fn,
+        verbose = verbose,
+        c1 = 1e-4
+      )
+
       B <- B + update$update
       gap <- update$gap
-      z <- update_z(Y,X,B)
-      
+      z <- update_z(Y, X, B)
+
       if (trackB) {
-      #   
-      #   ### AW notes 1/15/24 that dlag_lamhat_dB is not computed in this function. 
-      #   
         temp_Bs <- lilBs
-        for(k in 1:p) {
-          temp_Bs$B[1:J + J*(k - 1)] <- B[k,]
+        for (k in 1:p) {
+          temp_Bs$B[1:J + J * (k - 1)] <- B[k, ]
         }
         temp_Bs$iter <- iter
         temp_Bs$inner_iter <- inner_iter
         temp_Bs$rho <- rho
         temp_Bs$u <- u
-      #   temp_Bs$Q <- NA
         temp_Bs$gap <- gap
-      #   if (iter > 1 | inner_iter > 1) {
-      #     jcounter <- 1
-      #     for(j in 1:J) {
-      #       if (j == j_constr) {
-      #         temp_Bs$dlag_dB[temp_Bs$j ==j] <- 0
-      #       } else{
-      #         
-      #         ### maybe in 
-      #         
-      #         # dlag_lamhat_dB <- dldB + lambda_hat*dgdB
-      #         # temp_Bs$dlag_dB[temp_Bs$j ==j] <- dlag_lamhat_dB[(jcounter - 1)*p + 1:p,]
-      #         # jcounter <- jcounter + 1
-      #       }
-      #     }
-      #   }
-      #   
-        Bs <- rbind(Bs,temp_Bs)
-      #   
+        Bs <- rbind(Bs, temp_Bs)
       }
-      
+
       #did we move much?
       inner_diff <- max(abs(B - inner_old_B))
-      
+
       #increment inner iteration counter
       inner_iter <- inner_iter + 1
-      
     }
     #did we move much since last *outer* loop iteration?
     B_diff <- max(abs(B - old_B))
-    
+
     if (verbose) {
-      message("Max absolute difference in B since last augmented Lagrangian outer step: ", signif (B_diff,3))
+      message(
+        "Max absolute difference in B since last augmented Lagrangian outer step: ",
+        signif(B_diff, 3)
+      )
     }
-    
+
     #update u and rho
-    if ( abs(gap) > constraint_tol) {
-      u <- u + rho*gap
-      if (abs(gap/old_gap) > kappa) {
-        rho <- rho*tau
-      } 
+    if (abs(gap) > constraint_tol) {
+      u <- u + rho * gap
+      if (abs(gap / old_gap) > kappa) {
+        rho <- rho * tau
+      }
     }
-    
+
     if (verbose) {
-      message("Estimate deviates from feasibility by ",signif (gap,3))
-      message("Parameter u set to ", signif (u,3))
-      message("Parameter rho set to ", signif (rho,3))
+      message("Estimate deviates from feasibility by ", signif(gap, 3))
+      message("Parameter u set to ", signif(u, 3))
+      message("Parameter rho set to ", signif(rho, 3))
     }
-    
+
     if (abs(gap) < constraint_tol) {
       use_max_inner_it <- TRUE
-      # message("Score stat: ",signif (check$score_stat*(n/(n - 1)),3))
-      # message("Influence stat: ",signif (check$influence_stat,3))
-      # proj_score <- as.numeric(as.matrix(check$influence_stat))
-    } else{
+    } else {
       use_max_inner_it <- FALSE
     }
-    
+
     if (ignore_stop) {
       criteria <- (iter < maxit & !is.infinite(rho))
     } else {
-      criteria <- (abs(gap) > constraint_tol | B_diff> B_tol) & iter < maxit & !is.infinite(rho)
+      criteria <- (abs(gap) > constraint_tol | B_diff > B_tol) &
+        iter < maxit &
+        !is.infinite(rho)
     }
-    
-    log_means <- X %*% B +
-      matrix(z, ncol = 1) %*% matrix(1, ncol = J, nrow = 1)
-    
+
+    log_means <- X %*% B + matrix(z, ncol = 1) %*% matrix(1, ncol = J, nrow = 1)
+
     ll <- sum(Y * log_means - exp(log_means))
     it_df$lik[iter] <- ll
     it_df$max_abs_B[iter] <- B_diff
     it_df$constraint_diff[iter] <- abs(gap)
-    
+
     if (verbose) {
-      message("Log likelihood: ", signif (ll,3))
+      message("Log likelihood: ", signif(ll, 3))
     }
-    
+
     if (null_diagnostic_plots) {
-      score_res <- try(get_score_stat(X = X, Y = Y, X_cup = X_cup, B = B, k_constr = k_constr,
-                                      j_constr = j_constr, constraint_grad_fn = constraint_grad_fn,
-                                      indexes_to_remove = (j_ref - 1)*p + 1:p, j_ref = j_ref, J = J,
-                                      n = n, p = p))
+      score_res <- try(get_score_stat(
+        X = X,
+        Y = Y,
+        X_cup = X_cup,
+        B = B,
+        k_constr = k_constr,
+        j_constr = j_constr,
+        constraint_grad_fn = constraint_grad_fn,
+        indexes_to_remove = (j_ref - 1) * p + 1:p,
+        j_ref = j_ref,
+        J = J,
+        n = n,
+        p = p
+      ))
       if (!inherits(score_res, "try-error")) {
         it_df$test_stat[iter] <- score_res$score_stat
       }
     }
-    
+
     if (abs(gap) <= constraint_tol & B_diff <= B_tol) {
       converged <- TRUE
     }
-    
   }
-  
+
   it_df <- it_df[1:iter, ]
-  
-  return(list("B" = B,
-              "k_constr" = k_constr,
-              "j_constr" = j_constr,
-              # "score_stats" = list(score_stats),
-              # "proj_score" = proj_score,
-              "niter" = iter,
-              "gap" = gap,
-              "u" = u,
-              "rho" = rho,
-              "Bs" = Bs,
-              "it_df" = it_df,
-              "converged" = converged))
-}
 
+  return(list(
+    "B" = B,
+    "k_constr" = k_constr,
+    "j_constr" = j_constr,
+    "niter" = iter,
+    "gap" = gap,
+    "u" = u,
+    "rho" = rho,
+    "Bs" = Bs,
+    "it_df" = it_df,
+    "converged" = converged
+  ))
+}
diff --git a/R/fit_null_discrete.R b/R/fit_null_discrete.R
new file mode 100644
index 0000000..b036996
--- /dev/null
+++ b/R/fit_null_discrete.R
@@ -0,0 +1,84 @@
+#' @note
+#' Needs higher tolerance than macro_fisher_null to get superior LL
+
+fit_null_discrete_pseudohuber <- function(
+    Y,
+    X,
+    k_constr,
+    j_constr,
+    j_ref,
+    trackB = TRUE,
+    maxit = 1000, 
+    tol = 1e-8,
+    ls_max = 20, 
+    ls_rho = 0.5, 
+    max_step = 1
+) {
+  
+  n <- nrow(Y)
+  J <- ncol(Y)
+  p <- ncol(X)
+  
+  ### check we're in the right paradigm
+  distinct_xx <- unique(X)
+  stopifnot(ncol(X) == nrow(distinct_xx))
+  stopifnot(all(X[,1] == 1)) ### need a baseline category to simplify the parametrization
+  X_wo_1s <- distinct_xx[, -1]
+  
+  
+  ## Rearrange to have j_ref be the last (J-th) column
+  ## Setup: 
+  ##    put j_ref to be the J-th column
+  ##    put j_constr to be the 1st column
+  ## Later, permute cols of B-hat
+  Y_orig <- Y
+  new_order <- c(j_constr, setdiff(1:J, c(j_constr, j_ref)), j_ref)
+  Y <- Y_orig[, new_order]
+  
+  groups <- split(
+    seq_len(nrow(X)),                        # row indices of original data
+    apply(X, 1, function(r)
+      paste(r, collapse = "_"))               # grouping key by row contents
+  )
+  
+  groups <- groups[order(sapply(groups, min))]
+  
+  totals <- lapply(groups, function(x) { 
+    apply(Y[x, , drop = FALSE], 2, sum)
+  })
+  
+  Y_sum <- do.call(rbind, totals)
+  
+  ## this should be equivalent to fit_null with j_ref=J
+  out <- fit_null_discrete_micro_fs(
+    Y = Y_sum, 
+    X = X_wo_1s, 
+    k_constr=k_constr, 
+    j_constr=1, ## because it's been moved to the first column
+    constraint_fn=function(x) {  pseudohuber_median(c(x, 0)) },       
+    constraint_grad_fn= function(x) {  x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]},
+    maxit = maxit, 
+    tol = tol,
+    ls_max = ls_max, 
+    ls_rho = ls_rho, 
+    max_step = max_step
+  )
+
+  B <- out$B
+  
+  z <- update_z(Y, X, B)
+  log_means <- X %*% B + matrix(z, ncol = 1) %*% matrix(1, ncol = J, nrow = 1)
+  ll_new <- sum(Y * log_means - exp(log_means))
+  
+  ### now reorder B to match what was input 
+  return(list(
+    "B" = B[, order(new_order)],
+    "k_constr" = k_constr,
+    "j_constr" = j_constr,
+    "niter" = out$iter,
+    "loglik" = ll_new,
+    "it_df" = out$history,
+    "converged" = (out$iter < maxit)
+  ))
+  
+}
\ No newline at end of file
diff --git a/R/fit_null_discrete_micro_fs.R b/R/fit_null_discrete_micro_fs.R
new file mode 100644
index 0000000..f0cc27c
--- /dev/null
+++ b/R/fit_null_discrete_micro_fs.R
@@ -0,0 +1,221 @@
+## this is equivalent to fit_null with j_ref=J
+fit_null_discrete_micro_fs <- function(Y, X,
+                                       k_constr, 
+                                       j_constr,          # indices: row k* in 1..(p-1), column j* in 1..(J-1)
+                                       constraint_fn, constraint_grad_fn,       # g: R^{m-1}->R on the constrained row; grad: R^{m-1}->R^{m-1}
+                                       maxit = 1000, 
+                                       tol = 1e-8,
+                                       ls_max = 20, 
+                                       ls_rho = 0.5,
+                                       ridge_base = 1e-4,
+                                       max_step_norm = 5,
+                                       max_step = 1) {
+  
+  ## using `alpha + beta*X` parametrisation, so decrease k_constr_non_intercept accordingly
+  k_constr_non_intercept <- k_constr - 1
+  
+  Y <- as.matrix(Y); X <- as.matrix(X)
+  p <- nrow(Y); J <- ncol(Y); m <- J - 1L
+  if (nrow(X) != p || ncol(X) != p - 1L) stop("X must be p x (p-1).")
+  if (J < 3) stop("Need J >= 3 (one baseline).")
+  if (k_constr_non_intercept < 1L || k_constr_non_intercept > (p - 1L)) stop("k_constr_non_intercept must be in 1..(p-1).")
+  if (j_constr < 1L || j_constr > m) stop("j_constr must be in 1..(J-1).")
+  
+  n <- rowSums(Y)
+  
+  # Parameters
+  alpha <- rep(0, m)
+  beta  <- matrix(0, nrow = p - 1L, ncol = m)
+  # initialize constrained element coherently (assume g excludes j_constr)
+  free_idx <- setdiff(seq_len(m), j_constr)
+  beta[k_constr_non_intercept, j_constr] <- constraint_fn(beta[k_constr_non_intercept, free_idx])
+  
+  softmax_nb <- function(eta) { e <- exp(eta); e / (1 + sum(e)) }
+  
+  # Log-likelihood
+  loglik_fun <- function(alpha, beta) {
+    ll <- 0; #eps <- 1e-12
+    for (i in 1:p) {
+      eta <- alpha + drop(X[i, , drop = FALSE] %*% beta) # length m
+      pnb <- softmax_nb(eta)
+      pfull <- c(pnb, 1 - sum(pnb))
+      # pfull <- pmin(pmax(c(pnb, 1 - sum(pnb)), eps), 1 - eps)
+      ll <- ll + sum(Y[i, ] * log(pfull))
+    }
+    ll
+  }
+  
+  # History
+  param_hist <- list()
+  record <- function(iter) {
+    rbind(
+      data.frame(iter = iter, type = "alpha", row = NA_integer_, col = seq_len(m), value = alpha),
+      data.frame(iter = iter, type = "beta",  row = rep(seq_len(p - 1L), each = m),
+                 col = rep(seq_len(m), times = p - 1L), value = as.vector(beta))
+    )
+  }
+  param_hist[[1]] <- record(0L)
+  
+  ll_old <- loglik_fun(alpha, beta); final_iter <- 0L
+  
+  for (iter in 1:maxit) {
+    # probs and M_i
+    p_list <- vector("list", p)
+    M_list <- vector("list", p)
+    for (i in 1:p) {
+      eta <- alpha + drop(X[i, , drop = FALSE] %*% beta)
+      pnb <- softmax_nb(eta)
+      p_list[[i]] <- pnb
+      M_list[[i]] <- diag(pnb) - tcrossprod(pnb)
+    }
+    
+    # scores
+    s_alpha <- rep(0, m)
+    s_beta  <- matrix(0, nrow = p - 1L, ncol = m)
+    for (i in 1:p) {
+      s_i <- Y[i, 1:m] - n[i] * p_list[[i]] # length m
+      s_alpha <- s_alpha + s_i
+      # add outer product X[i,]^T * s_i
+      s_beta <- s_beta + tcrossprod(X[i, ], s_i) # (p-1) x m
+    }
+    
+    # reduced score: theta = (alpha, rows k != k*, full m; row k* free indices only)
+    s_theta <- c(
+      s_alpha,
+      as.vector(t(s_beta[setdiff(seq_len(p - 1L), k_constr_non_intercept), , drop = FALSE])),
+      # constrained row (chain rule)
+      {
+        grad_g <- constraint_grad_fn(beta[k_constr_non_intercept, free_idx])
+        s_beta[k_constr_non_intercept, free_idx] + s_beta[k_constr_non_intercept, j_constr] * grad_g
+      }
+    )
+    
+    if (sqrt(sum(s_theta^2)) < tol) { final_iter <- iter - 1L; break }
+    
+    # Fisher J in full phi = (alpha, vec(beta rows 1..p-1 each length m))
+    q_dim <- m + (p - 1L) * m
+    Jmat <- matrix(0, q_dim, q_dim)
+    
+    # indices
+    alpha_idx <- 1:m
+    row_block_idx <- function(k) { # k in 1..(p-1), returns indices for row k in vec(beta)
+      start <- m + (k - 1L) * m + 1L
+      start:(start + m - 1L)
+    }
+    
+    for (i in 1:p) {
+      Mi <- M_list[[i]]; ni <- n[i]; Xi <- X[i, , drop = FALSE]          # 1 x (p-1)
+      # alpha-alpha
+      Jmat[alpha_idx, alpha_idx] <- Jmat[alpha_idx, alpha_idx] + ni * Mi
+      # alpha-beta rows
+      for (k in 1:(p - 1L)) {
+        rb <- row_block_idx(k)
+        Jmat[alpha_idx, rb] <- Jmat[alpha_idx, rb] + ni * (Mi * Xi[1, k])
+        Jmat[rb, alpha_idx] <- t(Jmat[alpha_idx, rb])
+      }
+      # beta-beta rows
+      for (k in 1:(p - 1L)) {
+        for (l in 1:(p - 1L)) {
+          rbk <- row_block_idx(k); rbl <- row_block_idx(l)
+          Jmat[rbk, rbl] <- Jmat[rbk, rbl] + ni * (Mi * (Xi[1, k] * Xi[1, l]))
+        }
+      }
+    }
+    
+    # Transformation T: reduced theta -> full phi
+    # theta packs: alpha, then rows k != k* (full m), then row k* (m-1 free entries)
+    p_dim <- m + (p - 2L) * m + (m - 1L)
+    T <- matrix(0, nrow = q_dim, ncol = p_dim)
+    # alpha identity
+    T[alpha_idx, 1:m] <- diag(m)
+    col_ptr <- m + 1L
+    # rows k != k*
+    for (k in setdiff(seq_len(p - 1L), k_constr_non_intercept)) {
+      rb <- row_block_idx(k)
+      T[rb, col_ptr:(col_ptr + m - 1L)] <- diag(m)
+      col_ptr <- col_ptr + m
+    }
+    # row k* mapping B (m x (m-1))
+    grad_g <- constraint_grad_fn(beta[k_constr_non_intercept, free_idx])
+    Bmap <- matrix(0, nrow = m, ncol = m - 1L)
+    # identity on free indices
+    for (pos in seq_along(free_idx)) {
+      Bmap[free_idx[pos], pos] <- 1
+    }
+    # constrained element row = grad_g^T
+    Bmap[j_constr, ] <- grad_g
+    T[row_block_idx(k_constr_non_intercept), col_ptr:(col_ptr + (m - 2L))] <- Bmap
+    
+    I_theta <- crossprod(T, Jmat %*% T)
+    
+    # Adaptive ridge solve
+    ridge_mult <- c(1, 10, 100, 1000, 1e4)
+    step_theta <- rep(NA_real_, p_dim); solved <- FALSE
+    for (rm in ridge_mult) {
+      Ir <- I_theta + (ridge_base * rm) * diag(p_dim)
+      st <- tryCatch(solve(Ir, s_theta), error = function(e) NA_real_)
+      if (all(is.finite(st))) { step_theta <- st; solved <- TRUE; break }
+    }
+    if (!solved) { final_iter <- iter - 1L; break }
+    
+    # Step capping
+    sn <- sqrt(sum(step_theta^2))
+    if (sn > max_step_norm) step_theta <- step_theta * (max_step_norm / sn)
+    if (sqrt(sum(step_theta^2)) < tol) { final_iter <- iter - 1L; break }
+    
+    # Pack current reduced theta
+    pack_theta <- function(alpha, beta) {
+      c(
+        alpha,
+        as.vector(t(beta[setdiff(seq_len(p - 1L), k_constr_non_intercept), , drop = FALSE])),
+        beta[k_constr_non_intercept, free_idx]
+      )
+    }
+    unpack_theta <- function(vec) {
+      wp <- 1L
+      a_new <- vec[wp:(wp + m - 1L)]; wp <- wp + m
+      beta_new <- beta
+      # rows k != k*
+      for (k in setdiff(seq_len(p - 1L), k_constr_non_intercept)) {
+        beta_new[k, ] <- vec[wp:(wp + m - 1L)]; wp <- wp + m
+      }
+      # row k* free
+      bfree <- vec[wp:(wp + (m - 2L))]
+      beta_new[k_constr_non_intercept, free_idx] <- bfree
+      beta_new[k_constr_non_intercept, j_constr]   <- constraint_fn(bfree)
+      list(alpha = a_new, beta = beta_new)
+    }
+    
+    # Line search (monotone ℓ)
+    theta_old <- pack_theta(alpha, beta)
+    step_scale <- 1.0; accepted <- FALSE; ll_new <- ll_old
+    for (ls in 1:ls_max) {
+      prop <- unpack_theta(theta_old + step_scale * step_theta)
+      alpha_new <- prop$alpha
+      beta_new  <- prop$beta
+      ll_cand <- loglik_fun(alpha_new, beta_new)
+      if (is.finite(ll_cand) && ll_cand >= ll_old) {
+        alpha <- alpha_new; beta <- beta_new; ll_new <- ll_cand; accepted <- TRUE; break
+      }
+      step_scale <- step_scale * ls_rho
+    }
+    if (!accepted) { final_iter <- iter - 1L; break }
+    
+    # record and check convergence
+    param_hist[[length(param_hist) + 1L]] <- record(iter)
+    if (abs(ll_new - ll_old) < tol) { ll_old <- ll_new; final_iter <- iter; break }
+    ll_old <- ll_new; final_iter <- iter
+  }
+  
+  B <- cbind(rbind(alpha, beta), 0)
+  rownames(B) <- NULL
+  
+  history <- do.call(rbind, param_hist)
+  history$k <- ifelse(is.na(history$row), 0, history$row) + 1
+  history$j <- history$col
+  
+  list(B = B,                 
+       logLik = ll_old,
+       iter   = final_iter,
+       history = history)
+}
diff --git a/R/fit_null_discrete_optim.R b/R/fit_null_discrete_optim.R
new file mode 100644
index 0000000..84397b6
--- /dev/null
+++ b/R/fit_null_discrete_optim.R
@@ -0,0 +1,165 @@
+# ## n0: counts for x = 0, length J
+# ## n1: counts for x = 1, length J
+# ## gfun: function(beta_free2) -> scalar beta_1
+# ##       beta_free2 is c(beta_2, ..., beta_{J-1})
+# ## gprimefun: function(beta_free2) -> numeric(J-2)
+# ##            gradient d beta_1 / d (beta_2,...,beta_{J-1})
+# 
+# loglik_counts_constrained <- function(par, n0, n1, gfun, gprimefun) {
+#   J  <- length(n0)
+#   Jm1 <- J - 1
+#   
+#   ## Split parameters:
+#   ## alpha_1,...,alpha_{J-1}, beta_2,...,beta_{J-1}
+#   alpha       <- par[1:Jm1]
+#   beta_free2  <- par[(J):(2*J - 3)]   # length J-2: beta_2..beta_{J-1}
+#   
+#   ## Constrained beta_1 via g
+#   beta1 <- gfun(beta_free2)
+#   
+#   ## Full beta for categories 1..J-1 (baseline J has beta_J = 0)
+#   beta_full <- numeric(Jm1)
+#   beta_full[1] <- beta1
+#   beta_full[2:Jm1] <- beta_free2
+#   
+#   ## Group sizes
+#   N0 <- sum(n0)
+#   N1 <- sum(n1)
+#   
+#   ## x = 0: eta0_j = alpha_j, eta0_J = 0
+#   eta0 <- alpha                       # length J-1
+#   m0   <- max(c(0, eta0))
+#   exp_eta0 <- exp(eta0 - m0)
+#   denom0   <- exp(0 - m0) + sum(exp_eta0)
+#   p0       <- exp_eta0 / denom0      # probs for j=1..J-1
+#   p0J      <- exp(0 - m0) / denom0   # prob for baseline J
+#   
+#   ## x = 1: eta1_j = alpha_j + beta_j, eta1_J = 0
+#   eta1 <- alpha + beta_full
+#   m1   <- max(c(0, eta1))
+#   exp_eta1 <- exp(eta1 - m1)
+#   denom1   <- exp(0 - m1) + sum(exp_eta1)
+#   p1       <- exp_eta1 / denom1      # probs for j=1..J-1
+#   p1J      <- exp(0 - m1) / denom1
+#   
+#   ## Log-likelihood:
+#   ## ℓ = sum_j n0_j log p0_j + sum_j n1_j log p1_j
+#   ## with p0_J, p1_J included for category J
+#   ll0 <- sum(n0[1:Jm1] * log(p0))  + n0[J] * log(p0J)
+#   ll1 <- sum(n1[1:Jm1] * log(p1))  + n1[J] * log(p1J)
+#   
+#   ll0 + ll1
+# }
+# 
+# grad_counts_constrained <- function(par, n0, n1, gfun, gprimefun) {
+#   J  <- length(n0)
+#   Jm1 <- J - 1
+#   
+#   alpha      <- par[1:Jm1]
+#   beta_free2 <- par[(J):(2*J - 3)]
+#   
+#   beta1 <- gfun(beta_free2)
+#   beta_full <- numeric(Jm1)
+#   beta_full[1] <- beta1
+#   beta_full[2:Jm1] <- beta_free2
+#   
+#   gprime <- gprimefun(beta_free2)   # length J-2
+#   
+#   N0 <- sum(n0)
+#   N1 <- sum(n1)
+#   
+#   ## x = 0
+#   eta0 <- alpha
+#   m0   <- max(c(0, eta0))
+#   exp_eta0 <- exp(eta0 - m0)
+#   denom0 <- exp(0 - m0) + sum(exp_eta0)
+#   p0 <- exp_eta0 / denom0                  # j = 1..J-1
+#   p0J <- exp(0 - m0) / denom0
+#   
+#   ## x = 1
+#   eta1 <- alpha + beta_full
+#   m1   <- max(c(0, eta1))
+#   exp_eta1 <- exp(eta1 - m1)
+#   denom1 <- exp(0 - m1) + sum(exp_eta1)
+#   p1 <- exp_eta1 / denom1                  # j = 1..J-1
+#   p1J <- exp(0 - m1) / denom1
+#   
+#   ## Gradient w.r.t. alpha_j (j=1..J-1):
+#   ## ∂ℓ/∂alpha_j = (n0_j - N0 p0_j) + (n1_j - N1 p1_j)
+#   grad_alpha <- (n0[1:Jm1] - N0 * p0) + (n1[1:Jm1] - N1 * p1)
+#   
+#   ## Unconstrained slope scores:
+#   ## ∂ℓ/∂beta_j = n1_j - N1 p1_j, j = 1..J-1
+#   score_beta_full <- n1[1:Jm1] - N1 * p1
+#   
+#   ## ∂ℓ/∂beta_1:
+#   S1 <- score_beta_full[1]
+#   
+#   ## For free beta_2..beta_{J-1}:
+#   ## ∂ℓ/∂beta_j_free = (n1_j - N1 p1_j) + S1 * ∂beta1/∂beta_j
+#   ## j = 2..J-1
+#   S_direct <- score_beta_full[2:Jm1]       # length J-2
+#   grad_beta_free2 <- S_direct + S1 * gprime
+#   
+#   c(grad_alpha, grad_beta_free2)
+# }
+# 
+# fit_multinom_constrained_counts <- function(n0, n1, gfun, gprimefun,
+#                                             start = NULL) {
+#   J  <- length(n0)
+#   if (length(n1) != J) stop("n0 and n1 must have the same length")
+#   if (J < 3) stop("Need at least 3 categories (J >= 3).")
+#   
+#   Jm1 <- J - 1
+#   n_par <- (Jm1) + (Jm1 - 1)   # alpha_1..alpha_{J-1}, beta_2..beta_{J-1}
+#   
+#   if (is.null(start)) {
+#     start <- rep(0, n_par)
+#   }
+#   
+#   fn <- function(par) {
+#     -loglik_counts_constrained(par, n0, n1, gfun, gprimefun)  # optim minimizes
+#   }
+#   
+#   gr <- function(par) {
+#     -grad_counts_constrained(par, n0, n1, gfun, gprimefun)
+#   }
+#   
+#   opt <- optim(start, fn, gr, method = "BFGS", control = list(maxit = 10))
+#   
+#   par_hat <- opt$par
+#   
+#   ## Reconstruct alpha_1..alpha_J and beta_1..beta_J
+#   alpha_hat <- par_hat[1:Jm1]
+#   beta_free2_hat <- par_hat[(J):(2*J - 3)]
+#   beta1_hat <- gfun(beta_free2_hat)
+#   
+#   beta_hat <- numeric(J)
+#   beta_hat[1] <- beta1_hat
+#   beta_hat[2:Jm1] <- beta_free2_hat
+#   beta_hat[J] <- 0
+#   
+#   alpha_hat_full <- numeric(J)
+#   alpha_hat_full[1:Jm1] <- alpha_hat
+#   alpha_hat_full[J] <- 0
+#   
+#   list(
+#     alpha = alpha_hat_full,
+#     beta  = beta_hat,
+#     logLik = -opt$value,
+#     convergence = opt$convergence,
+#     optim = opt
+#   )
+# }
+# 
+# # system.time({
+# # out_test3 <- 
+# fit_multinom_constrained_counts(n0=Y[which(X[,2] == 0), ] %>% colSums, 
+#                                 n1 = Y[which(X[,2] == 1), ] %>% colSums, 
+#                                 gfun=function(x) {  pseudohuber_median(c(x, 0)) },
+#                                 gprimefun= function(x) {  x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]}
+# )
+# # })
+# out_test2$alpha
+# out_test2$beta
+# out_test3
diff --git a/R/macro_fisher_null.R b/R/macro_fisher_null.R
index 29f7ecb..09cdb43 100644
--- a/R/macro_fisher_null.R
+++ b/R/macro_fisher_null.R
@@ -1,162 +1,172 @@
-
-#block approximate Newton update to B used in 
+#block approximate Newton update to B used in
 #optimization under null
 
-macro_fisher_null <- function(X,
-                              Y,
-                              B,
-                              z,
-                              J,
-                              p,
-                              k_constr,
-                              j_constr,
-                              j_ref,
-                              rho,
-                              u,
-                              constraint_fn,
-                              constraint_grad_fn,
-                              stepsize = 0.5,
-                              c1 = 1e-4,
-                              regularization = 0.1,
-                              max_step = 1,
-                              verbose = FALSE,
-                              debug = FALSE #return more information useful for
-                                            #debugging
-                              ){
+macro_fisher_null <- function(
+  X,
+  Y,
+  B,
+  z,
+  J,
+  p,
+  k_constr,
+  j_constr,
+  j_ref,
+  rho,
+  u,
+  constraint_fn,
+  constraint_grad_fn,
+  stepsize = 0.5,
+  c1 = 1e-4,
+  regularization = 0.1,
+  max_step = 1,
+  verbose = FALSE,
+  debug = FALSE #return more information useful for
+  #debugging
+) {
   #list of log means... by taxon
   log_means_by_j <-
-    lapply(1:J,function(j)
-      X%*%B[,j,drop = FALSE] + z)
-  
+    lapply(1:J, function(j) {
+      X %*% B[, j, drop = FALSE] + z
+    })
+
   #compute information matrix of *Poisson* log likelihood
   #for fixed z
-  info_bits <- lapply(log_means_by_j,
-                      function(log_mu){
-                        crossprod(X,diag(as.numeric(exp(log_mu))))%*%X
-                      })
-  info_diag <- do.call(c,lapply(info_bits,diag))
+  info_bits <- lapply(log_means_by_j, function(log_mu) {
+    crossprod(X, diag(as.numeric(exp(log_mu)))) %*% X
+  })
+  info_diag <- do.call(c, lapply(info_bits, diag))
   mean_diag <- mean(info_diag)
-  
-  info_bits <- lapply(info_bits,
-                      function(x){
-                        diag(x) <- diag(x) + regularization*mean_diag;
-                        return(x)
-                      })
+
+  info_bits <- lapply(info_bits, function(x) {
+    diag(x) <- diag(x) + regularization * mean_diag
+    return(x)
+  })
   #invert info by inverting each block diagonal element
-  inv_info_by_j <- lapply(info_bits,
-                          function(x)
-                            qr.solve(x,tol = 1e-20))
+  inv_info_by_j <- lapply(info_bits, function(x) {
+    qr.solve(x, tol = 1e-20)
+  })
   #remove diagonal block corresponding to convenience constraint
   inv_info_by_j <- inv_info_by_j[-j_ref]
-  
-  #stick the block diagonal elements together into one matrix to get 
+
+  #stick the block diagonal elements together into one matrix to get
   #inverse as an actual matrix
   info_inverse <- Matrix::bdiag(inv_info_by_j)
-  
+
   #compute gradient of constraint function
-  cg <- constraint_grad_fn[[k_constr]](B[k_constr,])
+  cg <- constraint_grad_fn[[k_constr]](B[k_constr, ])
   cg[j_constr] <- cg[j_constr] - 1
   cg <- cg[-j_ref]
-  
+
   #vector of p elements whose k_constr-th element is 1, all else 0
-  e_k_constr <- Matrix(0,nrow = p,ncol = 1)
-  e_k_constr[k_constr,] <- 1
-  #take kronecker producr with gradient of constraint to 
+  e_k_constr <- Matrix(0, nrow = p, ncol = 1)
+  e_k_constr[k_constr, ] <- 1
+  #take kronecker producr with gradient of constraint to
   #get a gradient in long format B (i.e., B_cup)
-  cg_expanded <- Matrix::kronecker(Matrix(cg,ncol = 1),e_k_constr)
-  v <- sqrt(rho)*cg_expanded
+  cg_expanded <- Matrix::kronecker(Matrix(cg, ncol = 1), e_k_constr)
+  v <- sqrt(rho) * cg_expanded
   #do sherman-morrison rank-1 update in a couple steps
-  sm_denom <- as.numeric(as.matrix(1 + Matrix::crossprod(v,info_inverse)%*%v))
-  sm_half_num <- info_inverse%*%v
-  
-  #strictly speaking, info_inverse isn't the inverse info -- its the inverse of 
+  sm_denom <- as.numeric(as.matrix(
+    1 + Matrix::crossprod(v, info_inverse) %*% v
+  ))
+  sm_half_num <- info_inverse %*% v
+
+  #strictly speaking, info_inverse isn't the inverse info -- its the inverse of
   # (an approximation to) the hessian of the augmented lagrangian
-  
+
   #compute derivative of augmented lagrangian
-  lag_deriv <- lapply(1:J,
-                      function(j){
-                        
-                        -t(X)%*%(Y[,j,drop = FALSE] - exp(log_means_by_j[[j]]))
-                      })
-  lag_deriv <- do.call(rbind,lag_deriv[-j_ref])
-  gap <- constraint_fn[[k_constr]](B[k_constr,]) - B[k_constr,j_constr]
-  lag_deriv <- lag_deriv + (u + rho*gap)*cg_expanded
-  
-  
+  lag_deriv <- lapply(1:J, function(j) {
+    -t(X) %*% (Y[, j, drop = FALSE] - exp(log_means_by_j[[j]]))
+  })
+  lag_deriv <- do.call(rbind, lag_deriv[-j_ref])
+  gap <- constraint_fn[[k_constr]](B[k_constr, ]) - B[k_constr, j_constr]
+  lag_deriv <- lag_deriv + (u + rho * gap) * cg_expanded
+
   #direction for (approximate) Newton step
-  update_dir <- -info_inverse %*%lag_deriv + 
-    sm_half_num%*% Matrix::crossprod(sm_half_num,lag_deriv)/sm_denom
+  update_dir <- -info_inverse %*%
+    lag_deriv +
+    sm_half_num %*% Matrix::crossprod(sm_half_num, lag_deriv) / sm_denom
   #shorten step if any of its elements are larger than allowed by max_step
-  if(max(abs(update_dir), na.rm = TRUE)>max_step){
-    update_dir <- update_dir/max(abs(update_dir), na.rm = TRUE)
+  if (max(abs(update_dir), na.rm = TRUE) > max_step) {
+    update_dir <- update_dir / max(abs(update_dir), na.rm = TRUE)
   }
-  
+
   # make sure update direction is never missing
   update_dir[is.na(update_dir)] <- 0
-  
-  armijo_term <- c1*sum(update_dir*lag_deriv)
-  
+
+  armijo_term <- c1 * sum(update_dir * lag_deriv)
+
   #update direction in long format
-  update_dir <- B_from_B_cup(update_dir,J - 1, p)
+  update_dir <- B_from_B_cup(update_dir, J - 1, p)
   #don't update betas for j_ref
-  update_dir <- do.call(cbind,list(update_dir[,(1:(J - 1))<j_ref,drop = FALSE],
-                                   0,
-                                   update_dir[,(1:(J - 1))>=j_ref,drop = FALSE]))
-  
-  
+  update_dir <- do.call(
+    cbind,
+    list(
+      update_dir[, (1:(J - 1)) < j_ref, drop = FALSE],
+      0,
+      update_dir[, (1:(J - 1)) >= j_ref, drop = FALSE]
+    )
+  )
+
   #in debug mode, compute full inverse info for backward compatibility
-  if(debug){
-    info_inverse <- info_inverse - Matrix::tcrossprod(sm_half_num)/sm_denom
+  if (debug) {
+    info_inverse <- info_inverse - Matrix::tcrossprod(sm_half_num) / sm_denom
   }
-  
-  
-  
-  
-  
+
   #find stepsize that satisfies armijo rule
   accept <- FALSE
-  
-  curr_lag_val <- do.call(sum,
-                          lapply(1:J, function(j)
-                            -Y[,j,drop = FALSE]*log_means_by_j[[j]] + exp(log_means_by_j[[j]]))) +
-    u*gap + (rho/2)*gap^2
-  
-  while(!accept){
-    prop_B <- B + stepsize*update_dir
-    prop_gap <- constraint_fn[[k_constr]](prop_B[k_constr,]) - prop_B[k_constr,j_constr]
-    
+
+  curr_lag_val <- do.call(
+    sum,
+    lapply(1:J, function(j) {
+      -Y[, j, drop = FALSE] * log_means_by_j[[j]] + exp(log_means_by_j[[j]])
+    })
+  ) +
+    u * gap +
+    (rho / 2) * gap^2
+
+  while (!accept) {
+    prop_B <- B + stepsize * update_dir
+    prop_gap <- constraint_fn[[k_constr]](prop_B[k_constr, ]) -
+      prop_B[k_constr, j_constr]
+
     prop_log_means_by_j <-
-      lapply(1:J,function(j)
-        X%*%prop_B[,j,drop = FALSE] + z)
-    
-    prop_lag_val <- do.call(sum,
-                            lapply(1:J, function(j)
-                              -Y[,j,drop = FALSE]*prop_log_means_by_j[[j]] + exp(prop_log_means_by_j[[j]]))) +
-      u*prop_gap + (rho/2)*prop_gap^2
-    
-    accept <- prop_lag_val <= curr_lag_val + armijo_term*stepsize
-    stepsize <- stepsize/2
+      lapply(1:J, function(j) {
+        X %*% prop_B[, j, drop = FALSE] + z
+      })
+
+    prop_lag_val <- do.call(
+      sum,
+      lapply(1:J, function(j) {
+        -Y[, j, drop = FALSE] *
+          prop_log_means_by_j[[j]] +
+          exp(prop_log_means_by_j[[j]])
+      })
+    ) +
+      u * prop_gap +
+      (rho / 2) * prop_gap^2
+
+    accept <- prop_lag_val <= curr_lag_val + armijo_term * stepsize
+    stepsize <- stepsize / 2
     # stepsize <- stepsize*0.9
-    
+
     # steps <- c(steps,stepsize)
     # props <- c(props,prop_lag_val)
-    
   }
-  stepsize <- stepsize*2
-  if (stepsize <  1e-2 & verbose) {
+  stepsize <- stepsize * 2
+  if (stepsize < 1e-2 & verbose) {
     #ideally, stepsize doesn't get super small and we never see this message
     #but it can be helpful if something is going wrong
-    message("Small stepsize: ",signif(stepsize,2))
+    message("Small stepsize: ", signif(stepsize, 2))
   }
   if (debug) {
-    return(list(update =  update_dir*stepsize,gap = prop_gap,
-                lag_deriv = lag_deriv,
-                info_inverse = info_inverse,
-                stepsize = stepsize))
-  } else{
-    return(list(update = update_dir*stepsize,gap = prop_gap))
-    
+    return(list(
+      update = update_dir * stepsize,
+      gap = prop_gap,
+      lag_deriv = lag_deriv,
+      info_inverse = info_inverse,
+      stepsize = stepsize
+    ))
+  } else {
+    return(list(update = update_dir * stepsize, gap = prop_gap))
   }
-  
-  
 }
diff --git a/R/my_fs_stable_two_groups.R b/R/my_fs_stable_two_groups.R
new file mode 100644
index 0000000..25a9c33
--- /dev/null
+++ b/R/my_fs_stable_two_groups.R
@@ -0,0 +1,240 @@
+## this is equivalent to fit_null with k_constr=2, j_constr=1, j_ref=J,
+my_fs_stable_two_groups <- function(n0, 
+                                    n1,
+                                    g_beta,  
+                                    g_beta_grad,
+                                    maxit = 1000,
+                                    tol = 1e-8,
+                                    ls_max = 20,       # max step-halving
+                                    ls_rho = 0.5) {    # shrink factor for step-halving
+  
+  K  <- length(n0)
+  Km <- K - 1  # m
+  
+  if (K < 3) {
+    stop("beta constraint beta1 = g(beta2,...,beta_{K-1}) requires K >= 3.")
+  }
+  
+  # Parameters: categories 1..K-1 (baseline K is 0)
+  alpha <- rep(0, Km)
+  beta  <- rep(0, Km)
+  
+  # Enforce beta constraint initially: beta1 = g(beta2..beta_{K-1})
+  beta[1] <- g_beta(beta[2:Km])
+  
+  n0p <- sum(n0)
+  n1p <- sum(n1)
+  
+  # Softmax probabilities for x = 0 and x = 1 (excluding baseline)
+  p0fun <- function(a) {
+    ea <- exp(a)
+    ea / (1 + sum(ea))
+  }
+  p1fun <- function(a, b) {
+    eb <- exp(a + b)
+    eb / (1 + sum(eb))
+  }
+  
+  # Log-likelihood given alpha, beta
+  loglik_fun <- function(alpha, beta) {
+    p0 <- p0fun(alpha)
+    p1 <- p1fun(alpha, beta)
+    
+    p0_full <- c(p0, 1 - sum(p0))
+    p1_full <- c(p1, 1 - sum(p1))
+    
+    eps <- 1e-12
+    p0_full <- pmin(pmax(p0_full, eps), 1 - eps)
+    p1_full <- pmin(pmax(p1_full, eps), 1 - eps)
+    
+    sum(n0 * log(p0_full)) + sum(n1 * log(p1_full))
+  }
+  
+  # ---- history tracking ----
+  param_history <- list()
+  param_history[[length(param_history) + 1]] <- rbind(
+    data.frame(iter = 0L,
+               type = "alpha",
+               k = seq_len(Km),
+               value = alpha),
+    data.frame(iter = 0L,
+               type = "beta",
+               k = seq_len(Km),
+               value = beta)
+  )
+  # ---------------------------
+  
+  ll_old <- loglik_fun(alpha, beta)
+  final_iter <- 0L
+  
+  # tuning constants for stability
+  ridge_base     <- 1e-4   # base ridge
+  max_step_norm  <- 5      # max Euclidean norm of step in θ
+  clip_logit_max <- 15     # clip for alpha/beta components
+  
+  for (iter in 1:maxit) {
+    
+    # Probabilities for non-baseline categories 1..K-1
+    p0 <- p0fun(alpha)
+    p1 <- p1fun(alpha, beta)
+    
+    # ----- SCORES (gradients) -----
+    # Unconstrained score wrt alpha[1:Km] and beta[1:Km]
+    s_alpha_full <- (n0[1:Km] + n1[1:Km]) - n0p * p0 - n1p * p1
+    s_beta_full  <-  n1[1:Km]            - n1p * p1
+    
+    # Beta free gradient via chain rule (k = 2..Km)
+    grad_g_beta <- g_beta_grad(beta[2:Km])          # length Km-1
+    s_beta_free <- s_beta_full[2:Km] + s_beta_full[1] * grad_g_beta
+    
+    # Score vector for reduced parameters θ = (alpha[1:Km], beta_free[1:(Km-1)])
+    score_theta <- c(s_alpha_full, s_beta_free)
+    
+    # Convergence: score norm
+    score_norm <- sqrt(sum(score_theta^2))
+    if (score_norm < tol) {
+      final_iter <- iter - 1L
+      break
+    }
+    
+    # ----- FULL FISHER INFORMATION (α, β_full) -----
+    # M0, M1 (m x m)
+    M0 <- diag(p0) - tcrossprod(p0)
+    M1 <- diag(p1) - tcrossprod(p1)
+    
+    Iaa <- n0p * M0 + n1p * M1         # alpha-alpha
+    Iab <- n1p * M1                    # alpha-beta
+    Ibb <- n1p * M1                    # beta-beta
+    
+    # Full J (2m x 2m)
+    J_top    <- cbind(Iaa, Iab)
+    J_bottom <- cbind(t(Iab), Ibb)
+    J <- rbind(J_top, J_bottom)
+    
+    # ----- TRANSFORMATION T from θ to φ = (alpha, beta_full) -----
+    m  <- Km
+    p  <- 2 * m - 1  # dim(θ)
+    q  <- 2 * m      # dim(φ)
+    
+    T <- matrix(0, nrow = q, ncol = p)
+    # alpha-part: identity
+    T[1:m, 1:m] <- diag(m)
+    
+    # beta-part: B (m x (m-1)) at rows (m+1)..(2m), cols (m+1)..(2m-1)
+    B <- matrix(0, nrow = m, ncol = m - 1)
+    B[1, ] <- grad_g_beta                 # row for beta1
+    if (m > 1) {
+      B[2:m, ] <- diag(m - 1)            # rows beta2..betam
+    }
+    T[(m + 1):(2 * m), (m + 1):p] <- B
+    
+    # Fisher info for θ
+    I_theta <- crossprod(T, J %*% T)     # T^T J T
+    
+    # ----- solve for step with adaptive ridge -----
+    ridge_multipliers <- c(1, 10, 100, 1000, 1e4)
+    step_theta <- rep(NA_real_, length(score_theta))
+    solved <- FALSE
+    
+    for (rm in ridge_multipliers) {
+      ridge <- ridge_base * rm
+      I_theta_r <- I_theta + ridge * diag(p)
+      step_try <- tryCatch(
+        solve(I_theta_r, score_theta),
+        error = function(e) NA_real_
+      )
+      if (all(is.finite(step_try))) {
+        step_theta <- step_try
+        solved <- TRUE
+        break
+      }
+    }
+    
+    if (!solved) {
+      final_iter <- iter - 1L
+      break
+    }
+    
+    # Cap step length
+    step_norm <- sqrt(sum(step_theta^2))
+    if (step_norm > max_step_norm) {
+      step_theta <- step_theta * (max_step_norm / step_norm)
+    }
+    
+    # If step is essentially zero, we're done
+    if (sqrt(sum(step_theta^2)) < tol) {
+      final_iter <- iter - 1L
+      break
+    }
+    
+    # ----- Step-halving line search for robustness -----
+    step_scale <- 1.0
+    ll_new <- ll_old
+    accepted <- FALSE
+    
+    for (ls_iter in 1:ls_max) {
+      
+      theta_old <- c(alpha, beta[2:Km])
+      theta_new <- theta_old + step_scale * step_theta
+      
+      alpha_new      <- theta_new[1:Km]
+      beta_free_new  <- theta_new[(Km + 1):(2 * Km - 1)]
+      
+      beta_new <- beta
+      beta_new[2:Km] <- beta_free_new
+      beta_new[1]    <- g_beta(beta_new[2:Km])
+      
+      # clip logits to avoid numeric blowups
+      alpha_new <- pmax(pmin(alpha_new,  clip_logit_max), -clip_logit_max)
+      beta_new  <- pmax(pmin(beta_new,   clip_logit_max), -clip_logit_max)
+      
+      ll_candidate <- loglik_fun(alpha_new, beta_new)
+      
+      if (is.finite(ll_candidate) && ll_candidate >= ll_old) {
+        alpha    <- alpha_new
+        beta     <- beta_new
+        ll_new   <- ll_candidate
+        accepted <- TRUE
+        break
+      } else {
+        step_scale <- step_scale * ls_rho
+      }
+    }
+    
+    if (!accepted) {
+      final_iter <- iter - 1L
+      break
+    }
+    
+    # record parameters after accepted update
+    param_history[[length(param_history) + 1]] <- rbind(
+      data.frame(iter = iter,
+                 type = "alpha",
+                 k = seq_len(Km),
+                 value = alpha),
+      data.frame(iter = iter,
+                 type = "beta",
+                 k = seq_len(Km),
+                 value = beta)
+    )
+    
+    if (abs(ll_new - ll_old) < tol) {
+      ll_old    <- ll_new
+      final_iter <- iter
+      break
+    }
+    
+    ll_old <- ll_new
+    final_iter <- iter
+  }
+  
+  param_df <- do.call(rbind, param_history)
+  
+  list(
+    alpha   = alpha,
+    beta    = beta,
+    logLik  = ll_old,
+    iter    = final_iter,
+    history = param_df
+  )
+}
diff --git a/R/my_gd_fs.R b/R/my_gd_fs.R
new file mode 100644
index 0000000..1666255
--- /dev/null
+++ b/R/my_gd_fs.R
@@ -0,0 +1,214 @@
+my_gd_fs <- function(n0, n1,
+                     g_beta,  g_beta_grad,
+                     eta_alpha = 1e-3,  # kept for signature; not used
+                     eta_beta  = 1e-3,  # kept for signature; not used
+                     maxit = 1000,
+                     B_tol = 1e-8,
+                     ls_max = 20,       # step-halving max
+                     ls_rho = 0.5) {    # step shrink factor
+  
+  K  <- length(n0)
+  Km <- K - 1  # m
+  
+  if (K < 3) {
+    stop("beta constraint beta1 = g(beta2,...,beta_{K-1}) requires K >= 3.")
+  }
+  
+  # Parameters: categories 1..K-1 (baseline K is 0)
+  alpha <- rep(0, Km)
+  beta  <- rep(0, Km)
+  
+  # Enforce beta constraint initially: beta1 = g(beta2..beta_{K-1})
+  beta[1] <- g_beta(beta[2:Km])
+  
+  n0p <- sum(n0)
+  n1p <- sum(n1)
+  
+  # Softmax probabilities for x = 0 and x = 1 (excluding baseline)
+  p0fun <- function(a) {
+    ea <- exp(a)
+    ea / (1 + sum(ea))
+  }
+  p1fun <- function(a, b) {
+    eb <- exp(a + b)
+    eb / (1 + sum(eb))
+  }
+  
+  # Log-likelihood given alpha, beta
+  loglik_fun <- function(alpha, beta) {
+    p0 <- p0fun(alpha)
+    p1 <- p1fun(alpha, beta)
+    
+    p0_full <- c(p0, 1 - sum(p0))
+    p1_full <- c(p1, 1 - sum(p1))
+    
+    eps <- 1e-12
+    p0_full <- pmin(pmax(p0_full, eps), 1 - eps)
+    p1_full <- pmin(pmax(p1_full, eps), 1 - eps)
+    
+    sum(n0 * log(p0_full)) + sum(n1 * log(p1_full))
+  }
+  
+  # ---- history tracking ----
+  param_history <- list()
+  param_history[[length(param_history) + 1]] <- rbind(
+    data.frame(iter = 0L,
+               type = "alpha",
+               k = seq_len(Km),
+               value = alpha),
+    data.frame(iter = 0L,
+               type = "beta",
+               k = seq_len(Km),
+               value = beta)
+  )
+  # ---------------------------
+  
+  ll_old <- loglik_fun(alpha, beta)
+  final_iter <- 0L
+  
+  for (iter in 1:maxit) {
+    
+    # Probabilities for non-baseline categories 1..K-1
+    p0 <- p0fun(alpha)
+    p1 <- p1fun(alpha, beta)
+    
+    # ----- SCORES (gradients) -----
+    # Unconstrained score wrt alpha[1:Km] and beta[1:Km]
+    s_alpha_full <- (n0[1:Km] + n1[1:Km]) - n0p * p0 - n1p * p1
+    s_beta_full  <-  n1[1:Km]            - n1p * p1
+    
+    # Beta free gradient via chain rule (k = 2..Km)
+    grad_g_beta     <- g_beta_grad(beta[2:Km])          # length Km-1
+    s_beta_free     <- s_beta_full[2:Km] + s_beta_full[1] * grad_g_beta
+    
+    # Score vector for reduced parameters θ = (alpha[1:Km], beta_free[1:(Km-1)])
+    score_theta <- c(s_alpha_full, s_beta_free)
+    
+    # Convergence: score norm
+    score_norm <- sqrt(sum(score_theta^2))
+    # if (score_norm < B_tol) {
+    #   final_iter <- iter - 1L
+    #   break
+    # }
+    
+    # ----- FULL FISHER INFORMATION (α, β_full) -----
+    # M0, M1 (m x m)
+    M0 <- diag(p0) - tcrossprod(p0)
+    M1 <- diag(p1) - tcrossprod(p1)
+    
+    Iaa <- n0p * M0 + n1p * M1         # alpha-alpha
+    Iab <- n1p * M1                    # alpha-beta
+    Ibb <- n1p * M1                    # beta-beta
+    
+    # Full J (2m x 2m)
+    J_top    <- cbind(Iaa, Iab)
+    J_bottom <- cbind(t(Iab), Ibb)
+    J <- rbind(J_top, J_bottom)
+    
+    # ----- TRANSFORMATION T from θ to φ = (alpha, beta_full) -----
+    m  <- Km
+    p  <- 2 * m - 1  # dim(θ)
+    q  <- 2 * m      # dim(φ)
+    
+    T <- matrix(0, nrow = q, ncol = p)
+    # alpha-part: identity
+    T[1:m, 1:m] <- diag(m)
+    
+    # beta-part: B (m x (m-1)) at rows (m+1)..(2m), cols (m+1)..(2m-1)
+    B <- matrix(0, nrow = m, ncol = m - 1)
+    B[1, ] <- grad_g_beta                 # row for beta1
+    if (m > 1) {
+      B[2:m, ] <- diag(m - 1)            # rows beta2..betam
+    }
+    T[(m + 1):(2 * m), (m + 1):p] <- B
+    
+    # Fisher info for θ
+    I_theta <- crossprod(T, J %*% T)     # T^T J T
+    
+    # Add tiny ridge for numerical stability if needed
+    ridge <- 1e-8
+    I_theta_r <- I_theta + ridge * diag(p)
+    
+    # Fisher scoring step: θ_new = θ_old + I^{-1} S
+    step_theta <- tryCatch(
+      solve(I_theta_r, score_theta),
+      error = function(e) {
+        # In case of numerical issues, break
+        rep(0, length(score_theta))
+      }
+    )
+    
+    # If step is essentially zero, we're done
+    # if (sqrt(sum(step_theta^2)) < B_tol) {
+    if (max(abs(step_theta)) < B_tol) {
+      final_iter <- iter - 1L
+      break
+    }
+    
+    # ----- Step-halving line search for robustness -----
+    step_scale <- 1.0
+    ll_new <- ll_old
+    accepted <- FALSE
+    
+    for (ls_iter in 1:ls_max) {
+      
+      theta_new <- c(alpha, beta[2:Km]) + step_scale * step_theta
+      alpha_new <- theta_new[1:Km]
+      beta_free_new <- theta_new[(Km + 1):(2 * Km - 1)]
+      
+      beta_new <- beta
+      beta_new[2:Km] <- beta_free_new
+      beta_new[1]    <- g_beta(beta_new[2:Km])
+      
+      print(cbind(alpha_new, beta_new))
+      ll_candidate <- loglik_fun(alpha_new, beta_new)
+      print(ll_candidate)
+      
+      if (ll_candidate >= ll_old) {
+        alpha    <- alpha_new
+        beta     <- beta_new
+        ll_new   <- ll_candidate
+        accepted <- TRUE
+        break
+      } else {
+        step_scale <- step_scale * ls_rho
+      }
+    }
+    
+    if (!accepted) {
+      final_iter <- iter - 1L
+      break
+    }
+    
+    # record parameters after accepted update
+    param_history[[length(param_history) + 1]] <- rbind(
+      data.frame(iter = iter,
+                 type = "alpha",
+                 k = seq_len(Km),
+                 value = alpha),
+      data.frame(iter = iter,
+                 type = "beta",
+                 k = seq_len(Km),
+                 value = beta)
+    )
+    
+    if (abs(ll_new - ll_old) < B_tol) {
+      ll_old    <- ll_new
+      final_iter <- iter
+      break
+    }
+    
+    ll_old <- ll_new
+    final_iter <- iter
+  }
+  
+  param_df <- do.call(rbind, param_history)
+  
+  list(
+    alpha   = alpha,
+    beta    = beta,
+    logLik  = ll_old,
+    iter    = final_iter,
+    history = param_df
+  )
+}
diff --git a/R/my_gd_ls.R b/R/my_gd_ls.R
new file mode 100644
index 0000000..3c89289
--- /dev/null
+++ b/R/my_gd_ls.R
@@ -0,0 +1,224 @@
+my_gd_ls  <- function(n0, n1,
+                      g_beta,  g_beta_grad,
+                      eta_alpha = 1e-3,
+                      eta_beta  = 1e-3,
+                      maxit = 1000,
+                      tol = 1e-8,
+                      ls_max = 30,        # max backtracking steps
+                      ls_c   = 1e-4,      # Armijo constant
+                      ls_rho = 0.5) {     # backtracking shrink factor
+  
+  K  <- length(n0)
+  Km <- K - 1
+  
+  if (K < 3) {
+    stop("beta constraint beta1 = g(beta2,...,beta_{K-1}) requires K >= 3.")
+  }
+  
+  # Parameters for categories 1..K-1 (baseline K is 0)
+  alpha <- rep(0, Km)
+  beta  <- rep(0, Km)
+  
+  # Enforce beta constraint initially: beta1 = g(beta2..beta_{K-1})
+  beta[1] <- g_beta(beta[2:Km])
+  
+  n0p <- sum(n0)
+  n1p <- sum(n1)
+  
+  # Softmax probabilities for x = 0 and x = 1 (excluding baseline)
+  p0fun <- function(a) {
+    ea <- exp(a)
+    ea / (1 + sum(ea))
+  }
+  p1fun <- function(a, b) {
+    eb <- exp(a + b)
+    eb / (1 + sum(eb))
+  }
+  
+  # Log-likelihood given alpha, beta
+  loglik_fun <- function(alpha, beta) {
+    p0 <- p0fun(alpha)
+    p1 <- p1fun(alpha, beta)
+    
+    p0_full <- c(p0, 1 - sum(p0))
+    p1_full <- c(p1, 1 - sum(p1))
+    
+    # clip probs to avoid log(0)
+    eps <- 1e-12
+    p0_full <- pmin(pmax(p0_full, eps), 1 - eps)
+    p1_full <- pmin(pmax(p1_full, eps), 1 - eps)
+    
+    sum(n0 * log(p0_full)) + sum(n1 * log(p1_full))
+  }
+  
+  # ---- history tracking ----
+  param_history <- list()
+  # iteration 0 (initial)
+  param_history[[length(param_history) + 1]] <- rbind(
+    data.frame(iter = 0L,
+               type = "alpha",
+               k = seq_len(Km),
+               value = alpha),
+    data.frame(iter = 0L,
+               type = "beta",
+               k = seq_len(Km),
+               value = beta)
+  )
+  # ---------------------------
+  
+  # Initial log-likelihood
+  ll_old <- loglik_fun(alpha, beta)
+  
+  final_iter <- 0L
+  
+  for (iter in 1:maxit) {
+    
+    # Probabilities for non-baseline categories 1..K-1
+    p0 <- p0fun(alpha)
+    p1 <- p1fun(alpha, beta)
+    
+    # Unconstrained gradients wrt alpha[1:Km] and beta[1:Km]
+    dlda <- (n0[1:Km] + n1[1:Km]) - n0p * p0 - n1p * p1
+    dldb <-  n1[1:Km]            - n1p * p1
+    
+    # Gradient for beta free coordinates k = 2..Km (chain rule)
+    grad_g_beta     <- g_beta_grad(beta[2:Km])             # length K-2
+    grad_beta_free  <- dldb[2:Km] + dldb[1] * grad_g_beta  # length K-2
+    
+    # Gradient norm for convergence check (use α and free β)
+    grad_norm <- sqrt(sum(dlda^2) + sum(grad_beta_free^2))
+    if (grad_norm < tol) {
+      final_iter <- iter - 1L
+      break
+    }
+    
+    # Search directions (gradient ascent)
+    dir_alpha     <- eta_alpha * dlda
+    dir_beta_free <- eta_beta  * grad_beta_free
+    
+    # Directional magnitude for Armijo:
+    # D = η_alpha * ||grad_alpha||^2 + η_beta * ||grad_beta_free||^2
+    D <- eta_alpha * sum(dlda^2) + eta_beta * sum(grad_beta_free^2)
+    if (D <= 0) {
+      final_iter <- iter - 1L
+      break  # gradients essentially zero
+    }
+    
+    # Backtracking line search
+    step_scale <- 1.0
+    ll_new <- ll_old
+    accepted <- FALSE
+    
+    for (ls_iter in 1:ls_max) {
+      
+      # Candidate alpha
+      alpha_new <- alpha + step_scale * dir_alpha
+      
+      # Candidate beta free coords
+      beta_free_new <- beta[2:Km] + step_scale * dir_beta_free
+      
+      # Re-enforce beta constraint
+      beta_new <- beta
+      beta_new[2:Km] <- beta_free_new
+      beta_new[1]    <- g_beta(beta_new[2:Km])
+      
+      ll_candidate <- loglik_fun(alpha_new, beta_new)
+      
+      # Armijo condition for maximization:
+      # ℓ(new) >= ℓ(old) + c * step_scale * D
+      if (ll_candidate >= ll_old + ls_c * step_scale * D) {
+        ll_new   <- ll_candidate
+        alpha    <- alpha_new
+        beta     <- beta_new
+        accepted <- TRUE
+        break
+      } else {
+        step_scale <- step_scale * ls_rho
+      }
+    }
+    
+    # If no step accepted, treat as convergence/stall
+    if (!accepted) {
+      final_iter <- iter - 1L
+      break
+    }
+    
+    # record parameters *after* this accepted update
+    param_history[[length(param_history) + 1]] <- rbind(
+      data.frame(iter = iter,
+                 type = "alpha",
+                 k = seq_len(Km),
+                 value = alpha),
+      data.frame(iter = iter,
+                 type = "beta",
+                 k = seq_len(Km),
+                 value = beta)
+    )
+    
+    # Monotone increase in log-likelihood at accepted steps
+    if (abs(ll_new - ll_old) < tol) {
+      ll_old    <- ll_new
+      final_iter <- iter
+      break
+    }
+    
+    ll_old <- ll_new
+    final_iter <- iter
+  }
+  
+  # Bind history into a single data.frame
+  param_df <- do.call(rbind, param_history)
+  
+  list(
+    alpha   = alpha,
+    beta    = beta,
+    logLik  = ll_old,
+    iter    = final_iter,
+    history = param_df
+  )
+}
+
+# fn3_1_J <- fit_null(B=fit3$B, Y=Y, X = X, k_constr=2, j_constr=1, j_ref=J,
+#                     constraint_fn=list(pseudohuber_median, pseudohuber_median),
+#                     constraint_grad_fn=list(radEmu::dpseudohuber_median_dx, radEmu::dpseudohuber_median_dx),
+#                     B_tol=1e-8, constraint_tol=1e-8)
+# 
+# out_test2 <- my_gd_ls(n0=Y[which(X[,2] == 0), ] %>% colSums, 
+#                      n1 = Y[which(X[,2] == 1), ] %>% colSums, 
+#                      g_beta=function(x) {  pseudohuber_median(c(x, 0)) },  
+#                      g_beta_grad= function(x) {  x <- radEmu::dpseudohuber_median_dx(c(x, 0)); x[-length(x)]}, 
+#                      eta_alpha = 1e-3,
+#                      eta_beta  = 1e-3,
+#                      maxit = 1000,
+#                      tol = 1e-6)
+# 
+# out_test$alpha
+# out_test$beta
+# fn3_1_J$B # j_constr=1, j_ref=J
+# 
+# n0s <- Y[which(X[,2] == 0), ] %>% colSums
+# n1s <- Y[which(X[,2] == 1), ] %>% colSums
+# n0s_mod <- n0s; n0s_mod[1] <- n0s[1] + n1s[1]
+# 
+# 
+# Y
+# logit <- function(x) log(x/(1-x))
+# lg <- logit(n0s/sum(n0s))
+# lg
+# lg - lg[J]
+# 
+# logit(n0s_mod/sum(n0s_mod)) - logit(n0s_mod/sum(n0s_mod))[J]
+# 
+# 
+# pseudohuber_median_mod(out_test$alpha[-1])
+# pseudohuber_median_mod(out_test$beta[-1])
+# 
+# 
+# out_test
+# fn3_1_J$B
+# 
+# pseudohuber_median_mod(out_test$alpha[-1])
+# pseudohuber_median_mod(out_test$beta[-1])
+
+
+
diff --git a/tests/testthat/test-fit_null_discrete.R b/tests/testthat/test-fit_null_discrete.R
new file mode 100644
index 0000000..6ffddd4
--- /dev/null
+++ b/tests/testthat/test-fit_null_discrete.R
@@ -0,0 +1,225 @@
+
+test_that("new discrete is correct", {
+  
+  # set.seed(1)
+  n <- 40
+  J <- 30
+  p <- 5
+  beta <- matrix(rnorm(p*J, mean = 2, sd=1), ncol = J)
+  X <- cbind(1, 
+             c(rep(0, n/2), rep(1, n/2)), 
+             c(rep(0, 5*n/8), rep(1, 3*n/8)), 
+             c(rep(0, 3*n/4), rep(1, n/4)),
+             c(rep(0, 7*n/8), rep(1, n/8)))
+  Y <- matrix(rpois(n=n*J, lambda=exp(1 + X %*% beta)), ncol = J)
+  Y <- pmax(Y, 1)
+  
+  my_jstar <- 9
+  my_kstar <- 4
+  my_jref <- J
+  
+  t_discrete <- system.time({
+    out_discrete <- fit_null_discrete_pseudohuber(Y = Y, X = X, k_constr = my_kstar, j_constr = my_jstar, j_ref = my_jref, 
+                                                  tol = 1e-12)
+  })
+  
+  ## check that the (my_kstar, my_kstar) constraint is satisfied
+  expect_equal(pseudohuber_median(out_discrete$B[my_kstar, -my_jstar]), 
+               out_discrete$B[my_kstar, my_jstar] )
+  
+  ## check that the j_ref constraint is satisfied
+  expect_equal(out_discrete$B[, my_jref], 
+               rep(0, p))
+  
+  ## check permutation invariant 
+  X2 <- X[, c(1, (p:2))]
+  new_order <- c(sample(1:(J-1), J-1), J)
+  new_j_star <- which(new_order == my_jstar)
+  Y2 <- Y[, new_order]
+  t_discrete2 <- system.time({
+    out_discrete2 <- fit_null_discrete_pseudohuber(Y = Y2, X = X2, k_constr = which(c(1, (p:2)) == my_kstar), 
+                                                   j_constr = new_j_star, j_ref = my_jref, 
+                                                   tol = 1e-12)
+  })
+  
+  z_discrete <- update_z(Y, X, out_discrete$B)
+  log_means_discrete <- X %*% out_discrete$B + matrix(z_discrete, ncol = 1) %*% matrix(1, ncol = J, nrow = 1)
+  ll_new_discrete <- sum(Y * log_means_discrete - exp(log_means_discrete))
+  
+  z_discrete2 <- update_z(Y2, X2, out_discrete2$B)
+  log_means_discrete2 <- X2 %*% out_discrete2$B + matrix(z_discrete2, ncol = 1) %*% matrix(1, ncol = J, nrow = 1)
+  ll_new_discrete2 <- sum(Y2 * log_means_discrete2 - exp(log_means_discrete2))
+  
+  ## expect them to be close in estimates and likelihood
+  expect_true(max(abs(out_discrete$B[, new_order] - out_discrete2$B[c(1, (p:2)), ])) < 1e-4)
+  expect_equal(ll_new_discrete2, ll_new_discrete)
+  
+})
+
+
+
+test_that("new discrete is correct with flexible jref and jstar", {
+  
+  # set.seed(3)
+  n <- 24
+  J <- 10
+  p <- 5
+  beta <- matrix(rnorm(p*J, mean = 1, sd=1), ncol = J)
+  X <- cbind(1, 
+             c(rep(0, n/2), rep(1, n/2)), 
+             c(rep(0, 5*n/8), rep(1, 3*n/8)), 
+             c(rep(0, 3*n/4), rep(1, n/4)),
+             c(rep(0, 7*n/8), rep(1, n/8)))
+  Y <- matrix(rpois(n=n*J, lambda=exp(1 + X %*% beta)), ncol = J)
+  Y <- pmax(Y, 1)
+  
+  my_jstar <- 9
+  my_jref <- 1
+  my_kstar <- 4
+  
+  out_discrete91 <- fit_null_discrete_pseudohuber(Y = Y, X = X, k_constr = my_kstar, j_constr = my_jstar, j_ref = my_jref, 
+                                                  tol = 1e-12)
+  
+  ## check that the (my_kstar, my_kstar) constraint is satisfied
+  expect_equal(pseudohuber_median(out_discrete91$B[my_kstar, -my_jstar]), 
+               out_discrete91$B[my_kstar, my_jstar] )
+  
+  ## check that the j_ref constraint is satisfied
+  expect_equal(out_discrete91$B[, my_jref], 
+               rep(0, p))
+  
+  my_jstar <- 4
+  my_jref <- 3
+  
+  out_discrete43 <- fit_null_discrete_pseudohuber(Y = Y, X = X, k_constr = my_kstar, j_constr = my_jstar, j_ref = my_jref, 
+                                                  tol = 1e-12)
+  
+  ## check that the (my_kstar, my_kstar) constraint is satisfied
+  expect_equal(pseudohuber_median(out_discrete43$B[my_kstar, -my_jstar]), 
+               out_discrete43$B[my_kstar, my_jstar] )
+  
+  ## check that the j_ref constraint is satisfied
+  expect_equal(out_discrete43$B[, my_jref], 
+               rep(0, p))
+  
+  my_jstar <- 4
+  my_jref <- 6
+  
+  out_discrete46 <- fit_null_discrete_pseudohuber(Y = Y, X = X, k_constr = my_kstar, j_constr = my_jstar, j_ref = my_jref, 
+                                                  tol = 1e-12)
+  
+  ## should be same value across rows
+  expect_equal((out_discrete43$B - out_discrete46$B) %>% round(4) %>% apply(1, sd) %>% max, 0)
+  
+})
+
+
+
+
+test_that("new discrete is fast", {
+  
+  skip("Too slow for automated testing; 20 mins on laptop")
+  
+  set.seed(2)
+  n <- 12
+  J <- 10
+  p <- 3
+  beta <- matrix(rnorm(p*J, mean = 2, sd=1), ncol = J)
+  X <- cbind(1, c(rep(0, n/2), rep(1, n/2)), c(rep(0, 3*n/4), rep(1, n/4)))
+  Y <- matrix(rpois(n=n*J, lambda=exp(2 + X %*% beta)), ncol = J)
+  Y <- pmax(Y, 1)
+  
+  my_jstar <- J-1
+  my_kstar <- 2
+  my_jref <- J
+  
+  t_discrete <- system.time({
+    out_discrete <- fit_null_discrete_pseudohuber(Y = Y, X = X, k_constr = my_kstar, j_constr = my_jstar, j_ref = my_jref, 
+                                                  tol = 1e-12)
+  })
+  
+  t_orig <- system.time({
+    out_orig <- fit_null(B=matrix(0, nrow = p, ncol = J), 
+                         Y=Y, X = X, 
+                         k_constr=my_kstar, j_constr=my_jstar, j_ref=my_jref,
+                         constraint_fn=list(pseudohuber_median, pseudohuber_median),
+                         constraint_grad_fn=list(radEmu::dpseudohuber_median_dx, radEmu::dpseudohuber_median_dx),
+                         B_tol=1e-8, constraint_tol=1e-10, maxit = 1e8)
+  }) #  816.931  489.308 1317.577 
+  
+  ## check faster
+  expect_lt(t_discrete[3], t_orig[3])
+  
+  ## check that the (my_kstar, my_kstar) constraint is satisfied
+  expect_equal(pseudohuber_median(out_discrete$B[my_kstar, -my_jstar]), 
+               out_discrete$B[my_kstar, my_jstar] )
+  
+  expect_equal(pseudohuber_median(out_orig$B[my_kstar, -my_jstar]), 
+               out_orig$B[my_kstar, my_jstar] )
+  
+  expect_lte(max(abs(out_discrete$B - out_orig$B)), 1e-3)
+  
+  z_discrete <- update_z(Y, X, out_discrete$B)
+  log_means_discrete <- X %*% out_discrete$B + matrix(z_discrete, ncol = 1) %*% matrix(1, ncol = J, nrow = 1)
+  ll_new_discrete <- sum(Y * log_means_discrete - exp(log_means_discrete))
+  
+  z_orig <- update_z(Y, X, out_orig$B)
+  log_means_orig <- X %*% out_orig$B + matrix(z_orig, ncol = 1) %*% matrix(1, ncol = J, nrow = 1)
+  ll_new_orig <- sum(Y * log_means_orig - exp(log_means_orig))
+  
+  ## expect the new likelihood is no worse than 1% lower
+  expect_true(ll_new_discrete > ll_new_orig*0.99)
+  
+})
+
+
+test_that("new discrete null aligns with older code", {
+  
+  skip("Too slow for automated testing; 3 mins on laptop")
+  
+  set.seed(4)
+  n <- 12
+  J <- 5
+  p <- 3
+  beta <- matrix(rnorm(p*J, mean = 2, sd=1), ncol = J)
+  X <- cbind(1, c(rep(0, n/3), rep(1, n/3), rep(0, n/3)), c(rep(0, n/3), rep(0, n/3), rep(1, n/3)))
+  Y <- matrix(rpois(n=n*J, lambda=exp(2 + X %*% beta)), ncol = J)
+  Y <- pmax(Y, 1)
+  
+  my_jstar <- J-2
+  my_jref <- J-1
+  my_kstar <- 2
+  
+  t_discrete <- system.time({
+    out_discrete <- fit_null_discrete_pseudohuber(Y = Y, X = X, k_constr = my_kstar, j_constr = my_jstar, j_ref = my_jref, 
+                                                  tol = 1e-12)
+  })
+  
+  t_orig <- system.time({
+    out_orig <- fit_null(B=matrix(0, nrow = p, ncol = J), 
+                         Y=Y, X = X, 
+                         k_constr=my_kstar, j_constr=my_jstar, j_ref=my_jref,
+                         constraint_fn=list(pseudohuber_median, pseudohuber_median),
+                         constraint_grad_fn=list(radEmu::dpseudohuber_median_dx, radEmu::dpseudohuber_median_dx),
+                         B_tol=1e-3, constraint_tol=1e-6, maxit = 1e8)
+  }) # 94.249  66.621 163.194 
+  
+  ## check faster
+  expect_lt(t_discrete[3], t_orig[3])
+  
+  ## check align
+  expect_lte(max(abs(out_discrete$B - out_orig$B)), 0.02)
+  
+  ## check no worse
+  z_discrete <- update_z(Y, X, out_discrete$B)
+  log_means_discrete <- X %*% out_discrete$B + matrix(z_discrete, ncol = 1) %*% matrix(1, ncol = J, nrow = 1)
+  ll_new_discrete <- sum(Y * log_means_discrete - exp(log_means_discrete))
+  
+  z_orig <- update_z(Y, X, out_orig$B)
+  log_means_orig <- X %*% out_orig$B + matrix(z_orig, ncol = 1) %*% matrix(1, ncol = J, nrow = 1)
+  ll_new_orig <- sum(Y * log_means_orig - exp(log_means_orig))
+  
+  ## expect the new likelihood is no worse than 1% lower
+  expect_true(ll_new_discrete > ll_new_orig*0.99)
+  
+})
\ No newline at end of file