From 0898d8bd6a89f1fbc31476be171274ba5ab4832a Mon Sep 17 00:00:00 2001 From: Rutgervdspek Date: Tue, 18 Nov 2025 11:40:02 +0100 Subject: [PATCH] Added CMeanEst, CCovEst, and CEllGenEst --- DESCRIPTION | 2 +- NAMESPACE | 3 + R/CCovEst.R | 196 +++++++++++++++++++++++++++++++++++++++++ R/CEllGenEst.R | 217 ++++++++++++++++++++++++++++++++++++++++++++++ R/CMeanEst.R | 111 ++++++++++++++++++++++++ man/CCovEst.Rd | 104 ++++++++++++++++++++++ man/CEllGenEst.Rd | 142 ++++++++++++++++++++++++++++++ man/CMeanEst.Rd | 66 ++++++++++++++ 8 files changed, 840 insertions(+), 1 deletion(-) create mode 100644 R/CCovEst.R create mode 100644 R/CEllGenEst.R create mode 100644 R/CMeanEst.R create mode 100644 man/CCovEst.Rd create mode 100644 man/CEllGenEst.Rd create mode 100644 man/CMeanEst.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 23b5247..58e8dff 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -15,7 +15,7 @@ Authors@R: c( License: GPL-3 Encoding: UTF-8 LazyLoad: yes -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 Depends: R (>= 3.6.0) Roxygen: list(markdown = TRUE) diff --git a/NAMESPACE b/NAMESPACE index 7994600..09b4ef7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,8 @@ # Generated by roxygen2: do not edit by hand +export(CCovEst) +export(CEllGenEst) +export(CMeanEst) export(Convert_g1_To_Fg1) export(Convert_g1_To_Qg1) export(Convert_g1_To_f1) diff --git a/R/CCovEst.R b/R/CCovEst.R new file mode 100644 index 0000000..4d451a1 --- /dev/null +++ b/R/CCovEst.R @@ -0,0 +1,196 @@ +#' Conditional Covariance Estimator +#' +#' This function estimates the conditional covariance matrix +#' \deqn{\Sigma(z) = \mathrm{Cov}(X \mid Z = z)} +#' of a multivariate response variable \eqn{X} given a conditioning variable +#' \eqn{Z}, using kernel smoothing. Three different estimators are provided. +#' +#' The kernel weights are defined as +#' \deqn{ +#' w_{n,i}(z) = +#' \frac{K\!\left( \frac{Z_i - z}{h} \right)} +#' {\sum_{j=1}^n K\!\left( \frac{Z_j - z}{h} \right)} , +#' } +#' where \eqn{K} is the chosen kernel and \eqn{h} is the bandwidth. +#' +#' The supported estimators are: +#' +#' \describe{ +#' \item{type = 1}{ +#' Covariance estimator using the conditional mean evaluated at the grid point: +#' \deqn{ +#' \widehat{\Sigma}_n(z) +#' = \sum_{i=1}^n w_{n,i}(z) +#' \bigl(X_i - \widehat{\mu}_n(z)\bigr) +#' \bigl(X_i - \widehat{\mu}_n(z)\bigr)^\top . +#' } +#' } +#' +#' \item{type = 2}{ +#' Covariance estimator using the conditional mean evaluated at each observation: +#' \deqn{ +#' \widetilde{\Sigma}_n(z) +#' = \sum_{i=1}^n w_{n,i}(z) +#' \bigl(X_i - \widehat{\mu}_n(Z_i)\bigr) +#' \bigl(X_i - \widehat{\mu}_n(Z_i)\bigr)^\top . +#' } +#' } +#' +#' \item{type = 3}{ +#' Pairwise covariance estimator: +#' \deqn{ +#' \widecheck{\Sigma}_n(z) +#' = +#' \frac{\sum_{i 0} and \eqn{a > 0} are tuning parameters, \eqn{K} is a kernel +#' function, and \eqn{w_{n,i}(z)} are Nadaraya-Watson weights: +#' \deqn{ +#' w_{n,i}(z) = \frac{K\left( (z - Z_i)/h \right)}{\sum_{j=1}^n K\left( (z - Z_j)/h \right)}. +#' } +#' +#' @param dataMatrix a matrix of size \eqn{n \times d} containing the \eqn{n} +#' observations of the \eqn{d}-dimensional response variable \eqn{X}. The pairs +#' \eqn{(X_i, Z_i)} are assumed to be i.i.d. realizations of a joint random vector. +#' +#' @param observedZ vector with \eqn{n} observations of the conditioning +#' variable \eqn{Z}. +#' +#' @param gridZ vector of points \eqn{z} at which the conditional covariance matrix +#' \eqn{\Sigma(z) = \mathrm{Cov}(X \mid Z = z)} is estimated. +#' +#' @param h bandwidth of the kernel for kernel smoothing over \eqn{Z}. +#' +#' @template param-Kernel +#' +#' @param mu matrix of size \eqn{d \times \code{length(gridZ)}} with +#' conditional means \eqn{\mu(z)}. +#' +#' @param sigma array of size \eqn{d \times d \times +#' \code{length(gridZ)}} with conditional covariances \eqn{\Sigma(z)}. +#' +#' @param grid vector of \eqn{\xi} values where the generator is +#' evaluated. +#' +#' @param a tuning parameter controlling bias at \eqn{\xi = 0}. +#' +#' @param h_psi optional bandwidth for kernel smoothing over +#' \eqn{\psi(\xi)}. If not specified, the Silverman's rule of thumb is used. +#' +#' @return A matrix of size \eqn{\code{length(grid)} \times +#' \code{length(gridZ)}}, \eqn{\widehat{g}_{n}(\xi \mid z)} containing the estimated conditional density +#' generator at each \eqn{\xi} and \eqn{z}. +#' +#' @references Liebscher, E. (2005). A semiparametric density estimator based on +#' elliptical distributions. Journal of Multivariate Analysis, 92(1), 205-222. +#' +#' @examples +#' # Conditional generator with Z-dependent mean/covariance +#' +#' n = 5000 +#' d = 3 +#' Z = runif(n) +#' +#' data_X = matrix(NA, n, d) +#' for (i in 1:n) { +#' mean_i = rep(Z[i], d) +#' sigma_i = matrix(c(Z[i], Z[i]/2, Z[i]/3, +#' Z[i]/2, Z[i], Z[i]/2, +#' Z[i]/3, Z[i]/2, Z[i]), nrow = d) +#' data_X[i,] = mvtnorm::rmvnorm(1, mean = mean_i, sigma = sigma_i) +#' } +#' +#' h <- 1.06 * sd(Z) * n^(-1/5) +#' gridZ <- c(0.2, 0.8) +#' +#' mu_est = CMeanEst(data_X, Z, gridZ, h) +#' Sigma_est = CCovEst(data_X, Z, gridZ, h, type = 1) +#' +#' grid = seq(0, 8, length.out = 80) +#' g_est = CEllGenEst( +#' dataMatrix = data_X, +#' observedZ = Z, +#' mu = mu_est, +#' sigma = Sigma_est, +#' gridZ = gridZ, +#' grid = grid, +#' h = h, +#' Kernel = "epanechnikov", +#' a = 1 +#' ) +#' +#' g_true <- function(r){ (2*pi)^(-d/2) * exp(-r/2) } +#' g_grid_true = g_true(grid) +#' +#' plot(grid, g_est[,1], type="l", col="blue", lwd=2, +#' main="Conditional Generator: Estimated vs True", +#' xlab="u", ylab="g(u | Z)") +#' lines(grid, g_est[,2], col="red", lwd=2) +#' lines(grid, g_grid_true, col="black", lwd=2, lty=2) +#' legend("topright", +#' legend=c("Estimated Z = 0.2", "Estimated Z = 0.8", "True Gaussian generator"), +#' col=c("blue", "red", "black"), lwd=2, lty=c(1,1,2)) +#' +#' @export +#' +CEllGenEst <- function(dataMatrix, observedZ, mu, sigma, gridZ, grid, h, + Kernel = "epanechnikov", a = 1, h_psi = NULL) +{ + kernelFun = ElliptCopulas:::getKernel(Kernel = Kernel) + d = ncol( dataMatrix ) + n = nrow( dataMatrix ) + nz = length( gridZ ) + n1 = length( grid ) + + if(length(observedZ) != n) { + stop(errorCondition( + message = paste0("The length of observedZ and the number of rows in ", + "'dataMatrix'must be equal. Here they are respectively: ", + length(observedZ), ", ", n), + class = "DifferentLengthsError") ) + } + + if (ncol(mu) != nz) { + stop(errorCondition( + message = paste0("The number of columns in 'mu' (", ncol(mu), + ") must equal length(gridZ) (", nz, ")."), + class = "DifferentLengthsError" + )) + } + + if (!all(dim(sigma)[1:2] == c(d, d))) { + stop(errorCondition( + message = paste0("'sigma' must be an array with first two dimensions ", + "equal to d x d (", d, " x ", d, "). ", + "Current dimensions are ", + paste(dim(sigma)[1:2], collapse = " x "), "."), + class = "DifferentLengthsError" + )) + } + + if (dim(sigma)[3] != nz) { + stop(errorCondition( + message = paste0("The third dimension of 'sigma' (", dim(sigma)[3], + ") must equal length(gridZ) (", nz, ")."), + class = "DifferentLengthsError" + )) + } + + R = matrix(data = NA, nrow = n, ncol = nz) + psiR = matrix(data = NA, nrow = n, ncol = nz) + + for(i in 1:n){ + for(j in 1:nz){ + R[i,j] = t(dataMatrix[i,] - mu[,j]) %*% solve(sigma[,,j]) %*% + (dataMatrix[i,] - mu[,j]) + psiR[i,j] = -a + (a ^ (d/2) + ( R[i,j] ) ^ (d/2) ) ^ (2/d) + } + } + + matrixWeights = matrix(data = NA, nrow = n, ncol = nz) + + for(i in 1:nz){ + matrixWeights[,i] = computeWeights( + vectorZ = observedZ, h = h, + pointZ = gridZ[i], Kernel = Kernel, + normalization = TRUE) + } + + qEst = matrix(data = NA, nrow = n1, ncol = nz) + + psiGrid = -a + (a ^ (d/2) + ( grid ) ^ (d/2) ) ^ (2/d) + + if(is.null(h_psi)) { + psiR_pooled = as.vector(psiR) + h_psi = 1.06 * sd(psiR_pooled) * n^(-1/5) + } + + for(i in 1:nz){ + for(j in 1:n1){ + S = 0 + for(k in 1:n){ + S = S + matrixWeights[k,i] / (h_psi) * + ( kernelFun( (psiGrid[j] - psiR[k,i]) / h_psi ) + + kernelFun( (psiGrid[j] + psiR[k,i]) / h_psi ) ) + } + qEst[j,i] = S + } + } + + gEst = matrix(data = NA, nrow = n1, ncol = nz) + + s_d = pi^(d/2) / gamma(d/2) + + for(i in 1:n1){ + psiPGrid = grid[i]^(d/2 - 1) * (a ^ (d/2) + grid[i]^(d/2)) ^ (2/d - 1) + for(j in 1:nz){ + gEst[i,j] = 1/s_d * grid[i]^(-d/2 + 1) * psiPGrid * qEst[i,j] + } + } + + return(gEst) +} diff --git a/R/CMeanEst.R b/R/CMeanEst.R new file mode 100644 index 0000000..cdd39da --- /dev/null +++ b/R/CMeanEst.R @@ -0,0 +1,111 @@ +#' Conditional Mean Estimator +#' +#' This function estimates the conditional mean +#' \deqn{\mu(z) = E[X \mid Z = z]} +#' of a multivariate response variable using kernel smoothing. +#' +#' The estimator is the kernel-weighted average +#' \deqn{ +#' \widehat{\mu}_n(z) +#' = \sum_{i=1}^n w_{n,i}(z) X_i , +#' } +#' where the normalized kernel weights are +#' \deqn{ +#' w_{n,i}(z) +#' = +#' \frac{ K\!\left( \frac{Z_i - z}{h} \right) } +#' { \sum_{j=1}^n K\!\left( \frac{Z_j - z}{h} \right) } . +#' } +#' +#' +#' @param dataMatrix a matrix of size \eqn{n \times d} containing the \eqn{n} +#' observations of the \eqn{d}-dimensional response variable \eqn{X}. The pairs +#' \eqn{(X_i, Z_i)} are assumed to be i.i.d. realizations of a joint random vector. +#' +#' @param observedZ vector with \eqn{n} observations of the conditioning +#' variable \eqn{Z}. +#' +#' @param gridZ vector of points \eqn{z} at which the conditional mean +#' \eqn{\mu(z)} is estimated. +#' +#' @param h bandwidth of the kernel. +#' +#' @template param-Kernel +#' +#' @return A matrix of size \eqn{d \times \code{length(gridZ)}}, +#' \eqn{\widehat{\mu}_n(z)} containing the estimated conditional means +#' of the \eqn{d}-dimensional random variable \eqn{X} at each point of `gridZ`. +#' +#' @examples +#' # Comparison between the estimated and true conditional mean +#' n = 500 +#' d = 1 +#' Z = runif(n, -2, 2) +#' X = matrix(2 * Z + rnorm(n), ncol = d) +#' gridZ = seq(-2, 2, length.out = 50) +#' h = 0.3 +#' CMean_estimates = CMeanEst(X, Z, gridZ, h) +#' +#' true_mean = 2 * gridZ +#' plot(gridZ, CMean_estimates, type = "l", col = "blue", lwd = 2, +#' ylab = "Conditional Mean", xlab = "Z") +#' lines(gridZ, true_mean, col = "red", lwd = 2, lty = 2) +#' legend("topleft", legend = c("Estimated", "True"), col = c("blue", "red"), +#' lty = c(1, 2), lwd = 2) +#' +#' @export +#' +CMeanEst <- function(dataMatrix, observedZ, gridZ, h, Kernel = "epanechnikov") +{ + d = ncol( dataMatrix ) + n = nrow( dataMatrix ) + nz = length( gridZ ) + + if(length(observedZ) != n) { + stop(errorCondition( + message = paste0("The length of observedZ and the number of rows in ", + "'dataMatrix'must be equal. Here they are respectively: ", + length(observedZ), ", ", n), + class = "DifferentLengthsError") ) + } + + matrixWeights = matrix(data = NA, nrow = n, ncol = nz) + + for(i in 1:nz){ + matrixWeights[, i] = computeWeights( + vectorZ = observedZ, h = h, + pointZ = gridZ[i], Kernel = Kernel, + normalization = TRUE + ) + } + + estimate = t(dataMatrix) %*% matrixWeights + + return(estimate) +} + + +#' @keywords internal +#' +computeWeights <- function(vectorZ, h, pointZ, Kernel = "epanechnikov", normalization = TRUE) { + n = length(vectorZ) + + kernelFun = ElliptCopulas:::getKernel(Kernel = Kernel) + + u = (vectorZ - pointZ) / h + w_raw = kernelFun(u) + + if (normalization) { + denom = sum(w_raw) + if (denom == 0) { + warning("All kernel weights are zero at pointZ = ", pointZ) + w = rep(0, n) + } else { + w = w_raw / denom + } + } else { + w = w_raw + } + + return(w) +} diff --git a/man/CCovEst.Rd b/man/CCovEst.Rd new file mode 100644 index 0000000..a1dc915 --- /dev/null +++ b/man/CCovEst.Rd @@ -0,0 +1,104 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/CCovEst.R +\name{CCovEst} +\alias{CCovEst} +\title{Conditional Covariance Estimator} +\usage{ +CCovEst(dataMatrix, observedZ, gridZ, h, Kernel = "epanechnikov", type = 1) +} +\arguments{ +\item{dataMatrix}{a matrix of size \eqn{n \times d} containing the \eqn{n} +observations of the \eqn{d}-dimensional response variable \eqn{X}. The pairs +\eqn{(X_i, Z_i)} are assumed to be i.i.d. realizations of a joint random vector.} + +\item{observedZ}{vector with \eqn{n} observations of the conditioning +variable \eqn{Z}.} + +\item{gridZ}{vector of points \eqn{z} at which the conditional covariance matrix +\eqn{\Sigma(z) = \mathrm{Cov}(X \mid Z = z)} is estimated.} + +\item{h}{bandwidth of the kernel.} + +\item{Kernel}{name of the kernel. Possible choices are +\code{"gaussian"}, \code{"epanechnikov"}, \code{"triangular"}.} + +\item{type}{integer in \{1,2,3\} indicating which estimator to compute.} +} +\value{ +An array of dimension \eqn{d \times d \times \code{length(gridZ)}}, +\eqn{\widehat{\Sigma}_n(z)} containing the estimated conditional covariance +matrices of the \eqn{d}-dimensional random variable \eqn{X} at each point of \code{gridZ}. +} +\description{ +This function estimates the conditional covariance matrix +\deqn{\Sigma(z) = \mathrm{Cov}(X \mid Z = z)} +of a multivariate response variable \eqn{X} given a conditioning variable +\eqn{Z}, using kernel smoothing. Three different estimators are provided. +} +\details{ +The kernel weights are defined as +\deqn{ + w_{n,i}(z) = + \frac{K\!\left( \frac{Z_i - z}{h} \right)} + {\sum_{j=1}^n K\!\left( \frac{Z_j - z}{h} \right)} , +} +where \eqn{K} is the chosen kernel and \eqn{h} is the bandwidth. + +The supported estimators are: + +\describe{ +\item{type = 1}{ +Covariance estimator using the conditional mean evaluated at the grid point: +\deqn{ + \widehat{\Sigma}_n(z) + = \sum_{i=1}^n w_{n,i}(z) + \bigl(X_i - \widehat{\mu}_n(z)\bigr) + \bigl(X_i - \widehat{\mu}_n(z)\bigr)^\top . + } +} + +\item{type = 2}{ +Covariance estimator using the conditional mean evaluated at each observation: +\deqn{ + \widetilde{\Sigma}_n(z) + = \sum_{i=1}^n w_{n,i}(z) + \bigl(X_i - \widehat{\mu}_n(Z_i)\bigr) + \bigl(X_i - \widehat{\mu}_n(Z_i)\bigr)^\top . + } +} + +\item{type = 3}{ +Pairwise covariance estimator: +\deqn{ + \widecheck{\Sigma}_n(z) + = + \frac{\sum_{i 0} and \eqn{a > 0} are tuning parameters, \eqn{K} is a kernel +function, and \eqn{w_{n,i}(z)} are Nadaraya-Watson weights: +\deqn{ +w_{n,i}(z) = \frac{K\left( (z - Z_i)/h \right)}{\sum_{j=1}^n K\left( (z - Z_j)/h \right)}. +} +} +\examples{ +# Conditional generator with Z-dependent mean/covariance + +n = 5000 +d = 3 +Z = runif(n) + +data_X = matrix(NA, n, d) +for (i in 1:n) { + mean_i = rep(Z[i], d) + sigma_i = matrix(c(Z[i], Z[i]/2, Z[i]/3, + Z[i]/2, Z[i], Z[i]/2, + Z[i]/3, Z[i]/2, Z[i]), nrow = d) + data_X[i,] = mvtnorm::rmvnorm(1, mean = mean_i, sigma = sigma_i) +} + +h <- 1.06 * sd(Z) * n^(-1/5) +gridZ <- c(0.2, 0.8) + +mu_est = CMeanEst(data_X, Z, gridZ, h) +Sigma_est = CCovEst(data_X, Z, gridZ, h, type = 1) + +grid = seq(0, 8, length.out = 80) +g_est = CEllGenEst( + dataMatrix = data_X, + observedZ = Z, + mu = mu_est, + sigma = Sigma_est, + gridZ = gridZ, + grid = grid, + h = h, + Kernel = "epanechnikov", + a = 1 +) + +g_true <- function(r){ (2*pi)^(-d/2) * exp(-r/2) } +g_grid_true = g_true(grid) + +plot(grid, g_est[,1], type="l", col="blue", lwd=2, + main="Conditional Generator: Estimated vs True", + xlab="u", ylab="g(u | Z)") + lines(grid, g_est[,2], col="red", lwd=2) + lines(grid, g_grid_true, col="black", lwd=2, lty=2) + legend("topright", + legend=c("Estimated Z = 0.2", "Estimated Z = 0.8", "True Gaussian generator"), + col=c("blue", "red", "black"), lwd=2, lty=c(1,1,2)) + +} +\references{ +Liebscher, E. (2005). A semiparametric density estimator based on +elliptical distributions. Journal of Multivariate Analysis, 92(1), 205-222. +} diff --git a/man/CMeanEst.Rd b/man/CMeanEst.Rd new file mode 100644 index 0000000..bea0a52 --- /dev/null +++ b/man/CMeanEst.Rd @@ -0,0 +1,66 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/CMeanEst.R +\name{CMeanEst} +\alias{CMeanEst} +\title{Conditional Mean Estimator} +\usage{ +CMeanEst(dataMatrix, observedZ, gridZ, h, Kernel = "epanechnikov") +} +\arguments{ +\item{dataMatrix}{a matrix of size \eqn{n \times d} containing the \eqn{n} +observations of the \eqn{d}-dimensional response variable \eqn{X}. The pairs +\eqn{(X_i, Z_i)} are assumed to be i.i.d. realizations of a joint random vector.} + +\item{observedZ}{vector with \eqn{n} observations of the conditioning +variable \eqn{Z}.} + +\item{gridZ}{vector of points \eqn{z} at which the conditional mean +\eqn{\mu(z)} is estimated.} + +\item{h}{bandwidth of the kernel.} + +\item{Kernel}{name of the kernel. Possible choices are +\code{"gaussian"}, \code{"epanechnikov"}, \code{"triangular"}.} +} +\value{ +A matrix of size \eqn{d \times \code{length(gridZ)}}, +\eqn{\widehat{\mu}_n(z)} containing the estimated conditional means +of the \eqn{d}-dimensional random variable \eqn{X} at each point of \code{gridZ}. +} +\description{ +This function estimates the conditional mean +\deqn{\mu(z) = E[X \mid Z = z]} +of a multivariate response variable using kernel smoothing. +} +\details{ +The estimator is the kernel-weighted average +\deqn{ + \widehat{\mu}_n(z) + = \sum_{i=1}^n w_{n,i}(z) X_i , +} +where the normalized kernel weights are +\deqn{ + w_{n,i}(z) + = + \frac{ K\!\left( \frac{Z_i - z}{h} \right) } + { \sum_{j=1}^n K\!\left( \frac{Z_j - z}{h} \right) } . +} +} +\examples{ +# Comparison between the estimated and true conditional mean +n = 500 +d = 1 +Z = runif(n, -2, 2) +X = matrix(2 * Z + rnorm(n), ncol = d) +gridZ = seq(-2, 2, length.out = 50) +h = 0.3 +CMean_estimates = CMeanEst(X, Z, gridZ, h) + +true_mean = 2 * gridZ +plot(gridZ, CMean_estimates, type = "l", col = "blue", lwd = 2, + ylab = "Conditional Mean", xlab = "Z") +lines(gridZ, true_mean, col = "red", lwd = 2, lty = 2) +legend("topleft", legend = c("Estimated", "True"), col = c("blue", "red"), + lty = c(1, 2), lwd = 2) + +}