Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 22 additions & 31 deletions R/Aggregation.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2021 Observational Health Data Sciences and Informatics
# Copyright 2022 Observational Health Data Sciences and Informatics
#
# This file is part of FeatureExtraction
#
Expand Down Expand Up @@ -27,7 +27,7 @@
aggregateCovariates <- function(covariateData) {
if (!isCovariateData(covariateData))
stop("Data not of class CovariateData")
if (!Andromeda::isValidAndromeda(covariateData))
if (!Andromeda::isValidAndromeda(covariateData))
stop("CovariateData object is closed")
if (isAggregatedCovariateData(covariateData))
stop("Data appears to already be aggregated")
Expand All @@ -39,17 +39,18 @@ aggregateCovariates <- function(covariateData) {
attr(result, "metaData") <- attr(covariateData, "metaData")
class(result) <- "CovariateData"
attr(class(result), "package") <- "FeatureExtraction"
populationSize <- attr(covariateData, "metaData")$populationSize
populationSize <- attr(covariateData, "metaData")$populationSize

# Aggregate binary variables
result$covariates <- covariateData$analysisRef %>%
filter(rlang::sym("isBinary") == "Y") %>%
inner_join(covariateData$covariateRef, by = "analysisId") %>%
inner_join(covariateData$covariates, by = "covariateId") %>%
group_by(rlang::sym("covariateId")) %>%
summarize(sumValue = sum(rlang::sym("covariateValue"), na.rm = TRUE),
averageValue = sum(rlang::sym("covariateValue") / populationSize, na.rm = TRUE))

averageValue = sum(rlang::sym("covariateValue")/populationSize,
na.rm = TRUE))

# Aggregate continuous variables where missing means zero
computeStats <- function(data) {
zeroFraction <- 1 - (nrow(data)/populationSize)
Expand All @@ -61,47 +62,37 @@ aggregateCovariates <- function(covariateData) {
result <- tibble(covariateId = data$covariateId[1],
countValue = nrow(data),
minValue = quants[1],
maxValue = quants[7],
averageValue = mean(data$covariateValue) * (1 - zeroFraction),
standardDeviation = sqrt((populationSize * sum(data$covariateValue^2) - sum(data$covariateValue)^2)/(populationSize * (populationSize - 1))),
medianValue = quants[4],
p10Value = quants[2],
p25Value = quants[3],
p75Value = quants[5],
p90Value = quants[6])

maxValue = quants[7], averageValue = mean(data$covariateValue) * (1 - zeroFraction), standardDeviation = sqrt((populationSize *
sum(data$covariateValue^2) - sum(data$covariateValue)^2)/(populationSize * (populationSize -
1))), medianValue = quants[4], p10Value = quants[2], p25Value = quants[3], p75Value = quants[5],
p90Value = quants[6])
}

covariatesContinuous1 <- covariateData$analysisRef %>%
filter(rlang::sym("isBinary") == "N" & rlang::sym("missingMeansZero") == "Y") %>%
inner_join(covariateData$covariateRef, by = "analysisId") %>%
inner_join(covariateData$covariates, by = "covariateId") %>%
Andromeda::groupApply("covariateId", computeStats) %>%
Andromeda::groupApply("covariateId", computeStats) %>%
bind_rows()

# Aggregate continuous variables where missing means missing
computeStats <- function(data) {
probs <- c(0, 0.1, 0.25, 0.5, 0.75, 0.9, 1)
quants <- quantile(data$covariateValue, probs = probs, type = 1)
result <- tibble(covariateId = data$covariateId[1],
countValue = length(data$covariateValue),
minValue = quants[1],
maxValue = quants[7],
averageValue = mean(data$covariateValue),
standardDeviation = sd(data$covariateValue),
medianValue = quants[4],
p10Value = quants[2],
p25Value = quants[3],
p75Value = quants[5],
p90Value = quants[6])
result <- tibble(covariateId = data$covariateId[1], countValue = length(data$covariateValue),
minValue = quants[1], maxValue = quants[7], averageValue = mean(data$covariateValue), standardDeviation = sd(data$covariateValue),
medianValue = quants[4], p10Value = quants[2], p25Value = quants[3], p75Value = quants[5],
p90Value = quants[6])
}

covariatesContinuous2 <- covariateData$analysisRef %>%
filter(rlang::sym("isBinary") == "N" & rlang::sym("missingMeansZero") == "N") %>%
inner_join(covariateData$covariateRef, by = "analysisId") %>%
inner_join(covariateData$covariates, by = "covariateId") %>%
Andromeda::groupApply("covariateId", computeStats) %>%
Andromeda::groupApply("covariateId", computeStats) %>%
bind_rows()

covariatesContinuous <- bind_rows(covariatesContinuous1, covariatesContinuous2)
if (nrow(covariatesContinuous) > 0) {
result$covariatesContinuous <- covariatesContinuous
Expand Down
51 changes: 27 additions & 24 deletions R/CompareCohorts.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2021 Observational Health Data Sciences and Informatics
# Copyright 2022 Observational Health Data Sciences and Informatics
#
# This file is part of FeatureExtraction
#
Expand Down Expand Up @@ -32,7 +32,10 @@
#' of mean.
#'
#' @export
computeStandardizedDifference <- function(covariateData1, covariateData2, cohortId1 = NULL, cohortId2 = NULL) {
computeStandardizedDifference <- function(covariateData1,
covariateData2,
cohortId1 = NULL,
cohortId2 = NULL) {
if (!isCovariateData(covariateData1))
stop("covariateData1 is not of type 'covariateData'")
if (!isCovariateData(covariateData1))
Expand All @@ -48,21 +51,19 @@ computeStandardizedDifference <- function(covariateData1, covariateData2, cohort
covariates1 <- covariates1 %>%
filter(.data$cohortDefinitionId == cohortId1)
}
covariates1 <- covariates1 %>%
select(covariateId = "covariateId",
count1 = "sumValue") %>%
covariates1 <- covariates1 %>%
select(covariateId = "covariateId", count1 = "sumValue") %>%
collect()

covariates2 <- covariateData2$covariates
if (!is.null(cohortId2)) {
covariates2 <- covariates2 %>%
filter(.data$cohortDefinitionId == cohortId2)
}
covariates2 <- covariates2 %>%
select(covariateId = "covariateId",
count2 = "sumValue") %>%
covariates2 <- covariates2 %>%
select(covariateId = "covariateId", count2 = "sumValue") %>%
collect()

n1 <- attr(covariateData1, "metaData")$populationSize
if (!is.null(cohortId1)) {
n1 <- n1[as.character(cohortId1)]
Expand All @@ -78,9 +79,11 @@ computeStandardizedDifference <- function(covariateData1, covariateData2, cohort
m$mean2 <- m$count2/n2
m$sd1 <- sqrt(m$mean1 * (1 - m$mean1))
m$sd2 <- sqrt(m$mean2 * (1 - m$mean2))
m$sd <- sqrt((m$sd1^2 + m$sd2^2) / 2)
m$sd <- sqrt((m$sd1^2 + m$sd2^2)/2)
m$stdDiff <- (m$mean2 - m$mean1)/m$sd
result <- bind_rows(result, m[, c("covariateId", "mean1", "sd1", "mean2", "sd2", "sd", "stdDiff")])
result <- bind_rows(result,
m[,
c("covariateId", "mean1", "sd1", "mean2", "sd2", "sd", "stdDiff")])
}
if (!is.null(covariateData1$covariatesContinuous) && !is.null(covariateData2$covariatesContinuous)) {
covariates1 <- covariateData1$covariatesContinuous
Expand All @@ -89,39 +92,39 @@ computeStandardizedDifference <- function(covariateData1, covariateData2, cohort
filter(.data$cohortDefinitionId == cohortId1)
}
covariates1 <- covariates1 %>%
select(covariateId = "covariateId",
mean1 = "averageValue",
sd1 = "standardDeviation") %>%
select(covariateId = "covariateId", mean1 = "averageValue", sd1 = "standardDeviation") %>%
collect()

covariates2 <- covariateData2$covariatesContinuous
if (!is.null(cohortId2)) {
covariates2 <- covariates2 %>%
filter(.data$cohortDefinitionId == cohortId2)
}
covariates2 <- covariates2 %>%
select(covariateId = "covariateId",
mean2 = "averageValue",
sd2 = "standardDeviation") %>%
select(covariateId = "covariateId", mean2 = "averageValue", sd2 = "standardDeviation") %>%
collect()

m <- merge(covariates1, covariates2, all = T)
m$mean1[is.na(m$mean1)] <- 0
m$sd1[is.na(m$sd1)] <- 0
m$mean2[is.na(m$mean2)] <- 0
m$sd2[is.na(m$sd2)] <- 0
m$sd <- sqrt(m$sd1^2 + m$sd2^2)
m$stdDiff <- (m$mean2 - m$mean1)/m$sd
result <- bind_rows(result, m[, c("covariateId", "mean1", "sd1", "mean2", "sd2", "sd", "stdDiff")])
result <- bind_rows(result,
m[,
c("covariateId", "mean1", "sd1", "mean2", "sd2", "sd", "stdDiff")])
}
covariateRef1 <- covariateData1$covariateRef %>%
collect()
covariateRef2 <- covariateData2$covariateRef %>%
collect()

result <- result %>%
left_join(select(covariateRef1, covariateId = "covariateId", covariateName1 = "covariateName"), by = "covariateId") %>%
left_join(select(covariateRef2, covariateId = "covariateId", covariateName2 = "covariateName"), by = "covariateId") %>%
left_join(select(covariateRef1, covariateId = "covariateId", covariateName1 = "covariateName"),
by = "covariateId") %>%
left_join(select(covariateRef2, covariateId = "covariateId", covariateName2 = "covariateName"),
by = "covariateId") %>%
mutate(covariateName = case_when(is.na(covariateName1) ~ covariateName2,
TRUE ~ covariateName1)) %>%
select(-rlang::sym("covariateName1"), -rlang::sym("covariateName2")) %>%
Expand Down
Loading