diff --git a/R/mergedatasetsbycase.R b/R/mergedatasetsbycase.R index 21199a3..ae2dff3 100644 --- a/R/mergedatasetsbycase.R +++ b/R/mergedatasetsbycase.R @@ -2066,7 +2066,7 @@ combineAsCategoricalVariable <- function(var.list, data.sets, } } - if (isIntegerValued(result) && all(abs(result) <= .Machine$integer.max, na.rm = TRUE)) + if (isIntegerValued(result, merged.val.attr)) { result <- as.integer(result) nms <- names(merged.val.attr) @@ -2496,7 +2496,7 @@ combineAsNumericVariable <- function(var.list, data.sets, v.types) v })) - if (isIntegerValued(result) && all(abs(result) <= .Machine$integer.max, na.rm = TRUE)) + if (isIntegerValued(result)) result <- as.integer(result) return(result) diff --git a/R/mergedatasetsbyvariable.R b/R/mergedatasetsbyvariable.R index abfae77..5a24f22 100644 --- a/R/mergedatasetsbyvariable.R +++ b/R/mergedatasetsbyvariable.R @@ -580,12 +580,6 @@ doMergeByVariable <- function(data.sets, matched.cases.matrix, merged.var[non.missing.ind] <- input.var[matched.cases.matrix[non.missing.ind, data.set.ind]] - if (isIntegerValued(merged.var)) { - recode.object <- recodeOutOfBoundsIntegersIfNecessary(merged.var, input.var = input.var) - merged.var <- recode.object[["merged.var"]] - input.var <- recode.object[["input.var"]] - merged.var <- as.integer(merged.var) - } v.type <- variableType(input.var) if (v.type == CATEGORICAL.VARIABLE.TYPE) { @@ -671,43 +665,3 @@ print.MergeDataSetByVariablePage <- function(x, ...) do.call(DataSetMergingByVariableWidget, args) } - -recodeOutOfBoundsIntegersIfNecessary <- function(merged.var, input.var) { - merged.unique.vals <- unique(merged.var) - merged.unique.vals <- removeNA(merged.unique.vals) - merged.val.attr <- attr(merged.var, "labels", exact = TRUE) - input.val.attr <- attr(input.var, "labels", exact = TRUE) - all.unique.vals <- unique(c(unclass(merged.unique.vals), unclass(input.val.attr))) - - bad.vals <- abs(all.unique.vals) > 1e9 - n.bad.vals <- length(which(bad.vals)) - if (n.bad.vals == 0) - return(list(merged.var = merged.var, - input.var = input.var)) - lab <- attr(input.var, "label", exact = TRUE) - if (n.bad.vals > 1 ) { - stop("Variable: '", - lab, - "' contains multiple values outside the allowable range. ", - "Values larger than 1,000,000,000 or smaller than -1,000,000,000 ", - "should be recoded before attempting to merge these files.") - } - - offending.value <- all.unique.vals[bad.vals] - remaining.values <- setdiff(all.unique.vals, offending.value) - # -99 is an industry convention - new.value <- if (-99 %in% remaining.values) min(remaining.values) - 1 else -99 - warning("Variable: '", - lab, - "' contains a value outside of the allowable range (", - offending.value, - "). This value has been recoded as ", - new.value) - merged.var[merged.var == offending.value] <- new.value - input.val.attr[input.val.attr == offending.value] <- new.value - merged.val.attr[merged.val.attr == offending.value] <- new.value - attr(input.var, "labels") <- input.val.attr - attr(merged.var, "labels") <- merged.val.attr - list(merged.var = merged.var, - input.var = input.var) -} diff --git a/R/mergingandstackingutilities.R b/R/mergingandstackingutilities.R index ab24028..bf4f694 100644 --- a/R/mergingandstackingutilities.R +++ b/R/mergingandstackingutilities.R @@ -98,6 +98,8 @@ createReadErrorHandler <- function(data.set.name) #' @importFrom tools file_path_sans_ext writeDataSet <- function(data.set, data.set.name, is.saved.to.cloud) { + if (any(invalid.columns <- findInvalidIntegerValueColumns(data.set))) + data.set <- updateClassForLabelledIntegerVariables(data.set, invalid.columns) if (is.saved.to.cloud) { warn.msg <- paste0("The data file ", data.set.name, @@ -352,16 +354,23 @@ splitByComma <- function(input.text, ignore.commas.in.parentheses = FALSE) #' isIntegerValued(c(1, 2, 3)) # TRUE #' isIntegerValued(c(1, 2.1, 3)) # FALSE #' @noRd -isIntegerValued <- function(x) +isIntegerValued <- function(x, val.attr = NULL) { - val.attr <- attr(x, "labels", exact = TRUE) - if (!any(is.nan(val.attr)) && is.numeric(x)) - { - x.without.na <- removeNA(x) - all(floor(x.without.na) == x.without.na & - !is.infinite(x.without.na)) - }else - FALSE + if (!is.numeric(x)) + return(FALSE) + + if (is.null(val.attr)) + val.attr <- attr(x, "labels", exact = TRUE) + if (!is.null(val.attr) && + (any(is.nan(val.attr)) || + any(abs(val.attr) > .Machine$integer.max))) + return(FALSE) + + + x.without.na <- removeNA(x) + all(floor(x.without.na) == x.without.na & + !is.infinite(x.without.na) & + abs(x.without.na) <= .Machine$integer.max) } #' @param data.set.name A character scalar of the user-input name for @@ -617,3 +626,31 @@ throwCombinedDataSetTooLargeError <- function() { stop("The combined data set is too large to create. ", "Consider omitting variables from the combined data set.") } + +#' Checks each labelled integer variable in a data.frame for integer values +#' greater than .Machine$integer.max (including in the value attributes) +#' @param df A data.frame containing haven::labelled and vctrs variables +#' @return a logical vector for each +#' @noRd +findInvalidIntegerValueColumns <- function(df) +{ + n.col <- ncol(df) + invalid.columns <- logical(n.col) + .invalidValues <- function(x) + any(abs(x) > .Machine$integer.max, na.rm = TRUE) + .invalidVariable <- function(variable) + .invalidValues(variable) || + (!is.null(val.attr <- attr(variable, "labels", exact = TRUE)) && + .invalidValues(val.attr)) + vapply(df, .invalidVariable, logical(1L)) +} + +#' Updates the class of specified columns of a data.frame from integer to double +#' so they may be saved by haven::write_sav +#' @noRd +updateClassForLabelledIntegerVariables <- function(data.set, col.idx) +{ + for (i in which(col.idx)) + class(data.set[, col.idx])[class(data.set[, col.idx]) %in% "integer"] <- "double" + return(data.set) +} diff --git a/tests/testthat/test-mergedatasetsbycase.R b/tests/testthat/test-mergedatasetsbycase.R index 4b81441..be17a57 100644 --- a/tests/testthat/test-mergedatasetsbycase.R +++ b/tests/testthat/test-mergedatasetsbycase.R @@ -1338,6 +1338,62 @@ test_that("DS-5306 Support when both data sets have meresrc variables", { expect_equal(mergesrc, expected) }) +test_that("DS-5236: Merging with missing data (NaN value attr.) stays NaN in output", +{ + input.data <- structure(list(Q1 = + structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 4), + label = "I love cats", + format.spss = "F4.0", + labels = c(`Strongly disagree` = 1, Disagree = 2, + `Neither agree nor disagree` = 3, + Agree = 4, `Strongly agree` = 5), + class = c("haven_labelled", "vctrs_vctr", "double")), + Q2 = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 4), + label = "I love dogs", + format.spss = "F4.0", + labels = c(`Strongly disagree` = 1, Disagree = 2, + `Neither agree nor disagree` = 3, Agree = 4, + `Strongly agree` = 5), + class = c("haven_labelled", "vctrs_vctr", "double")), + Q3 = structure(c(4, 4, 4, 3, 4, 4, 3, 3, 3, 2), label = "QA3", + format.spss = "F4.0", + labels = c(`Dog` = 1, Pig = 2, + `Giraffe` = 3, Cow = 4, + `Missing data` = NaN), + class = c("haven_labelled", "vctrs_vctr", "double"))), + row.names = c(NA, -10L), + class = c("tbl_df", "tbl", "data.frame")) + in.tfile <- "temp_data_for_merge.sav" + out.tfile <- "temp_data_merged.sav" + haven::write_sav(input.data, in.tfile) + + output <- do.call(MergeDataSetsByCase, list(c(in.tfile, in.tfile), out.tfile)) + expected.labels <- attr(input.data[["Q3"]], "labels") + output.labels <- attr(haven::read_sav(out.tfile)[[3]], "labels") + expect_equal(expected.labels, output.labels) + unlink(in.tfile) + unlink(out.tfile) +}) + +test_that("DS-5115: Merging with missing data (NaN value attr.) stays NaN in output", +{ + out.file <- "temp_data_for_merge.sav" + in.file <- findInstDirFile("SPSSWithIntegerValueError1.sav") + + expect_silent(do.call(MergeDataSetsByCase, + list(c(in.file, in.file), out.file))) + output.data <- haven::read_spss(out.file) + input.data <- haven::read_spss(in.file) + expected.labels <- attr(input.data[["badvar1"]], "labels") + output.labels <- attr(output.data[["badvar1"]], "labels") + expect_equal(expected.labels, output.labels) + + expected.badvar <- c(input.data[["badvar1"]], input.data[["badvar1"]]) + attr(expected.badvar, "format.spss") <- "F8.2" + expect_equal(expected.badvar, output.data[["badvar1"]]) + unlink(out.file) +}) + if (file.exists("Combined data set.sav")) file.remove("Combined data set.sav") diff --git a/tests/testthat/test-mergedatasetsbyvariable.R b/tests/testthat/test-mergedatasetsbyvariable.R index 32fa26b..271e452 100644 --- a/tests/testthat/test-mergedatasetsbyvariable.R +++ b/tests/testthat/test-mergedatasetsbyvariable.R @@ -261,20 +261,15 @@ test_that("exampleIDValues", { }) test_that("DS-5115: Handle integer values outside R's allowable range", { - expect_warning( + expect_silent( MergeDataSetsByVariable(data.set.names = c(findInstDirFile("SPSSWithIntegerValueError1.sav"), - findInstDirFile("SPSSWithIntegerValueError2.sav"))), - "contains a value outside of the allowable range" + findInstDirFile("SPSSWithIntegerValueError2.sav"))) ) - - expect_warning( - expect_error( + + expect_silent( MergeDataSetsByVariable(data.set.names = c(findInstDirFile("SPSSWithIntegerValueError3.sav"), - findInstDirFile("SPSSWithIntegerValueError2.sav"))), - "contains multiple values outside the allowable range" - ), - "contains a value outside of the allowable range" - ) + findInstDirFile("SPSSWithIntegerValueError2.sav"))) + ) }) if (file.exists("Combined data set.sav"))