diff --git a/R/mergedatasetsbyvariable.R b/R/mergedatasetsbyvariable.R index 360996b..ff56563 100644 --- a/R/mergedatasetsbyvariable.R +++ b/R/mergedatasetsbyvariable.R @@ -450,12 +450,7 @@ mergedDataSetVariableNames <- function(input.data.sets.metadata, v.names.list[[data.set.ind]]) } - merged.data.set.var.names <- character(0) - for (i in seq_len(n.data.sets)) - for (nm in included.variable.names.list[[i]]) - merged.data.set.var.names <- c(merged.data.set.var.names, - uniqueName(nm, merged.data.set.var.names, delimiter = "_")) - merged.data.set.var.names <- sanitizeSPSSVariableNames(merged.data.set.var.names) + merged.data.set.var.names <- sanitizeSPSSVariableNames(unlist(included.variable.names.list)) attr(merged.data.set.var.names, "included.variable.names.list") <- included.variable.names.list attr(merged.data.set.var.names, "omitted.variable.names.list") <- omitted.var.names.list diff --git a/R/mergingandstackingutilities.R b/R/mergingandstackingutilities.R index 35ffcf6..57a78bc 100644 --- a/R/mergingandstackingutilities.R +++ b/R/mergingandstackingutilities.R @@ -517,6 +517,49 @@ uniqueName <- function(new.name, existing.names, delimiter = "") } } +makeValidNameForSpss <- function(input.name, existing.names, delimiter = "") +{ + input.name |> + removeWhitespace() |> + removeInvalidStartingCharacters() |> + truncateNameToByteLimit() |> + trimTrailingPeriods() |> + defaultNameIfEmpty() |> + replaceReservedKeywords() |> + uniqueName(existing.names, delimiter) +} + +removeWhitespace <- function(name) +{ + gsub("\\s+", "", name) +} + +removeInvalidStartingCharacters <- function(name) +{ + gsub("^[^a-zA-Z@]+", "", name) +} + +trimTrailingPeriods <- function(name) +{ + gsub("\\.+$", "", name) +} + +defaultNameIfEmpty <- function(name) +{ + ifelse(name == "", "VAR", name) +} + +replaceReservedKeywords <- function(name) +{ + reserved.keywords <- c("ALL", "AND", "BY", "EQ", "GE", "GT", "LE", "LT", "NE", "NOT", "OR", "TO", "WITH") + ifelse(name %in% reserved.keywords, paste0(name, "_r"), name) +} + +truncateNameToByteLimit <- function(name) +{ + addSuffixFittingByteLimit(name) +} + #' @description Return variable name matches to wildcard.text. Throw error if no matches #' found and error.if.not.found == TRUE. #' @param wildcard.text Character scalar of the wildcard pattern to match for. @@ -546,73 +589,15 @@ parseVariableWildcardForMerging <- function(wildcard.text, variable.names, } sanitizeSPSSVariableNames <- function(variable.names) { - variable.names <- trimPeriods(variable.names) - variable.names <- replaceReservedKeywords(variable.names) - variable.names <- truncateVariableNames(variable.names) - - # Extra call to trimPeriods in case truncateVariableNames - # results in trailling periods - variable.names <- trimPeriods(variable.names) - - variable.names <- deduplicateVariableNames(variable.names) - variable.names -} - -trimPeriods <- function(variable.names) -{ - # Can't begin with or end with a period - starts.or.ends.with.period <- startsWith(variable.names, ".") | endsWith(variable.names, ".") - if (any(starts.or.ends.with.period)) { - warning("Cannot save variables names which begin or end with '.'. Some variables have had '.' removed from their names: ", - paste0(variable.names[starts.or.ends.with.period], collapse = ", ")) - variable.names[starts.or.ends.with.period] <- gsub("^\\.+", "", gsub("\\.+$", "", variable.names[starts.or.ends.with.period])) - } - variable.names -} - -replaceReservedKeywords <- function(variable.names) -{ - # SPSS variable names can't be reserved keywords - reserved.keywords <- c("ALL", "AND", "BY", "EQ", "GE", "GT", "LE", "LT", "NE", "NOT", "OR", "TO", "WITH") - forbidden.keywords <- variable.names %in% reserved.keywords - if (any(forbidden.keywords)) { - warning("Cannot save variables whose names are SPSS reserved keywords. The following variables have had '_r' added to their names:", - paste0(variable.names[forbidden.keywords], collapse = ", ")) - variable.names[forbidden.keywords] <- paste0(variable.names[forbidden.keywords], "_r") + sanitized.names <- character(length(variable.names)) + for (i in seq_along(variable.names)) { + sanitized.names[i] <- makeValidNameForSpss(variable.names[i], + existing.names = sanitized.names[seq_len(i - 1)], + delimiter = "_") } - variable.names + sanitized.names } -truncateVariableNames <- function(variable.names) -{ - # SPSS variable names can't be longer than 64 bytes - bad.length <- nchar(variable.names, type = "bytes") > 64 - if (any(bad.length)) { - warning("Some variable names were too long and have been truncated: ", - paste0(variable.names[bad.length], collapse = ", ")) - variable.names[bad.length] <- vapply(variable.names[bad.length], - FUN = addSuffixFittingByteLimit, - FUN.VALUE = character(1)) - } - variable.names -} - -deduplicateVariableNames <- function(variable.names) -{ - # SPSS variable names must be unique - dupes <- duplicated(tolower(variable.names)) - if (any(dupes)) { - dupe.ind <- which(dupes) - for (i in dupe.ind) { - variable.names[i] <- uniqueName(variable.names[i], - existing.names = variable.names, - delimiter = "_") - } - } - variable.names -} - - addSuffixFittingByteLimit <- function(string, suffix = "", byte.limit = 64) { new.string <- paste0(string, suffix) size <- nchar(new.string, type = "bytes") diff --git a/R/stacking.R b/R/stacking.R index 8db387d..1bc3a30 100644 --- a/R/stacking.R +++ b/R/stacking.R @@ -1126,7 +1126,7 @@ stackedDataSet <- function(input.data.set, input.data.set.metadata, attr(v, "is.manually.stacked") <- is.manually.stacked[ind] attr(v, "stacking.input.variable.names") <- input.v.names[group.ind] attr(v, "stacking.input.variable.labels") <- input.v.labels[group.ind] - attr(v, "label") <- stackedVariableLabel(group.ind, input.v.labels, nm) + attr(v, "label") <- stackedVariableLabel(group.ind, input.v.labels) val.attr <- stackedValueAttributes(group.ind, input.data.set.metadata$variable.value.attributes) if (!is.null(val.attr)) @@ -1167,7 +1167,7 @@ stackedDataSet <- function(input.data.set, input.data.set.metadata, attr(v, "labels") <- val.attr class(v) <- c(class(v), "haven_labelled") } - nm <- uniqueName(input.v.names[i], names(stacked.data.set)) + nm <- makeValidNameForSpss(input.v.names[i], names(stacked.data.set)) stacked.data.set[[nm]] <- v } } @@ -1179,8 +1179,8 @@ stackedDataSet <- function(input.data.set, input.data.set.metadata, attr(original.case, "is.stacked") <- FALSE attr(original.case, "is.manually.stacked") <- NA attr(original.case, "is.original.case") <- TRUE - stacked.data.set[[uniqueName("original_case", - names(stacked.data.set))]] <- original.case + nm <- makeValidNameForSpss("original_case", names(stacked.data.set)) + stacked.data.set[[nm]] <- original.case } if (include.observation.variable && has.stacking) @@ -1199,14 +1199,11 @@ stackedDataSet <- function(input.data.set, input.data.set.metadata, attr(observation, "is.stacked") <- FALSE attr(observation, "is.manually.stacked") <- NA attr(observation, "is.observation") <- TRUE - - stacked.data.set[[uniqueName("observation", - names(stacked.data.set))]] <- observation + nm <- makeValidNameForSpss("observation", names(stacked.data.set)) + stacked.data.set[[nm]] <- observation } - stacked.data.set <- data.frame(stacked.data.set, check.names = FALSE) - colnames(stacked.data.set) <- sanitizeSPSSVariableNames(colnames(stacked.data.set)) - stacked.data.set + data.frame(stacked.data.set, check.names = FALSE) } stackedVariableName <- function(group.ind, input.variable.names, taken.names) @@ -1219,10 +1216,10 @@ stackedVariableName <- function(group.ind, input.variable.names, taken.names) if (candidate == "") candidate <- "stacked_var" - uniqueName(candidate, taken.names, "_") + makeValidNameForSpss(candidate, taken.names, "_") } -stackedVariableLabel <- function(group.ind, input.variable.labels, stacked.variable.name) +stackedVariableLabel <- function(group.ind, input.variable.labels) { ind <- removeNA(group.ind) lbl <- input.variable.labels[ind] diff --git a/tests/testthat/test-mergingandstackingutilities.R b/tests/testthat/test-mergingandstackingutilities.R index f98b090..cea2ae7 100644 --- a/tests/testthat/test-mergingandstackingutilities.R +++ b/tests/testthat/test-mergingandstackingutilities.R @@ -126,26 +126,22 @@ test_that("DS-4210: SPSS variable names sanitized before attempting to save", { # Period at beginning bad.names <- c(".A", ".B", ".C") - expect_warning(z <- sanitizeSPSSVariableNames(bad.names), - "Cannot save variables names which begin or end with '.'") + z <- sanitizeSPSSVariableNames(bad.names) expect_equal(z, c("A", "B", "C")) # Period at end bad.names <- c("A.", "B.", "C.") - expect_warning(z <- sanitizeSPSSVariableNames(bad.names), - "Cannot save variables names which begin or end with '.'") + z <- sanitizeSPSSVariableNames(bad.names) expect_equal(z, c("A", "B", "C")) # Multiple periods bad.names <- c("..A...", "..B...", "..C...") - expect_warning(z <- sanitizeSPSSVariableNames(bad.names), - "Cannot save variables names which begin or end with '.'") + z <- sanitizeSPSSVariableNames(bad.names) expect_equal(z, c("A", "B", "C")) # Restricted names bad.names <- c("A", "B", "WITH") - expect_warning(z <- sanitizeSPSSVariableNames(bad.names), - "Cannot save variables whose names are SPSS reserved keywords.") + z <- sanitizeSPSSVariableNames(bad.names) expect_equal(z, c("A", "B", "WITH_r")) # Too long @@ -154,22 +150,42 @@ test_that("DS-4210: SPSS variable names sanitized before attempting to save", { 'L2LeisureActivitiesConsideration_OtherpleasespecifyL2LeisureActivitiesConsideration_Otherpleasespecify', 'L2LeisureActivitiesConsideration_Otherpleasespecify_0L2LeisureActivitiesConsideration_Otherpleasespecify_0', 'PQ4a_OtherwithchildrenathomepleasespecifyPQ4a_Otherwithchildrenathomepleasespecify') - expect_warning(z <- sanitizeSPSSVariableNames(bad.names), - "Some variable names were too long and have been truncated: ") + z <- sanitizeSPSSVariableNames(bad.names) expect_true(all(nchar(z, type = 'bytes') <= 64)) # Too long, not ascii bad.names <- c("トム・クルーズが嫌いな理由を10語以内で教えてください", "春に訪れたい日本で一番好きな都市はどこですか") - - expect_warning(z <- sanitizeSPSSVariableNames(bad.names), - "Some variable names were too long and have been truncated: ") + z <- sanitizeSPSSVariableNames(bad.names) expect_true(all(nchar(z, type = 'bytes') <= 64)) - # Prevent duplicates bad.names <- c("A", "B", "WITH", "A", "B", "WITH") - expect_warning(z <- sanitizeSPSSVariableNames(bad.names), - "Cannot save variables whose names are SPSS reserved keywords") + z <- sanitizeSPSSVariableNames(bad.names) expect_equal(z, c("A", "B", "WITH_r", "A_1", "B_1", "WITH_r_1")) + + # Invalid starting characters + bad.names <- c("_A", "??B") + z <- sanitizeSPSSVariableNames(bad.names) + expect_equal(z, c("A", "B")) + + # Spaces + bad.names <- c(" A \n B ") + z <- sanitizeSPSSVariableNames(bad.names) + expect_equal(z, c("AB")) + + # Edge cases + bad.names <- c(" _.WITH.", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab.a", # 65 bytes long + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabaa", + "_.", + "__.") + z <- sanitizeSPSSVariableNames(bad.names) + expect_equal(z, c("WITH_r", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaba", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa_1", + "VAR", + "VAR_1")) }) diff --git a/tests/testthat/test-stacking.R b/tests/testthat/test-stacking.R index e55dc6c..f5f1be4 100644 --- a/tests/testthat/test-stacking.R +++ b/tests/testthat/test-stacking.R @@ -334,9 +334,9 @@ test_that("stackWithCommonLabels", { variable.labels = v.labels, variable.types = v.types, variable.value.attributes = v.val.attr)), - "Some variables could not be stacked due to mismatching ", + paste0("Some variables could not be stacked due to mismatching ", "variable types or value attributes. See Notes section in ", - "output for more details.") + "output for more details.")) expect_equal(stacking.groups, structure(integer(0), .Dim = c(0L, 3L), unstackable.names = list(c("Q2_A", "Q2_B", "Q2_C")))) @@ -350,9 +350,9 @@ test_that("stackWithCommonLabels", { variable.labels = v.labels, variable.types = v.types, variable.value.attributes = v.val.attr)), - "Some variables could not be stacked due to mismatching ", + paste0("Some variables could not be stacked due to mismatching ", "variable types or value attributes. See Notes section in ", - "output for more details.") + "output for more details.")) expect_equal(stacking.groups, structure(integer(0), .Dim = c(0L, 3L), unstackable.names = list(c("Q2_A", "Q2_B", "Q2_C")))) @@ -475,7 +475,7 @@ test_that("stackingSpecifiedByVariable", { names(val.attr.2) <- letters[1:3] v.val.attr.2 <- v.val.attr v.val.attr.2[[2]] <- val.attr.2 - + ## DS-4405: No longer a warning when some labels have duplicate attr. names { expect_warning( stacking.groups <- stackingSpecifiedByVariable(