Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 1 addition & 6 deletions R/mergedatasetsbyvariable.R
Original file line number Diff line number Diff line change
Expand Up @@ -450,12 +450,7 @@ mergedDataSetVariableNames <- function(input.data.sets.metadata,
v.names.list[[data.set.ind]])
}

merged.data.set.var.names <- character(0)
for (i in seq_len(n.data.sets))
for (nm in included.variable.names.list[[i]])
merged.data.set.var.names <- c(merged.data.set.var.names,
uniqueName(nm, merged.data.set.var.names, delimiter = "_"))
merged.data.set.var.names <- sanitizeSPSSVariableNames(merged.data.set.var.names)
merged.data.set.var.names <- sanitizeSPSSVariableNames(unlist(included.variable.names.list))
attr(merged.data.set.var.names, "included.variable.names.list") <- included.variable.names.list
attr(merged.data.set.var.names, "omitted.variable.names.list") <- omitted.var.names.list

Expand Down
113 changes: 49 additions & 64 deletions R/mergingandstackingutilities.R
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,49 @@ uniqueName <- function(new.name, existing.names, delimiter = "")
}
}

makeValidNameForSpss <- function(input.name, existing.names, delimiter = "")
{
input.name |>
removeWhitespace() |>
removeInvalidStartingCharacters() |>
truncateNameToByteLimit() |>
trimTrailingPeriods() |>
defaultNameIfEmpty() |>
replaceReservedKeywords() |>
uniqueName(existing.names, delimiter)
}

removeWhitespace <- function(name)
{
gsub("\\s+", "", name)
}

removeInvalidStartingCharacters <- function(name)
{
gsub("^[^a-zA-Z@]+", "", name)
}

trimTrailingPeriods <- function(name)
{
gsub("\\.+$", "", name)
}

defaultNameIfEmpty <- function(name)
{
ifelse(name == "", "VAR", name)
}

replaceReservedKeywords <- function(name)
{
reserved.keywords <- c("ALL", "AND", "BY", "EQ", "GE", "GT", "LE", "LT", "NE", "NOT", "OR", "TO", "WITH")
ifelse(name %in% reserved.keywords, paste0(name, "_r"), name)
}

truncateNameToByteLimit <- function(name)
{
addSuffixFittingByteLimit(name)
}

#' @description Return variable name matches to wildcard.text. Throw error if no matches
#' found and error.if.not.found == TRUE.
#' @param wildcard.text Character scalar of the wildcard pattern to match for.
Expand Down Expand Up @@ -546,73 +589,15 @@ parseVariableWildcardForMerging <- function(wildcard.text, variable.names,
}

sanitizeSPSSVariableNames <- function(variable.names) {
variable.names <- trimPeriods(variable.names)
variable.names <- replaceReservedKeywords(variable.names)
variable.names <- truncateVariableNames(variable.names)

# Extra call to trimPeriods in case truncateVariableNames
# results in trailling periods
variable.names <- trimPeriods(variable.names)

variable.names <- deduplicateVariableNames(variable.names)
variable.names
}

trimPeriods <- function(variable.names)
{
# Can't begin with or end with a period
starts.or.ends.with.period <- startsWith(variable.names, ".") | endsWith(variable.names, ".")
if (any(starts.or.ends.with.period)) {
warning("Cannot save variables names which begin or end with '.'. Some variables have had '.' removed from their names: ",
paste0(variable.names[starts.or.ends.with.period], collapse = ", "))
variable.names[starts.or.ends.with.period] <- gsub("^\\.+", "", gsub("\\.+$", "", variable.names[starts.or.ends.with.period]))
}
variable.names
}

replaceReservedKeywords <- function(variable.names)
{
# SPSS variable names can't be reserved keywords
reserved.keywords <- c("ALL", "AND", "BY", "EQ", "GE", "GT", "LE", "LT", "NE", "NOT", "OR", "TO", "WITH")
forbidden.keywords <- variable.names %in% reserved.keywords
if (any(forbidden.keywords)) {
warning("Cannot save variables whose names are SPSS reserved keywords. The following variables have had '_r' added to their names:",
paste0(variable.names[forbidden.keywords], collapse = ", "))
variable.names[forbidden.keywords] <- paste0(variable.names[forbidden.keywords], "_r")
sanitized.names <- character(length(variable.names))
for (i in seq_along(variable.names)) {
sanitized.names[i] <- makeValidNameForSpss(variable.names[i],
existing.names = sanitized.names[seq_len(i - 1)],
delimiter = "_")
}
variable.names
sanitized.names
}

truncateVariableNames <- function(variable.names)
{
# SPSS variable names can't be longer than 64 bytes
bad.length <- nchar(variable.names, type = "bytes") > 64
if (any(bad.length)) {
warning("Some variable names were too long and have been truncated: ",
paste0(variable.names[bad.length], collapse = ", "))
variable.names[bad.length] <- vapply(variable.names[bad.length],
FUN = addSuffixFittingByteLimit,
FUN.VALUE = character(1))
}
variable.names
}

deduplicateVariableNames <- function(variable.names)
{
# SPSS variable names must be unique
dupes <- duplicated(tolower(variable.names))
if (any(dupes)) {
dupe.ind <- which(dupes)
for (i in dupe.ind) {
variable.names[i] <- uniqueName(variable.names[i],
existing.names = variable.names,
delimiter = "_")
}
}
variable.names
}


addSuffixFittingByteLimit <- function(string, suffix = "", byte.limit = 64) {
new.string <- paste0(string, suffix)
size <- nchar(new.string, type = "bytes")
Expand Down
21 changes: 9 additions & 12 deletions R/stacking.R
Original file line number Diff line number Diff line change
Expand Up @@ -1126,7 +1126,7 @@ stackedDataSet <- function(input.data.set, input.data.set.metadata,
attr(v, "is.manually.stacked") <- is.manually.stacked[ind]
attr(v, "stacking.input.variable.names") <- input.v.names[group.ind]
attr(v, "stacking.input.variable.labels") <- input.v.labels[group.ind]
attr(v, "label") <- stackedVariableLabel(group.ind, input.v.labels, nm)
attr(v, "label") <- stackedVariableLabel(group.ind, input.v.labels)
val.attr <- stackedValueAttributes(group.ind,
input.data.set.metadata$variable.value.attributes)
if (!is.null(val.attr))
Expand Down Expand Up @@ -1167,7 +1167,7 @@ stackedDataSet <- function(input.data.set, input.data.set.metadata,
attr(v, "labels") <- val.attr
class(v) <- c(class(v), "haven_labelled")
}
nm <- uniqueName(input.v.names[i], names(stacked.data.set))
nm <- makeValidNameForSpss(input.v.names[i], names(stacked.data.set))
stacked.data.set[[nm]] <- v
}
}
Expand All @@ -1179,8 +1179,8 @@ stackedDataSet <- function(input.data.set, input.data.set.metadata,
attr(original.case, "is.stacked") <- FALSE
attr(original.case, "is.manually.stacked") <- NA
attr(original.case, "is.original.case") <- TRUE
stacked.data.set[[uniqueName("original_case",
names(stacked.data.set))]] <- original.case
nm <- makeValidNameForSpss("original_case", names(stacked.data.set))
stacked.data.set[[nm]] <- original.case
}

if (include.observation.variable && has.stacking)
Expand All @@ -1199,14 +1199,11 @@ stackedDataSet <- function(input.data.set, input.data.set.metadata,
attr(observation, "is.stacked") <- FALSE
attr(observation, "is.manually.stacked") <- NA
attr(observation, "is.observation") <- TRUE

stacked.data.set[[uniqueName("observation",
names(stacked.data.set))]] <- observation
nm <- makeValidNameForSpss("observation", names(stacked.data.set))
stacked.data.set[[nm]] <- observation
}

stacked.data.set <- data.frame(stacked.data.set, check.names = FALSE)
colnames(stacked.data.set) <- sanitizeSPSSVariableNames(colnames(stacked.data.set))
stacked.data.set
data.frame(stacked.data.set, check.names = FALSE)
}

stackedVariableName <- function(group.ind, input.variable.names, taken.names)
Expand All @@ -1219,10 +1216,10 @@ stackedVariableName <- function(group.ind, input.variable.names, taken.names)
if (candidate == "")
candidate <- "stacked_var"

uniqueName(candidate, taken.names, "_")
makeValidNameForSpss(candidate, taken.names, "_")
}

stackedVariableLabel <- function(group.ind, input.variable.labels, stacked.variable.name)
stackedVariableLabel <- function(group.ind, input.variable.labels)
{
ind <- removeNA(group.ind)
lbl <- input.variable.labels[ind]
Expand Down
48 changes: 32 additions & 16 deletions tests/testthat/test-mergingandstackingutilities.R
Original file line number Diff line number Diff line change
Expand Up @@ -126,26 +126,22 @@ test_that("DS-4210: SPSS variable names sanitized before attempting to save", {

# Period at beginning
bad.names <- c(".A", ".B", ".C")
expect_warning(z <- sanitizeSPSSVariableNames(bad.names),
"Cannot save variables names which begin or end with '.'")
z <- sanitizeSPSSVariableNames(bad.names)
expect_equal(z, c("A", "B", "C"))

# Period at end
bad.names <- c("A.", "B.", "C.")
expect_warning(z <- sanitizeSPSSVariableNames(bad.names),
"Cannot save variables names which begin or end with '.'")
z <- sanitizeSPSSVariableNames(bad.names)
expect_equal(z, c("A", "B", "C"))

# Multiple periods
bad.names <- c("..A...", "..B...", "..C...")
expect_warning(z <- sanitizeSPSSVariableNames(bad.names),
"Cannot save variables names which begin or end with '.'")
z <- sanitizeSPSSVariableNames(bad.names)
expect_equal(z, c("A", "B", "C"))

# Restricted names
bad.names <- c("A", "B", "WITH")
expect_warning(z <- sanitizeSPSSVariableNames(bad.names),
"Cannot save variables whose names are SPSS reserved keywords.")
z <- sanitizeSPSSVariableNames(bad.names)
expect_equal(z, c("A", "B", "WITH_r"))

# Too long
Expand All @@ -154,22 +150,42 @@ test_that("DS-4210: SPSS variable names sanitized before attempting to save", {
'L2LeisureActivitiesConsideration_OtherpleasespecifyL2LeisureActivitiesConsideration_Otherpleasespecify',
'L2LeisureActivitiesConsideration_Otherpleasespecify_0L2LeisureActivitiesConsideration_Otherpleasespecify_0',
'PQ4a_OtherwithchildrenathomepleasespecifyPQ4a_Otherwithchildrenathomepleasespecify')
expect_warning(z <- sanitizeSPSSVariableNames(bad.names),
"Some variable names were too long and have been truncated: ")
z <- sanitizeSPSSVariableNames(bad.names)
expect_true(all(nchar(z, type = 'bytes') <= 64))

# Too long, not ascii
bad.names <- c("トム・クルーズが嫌いな理由を10語以内で教えてください",
"春に訪れたい日本で一番好きな都市はどこですか")

expect_warning(z <- sanitizeSPSSVariableNames(bad.names),
"Some variable names were too long and have been truncated: ")
z <- sanitizeSPSSVariableNames(bad.names)
expect_true(all(nchar(z, type = 'bytes') <= 64))


# Prevent duplicates
bad.names <- c("A", "B", "WITH", "A", "B", "WITH")
expect_warning(z <- sanitizeSPSSVariableNames(bad.names),
"Cannot save variables whose names are SPSS reserved keywords")
z <- sanitizeSPSSVariableNames(bad.names)
expect_equal(z, c("A", "B", "WITH_r", "A_1", "B_1", "WITH_r_1"))

# Invalid starting characters
bad.names <- c("_A", "??B")
z <- sanitizeSPSSVariableNames(bad.names)
expect_equal(z, c("A", "B"))

# Spaces
bad.names <- c(" A \n B ")
z <- sanitizeSPSSVariableNames(bad.names)
expect_equal(z, c("AB"))

# Edge cases
bad.names <- c(" _.WITH.",
"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab.a", # 65 bytes long
"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabaa",
"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabaa",
"_.",
"__.")
z <- sanitizeSPSSVariableNames(bad.names)
expect_equal(z, c("WITH_r",
"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab",
"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaba",
"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa_1",
"VAR",
"VAR_1"))
})
10 changes: 5 additions & 5 deletions tests/testthat/test-stacking.R
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The changes in this file fix an existing fail in the unit tests

Original file line number Diff line number Diff line change
Expand Up @@ -334,9 +334,9 @@ test_that("stackWithCommonLabels", {
variable.labels = v.labels,
variable.types = v.types,
variable.value.attributes = v.val.attr)),
"Some variables could not be stacked due to mismatching ",
paste0("Some variables could not be stacked due to mismatching ",
"variable types or value attributes. See Notes section in ",
"output for more details.")
"output for more details."))
expect_equal(stacking.groups,
structure(integer(0), .Dim = c(0L, 3L),
unstackable.names = list(c("Q2_A", "Q2_B", "Q2_C"))))
Expand All @@ -350,9 +350,9 @@ test_that("stackWithCommonLabels", {
variable.labels = v.labels,
variable.types = v.types,
variable.value.attributes = v.val.attr)),
"Some variables could not be stacked due to mismatching ",
paste0("Some variables could not be stacked due to mismatching ",
"variable types or value attributes. See Notes section in ",
"output for more details.")
"output for more details."))
expect_equal(stacking.groups,
structure(integer(0), .Dim = c(0L, 3L),
unstackable.names = list(c("Q2_A", "Q2_B", "Q2_C"))))
Expand Down Expand Up @@ -475,7 +475,7 @@ test_that("stackingSpecifiedByVariable", {
names(val.attr.2) <- letters[1:3]
v.val.attr.2 <- v.val.attr
v.val.attr.2[[2]] <- val.attr.2

## DS-4405: No longer a warning when some labels have duplicate attr. names {
expect_warning(
stacking.groups <- stackingSpecifiedByVariable(
Expand Down