diff --git a/R/mergingandstackingutilities.R b/R/mergingandstackingutilities.R index 57a78bc..77de2f3 100644 --- a/R/mergingandstackingutilities.R +++ b/R/mergingandstackingutilities.R @@ -521,6 +521,7 @@ makeValidNameForSpss <- function(input.name, existing.names, delimiter = "") { input.name |> removeWhitespace() |> + removeInvalidCharacters() |> removeInvalidStartingCharacters() |> truncateNameToByteLimit() |> trimTrailingPeriods() |> @@ -534,6 +535,19 @@ removeWhitespace <- function(name) gsub("\\s+", "", name) } +removeInvalidCharacters <- function(name) +{ + # The regex matches all characters except: + # \\pL = any kind of letter from any language + # Numeric characters 0-9 + # \\p{Sc} = any kind of currency symbol + # The special characters \ . _ $ # @ + # + # This is stricter than either haven or Displayr, because their set of allowed characters are different. + # See unit tests for removeInvalidCharacters + gsub("[^\\pL0-9\\p{Sc}\\\\._$#@]", "", name, perl = TRUE) +} + removeInvalidStartingCharacters <- function(name) { gsub("^[^a-zA-Z@]+", "", name) diff --git a/tests/testthat/test-mergingandstackingutilities.R b/tests/testthat/test-mergingandstackingutilities.R index cea2ae7..c7e3335 100644 --- a/tests/testthat/test-mergingandstackingutilities.R +++ b/tests/testthat/test-mergingandstackingutilities.R @@ -189,3 +189,16 @@ test_that("DS-4210: SPSS variable names sanitized before attempting to save", { "VAR", "VAR_1")) }) + +test_that("removeInvalidCharacters", { + expect_equal(removeInvalidCharacters(intToUtf8(0:127)), "#$.0123456789@ABCDEFGHIJKLMNOPQRSTUVWXYZ\\_abcdefghijklmnopqrstuvwxyz") # check characters in basic ASCII + expect_equal(removeInvalidCharacters("ç"), "ç") # letter characters in extended ASCII are allowed + expect_equal(removeInvalidCharacters("½"), "") # number characters in extended ASCII are removed (allowed by haven but not Displayr) + expect_equal(removeInvalidCharacters("¥"), "¥") # currency characters in extended ASCII are allowed + expect_equal(removeInvalidCharacters("…"), "") # punctuation characters in extended ASCII are removed (allowed by Displayr but not haven) + expect_equal(removeInvalidCharacters("©"), "") # other characters in extended ASCII are removed (allowed by haven but not Displayr) + expect_equal(removeInvalidCharacters("名称"), "名称") # "letter" unicode characters are allowed + expect_equal(removeInvalidCharacters("∞"), "") # number characters in unicode are removed (allowed by haven but not Displayr) + expect_equal(removeInvalidCharacters("€"), "€") # currency unicode characters are allowed + expect_equal(removeInvalidCharacters("¿"), "") # punctuation unicode characters are removed (not allowed by either) +})