From 951af7fda0c4cc7558011287775a8dfb0d929adf Mon Sep 17 00:00:00 2001 From: Justin Yap Date: Wed, 26 Nov 2025 09:29:06 +1100 Subject: [PATCH 1/2] Remove invalid characters when making name for sav files --- R/mergingandstackingutilities.R | 19 +++++++++++++++++++ .../test-mergingandstackingutilities.R | 5 +++++ 2 files changed, 24 insertions(+) diff --git a/R/mergingandstackingutilities.R b/R/mergingandstackingutilities.R index 57a78bc..2155200 100644 --- a/R/mergingandstackingutilities.R +++ b/R/mergingandstackingutilities.R @@ -521,6 +521,7 @@ makeValidNameForSpss <- function(input.name, existing.names, delimiter = "") { input.name |> removeWhitespace() |> + removeInvalidCharacters() |> removeInvalidStartingCharacters() |> truncateNameToByteLimit() |> trimTrailingPeriods() |> @@ -534,6 +535,24 @@ removeWhitespace <- function(name) gsub("\\s+", "", name) } +removeInvalidCharacters <- function(name) +{ + # Invalid characters are any ASCII (including extended ASCII, i.e., codes 0-255) that are not + # letters, numbers, or the special characters @ # $ _ \ . + # Note that Unicode is permitted in SPSS variable names. + # The regex was created by combining the ranges of the invalid characters. The valid ASCII characters are: + # 0-9 \x30-\x39 + # A-Z \x41-\x5A + # a-z \x61-\x7A + # Character # \x23 + # Character $ \x24 + # Character . \x2E + # Character @ \x40 + # Character \ \x5C + # Character _ \x5F + gsub("[\\x00-\\x22\\x25-\\x2D\\x2F\\x3A-\\x3F\\x5B\\x5D-\\x5E\\x60\\x7B-\\xFF]", "", name, perl = TRUE) +} + removeInvalidStartingCharacters <- function(name) { gsub("^[^a-zA-Z@]+", "", name) diff --git a/tests/testthat/test-mergingandstackingutilities.R b/tests/testthat/test-mergingandstackingutilities.R index cea2ae7..edd744e 100644 --- a/tests/testthat/test-mergingandstackingutilities.R +++ b/tests/testthat/test-mergingandstackingutilities.R @@ -189,3 +189,8 @@ test_that("DS-4210: SPSS variable names sanitized before attempting to save", { "VAR", "VAR_1")) }) + +test_that("removeInvalidCharacters", { + expect_equal(removeInvalidCharacters(intToUtf8(0:255)), "#$.0123456789@ABCDEFGHIJKLMNOPQRSTUVWXYZ\\_abcdefghijklmnopqrstuvwxyz") + expect_equal(removeInvalidCharacters("名称"), "名称") # unicode characters are allowed +}) \ No newline at end of file From 826fd0d1aef993e29895c7848bb8fe0b1177ec8f Mon Sep 17 00:00:00 2001 From: Justin Yap Date: Wed, 26 Nov 2025 14:44:52 +1100 Subject: [PATCH 2/2] Remove characters if not supported by either Displayr or haven --- R/mergingandstackingutilities.R | 23 ++++++++----------- .../test-mergingandstackingutilities.R | 14 ++++++++--- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/R/mergingandstackingutilities.R b/R/mergingandstackingutilities.R index 2155200..77de2f3 100644 --- a/R/mergingandstackingutilities.R +++ b/R/mergingandstackingutilities.R @@ -537,20 +537,15 @@ removeWhitespace <- function(name) removeInvalidCharacters <- function(name) { - # Invalid characters are any ASCII (including extended ASCII, i.e., codes 0-255) that are not - # letters, numbers, or the special characters @ # $ _ \ . - # Note that Unicode is permitted in SPSS variable names. - # The regex was created by combining the ranges of the invalid characters. The valid ASCII characters are: - # 0-9 \x30-\x39 - # A-Z \x41-\x5A - # a-z \x61-\x7A - # Character # \x23 - # Character $ \x24 - # Character . \x2E - # Character @ \x40 - # Character \ \x5C - # Character _ \x5F - gsub("[\\x00-\\x22\\x25-\\x2D\\x2F\\x3A-\\x3F\\x5B\\x5D-\\x5E\\x60\\x7B-\\xFF]", "", name, perl = TRUE) + # The regex matches all characters except: + # \\pL = any kind of letter from any language + # Numeric characters 0-9 + # \\p{Sc} = any kind of currency symbol + # The special characters \ . _ $ # @ + # + # This is stricter than either haven or Displayr, because their set of allowed characters are different. + # See unit tests for removeInvalidCharacters + gsub("[^\\pL0-9\\p{Sc}\\\\._$#@]", "", name, perl = TRUE) } removeInvalidStartingCharacters <- function(name) diff --git a/tests/testthat/test-mergingandstackingutilities.R b/tests/testthat/test-mergingandstackingutilities.R index edd744e..c7e3335 100644 --- a/tests/testthat/test-mergingandstackingutilities.R +++ b/tests/testthat/test-mergingandstackingutilities.R @@ -191,6 +191,14 @@ test_that("DS-4210: SPSS variable names sanitized before attempting to save", { }) test_that("removeInvalidCharacters", { - expect_equal(removeInvalidCharacters(intToUtf8(0:255)), "#$.0123456789@ABCDEFGHIJKLMNOPQRSTUVWXYZ\\_abcdefghijklmnopqrstuvwxyz") - expect_equal(removeInvalidCharacters("名称"), "名称") # unicode characters are allowed -}) \ No newline at end of file + expect_equal(removeInvalidCharacters(intToUtf8(0:127)), "#$.0123456789@ABCDEFGHIJKLMNOPQRSTUVWXYZ\\_abcdefghijklmnopqrstuvwxyz") # check characters in basic ASCII + expect_equal(removeInvalidCharacters("ç"), "ç") # letter characters in extended ASCII are allowed + expect_equal(removeInvalidCharacters("½"), "") # number characters in extended ASCII are removed (allowed by haven but not Displayr) + expect_equal(removeInvalidCharacters("¥"), "¥") # currency characters in extended ASCII are allowed + expect_equal(removeInvalidCharacters("…"), "") # punctuation characters in extended ASCII are removed (allowed by Displayr but not haven) + expect_equal(removeInvalidCharacters("©"), "") # other characters in extended ASCII are removed (allowed by haven but not Displayr) + expect_equal(removeInvalidCharacters("名称"), "名称") # "letter" unicode characters are allowed + expect_equal(removeInvalidCharacters("∞"), "") # number characters in unicode are removed (allowed by haven but not Displayr) + expect_equal(removeInvalidCharacters("€"), "€") # currency unicode characters are allowed + expect_equal(removeInvalidCharacters("¿"), "") # punctuation unicode characters are removed (not allowed by either) +})