From 951af7fda0c4cc7558011287775a8dfb0d929adf Mon Sep 17 00:00:00 2001
From: Justin Yap <justin.yap@displayr.com>
Date: Wed, 26 Nov 2025 09:29:06 +1100
Subject: [PATCH 1/2] Remove invalid characters when making name for sav files

---
 R/mergingandstackingutilities.R               | 19 +++++++++++++++++++
 .../test-mergingandstackingutilities.R        |  5 +++++
 2 files changed, 24 insertions(+)

diff --git a/R/mergingandstackingutilities.R b/R/mergingandstackingutilities.R
index 57a78bc..2155200 100644
--- a/R/mergingandstackingutilities.R
+++ b/R/mergingandstackingutilities.R
@@ -521,6 +521,7 @@ makeValidNameForSpss <- function(input.name, existing.names, delimiter = "")
 {
     input.name |>
         removeWhitespace() |>
+        removeInvalidCharacters() |>
         removeInvalidStartingCharacters() |>
         truncateNameToByteLimit() |>
         trimTrailingPeriods() |>
@@ -534,6 +535,24 @@ removeWhitespace <- function(name)
     gsub("\\s+", "", name)
 }
 
+removeInvalidCharacters <- function(name)
+{
+    # Invalid characters are any ASCII (including extended ASCII, i.e., codes 0-255) that are not
+    # letters, numbers, or the special characters @ # $ _ \ .
+    # Note that Unicode is permitted in SPSS variable names.
+    # The regex was created by combining the ranges of the invalid characters. The valid ASCII characters are:
+    # 0-9 \x30-\x39
+    # A-Z \x41-\x5A
+    # a-z \x61-\x7A
+    # Character # \x23
+    # Character $ \x24
+    # Character . \x2E
+    # Character @ \x40
+    # Character \ \x5C
+    # Character _ \x5F
+    gsub("[\\x00-\\x22\\x25-\\x2D\\x2F\\x3A-\\x3F\\x5B\\x5D-\\x5E\\x60\\x7B-\\xFF]", "", name, perl = TRUE)
+}
+
 removeInvalidStartingCharacters <- function(name)
 {
     gsub("^[^a-zA-Z@]+", "", name)
diff --git a/tests/testthat/test-mergingandstackingutilities.R b/tests/testthat/test-mergingandstackingutilities.R
index cea2ae7..edd744e 100644
--- a/tests/testthat/test-mergingandstackingutilities.R
+++ b/tests/testthat/test-mergingandstackingutilities.R
@@ -189,3 +189,8 @@ test_that("DS-4210: SPSS variable names sanitized before attempting to save", {
                       "VAR",
                       "VAR_1"))
 })
+
+test_that("removeInvalidCharacters", {
+    expect_equal(removeInvalidCharacters(intToUtf8(0:255)), "#$.0123456789@ABCDEFGHIJKLMNOPQRSTUVWXYZ\\_abcdefghijklmnopqrstuvwxyz")
+    expect_equal(removeInvalidCharacters("名称"), "名称") # unicode characters are allowed
+})
\ No newline at end of file

From 826fd0d1aef993e29895c7848bb8fe0b1177ec8f Mon Sep 17 00:00:00 2001
From: Justin Yap <justin.yap@displayr.com>
Date: Wed, 26 Nov 2025 14:44:52 +1100
Subject: [PATCH 2/2] Remove characters if not supported by either Displayr or
 haven

---
 R/mergingandstackingutilities.R               | 23 ++++++++-----------
 .../test-mergingandstackingutilities.R        | 14 ++++++++---
 2 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/R/mergingandstackingutilities.R b/R/mergingandstackingutilities.R
index 2155200..77de2f3 100644
--- a/R/mergingandstackingutilities.R
+++ b/R/mergingandstackingutilities.R
@@ -537,20 +537,15 @@ removeWhitespace <- function(name)
 
 removeInvalidCharacters <- function(name)
 {
-    # Invalid characters are any ASCII (including extended ASCII, i.e., codes 0-255) that are not
-    # letters, numbers, or the special characters @ # $ _ \ .
-    # Note that Unicode is permitted in SPSS variable names.
-    # The regex was created by combining the ranges of the invalid characters. The valid ASCII characters are:
-    # 0-9 \x30-\x39
-    # A-Z \x41-\x5A
-    # a-z \x61-\x7A
-    # Character # \x23
-    # Character $ \x24
-    # Character . \x2E
-    # Character @ \x40
-    # Character \ \x5C
-    # Character _ \x5F
-    gsub("[\\x00-\\x22\\x25-\\x2D\\x2F\\x3A-\\x3F\\x5B\\x5D-\\x5E\\x60\\x7B-\\xFF]", "", name, perl = TRUE)
+    # The regex matches all characters except:
+    #   \\pL = any kind of letter from any language
+    #   Numeric characters 0-9
+    #   \\p{Sc} = any kind of currency symbol
+    #   The special characters \ . _ $ # @
+    #
+    # This is stricter than either haven or Displayr, because their set of allowed characters are different.
+    # See unit tests for removeInvalidCharacters
+    gsub("[^\\pL0-9\\p{Sc}\\\\._$#@]", "", name, perl = TRUE)
 }
 
 removeInvalidStartingCharacters <- function(name)
diff --git a/tests/testthat/test-mergingandstackingutilities.R b/tests/testthat/test-mergingandstackingutilities.R
index edd744e..c7e3335 100644
--- a/tests/testthat/test-mergingandstackingutilities.R
+++ b/tests/testthat/test-mergingandstackingutilities.R
@@ -191,6 +191,14 @@ test_that("DS-4210: SPSS variable names sanitized before attempting to save", {
 })
 
 test_that("removeInvalidCharacters", {
-    expect_equal(removeInvalidCharacters(intToUtf8(0:255)), "#$.0123456789@ABCDEFGHIJKLMNOPQRSTUVWXYZ\\_abcdefghijklmnopqrstuvwxyz")
-    expect_equal(removeInvalidCharacters("名称"), "名称") # unicode characters are allowed
-})
\ No newline at end of file
+    expect_equal(removeInvalidCharacters(intToUtf8(0:127)), "#$.0123456789@ABCDEFGHIJKLMNOPQRSTUVWXYZ\\_abcdefghijklmnopqrstuvwxyz") # check characters in basic ASCII
+    expect_equal(removeInvalidCharacters("ç"), "ç") # letter characters in extended ASCII are allowed
+    expect_equal(removeInvalidCharacters("½"), "") # number characters in extended ASCII are removed (allowed by haven but not Displayr)
+    expect_equal(removeInvalidCharacters("¥"), "¥") # currency characters in extended ASCII are allowed
+    expect_equal(removeInvalidCharacters("…"), "") # punctuation characters in extended ASCII are removed (allowed by Displayr but not haven)
+    expect_equal(removeInvalidCharacters("©"), "") # other characters in extended ASCII are removed (allowed by haven but not Displayr)
+    expect_equal(removeInvalidCharacters("名称"), "名称") # "letter" unicode characters are allowed
+    expect_equal(removeInvalidCharacters("∞"), "") # number characters in unicode are removed (allowed by haven but not Displayr)
+    expect_equal(removeInvalidCharacters("€"), "€") # currency unicode characters are allowed
+    expect_equal(removeInvalidCharacters("¿"), "") # punctuation unicode characters are removed (not allowed by either)
+})