From f385330550a0e2424a9af301d489cc449eb4c458 Mon Sep 17 00:00:00 2001
From: michaelv2 <1800075+michaelv2@users.noreply.github.com>
Date: Wed, 8 Apr 2026 09:10:48 -0400
Subject: [PATCH] Add weights_only=True to all torch.load() calls

PyTorch's pickle-based deserialization can execute arbitrary code when
loading a crafted .pt file. Adding weights_only=True restricts
deserialization to tensor data only, preventing this class of attack.

This is the recommended practice since PyTorch 2.0 and addresses
CVE-2025-32434 for users on PyTorch < 2.6.

Affected call sites:
- dictionary.py: AutoEncoder, GatedAutoEncoder, JumpReluAutoEncoder,
  AutoEncoderNew from_pretrained()
- trainers/top_k.py: AutoEncoderTopK from_pretrained()
- trainers/batch_top_k.py: BatchTopKSAE from_pretrained()
- trainers/matryoshka_batch_top_k.py: MatryoshkaBatchTopKSAE from_pretrained()
- activault_s3_buffer.py: compile()
---
 dictionary_learning/activault_s3_buffer.py             | 2 +-
 dictionary_learning/dictionary.py                      | 8 ++++----
 dictionary_learning/trainers/batch_top_k.py            | 2 +-
 dictionary_learning/trainers/matryoshka_batch_top_k.py | 2 +-
 dictionary_learning/trainers/top_k.py                  | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/dictionary_learning/activault_s3_buffer.py b/dictionary_learning/activault_s3_buffer.py
index 1b94a7e..0722b3b 100644
--- a/dictionary_learning/activault_s3_buffer.py
+++ b/dictionary_learning/activault_s3_buffer.py
@@ -118,7 +118,7 @@ def compile(byte_buffers, shuffle=True, seed=None, return_ids=False):
         # t = torch.from_numpy(n)
         # t = torch.frombuffer(combined_bytes, dtype=dtype) # torch.float32
         buffer = io.BytesIO(combined_bytes)
-        t = torch.load(buffer)
+        t = torch.load(buffer, weights_only=True)
         if (
             isinstance(t, dict) and "states" in t and not return_ids
         ):  # backward compatibility
diff --git a/dictionary_learning/dictionary.py b/dictionary_learning/dictionary.py
index 238a866..5076cdf 100644
--- a/dictionary_learning/dictionary.py
+++ b/dictionary_learning/dictionary.py
@@ -129,7 +129,7 @@ def from_pretrained(cls, path, dtype=t.float, device=None, normalize_decoder=Tru
         """
         Load a pretrained autoencoder from a file.
         """
-        state_dict = t.load(path)
+        state_dict = t.load(path, weights_only=True)
         dict_size, activation_dim = state_dict["encoder.weight"].shape
         autoencoder = cls(activation_dim, dict_size)
         autoencoder.load_state_dict(state_dict)
@@ -279,7 +279,7 @@ def from_pretrained(path, device=None):
         """
         Load a pretrained autoencoder from a file.
         """
-        state_dict = t.load(path)
+        state_dict = t.load(path, weights_only=True)
         dict_size, activation_dim = state_dict["encoder.weight"].shape
         autoencoder = GatedAutoEncoder(activation_dim, dict_size)
         autoencoder.load_state_dict(state_dict)
@@ -358,7 +358,7 @@ def from_pretrained(
         loading function.
         """
         if not load_from_sae_lens:
-            state_dict = t.load(path)
+            state_dict = t.load(path, weights_only=True)
             activation_dim, dict_size = state_dict["W_enc"].shape
             autoencoder = JumpReluAutoEncoder(activation_dim, dict_size)
             autoencoder.load_state_dict(state_dict)
@@ -429,7 +429,7 @@ def from_pretrained(path, device=None):
         """
         Load a pretrained autoencoder from a file.
         """
-        state_dict = t.load(path)
+        state_dict = t.load(path, weights_only=True)
         dict_size, activation_dim = state_dict["encoder.weight"].shape
         autoencoder = AutoEncoderNew(activation_dim, dict_size)
         autoencoder.load_state_dict(state_dict)
diff --git a/dictionary_learning/trainers/batch_top_k.py b/dictionary_learning/trainers/batch_top_k.py
index 8cb2ecf..59c1675 100644
--- a/dictionary_learning/trainers/batch_top_k.py
+++ b/dictionary_learning/trainers/batch_top_k.py
@@ -79,7 +79,7 @@ def scale_biases(self, scale: float):
 
     @classmethod
     def from_pretrained(cls, path, k=None, device=None, **kwargs) -> "BatchTopKSAE":
-        state_dict = t.load(path)
+        state_dict = t.load(path, weights_only=True)
         dict_size, activation_dim = state_dict["encoder.weight"].shape
         if k is None:
             k = state_dict["k"].item()
diff --git a/dictionary_learning/trainers/matryoshka_batch_top_k.py b/dictionary_learning/trainers/matryoshka_batch_top_k.py
index 03c195b..e81d1b6 100644
--- a/dictionary_learning/trainers/matryoshka_batch_top_k.py
+++ b/dictionary_learning/trainers/matryoshka_batch_top_k.py
@@ -121,7 +121,7 @@ def scale_biases(self, scale: float):
     def from_pretrained(
         cls, path, k=None, device=None, **kwargs
     ) -> "MatryoshkaBatchTopKSAE":
-        state_dict = t.load(path)
+        state_dict = t.load(path, weights_only=True)
         activation_dim, dict_size = state_dict["W_enc"].shape
         if k is None:
             k = state_dict["k"].item()
diff --git a/dictionary_learning/trainers/top_k.py b/dictionary_learning/trainers/top_k.py
index e81259f..712d91d 100644
--- a/dictionary_learning/trainers/top_k.py
+++ b/dictionary_learning/trainers/top_k.py
@@ -137,7 +137,7 @@ def from_pretrained(path, k: Optional[int] = None, device=None):
         """
         Load a pretrained autoencoder from a file.
         """
-        state_dict = t.load(path)
+        state_dict = t.load(path, weights_only=True)
         dict_size, activation_dim = state_dict["encoder.weight"].shape
 
         if k is None: