Refactor ActivationCache and NormalizableMixin for improved clarity and functionality

jkminder · jkminder · commit fa57b0b4e9b3 · 2025-06-29T15:34:56.000Z
This commit introduces the following changes:

- Sequence ranges are now always stored when storing an activation cache and having store_tokens=False and shuffling=False.
- Updated the `NormalizableMixin` to ensure that variance calculations are performed along the last dimension, with appropriate shape assertions added for clarity. This makes sure that for the crosscoder the variance is computed per-layer.
- Adjusted the normalization and denormalization methods to maintain tensor shapes correctly during operations.
- Enhanced logging in `CrossCoderTrainer` to include layer-wise RMS norms for better monitoring of training dynamics.

These modifications improve the clarity and maintainability of the code while ensuring correct functionality in activation caching and normalization processes.
diff --git a/dictionary_learning/cache.py b/dictionary_learning/cache.py
@@ -531,12 +531,9 @@ def collect(
             not shuffle_shards or not store_tokens
         ), "Shuffling shards and storing tokens is not supported yet"
         
-        # Check if we need to store sequence ranges
-        has_bos_token = model.tokenizer.bos_token is not None
         store_sequence_ranges = (
             store_tokens and 
-            not shuffle_shards and 
-            not has_bos_token
+            not shuffle_shards
         )
   
         dataloader = DataLoader(data, batch_size=batch_size, num_workers=num_workers)
diff --git a/dictionary_learning/dictionary.py b/dictionary_learning/dictionary.py
@@ -45,7 +45,7 @@ def __init__(
                           normalization is a no-op.
             activation_shape: Shape of the activation tensor. Required if activation_mean and activation_std are None for proper initialization and registration of the buffers.
             keep_relative_variance: If True, performs global scaling so that the
-                                  sum of variances is 1 while their relative magnitudes stay unchanged. If false we normalize neuron-wise.
+                                  sum of variances is 1 while their relative magnitudes stay unchanged. If false we normalize neuron-wise. We normalize the last dimension.
             target_rms: Target RMS for input activation normalization.
         """
         super().__init__()
@@ -69,11 +69,12 @@ def __init__(
             self.register_buffer("activation_std", th.nan * th.ones(activation_shape))
 
         if self.keep_relative_variance and self.has_activation_normalizer:
-            total_var = (self.activation_std**2).sum()
+            total_var = (self.activation_std**2).sum(dim=-1)
+            assert total_var.shape == self.activation_mean.shape[:-1]
             activation_global_scale = self.target_rms / th.sqrt(total_var + 1e-8)
             self.register_buffer("activation_global_scale", activation_global_scale)
         else:
-            self.register_buffer("activation_global_scale", th.tensor(1.0))
+            self.register_buffer("activation_global_scale", th.ones(activation_shape[:-1]))
 
     @property
     def has_activation_normalizer(self) -> bool:
@@ -103,7 +104,7 @@ def normalize_activations(self, x: th.Tensor, inplace: bool = False) -> th.Tenso
             x = x - self.activation_mean
 
             if self.keep_relative_variance:
-                return x * self.activation_global_scale
+                return (x.T * self.activation_global_scale).T
             else:
                 return x / (self.activation_std + 1e-8)
         return x
@@ -127,7 +128,7 @@ def denormalize_activations(self, x: th.Tensor, inplace: bool = False) -> th.Ten
             assert isinstance(self.activation_std, th.Tensor)
 
             if self.keep_relative_variance:
-                x = x / (self.activation_global_scale + 1e-8)
+                x = (x.T / (self.activation_global_scale + 1e-8)).T
             else:
                 x = x * (self.activation_std + 1e-8)
 
diff --git a/dictionary_learning/trainers/crosscoder.py b/dictionary_learning/trainers/crosscoder.py
@@ -204,17 +204,20 @@ def loss(
         if not logging:
             return loss
         else:
+            log_dict = {
+                "l2_loss": l2_loss.item(),
+                "mse_loss": mse_loss.item(),
+                "sparsity_loss": l1_loss.item(),
+                "loss": loss.item(),
+                "deads": deads if return_deads else None,
+            }
+            for layer in range(x.shape[1]):
+                log_dict[f"rms_norm_l{layer}"] = th.sqrt((x[:, layer, :].pow(2).sum(-1)).mean()).item()
             return namedtuple("LossLog", ["x", "x_hat", "f", "losses"])(
                 x,
                 x_hat,
                 f,
-                {
-                    "l2_loss": l2_loss.item(),
-                    "mse_loss": mse_loss.item(),
-                    "sparsity_loss": l1_loss.item(),
-                    "loss": loss.item(),
-                    "deads": deads if return_deads else None,
-                },
+                log_dict,
             )
 
     def update(self, step, activations):
diff --git a/dictionary_learning/training.py b/dictionary_learning/training.py
@@ -89,6 +89,7 @@ def log_stats(
     stage: str = "train",
     use_threshold: bool = True,
     epoch_idx_per_step: Optional[List[int]] = None,
+    num_tokens: int = None,
 ):
     with th.no_grad():
         log = {}
@@ -111,6 +112,8 @@ def log_stats(
 
         if epoch_idx_per_step is not None:
             log["epoch"] = epoch_idx_per_step[step]
+        if num_tokens is not None:
+            log["num_tokens"] = num_tokens
         wandb.log(log, step=step)
 
 
@@ -285,11 +288,12 @@ def trainSAE(
         with open(os.path.join(save_dir, "config.json"), "w") as f:
             json.dump(config, f, indent=4)
 
+    num_tokens = 0
     for step, act in enumerate(tqdm(data, total=steps)):
         if steps is not None and step >= steps:
             break
         act = act.to(trainer.device).to(dtype)
-
+        num_tokens += act.shape[0]
         # logging
         if log_steps is not None and step % log_steps == 0 and step != 0:
             with th.no_grad():
@@ -301,6 +305,7 @@ def trainSAE(
                     transcoder,
                     use_threshold=False,
                     epoch_idx_per_step=epoch_idx_per_step,
+                    num_tokens=num_tokens,
                 )
                 if isinstance(trainer, BatchTopKCrossCoderTrainer) or isinstance(trainer, BatchTopKTrainer):
                     log_stats(
@@ -312,6 +317,7 @@ def trainSAE(
                         use_threshold=True,
                         stage="trainthres",
                         epoch_idx_per_step=epoch_idx_per_step,
+                        num_tokens=num_tokens,
                     )
 
         # saving