Refactor and clean up unused code

s1lent4gnt · s1lent4gnt · commit 7067e2dcf6fa · 2025-09-01T15:25:21.000+02:00
diff --git a/src/lerobot/policies/__init__.py b/src/lerobot/policies/__init__.py
@@ -14,8 +14,8 @@
 
 from .act.configuration_act import ACTConfig as ACTConfig
 from .diffusion.configuration_diffusion import DiffusionConfig as DiffusionConfig
+from .octo.configuration_octo import OctoConfig as OctoConfig
 from .pi0.configuration_pi0 import PI0Config as PI0Config
 from .smolvla.configuration_smolvla import SmolVLAConfig as SmolVLAConfig
 from .tdmpc.configuration_tdmpc import TDMPCConfig as TDMPCConfig
 from .vqbet.configuration_vqbet import VQBeTConfig as VQBeTConfig
-from .octo.configuration_octo import OctoConfig as OctoConfig
diff --git a/src/lerobot/policies/factory.py b/src/lerobot/policies/factory.py
@@ -26,13 +26,13 @@
 from lerobot.envs.utils import env_to_policy_features
 from lerobot.policies.act.configuration_act import ACTConfig
 from lerobot.policies.diffusion.configuration_diffusion import DiffusionConfig
+from lerobot.policies.octo.configuration_octo import OctoConfig
 from lerobot.policies.pi0.configuration_pi0 import PI0Config
 from lerobot.policies.pi0fast.configuration_pi0fast import PI0FASTConfig
 from lerobot.policies.pretrained import PreTrainedPolicy
 from lerobot.policies.sac.configuration_sac import SACConfig
 from lerobot.policies.sac.reward_model.configuration_classifier import RewardClassifierConfig
 from lerobot.policies.smolvla.configuration_smolvla import SmolVLAConfig
-from lerobot.policies.octo.configuration_octo import OctoConfig
 from lerobot.policies.tdmpc.configuration_tdmpc import TDMPCConfig
 from lerobot.policies.vqbet.configuration_vqbet import VQBeTConfig
 
diff --git a/src/lerobot/policies/octo/configuration_octo.py b/src/lerobot/policies/octo/configuration_octo.py
@@ -15,7 +15,7 @@
 from dataclasses import dataclass, field
 
 from lerobot.configs.policies import PreTrainedConfig
-from lerobot.configs.types import FeatureType, NormalizationMode, PolicyFeature
+from lerobot.configs.types import NormalizationMode
 from lerobot.optim.optimizers import AdamWConfig
 from lerobot.optim.schedulers import (
     CosineDecayWithWarmupSchedulerConfig,
@@ -31,12 +31,12 @@ class OctoConfig(PreTrainedConfig):
     num_layers: int = 12
     num_heads: int = 12
     mlp_dim: int = 3072
-    
+
     # Input / output structure
     n_obs_steps: int = 1
     chunk_size: int = 10  # max_horizon in octo
     n_action_steps: int = 4  # action_horizon in octo
-    
+
     # Normalization
     normalization_mapping: dict[str, NormalizationMode] = field(
         default_factory=lambda: {
@@ -47,22 +47,22 @@ class OctoConfig(PreTrainedConfig):
     )
 
     push_to_hub: bool = False
-    
+
     # Image preprocessing
     resize_primary_image: tuple[int, int] = (256, 256)
     resize_wrist_image: tuple[int, int] = (128, 128)
-    
+
     # Language model
     language_model_name: str = "t5-base"
     language_max_length: int = 16
     freeze_language_encoder: bool = True
-    
+
     # Transformer settings
     repeat_task_tokens: bool = True
     dropout_rate: float = 0.0
     attention_dropout_rate: float = 0.0
     add_position_embedding: bool = False
-    
+
     # Diffusion settings
     diffusion_steps: int = 20
     n_diffusion_samples: int = 1
@@ -73,26 +73,26 @@ class OctoConfig(PreTrainedConfig):
     num_blocks: int = 3
     hidden_dim: int = 256
     use_layer_norm: bool = True
-    
+
     # Finetuning settings
     freeze_transformer: bool = False
     freeze_vision_encoder: bool = True
     train_action_head_only: bool = False
-    
+
     # Training presets
     optimizer_lr: float = 1e-4
     optimizer_betas: tuple[float, float] = (0.9, 0.999)
     optimizer_eps: float = 1e-8
     optimizer_weight_decay: float = 1e-4
     optimizer_grad_clip_norm: float = 10.0
-    
+
     scheduler_warmup_steps: int = 1_000
     scheduler_decay_steps: int = 100_000
     scheduler_decay_lr: float = 1e-5
 
     def __post_init__(self):
         super().__post_init__()
-        
+
         # Set architecture parameters based on model_name
         if self.model_name == "octo-base":
             self.token_embedding_size = 768
@@ -106,7 +106,7 @@ def __post_init__(self):
             self.mlp_dim = 1536
         else:
             raise ValueError(f"Unknown model name: {self.model_name}")
-        
+
         # Input validation
         if self.n_action_steps > self.chunk_size:
             raise ValueError(
diff --git a/src/lerobot/policies/octo/diffusion.py b/src/lerobot/policies/octo/diffusion.py
@@ -19,7 +19,6 @@
 
 import torch
 import torch.nn as nn
-
 from einops import rearrange
 
 from lerobot.policies.octo.base import TokenGroup
diff --git a/src/lerobot/policies/octo/modeling_octo.py b/src/lerobot/policies/octo/modeling_octo.py
@@ -43,23 +43,20 @@
 
 import torch
 import torch.nn as nn
-
 from torch import Tensor
 
-from lerobot.constants import ACTION, OBS_STATE
+from lerobot.constants import ACTION
 from lerobot.policies.normalize import Normalize, Unnormalize
 from lerobot.policies.octo.configuration_octo import OctoConfig
-
-from lerobot.policies.pretrained import PreTrainedPolicy
-from lerobot.policies.utils import populate_queues, log_model_loading_keys
-
+from lerobot.policies.octo.diffusion import DiffusionActionHead
 from lerobot.policies.octo.tokenizers import TextProcessor
 from lerobot.policies.octo.transformer import OctoWithoutHead
-from lerobot.policies.octo.diffusion import DiffusionActionHead
-
+from lerobot.policies.pretrained import PreTrainedPolicy
+from lerobot.policies.utils import log_model_loading_keys, populate_queues
 
 # TODO(lilkm): Be aware of normalization the image tokenizer (normalize_images function)
 
+
 class OctoPolicy(PreTrainedPolicy):
     """Wrapper class around Octo model to train and run inference within LeRobot."""
 
@@ -110,16 +107,16 @@ def reset(self):
         self._queues = {
             ACTION: deque(maxlen=self.config.n_action_steps),
         }
- 
+
     def _apply_selective_freezing(self):
         """Apply selective freezing based on configuration settings."""
-        if hasattr(self.model.octo_transformer, 'task_tokenizers'):
+        if hasattr(self.model.octo_transformer, "task_tokenizers"):
             for name, tokenizer in self.model.octo_transformer.task_tokenizers.items():
-                if name == 'language_instruction':
+                if name == "language_instruction":
                     for param in tokenizer.parameters():
                         param.requires_grad = False
-                    print(f"✓ T5 language encoder frozen (always frozen during finetuning)")
-        
+                    print("✓ T5 language encoder frozen (always frozen during finetuning)")
+
         # If train_action_head_only is True, freeze everything except the action head
         if self.config.train_action_head_only:
             # Freeze transformer
@@ -133,10 +130,10 @@ def _apply_selective_freezing(self):
             if self.config.freeze_transformer:
                 for param in self.model.octo_transformer.parameters():
                     param.requires_grad = False
- 
+
             if self.config.freeze_vision_encoder:
                 # Freeze vision encoder components in the transformer
-                if hasattr(self.model.octo_transformer, 'observation_tokenizers'):
+                if hasattr(self.model.octo_transformer, "observation_tokenizers"):
                     for tokenizer in self.model.octo_transformer.observation_tokenizers.values():
                         for param in tokenizer.parameters():
                             param.requires_grad = False
@@ -170,18 +167,18 @@ def _transform_state_dict_keys(cls, state_dict: dict) -> dict:
             # 1. Replace "action_head." with "head."
             if "action_head." in new_key:
                 new_key = new_key.replace("action_head.", "head.")
- 
+
             # 2. Adjust the transformer nesting to match the LeRobot model.
             # The checkpoint has `transformer.transformer` but LeRobot expects
             # `transformer.transformer.transformer`.
             if "octo_transformer.transformer.transformer." in new_key:
-                 new_key = new_key.replace(
-                     "octo_transformer.transformer.transformer.",
-                     "octo_transformer.transformer.transformer.transformer."
-                 )
+                new_key = new_key.replace(
+                    "octo_transformer.transformer.transformer.",
+                    "octo_transformer.transformer.transformer.transformer.",
+                )
 
             transformed_dict[new_key] = value
- 
+
         return transformed_dict
 
     @classmethod
@@ -190,8 +187,9 @@ def _load_as_safetensor(
     ) -> "OctoPolicy":
         """Override to apply key transformations before loading."""
         from safetensors.torch import load_file
+
         from lerobot.utils.utils import init_logging
- 
+
         init_logging()
         # Load the state dict from file safely
         state_dict = load_file(model_file, device=map_location)
@@ -205,7 +203,7 @@ def _load_as_safetensor(
         # Log message
         log_model_loading_keys(msg.missing_keys, msg.unexpected_keys)
         return model
- 
+
     @classmethod
     def from_pretrained(cls, *args, **kwargs):
         """Override the from_pretrained method to display important information."""
@@ -217,15 +215,17 @@ def from_pretrained(cls, *args, **kwargs):
         )
         return super().from_pretrained(*args, **kwargs)
 
-    def _prepare_batch(self, batch: dict[str, Tensor], raw_tasks: Optional[Sequence[str]] = None) -> dict[str, Tensor]:
+    def _prepare_batch(
+        self, batch: dict[str, Tensor], raw_tasks: Optional[Sequence[str]] = None
+    ) -> dict[str, Tensor]:
         """
         Prepare batch for model input.
         Transforms a batch from the LeRobotDataset format to the format expected by the OctoModel.
         """
         batch = self.normalize_inputs(batch)
         # Get device from any available tensor in the batch
         device = next(iter(batch.values())).device
- 
+
         image_primary = batch["observation.images.front"].to(device)
         image_wrist = batch["observation.images.wrist"].to(device)
         proprio = batch["observation.state"].to(device)
@@ -254,34 +254,36 @@ def _prepare_batch(self, batch: dict[str, Tensor], raw_tasks: Optional[Sequence[
         # Create timestep_pad_mask - all True since we have real data (no padding)
         timestep_pad_mask = torch.ones((batch_size, window_size), dtype=torch.bool, device=device)
 
-        task_completed = torch.zeros((batch_size, window_size, action_horizon), dtype=torch.bool, device=device)
+        task_completed = torch.zeros(
+            (batch_size, window_size, action_horizon), dtype=torch.bool, device=device
+        )
 
         # Create pad_mask_dict for observations
         obs_pad_mask_dict = {
-            'image_primary': torch.ones((batch_size, window_size), dtype=torch.bool, device=device),
-            'image_wrist': torch.ones((batch_size, window_size), dtype=torch.bool, device=device),
-            'proprio': torch.ones((batch_size, window_size), dtype=torch.bool, device=device),
-            'timestep': torch.ones((batch_size, window_size), dtype=torch.bool, device=device),
+            "image_primary": torch.ones((batch_size, window_size), dtype=torch.bool, device=device),
+            "image_wrist": torch.ones((batch_size, window_size), dtype=torch.bool, device=device),
+            "proprio": torch.ones((batch_size, window_size), dtype=torch.bool, device=device),
+            "timestep": torch.ones((batch_size, window_size), dtype=torch.bool, device=device),
         }
 
         observations = {
-            'image_primary': image_primary,
-            'image_wrist': image_wrist,
-            'proprio': proprio,
-            'timestep': timestep,
-            'timestep_pad_mask': timestep_pad_mask,
-            'task_completed': task_completed,
-            'pad_mask_dict': obs_pad_mask_dict
+            "image_primary": image_primary,
+            "image_wrist": image_wrist,
+            "proprio": proprio,
+            "timestep": timestep,
+            "timestep_pad_mask": timestep_pad_mask,
+            "task_completed": task_completed,
+            "pad_mask_dict": obs_pad_mask_dict,
         }
 
         language_instruction = self.text_processor.encode(raw_tasks)
         language_instruction = {k: v.to(device) for k, v in language_instruction.items()}
 
         tasks = {
-            'language_instruction': language_instruction,
-            'pad_mask_dict': {
-                'language_instruction': torch.ones(batch_size, dtype=torch.bool, device=device)
-            }
+            "language_instruction": language_instruction,
+            "pad_mask_dict": {
+                "language_instruction": torch.ones(batch_size, dtype=torch.bool, device=device)
+            },
         }
 
         # Handle actions only if they're present (during training)
@@ -295,10 +297,10 @@ def _prepare_batch(self, batch: dict[str, Tensor], raw_tasks: Optional[Sequence[
             # actions to be the target for the diffusion model.
             # raw_actions has shape [batch_size, num_timestamps, action_dim]
             # We need shape [batch_size, window_size, action_horizon, action_dim]
-            
+
             # Select the first `action_horizon` actions from the sequence.
             actions = raw_actions[:, :action_horizon]
-            
+
             # Add the window_size dimension.
             actions = actions.unsqueeze(1)
 
@@ -326,8 +328,10 @@ def _prepare_batch(self, batch: dict[str, Tensor], raw_tasks: Optional[Sequence[
         # return batch
 
     def create_tasks(
-        self, goals: Optional[Dict[str, torch.Tensor]] = None, texts: Optional[Sequence[str]] = None,
-        device: Optional[torch.device] = None
+        self,
+        goals: Optional[Dict[str, torch.Tensor]] = None,
+        texts: Optional[Sequence[str]] = None,
+        device: Optional[torch.device] = None,
     ):
         """Creates tasks dict from goals and texts."""
         assert goals is not None or texts is not None
@@ -348,9 +352,15 @@ def create_tasks(
         else:
             batch_size = len(texts)
             # Create dummy goals if none are provided
-            tasks.update({"image_primary": torch.zeros((batch_size, 256, 256, 3), dtype=torch.uint8, device=device)})
+            tasks.update(
+                {"image_primary": torch.zeros((batch_size, 256, 256, 3), dtype=torch.uint8, device=device)}
+            )
             tasks["pad_mask_dict"].update(
-                {k: torch.zeros(batch_size, dtype=torch.bool, device=device) for k in tasks.keys() if k != "pad_mask_dict"}
+                {
+                    k: torch.zeros(batch_size, dtype=torch.bool, device=device)
+                    for k in tasks.keys()
+                    if k != "pad_mask_dict"
+                }
             )
 
         if texts is not None:
@@ -359,14 +369,18 @@ def create_tasks(
             # Move to the correct device
             encoded = {k: v.to(device) for k, v in encoded.items()}
             tasks["language_instruction"] = encoded
-            tasks["pad_mask_dict"]["language_instruction"] = torch.ones(len(texts), dtype=torch.bool, device=device)
+            tasks["pad_mask_dict"]["language_instruction"] = torch.ones(
+                len(texts), dtype=torch.bool, device=device
+            )
         else:
             batch_size = next(iter(goals.values())).shape[0]
             dummy_texts = [""] * batch_size
             encoded = self.text_processor.encode(dummy_texts)
             encoded = {k: v.to(device) for k, v in encoded.items()}
             tasks["language_instruction"] = encoded
-            tasks["pad_mask_dict"]["language_instruction"] = torch.zeros(batch_size, dtype=torch.bool, device=device)
+            tasks["pad_mask_dict"]["language_instruction"] = torch.zeros(
+                batch_size, dtype=torch.bool, device=device
+            )
 
         return tasks
 
@@ -401,10 +415,10 @@ def predict_action_chunk(self, batch: dict[str, Tensor], tasks: Optional[Sequenc
     def select_action(self, batch: dict[str, Tensor]) -> Tensor:
         """Select a single action given environment observations."""
         self.eval()
- 
+
         # First, populate queues with the original, simple batch
         self._queues = populate_queues(self._queues, batch, exclude_keys=[ACTION])
- 
+
         # Then, prepare the complex batch for the model
         prepared_batch = self._prepare_batch(batch)
 
@@ -489,7 +503,6 @@ def forward(
         timestep_pad_mask: torch.Tensor,
         embodiment_action_dim: Optional[int] = None,
     ) -> torch.Tensor:
-
         transformer_outputs = self.octo_transformer(observations, tasks, timestep_pad_mask)
         actions = self.head.predict_action(
             transformer_outputs=transformer_outputs, embodiment_action_dim=embodiment_action_dim
diff --git a/src/lerobot/policies/octo/tokenizers.py b/src/lerobot/policies/octo/tokenizers.py
diff --git a/src/lerobot/policies/octo/transformer.py b/src/lerobot/policies/octo/transformer.py