ray-project
diff --git a/‎rllib/BUILD‎
Lines changed: 23 additions & 0 deletions b/‎rllib/BUILD‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎rllib/algorithms/algorithm.py‎
Lines changed: 26 additions & 23 deletions b/‎rllib/algorithms/algorithm.py‎
Lines changed: 26 additions & 23 deletions
diff --git a/‎rllib/algorithms/algorithm_config.py‎
Lines changed: 47 additions & 7 deletions b/‎rllib/algorithms/algorithm_config.py‎
Lines changed: 47 additions & 7 deletions
diff --git a/‎rllib/algorithms/marwil/marwil.py‎
Lines changed: 1 addition & 1 deletion b/‎rllib/algorithms/marwil/marwil.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎rllib/core/learner/differentiable_learner.py‎
Lines changed: 22 additions & 3 deletions b/‎rllib/core/learner/differentiable_learner.py‎
Lines changed: 22 additions & 3 deletions
diff --git a/‎rllib/core/learner/differentiable_learner_config.py‎
Lines changed: 102 additions & 2 deletions b/‎rllib/core/learner/differentiable_learner_config.py‎
Lines changed: 102 additions & 2 deletions
@@ -3357,6 +3357,29 @@ py_test(
     ],
 )
 
+py_test(
+    name = "examples/algorithms/maml_lr_supervised_learning",
+    size = "large",
+    srcs = ["examples/algorithms/maml_lr_supervised_learning.py"],
+    args = [
+        "--enable-new-api-stack",
+        "--as-test",
+        "--stop-iters=70000",
+        "--meta-lr=0.001",
+        "--meta-train-batch-size=5",
+        "--fine-tune-iters=10",
+        "--fine-tune-batch-size=5",
+        "--fine-tune-lr=0.01",
+        "--noise-std=0.0",
+        "--no-plot",
+    ],
+    main = "examples/algorithms/maml_lr_supervised_learning.py",
+    tags = [
+        "examples",
+        "team:rllib",
+    ],
+)
+
 # subdirectory: catalogs/
 # ....................................
 py_test(
 
@@ -640,7 +640,7 @@ def setup(self, config: AlgorithmConfig) -> None:
         else:
             self.offline_data = None
 
-        if not self.offline_data:
+        if self.config.is_online or not self.config.enable_env_runner_and_connector_v2:
             # Create a set of env runner actors via a EnvRunnerGroup.
             self.env_runner_group = EnvRunnerGroup(
                 env_creator=self.env_creator,
@@ -2822,28 +2822,31 @@ def get_state(
         state = {}
 
         # Get (local) EnvRunner state (w/o RLModule).
-        if self._check_component(COMPONENT_ENV_RUNNER, components, not_components):
-            if self.env_runner:
-                state[COMPONENT_ENV_RUNNER] = self.env_runner.get_state(
-                    components=self._get_subcomponents(COMPONENT_RL_MODULE, components),
-                    not_components=force_list(
-                        self._get_subcomponents(COMPONENT_RL_MODULE, not_components)
+        if self.config.is_online:
+            if self._check_component(COMPONENT_ENV_RUNNER, components, not_components):
+                if self.env_runner:
+                    state[COMPONENT_ENV_RUNNER] = self.env_runner.get_state(
+                        components=self._get_subcomponents(
+                            COMPONENT_RL_MODULE, components
+                        ),
+                        not_components=force_list(
+                            self._get_subcomponents(COMPONENT_RL_MODULE, not_components)
+                        )
+                        # We don't want the RLModule state from the EnvRunners (it's
+                        # `inference_only` anyway and already provided in full by the
+                        # Learners).
+                        + [COMPONENT_RL_MODULE],
+                        **kwargs,
                     )
-                    # We don't want the RLModule state from the EnvRunners (it's
-                    # `inference_only` anyway and already provided in full by the
-                    # Learners).
-                    + [COMPONENT_RL_MODULE],
-                    **kwargs,
-                )
-            else:
-                state[COMPONENT_ENV_RUNNER] = {
-                    COMPONENT_ENV_TO_MODULE_CONNECTOR: (
-                        self.env_to_module_connector.get_state()
-                    ),
-                    COMPONENT_MODULE_TO_ENV_CONNECTOR: (
-                        self.module_to_env_connector.get_state()
-                    ),
-                }
+                else:
+                    state[COMPONENT_ENV_RUNNER] = {
+                        COMPONENT_ENV_TO_MODULE_CONNECTOR: (
+                            self.env_to_module_connector.get_state()
+                        ),
+                        COMPONENT_MODULE_TO_ENV_CONNECTOR: (
+                            self.module_to_env_connector.get_state()
+                        ),
+                    }
 
                 # Get (local) evaluation EnvRunner state (w/o RLModule).
         if self.eval_env_runner and self._check_component(
@@ -2936,7 +2939,7 @@ def get_checkpointable_components(self) -> List[Tuple[str, "Checkpointable"]]:
         components = [
             (COMPONENT_LEARNER_GROUP, self.learner_group),
         ]
-        if not self.config.is_offline and self.env_runner:
+        if self.config.is_online:
             components.append(
                 (COMPONENT_ENV_RUNNER, self.env_runner),
             )
 
@@ -88,7 +88,9 @@
     from ray.rllib.algorithms.algorithm import Algorithm
     from ray.rllib.connectors.connector_v2 import ConnectorV2
     from ray.rllib.core.learner import Learner
+    from ray.rllib.core.learner.differentiable_learner import DifferentiableLearner
     from ray.rllib.core.learner.learner_group import LearnerGroup
+    from ray.rllib.core.learner.torch.torch_meta_learner import TorchMetaLearner
     from ray.rllib.core.rl_module.rl_module import RLModule
     from ray.rllib.utils.typing import EpisodeType
 
@@ -358,6 +360,7 @@ def __init__(self, algo_class: Optional[type] = None):
         self.update_worker_filter_stats = True
         self.use_worker_filter_stats = True
         self.sampler_perf_stats_ema_coef = None
+        self._is_online = True
 
         # `self.learners()`
         self.num_learners = 0
@@ -5157,9 +5160,13 @@ def _validate_offline_settings(self):
         # action and observation spaces. Note, we require here the spaces,
         # i.e. a user cannot provide an environment instead because we do
         # not want to create the environment to receive spaces.
-        if self.is_offline and (
-            not (self.evaluation_num_env_runners > 0 or self.evaluation_interval)
-            and (self.action_space is None or self.observation_space is None)
+        if (
+            self.is_offline
+            and not self.is_online
+            and (
+                not (self.evaluation_num_env_runners > 0 or self.evaluation_interval)
+                and (self.action_space is None or self.observation_space is None)
+            )
         ):
             self._value_error(
                 "If no evaluation should be run, `action_space` and "
@@ -5228,6 +5235,14 @@ def _validate_offline_settings(self):
                 "recorded episodes cannot be read in for training."
             )
 
+    @property
+    def is_online(self) -> bool:
+        """Defines if this config is for online RL.
+
+        Note, a config can be for on- and offline training at the same time.
+        """
+        return self._is_online
+
     @property
     def is_offline(self) -> bool:
         """Defines, if this config is for offline RL."""
@@ -6045,6 +6060,7 @@ def learners(
         self,
         *,
         differentiable_learner_configs: List[DifferentiableLearnerConfig] = NotProvided,
+        **kwargs,
     ) -> "DifferentiableAlgorithmConfig":
         """Sets the configurations for differentiable learners.
 
@@ -6053,6 +6069,8 @@ def learners(
                 defining the `DifferentiableLearner` classes used for the nested updates in
                 `Algorithm`'s learner.
         """
+        super().learners(**kwargs)
+
         if differentiable_learner_configs is not NotProvided:
             self.differentiable_learner_configs = differentiable_learner_configs
 
@@ -6092,18 +6110,40 @@ def validate(self):
                 "one instance is not a `DifferentiableLearnerConfig`."
             )
 
-    def get_default_learner_class(self):
-        """Returns the Learner class to use for this algorithm.
+    def get_default_learner_class(self) -> Union[Type["TorchMetaLearner"], str]:
+        """Returns the `MetaLearner` class to use for this algorithm.
 
         Override this method in the sub-class to return the `MetaLearner`.
 
         Returns:
             The `MetaLearner` class to use for this algorithm either as a class
             type or as a string. (e.g. "ray.rllib.core.learner.torch.torch_meta_learner.TorchMetaLearner")
         """
-        from ray.rllib.core.learner.torch.torch_meta_learner import TorchMetaLearner
+        return NotImplemented
 
-        return TorchMetaLearner
+    def get_differentiable_learner_classes(
+        self,
+    ) -> List[Union[Type["DifferentiableLearner"], str]]:
+        """Returns the `DifferentiableLearner` classes to use for this algorithm.
+
+        Override this method in the sub-class to return the `DifferentiableLearner`.
+
+        Returns:
+            The `DifferentiableLearner` class to use for this algorithm either as a class
+            type or as a string. (e.g.
+            "ray.rllib.core.learner.torch.torch_meta_learner.TorchDifferentiableLearner").
+        """
+        return NotImplemented
+
+    def get_differentiable_learner_configs(self) -> List[DifferentiableLearnerConfig]:
+        """Returns the `DifferentiableLearnerConfigs` for all `DifferentiableLearner`s.
+
+        Override this method in the sub-class to return the `DifferentiableLearnerConfig`s.
+
+        Returns:
+            The `DifferentiableLearnerConfig` instances to use for this algorithm.
+        """
+        return self.differentiable_learner_configs
 
 
 class TorchCompileWhatToCompile(str, Enum):
 
@@ -166,7 +166,7 @@ def __init__(self, algo_class=None):
         }
 
         super().__init__(algo_class=algo_class or MARWIL)
-
+        self._is_online = False
         # fmt: off
         # __sphinx_doc_begin__
         # MARWIL specific settings:
 
@@ -123,7 +123,7 @@ def build(self) -> None:
 
         # TODO (simon): Move the `build_learner_connector` to the
         # `DifferentiableLearnerConfig`.
-        self._learner_connector = self.config.build_learner_connector(
+        self._learner_connector = self.learner_config.build_learner_connector(
             input_observation_space=None,
             input_action_space=None,
             device=None,
@@ -383,14 +383,13 @@ def update(
         # gradient steps inside the iterator loop above (could be a complete epoch)
         # the target networks might need to be updated earlier.
         # self.after_gradient_based_update(timesteps=timesteps or {})
-
         self.metrics.deactivate_tensor_mode()
 
         # Reduce results across all minibatch update steps.
         if not _no_metrics_reduce:
             return params, loss_per_module, self.metrics.reduce()
         else:
-            return params, loss_per_module, None
+            return params, loss_per_module, {}
 
     def _create_iterator_if_necessary(
         self,
@@ -679,6 +678,26 @@ def _check_is_built(self, error: bool = True) -> bool:
             return False
         return True
 
+    @abc.abstractmethod
+    def _get_tensor_variable(
+        self,
+        value: Any,
+        dtype: Any = None,
+        trainable: bool = False,
+    ) -> TensorType:
+        """Returns a framework-specific tensor variable with the initial given value.
+
+        This is a framework specific method that should be implemented by the
+        framework specific sub-classes.
+
+        Args:
+            value: The initial value for the tensor variable variable.
+
+        Returns:
+            The framework specific tensor variable of the given initial value,
+            dtype and trainable/requires_grad property.
+        """
+
     # TODO (simon): Duplicate in Learner. Move to base class "Learnable".
     def _reset(self):
         self.metrics = MetricsLogger()
 
@@ -1,7 +1,12 @@
-from dataclasses import dataclass
-from typing import Callable
+from dataclasses import dataclass, fields
 
+from typing import Callable, List, Optional, Union
+
+from ray.rllib.connectors.connector_v2 import ConnectorV2
 from ray.rllib.core.learner.differentiable_learner import DifferentiableLearner
+from ray.rllib.core.rl_module.rl_module import RLModule
+from ray.rllib.core.rl_module.multi_rl_module import MultiRLModuleSpec
+from ray.rllib.utils.typing import ModuleID
 
 
 @dataclass
@@ -13,6 +18,16 @@ class DifferentiableLearnerConfig:
     # The `DifferentiableLearner` class. Must be derived from `DifferentiableLearner`.
     learner_class: Callable
 
+    learner_connector: Optional[
+        Callable[["RLModule"], Union["ConnectorV2", List["ConnectorV2"]]]
+    ] = None
+
+    add_default_connectors_to_learner_pipeline: bool = True
+
+    is_multi_agent: bool = False
+
+    policies_to_update: List[ModuleID] = None
+
     # The learning rate to use for the nested update. Note, in the default case this
     # learning rate is only used to update parameters in a functional form, i.e. the
     # `RLModule`'s stateful parameters are only updated in the `MetaLearner`. Different
@@ -45,3 +60,88 @@ def __post_init__(self):
                 "`learner_class` must be a subclass of `DifferentiableLearner "
                 f"but is {self.learner_class}."
             )
+
+    def build_learner_connector(
+        self,
+        input_observation_space,
+        input_action_space,
+        device=None,
+    ):
+        from ray.rllib.connectors.learner import (
+            AddColumnsFromEpisodesToTrainBatch,
+            AddObservationsFromEpisodesToBatch,
+            AddStatesFromEpisodesToBatch,
+            AddTimeDimToBatchAndZeroPad,
+            AgentToModuleMapping,
+            BatchIndividualItems,
+            LearnerConnectorPipeline,
+            NumpyToTensor,
+        )
+
+        custom_connectors = []
+        # Create a learner connector pipeline (including RLlib's default
+        # learner connector piece) and return it.
+        if self.learner_connector is not None:
+            val_ = self.learner_connector(
+                input_observation_space,
+                input_action_space,
+                # device,  # TODO (sven): Also pass device into custom builder.
+            )
+
+            from ray.rllib.connectors.connector_v2 import ConnectorV2
+
+            # ConnectorV2 (piece or pipeline).
+            if isinstance(val_, ConnectorV2):
+                custom_connectors = [val_]
+            # Sequence of individual ConnectorV2 pieces.
+            elif isinstance(val_, (list, tuple)):
+                custom_connectors = list(val_)
+            # Unsupported return value.
+            else:
+                raise ValueError(
+                    "`AlgorithmConfig.training(learner_connector=..)` must return "
+                    "a ConnectorV2 object or a list thereof (to be added to a "
+                    f"pipeline)! Your function returned {val_}."
+                )
+
+        pipeline = LearnerConnectorPipeline(
+            connectors=custom_connectors,
+            input_observation_space=input_observation_space,
+            input_action_space=input_action_space,
+        )
+        if self.add_default_connectors_to_learner_pipeline:
+            # Append OBS handling.
+            pipeline.append(
+                AddObservationsFromEpisodesToBatch(as_learner_connector=True)
+            )
+            # Append all other columns handling.
+            pipeline.append(AddColumnsFromEpisodesToTrainBatch())
+            # Append time-rank handler.
+            pipeline.append(AddTimeDimToBatchAndZeroPad(as_learner_connector=True))
+            # Append STATE_IN/STATE_OUT handler.
+            pipeline.append(AddStatesFromEpisodesToBatch(as_learner_connector=True))
+            # If multi-agent -> Map from AgentID-based data to ModuleID based data.
+            if self.is_multi_agent:
+                pipeline.append(
+                    AgentToModuleMapping(
+                        rl_module_specs=(
+                            self.rl_module_spec.rl_module_specs
+                            if isinstance(self.rl_module_spec, MultiRLModuleSpec)
+                            else set(self.policies)
+                        ),
+                        agent_to_module_mapping_fn=self.policy_mapping_fn,
+                    )
+                )
+            # Batch all data.
+            pipeline.append(BatchIndividualItems(multi_agent=self.is_multi_agent))
+            # Convert to Tensors.
+            pipeline.append(NumpyToTensor(as_learner_connector=True, device=device))
+        return pipeline
+
+    def update_from_kwargs(self, **kwargs):
+        """Sets all slots with values defined in `kwargs`."""
+        # Get all field names (i.e., slot names).
+        field_names = {f.name for f in fields(self)}
+        for key, value in kwargs.items():
+            if key in field_names:
+                setattr(self, key, value)
Original file line number	Diff line number	Diff line change
`@@ -166,7 +166,7 @@ def __init__(self, algo_class=None):`
`166`	`166`	`}`
`167`	`167`
`168`	`168`	`super().__init__(algo_class=algo_class or MARWIL)`
`169`		`-`
	`169`	`+ self._is_online = False`
`170`	`170`	`# fmt: off`
`171`	`171`	`# __sphinx_doc_begin__`
`172`	`172`	`# MARWIL specific settings:`