Source code for objectrl.utils.environment.reward_wrappers

# -----------------------------------------------------------------------------------
# ObjectRL: An Object-Oriented Reinforcement Learning Codebase
# Copyright (C) 2025 ADIN Lab

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
# -----------------------------------------------------------------------------------

import gymnasium as gym
import numpy as np



[docs]
class PositionDelayWrapper(gym.RewardWrapper):
    """
    A Gymnasium wrapper that modifies the reward function based on position delay and control cost.
    This wrapper delays reward until the agent reaches a certain position (`position_delay`).
    It also penalizes large control signals to encourage smoother actions.

    Attributes:
        env (gym.Env): The environment to wrap.
        position_delay (float): Minimum x-position the agent must reach before receiving reward.
        ctrl_w (float): Weight for the control cost penalty term.
    """


[docs]
    def __init__(
        self, env: gym.Env, position_delay: float = 2, ctrl_w: float = 0.001
    ) -> None:
        """
        Initialize the PositionDelayWrapper.

        Args:
            env (gym.Env): The environment to wrap.
            position_delay (float): Minimum x-position the agent must reach before receiving reward.
            ctrl_w (float): Weight for the control cost penalty term.
        Returns:
            None
        """
        super().__init__(env)
        self.position_delay = position_delay
        self.ctrl_w = ctrl_w



[docs]
    def step(self, action: np.ndarray) -> tuple:
        """
        Take a step in the environment, modifying the reward.
        The environment's reward is replaced with a custom one that combines delayed
        forward movement reward and a control cost.

        Args:
            action (np.ndarray): Action taken by the agent.
        Returns:
            tuple: (observation, modified_reward, terminated, truncated, info)
                - `info["x_pos"]`: Current x-position of the agent.
                - `info["action_norm"]`: Squared norm of the action.
        """
        observation, reward, terminated, truncated, info = self.env.step(action)
        info["x_pos"] = self.data.qpos[0]
        info["action_norm"] = np.sum(np.square(action))
        return (
            observation,
            self.reward(observation, action),
            terminated,
            truncated,
            info,
        )



[docs]
    def reward(self, observation: np.ndarray, action: np.ndarray) -> float:
        """
        Compute the modified reward based on position delay and control penalty.

        Args:
            observation: Current observation (unused here, but kept for compatibility).
            action (np.ndarray): Action taken by the agent.
        Returns:
            float: Modified reward value.
        """
        x_pos = self.data.qpos[0]
        x_vel = self.data.qvel[0]
        ctrl_cost = self.ctrl_w * np.sum(np.square(action))
        forward_reward = (x_pos >= self.position_delay) * x_vel
        rewards = forward_reward - ctrl_cost
        return rewards