Source code for objectrl.utils.environment.reward_wrappers

# -----------------------------------------------------------------------------------
# ObjectRL: An Object-Oriented Reinforcement Learning Codebase
# Copyright (C) 2025 ADIN Lab

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.
# -----------------------------------------------------------------------------------

import gymnasium as gym
import numpy as np


[docs] class PositionDelayWrapper(gym.RewardWrapper): """ A Gymnasium wrapper that modifies the reward function based on position delay and control cost. This wrapper delays reward until the agent reaches a certain position (`position_delay`). It also penalizes large control signals to encourage smoother actions. Attributes: env (gym.Env): The environment to wrap. position_delay (float): Minimum x-position the agent must reach before receiving reward. ctrl_w (float): Weight for the control cost penalty term. """
[docs] def __init__( self, env: gym.Env, position_delay: float = 2, ctrl_w: float = 0.001 ) -> None: """ Initialize the PositionDelayWrapper. Args: env (gym.Env): The environment to wrap. position_delay (float): Minimum x-position the agent must reach before receiving reward. ctrl_w (float): Weight for the control cost penalty term. Returns: None """ super().__init__(env) self.position_delay = position_delay self.ctrl_w = ctrl_w
[docs] def step(self, action: np.ndarray) -> tuple: """ Take a step in the environment, modifying the reward. The environment's reward is replaced with a custom one that combines delayed forward movement reward and a control cost. Args: action (np.ndarray): Action taken by the agent. Returns: tuple: (observation, modified_reward, terminated, truncated, info) - `info["x_pos"]`: Current x-position of the agent. - `info["action_norm"]`: Squared norm of the action. """ observation, reward, terminated, truncated, info = self.env.step(action) info["x_pos"] = self.data.qpos[0] info["action_norm"] = np.sum(np.square(action)) return ( observation, self.reward(observation, action), terminated, truncated, info, )
[docs] def reward(self, observation: np.ndarray, action: np.ndarray) -> float: """ Compute the modified reward based on position delay and control penalty. Args: observation: Current observation (unused here, but kept for compatibility). action (np.ndarray): Action taken by the agent. Returns: float: Modified reward value. """ x_pos = self.data.qpos[0] x_vel = self.data.qvel[0] ctrl_cost = self.ctrl_w * np.sum(np.square(action)) forward_reward = (x_pos >= self.position_delay) * x_vel rewards = forward_reward - ctrl_cost return rewards