Strategies

Base Interfaces

`interfaces`

`BaseActionStrategy`

Bases: ABC

An abstract base class for defining action spaces and handling agent actions.

Source code in src/quantrl_lab/environments/core/interfaces.py

class BaseActionStrategy(ABC):
    """An abstract base class for defining action spaces and handling
    agent actions."""

    @abstractmethod
    def define_action_space(self) -> gym.spaces.Space:
        """
        Defines the action space for the environment.

        Returns:
            gym.spaces.Space: The action space for the environment.
        """
        pass

    @abstractmethod
    def handle_action(self, env_self: TradingEnvProtocol, action: Any) -> Tuple[Any, Dict[str, Any]]:
        """
        Handles the action taken by the agent in the environment.

        Args:
            env_self (TradingEnvProtocol): The environment instance where the action is taken.
            action (Any): The action taken by the agent.

        Returns:
            Tuple[Any, Dict[str, Any]]: The outcome of the action taken in the environment
        """
        pass

`define_action_space()` `abstractmethod`

Defines the action space for the environment.

Returns:

Type	Description
`Space`	gym.spaces.Space: The action space for the environment.

Source code in src/quantrl_lab/environments/core/interfaces.py

@abstractmethod
def define_action_space(self) -> gym.spaces.Space:
    """
    Defines the action space for the environment.

    Returns:
        gym.spaces.Space: The action space for the environment.
    """
    pass

`handle_action(env_self, action)` `abstractmethod`

Handles the action taken by the agent in the environment.

Parameters:

Name	Type	Description	Default
`env_self`	`TradingEnvProtocol`	The environment instance where the action is taken.	required
`action`	`Any`	The action taken by the agent.	required

Returns:

Type	Description
`Tuple[Any, Dict[str, Any]]`	Tuple[Any, Dict[str, Any]]: The outcome of the action taken in the environment

Source code in src/quantrl_lab/environments/core/interfaces.py

@abstractmethod
def handle_action(self, env_self: TradingEnvProtocol, action: Any) -> Tuple[Any, Dict[str, Any]]:
    """
    Handles the action taken by the agent in the environment.

    Args:
        env_self (TradingEnvProtocol): The environment instance where the action is taken.
        action (Any): The action taken by the agent.

    Returns:
        Tuple[Any, Dict[str, Any]]: The outcome of the action taken in the environment
    """
    pass

`BaseObservationStrategy`

Bases: ABC

Abstract base class for defining how an agent perceives the environment.

Source code in src/quantrl_lab/environments/core/interfaces.py

class BaseObservationStrategy(ABC):
    """Abstract base class for defining how an agent perceives the
    environment."""

    @abstractmethod
    def define_observation_space(self, env: TradingEnvProtocol) -> gym.spaces.Space:
        """
        Defines and returns the observation space for the environment.

        Args:
            env (TradingEnvProtocol): The trading environment.

        Returns:
            gym.spaces.Space: The observation space.
        """
        pass

    @abstractmethod
    def build_observation(self, env: TradingEnvProtocol) -> np.ndarray:
        """
        Builds the observation vector for the current state.

        Args:
            env (TradingEnvProtocol): The trading environment.

        Returns:
            np.ndarray: The observation vector.
        """
        pass

    @abstractmethod
    def get_feature_names(self, env: TradingEnvProtocol) -> List[str]:
        """
        Returns a list of feature names corresponding to the exact order
        of elements in the flattened observation vector.

        Args:
            env (TradingEnvProtocol): The trading environment.

        Returns:
            List[str]: A list of feature names (e.g., ["Close_t-1", "RSI_t", ...])
        """
        pass

`define_observation_space(env)` `abstractmethod`

Defines and returns the observation space for the environment.

Parameters:

Name	Type	Description	Default
`env`	`TradingEnvProtocol`	The trading environment.	required

Returns:

Type	Description
`Space`	gym.spaces.Space: The observation space.

Source code in src/quantrl_lab/environments/core/interfaces.py

@abstractmethod
def define_observation_space(self, env: TradingEnvProtocol) -> gym.spaces.Space:
    """
    Defines and returns the observation space for the environment.

    Args:
        env (TradingEnvProtocol): The trading environment.

    Returns:
        gym.spaces.Space: The observation space.
    """
    pass

`build_observation(env)` `abstractmethod`

Builds the observation vector for the current state.

Parameters:

Name	Type	Description	Default
`env`	`TradingEnvProtocol`	The trading environment.	required

Returns:

Type	Description
`ndarray`	np.ndarray: The observation vector.

Source code in src/quantrl_lab/environments/core/interfaces.py

@abstractmethod
def build_observation(self, env: TradingEnvProtocol) -> np.ndarray:
    """
    Builds the observation vector for the current state.

    Args:
        env (TradingEnvProtocol): The trading environment.

    Returns:
        np.ndarray: The observation vector.
    """
    pass

`get_feature_names(env)` `abstractmethod`

Returns a list of feature names corresponding to the exact order of elements in the flattened observation vector.

Parameters:

Name	Type	Description	Default
`env`	`TradingEnvProtocol`	The trading environment.	required

Returns:

Type	Description
`List[str]`	List[str]: A list of feature names (e.g., ["Close_t-1", "RSI_t", ...])

Source code in src/quantrl_lab/environments/core/interfaces.py

@abstractmethod
def get_feature_names(self, env: TradingEnvProtocol) -> List[str]:
    """
    Returns a list of feature names corresponding to the exact order
    of elements in the flattened observation vector.

    Args:
        env (TradingEnvProtocol): The trading environment.

    Returns:
        List[str]: A list of feature names (e.g., ["Close_t-1", "RSI_t", ...])
    """
    pass

`BaseRewardStrategy`

Bases: ABC

Abstract base class for calculating rewards.

Source code in src/quantrl_lab/environments/core/interfaces.py

class BaseRewardStrategy(ABC):
    """Abstract base class for calculating rewards."""

    @abstractmethod
    def calculate_reward(self, env: TradingEnvProtocol) -> float:
        """
        Calculate the reward based on the action taken in the
        environment.

        Args:
            env (TradingEnvProtocol): The trading environment instance.

        Returns:
            float: The calculated reward.
        """
        raise NotImplementedError("Subclasses should implement this method.")

    def on_step_end(self, env: TradingEnvProtocol):
        """Optional: A hook to update any internal state if needed."""
        pass

`calculate_reward(env)` `abstractmethod`

Calculate the reward based on the action taken in the environment.

Parameters:

Name	Type	Description	Default
`env`	`TradingEnvProtocol`	The trading environment instance.	required

Returns:

Name	Type	Description
`float`	`float`	The calculated reward.

Source code in src/quantrl_lab/environments/core/interfaces.py

@abstractmethod
def calculate_reward(self, env: TradingEnvProtocol) -> float:
    """
    Calculate the reward based on the action taken in the
    environment.

    Args:
        env (TradingEnvProtocol): The trading environment instance.

    Returns:
        float: The calculated reward.
    """
    raise NotImplementedError("Subclasses should implement this method.")

`on_step_end(env)`

Optional: A hook to update any internal state if needed.

Source code in src/quantrl_lab/environments/core/interfaces.py

def on_step_end(self, env: TradingEnvProtocol):
    """Optional: A hook to update any internal state if needed."""
    pass

Action Strategies

`standard`

`StandardActionStrategy`

Bases: BaseActionStrategy

Implements the full-featured action space with a 3-part Box space.

Action: [action_type, amount, price_modifier]

Source code in src/quantrl_lab/environments/stock/strategies/actions/standard.py

class StandardActionStrategy(BaseActionStrategy):
    """
    Implements the full-featured action space with a 3-part Box space.

    Action: [action_type, amount, price_modifier]
    """

    def define_action_space(self) -> gym.spaces.Box:
        """
        Defines the action space for the trading environment.

        Returns:
            gym.spaces.Box: The action space as a Box space.
        """
        # We use a symmetric action space [-1, 1] for the action type to help RL agents
        # explore more effectively. An uninitialized agent outputs values near 0.
        # If we used [0, N], 0 would map to Action 0 (Hold), causing inactivity.
        # With [-1, 1], 0 maps to the middle action, encouraging interaction.
        action_type_low = -1.0
        action_type_high = 1.0

        # Use symmetric space for amount as well to avoid 0.0 default
        amount_low = -1.0
        amount_high = 1.0

        # Price modifier for limit orders, typically between 0.9 and 1.1
        price_mod_low = 0.9
        price_mod_high = 1.1

        return gym.spaces.Box(
            low=np.array([action_type_low, amount_low, price_mod_low], dtype=np.float32),
            high=np.array([action_type_high, amount_high, price_mod_high], dtype=np.float32),
            shape=(3,),
            dtype=np.float32,
        )

    def handle_action(self, env_self: TradingEnvProtocol, action: np.ndarray) -> Tuple[Any, Dict[str, Any]]:
        """
        Handles the action by decoding it and instructing the
        environment's portfolio.

        Args:
            env_self (TradingEnvProtocol): The environment instance.
            action (np.ndarray): The raw action from the agent.

        Returns:
            Tuple[Any, Dict[str, Any]]: The decoded action type and a dictionary of details.
        """
        # --- 1. Decode the action ---

        # Rescale Action Type from [-1, 1] to [0, len(Actions)-1]
        raw_type = np.clip(action[0], -1.0, 1.0)
        max_action_index = len(Actions) - 1
        scaled_type = ((raw_type + 1) / 2) * max_action_index

        action_type_int = int(np.round(scaled_type))

        # Rescale Amount from [-1, 1] to [0, 1]
        # This ensures that an output of 0.0 results in 50% amount, not 0%.
        raw_amount = np.clip(action[1], -1.0, 1.0)
        amount_pct = (raw_amount + 1) / 2

        price_modifier = np.clip(action[2], 0.9, 1.1)

        try:
            action_type = Actions(action_type_int)
        except ValueError:
            action_type = Actions.Hold

        # The environment is still responsible for providing the current price
        current_price = env_self._get_current_price()
        if current_price <= 1e-9:
            action_type = Actions.Hold

        # --- 2. Execute the action by calling methods on the PORTFOLIO ---

        # CORRECTED: Get total shares from the portfolio
        had_no_shares = env_self.portfolio.total_shares <= 0
        invalid_action_attempt = False

        # Get current_step from the environment, as the portfolio methods need it
        current_step = env_self.current_step

        if action_type == Actions.Hold:
            pass
        elif action_type == Actions.Buy:
            # CORRECTED: Call the portfolio's method, passing current_step
            env_self.portfolio.execute_market_order(action_type, current_price, amount_pct, current_step)
        elif action_type == Actions.Sell:
            if had_no_shares:
                invalid_action_attempt = True
            # CORRECTED: Call the portfolio's method
            env_self.portfolio.execute_market_order(action_type, current_price, amount_pct, current_step)
        elif action_type == Actions.LimitBuy:
            # CORRECTED: Call the portfolio's method
            env_self.portfolio.place_limit_order(action_type, current_price, amount_pct, price_modifier, current_step)
        elif action_type == Actions.LimitSell:
            if had_no_shares:
                invalid_action_attempt = True
            # CORRECTED: Call the portfolio's method
            env_self.portfolio.place_limit_order(action_type, current_price, amount_pct, price_modifier, current_step)
        elif action_type in [Actions.StopLoss, Actions.TakeProfit]:
            if had_no_shares:
                invalid_action_attempt = True
            # CORRECTED: Call the portfolio's method
            env_self.portfolio.place_risk_management_order(
                action_type, current_price, amount_pct, price_modifier, current_step
            )

        # --- 3. Return decoded info (No changes needed here) ---
        decoded_info = {
            "type": action_type.name,
            "amount_pct": amount_pct,
            "price_modifier": price_modifier,
            "raw_input": action,
            "invalid_action_attempt": invalid_action_attempt,
        }

        return action_type, decoded_info

`define_action_space()`

Defines the action space for the trading environment.

Returns:

Type	Description
`Box`	gym.spaces.Box: The action space as a Box space.

Source code in src/quantrl_lab/environments/stock/strategies/actions/standard.py

def define_action_space(self) -> gym.spaces.Box:
    """
    Defines the action space for the trading environment.

    Returns:
        gym.spaces.Box: The action space as a Box space.
    """
    # We use a symmetric action space [-1, 1] for the action type to help RL agents
    # explore more effectively. An uninitialized agent outputs values near 0.
    # If we used [0, N], 0 would map to Action 0 (Hold), causing inactivity.
    # With [-1, 1], 0 maps to the middle action, encouraging interaction.
    action_type_low = -1.0
    action_type_high = 1.0

    # Use symmetric space for amount as well to avoid 0.0 default
    amount_low = -1.0
    amount_high = 1.0

    # Price modifier for limit orders, typically between 0.9 and 1.1
    price_mod_low = 0.9
    price_mod_high = 1.1

    return gym.spaces.Box(
        low=np.array([action_type_low, amount_low, price_mod_low], dtype=np.float32),
        high=np.array([action_type_high, amount_high, price_mod_high], dtype=np.float32),
        shape=(3,),
        dtype=np.float32,
    )

`handle_action(env_self, action)`

Handles the action by decoding it and instructing the environment's portfolio.

Parameters:

Name	Type	Description	Default
`env_self`	`TradingEnvProtocol`	The environment instance.	required
`action`	`ndarray`	The raw action from the agent.	required

Returns:

Type	Description
`Tuple[Any, Dict[str, Any]]`	Tuple[Any, Dict[str, Any]]: The decoded action type and a dictionary of details.

Source code in src/quantrl_lab/environments/stock/strategies/actions/standard.py

def handle_action(self, env_self: TradingEnvProtocol, action: np.ndarray) -> Tuple[Any, Dict[str, Any]]:
    """
    Handles the action by decoding it and instructing the
    environment's portfolio.

    Args:
        env_self (TradingEnvProtocol): The environment instance.
        action (np.ndarray): The raw action from the agent.

    Returns:
        Tuple[Any, Dict[str, Any]]: The decoded action type and a dictionary of details.
    """
    # --- 1. Decode the action ---

    # Rescale Action Type from [-1, 1] to [0, len(Actions)-1]
    raw_type = np.clip(action[0], -1.0, 1.0)
    max_action_index = len(Actions) - 1
    scaled_type = ((raw_type + 1) / 2) * max_action_index

    action_type_int = int(np.round(scaled_type))

    # Rescale Amount from [-1, 1] to [0, 1]
    # This ensures that an output of 0.0 results in 50% amount, not 0%.
    raw_amount = np.clip(action[1], -1.0, 1.0)
    amount_pct = (raw_amount + 1) / 2

    price_modifier = np.clip(action[2], 0.9, 1.1)

    try:
        action_type = Actions(action_type_int)
    except ValueError:
        action_type = Actions.Hold

    # The environment is still responsible for providing the current price
    current_price = env_self._get_current_price()
    if current_price <= 1e-9:
        action_type = Actions.Hold

    # --- 2. Execute the action by calling methods on the PORTFOLIO ---

    # CORRECTED: Get total shares from the portfolio
    had_no_shares = env_self.portfolio.total_shares <= 0
    invalid_action_attempt = False

    # Get current_step from the environment, as the portfolio methods need it
    current_step = env_self.current_step

    if action_type == Actions.Hold:
        pass
    elif action_type == Actions.Buy:
        # CORRECTED: Call the portfolio's method, passing current_step
        env_self.portfolio.execute_market_order(action_type, current_price, amount_pct, current_step)
    elif action_type == Actions.Sell:
        if had_no_shares:
            invalid_action_attempt = True
        # CORRECTED: Call the portfolio's method
        env_self.portfolio.execute_market_order(action_type, current_price, amount_pct, current_step)
    elif action_type == Actions.LimitBuy:
        # CORRECTED: Call the portfolio's method
        env_self.portfolio.place_limit_order(action_type, current_price, amount_pct, price_modifier, current_step)
    elif action_type == Actions.LimitSell:
        if had_no_shares:
            invalid_action_attempt = True
        # CORRECTED: Call the portfolio's method
        env_self.portfolio.place_limit_order(action_type, current_price, amount_pct, price_modifier, current_step)
    elif action_type in [Actions.StopLoss, Actions.TakeProfit]:
        if had_no_shares:
            invalid_action_attempt = True
        # CORRECTED: Call the portfolio's method
        env_self.portfolio.place_risk_management_order(
            action_type, current_price, amount_pct, price_modifier, current_step
        )

    # --- 3. Return decoded info (No changes needed here) ---
    decoded_info = {
        "type": action_type.name,
        "amount_pct": amount_pct,
        "price_modifier": price_modifier,
        "raw_input": action,
        "invalid_action_attempt": invalid_action_attempt,
    }

    return action_type, decoded_info

`time_in_force`

`TimeInForceActionStrategy`

Bases: BaseActionStrategy

Implements an advanced action space with Time-In-Force (TIF) control.

Action: [action_type, amount, price_modifier, tif_type]

TIF Types: 0: GTC (Good Till Cancelled) 1: IOC (Immediate or Cancel) 2: TTL (Time To Live - uses order_expiration_steps)

Source code in src/quantrl_lab/environments/stock/strategies/actions/time_in_force.py

class TimeInForceActionStrategy(BaseActionStrategy):
    """
    Implements an advanced action space with Time-In-Force (TIF)
    control.

    Action: [action_type, amount, price_modifier, tif_type]

    TIF Types:
    0: GTC (Good Till Cancelled)
    1: IOC (Immediate or Cancel)
    2: TTL (Time To Live - uses order_expiration_steps)
    """

    def define_action_space(self) -> gym.spaces.Box:
        """
        Defines the action space for the trading environment.

        Returns:
            gym.spaces.Box: The action space as a Box space.
        """
        # Symmetric action space [-1, 1] for categorical actions to aid exploration

        # Action Type
        action_type_low = -1.0
        action_type_high = 1.0

        # Amount
        amount_low = 0.0
        amount_high = 1.0

        # Price Modifier
        price_mod_low = 0.9
        price_mod_high = 1.1

        # TIF Type (0: GTC, 1: IOC, 2: TTL)
        # Using symmetric space [-1, 1]
        tif_low = -1.0
        tif_high = 1.0

        return gym.spaces.Box(
            low=np.array([action_type_low, amount_low, price_mod_low, tif_low], dtype=np.float32),
            high=np.array([action_type_high, amount_high, price_mod_high, tif_high], dtype=np.float32),
            shape=(4,),
            dtype=np.float32,
        )

    def handle_action(self, env_self: TradingEnvProtocol, action: np.ndarray) -> Tuple[Any, Dict[str, Any]]:
        """
        Handles the action by decoding it and instructing the
        environment's portfolio.

        Args:
            env_self (TradingEnvProtocol): The environment instance.
            action (np.ndarray): The raw action from the agent.

        Returns:
            Tuple[Any, Dict[str, Any]]: The decoded action type and a dictionary of details.
        """
        # --- 1. Decode the action ---

        # Rescale Action Type from [-1, 1] to [0, len(Actions)-1]
        raw_action_type = np.clip(action[0], -1.0, 1.0)
        max_action_index = len(Actions) - 1
        scaled_action_type = ((raw_action_type + 1) / 2) * max_action_index
        action_type_int = int(np.round(scaled_action_type))

        amount_pct = np.clip(action[1], 0.0, 1.0)
        price_modifier = np.clip(action[2], 0.9, 1.1)

        # Rescale TIF from [-1, 1] to [0, 2]
        raw_tif = np.clip(action[3], -1.0, 1.0)
        max_tif_index = 2
        scaled_tif = ((raw_tif + 1) / 2) * max_tif_index
        tif_int = int(np.round(scaled_tif))

        if tif_int == 0:
            tif_type = OrderTIF.GTC
        elif tif_int == 1:
            tif_type = OrderTIF.IOC
        else:
            tif_type = OrderTIF.TTL

        try:
            action_type = Actions(action_type_int)
        except ValueError:
            action_type = Actions.Hold

        # The environment is still responsible for providing the current price
        current_price = env_self._get_current_price()
        if current_price <= 1e-9:
            action_type = Actions.Hold

        # --- 2. Execute the action by calling methods on the PORTFOLIO ---

        had_no_shares = env_self.portfolio.total_shares <= 0
        invalid_action_attempt = False

        # Get current_step from the environment
        current_step = env_self.current_step

        if action_type == Actions.Hold:
            pass
        elif action_type == Actions.Buy:
            # Market orders don't use TIF (usually IOC by definition, handled internally)
            env_self.portfolio.execute_market_order(action_type, current_price, amount_pct, current_step)
        elif action_type == Actions.Sell:
            if had_no_shares:
                invalid_action_attempt = True
            env_self.portfolio.execute_market_order(action_type, current_price, amount_pct, current_step)
        elif action_type == Actions.LimitBuy:
            env_self.portfolio.place_limit_order(
                action_type, current_price, amount_pct, price_modifier, current_step, tif=tif_type
            )
        elif action_type == Actions.LimitSell:
            if had_no_shares:
                invalid_action_attempt = True
            env_self.portfolio.place_limit_order(
                action_type, current_price, amount_pct, price_modifier, current_step, tif=tif_type
            )
        elif action_type in [Actions.StopLoss, Actions.TakeProfit]:
            if had_no_shares:
                invalid_action_attempt = True

            # Stop orders cannot be IOC usually (they need to rest until triggered)
            # If IOC is selected for Stop, Portfolio handles it (likely rejects or ignores)
            # but we pass it anyway as the portfolio owns that validation logic.
            env_self.portfolio.place_risk_management_order(
                action_type, current_price, amount_pct, price_modifier, current_step, tif=tif_type
            )

        # --- 3. Return decoded info ---
        decoded_info = {
            "type": action_type.name,
            "amount_pct": amount_pct,
            "price_modifier": price_modifier,
            "tif": tif_type.value,
            "raw_input": action,
            "invalid_action_attempt": invalid_action_attempt,
        }

        return action_type, decoded_info

`define_action_space()`

Defines the action space for the trading environment.

Returns:

Type	Description
`Box`	gym.spaces.Box: The action space as a Box space.

Source code in src/quantrl_lab/environments/stock/strategies/actions/time_in_force.py

def define_action_space(self) -> gym.spaces.Box:
    """
    Defines the action space for the trading environment.

    Returns:
        gym.spaces.Box: The action space as a Box space.
    """
    # Symmetric action space [-1, 1] for categorical actions to aid exploration

    # Action Type
    action_type_low = -1.0
    action_type_high = 1.0

    # Amount
    amount_low = 0.0
    amount_high = 1.0

    # Price Modifier
    price_mod_low = 0.9
    price_mod_high = 1.1

    # TIF Type (0: GTC, 1: IOC, 2: TTL)
    # Using symmetric space [-1, 1]
    tif_low = -1.0
    tif_high = 1.0

    return gym.spaces.Box(
        low=np.array([action_type_low, amount_low, price_mod_low, tif_low], dtype=np.float32),
        high=np.array([action_type_high, amount_high, price_mod_high, tif_high], dtype=np.float32),
        shape=(4,),
        dtype=np.float32,
    )

`handle_action(env_self, action)`

Handles the action by decoding it and instructing the environment's portfolio.

Parameters:

Name	Type	Description	Default
`env_self`	`TradingEnvProtocol`	The environment instance.	required
`action`	`ndarray`	The raw action from the agent.	required

Returns:

Type	Description
`Tuple[Any, Dict[str, Any]]`	Tuple[Any, Dict[str, Any]]: The decoded action type and a dictionary of details.

Source code in src/quantrl_lab/environments/stock/strategies/actions/time_in_force.py

def handle_action(self, env_self: TradingEnvProtocol, action: np.ndarray) -> Tuple[Any, Dict[str, Any]]:
    """
    Handles the action by decoding it and instructing the
    environment's portfolio.

    Args:
        env_self (TradingEnvProtocol): The environment instance.
        action (np.ndarray): The raw action from the agent.

    Returns:
        Tuple[Any, Dict[str, Any]]: The decoded action type and a dictionary of details.
    """
    # --- 1. Decode the action ---

    # Rescale Action Type from [-1, 1] to [0, len(Actions)-1]
    raw_action_type = np.clip(action[0], -1.0, 1.0)
    max_action_index = len(Actions) - 1
    scaled_action_type = ((raw_action_type + 1) / 2) * max_action_index
    action_type_int = int(np.round(scaled_action_type))

    amount_pct = np.clip(action[1], 0.0, 1.0)
    price_modifier = np.clip(action[2], 0.9, 1.1)

    # Rescale TIF from [-1, 1] to [0, 2]
    raw_tif = np.clip(action[3], -1.0, 1.0)
    max_tif_index = 2
    scaled_tif = ((raw_tif + 1) / 2) * max_tif_index
    tif_int = int(np.round(scaled_tif))

    if tif_int == 0:
        tif_type = OrderTIF.GTC
    elif tif_int == 1:
        tif_type = OrderTIF.IOC
    else:
        tif_type = OrderTIF.TTL

    try:
        action_type = Actions(action_type_int)
    except ValueError:
        action_type = Actions.Hold

    # The environment is still responsible for providing the current price
    current_price = env_self._get_current_price()
    if current_price <= 1e-9:
        action_type = Actions.Hold

    # --- 2. Execute the action by calling methods on the PORTFOLIO ---

    had_no_shares = env_self.portfolio.total_shares <= 0
    invalid_action_attempt = False

    # Get current_step from the environment
    current_step = env_self.current_step

    if action_type == Actions.Hold:
        pass
    elif action_type == Actions.Buy:
        # Market orders don't use TIF (usually IOC by definition, handled internally)
        env_self.portfolio.execute_market_order(action_type, current_price, amount_pct, current_step)
    elif action_type == Actions.Sell:
        if had_no_shares:
            invalid_action_attempt = True
        env_self.portfolio.execute_market_order(action_type, current_price, amount_pct, current_step)
    elif action_type == Actions.LimitBuy:
        env_self.portfolio.place_limit_order(
            action_type, current_price, amount_pct, price_modifier, current_step, tif=tif_type
        )
    elif action_type == Actions.LimitSell:
        if had_no_shares:
            invalid_action_attempt = True
        env_self.portfolio.place_limit_order(
            action_type, current_price, amount_pct, price_modifier, current_step, tif=tif_type
        )
    elif action_type in [Actions.StopLoss, Actions.TakeProfit]:
        if had_no_shares:
            invalid_action_attempt = True

        # Stop orders cannot be IOC usually (they need to rest until triggered)
        # If IOC is selected for Stop, Portfolio handles it (likely rejects or ignores)
        # but we pass it anyway as the portfolio owns that validation logic.
        env_self.portfolio.place_risk_management_order(
            action_type, current_price, amount_pct, price_modifier, current_step, tif=tif_type
        )

    # --- 3. Return decoded info ---
    decoded_info = {
        "type": action_type.name,
        "amount_pct": amount_pct,
        "price_modifier": price_modifier,
        "tif": tif_type.value,
        "raw_input": action,
        "invalid_action_attempt": invalid_action_attempt,
    }

    return action_type, decoded_info

Observation Strategies

`feature_aware`

`FeatureAwareObservationStrategy`

Bases: BaseObservationStrategy

Feature-aware observation strategy with smart normalization.

Unlike the standard strategy which normalizes everything relative to the window start, this strategy discriminates between feature types: 1. Price-like (Open, High, Low, Close, SMA, EMA, BB): Normalized relative to the first step in the window. 2. Stationary (RSI, STOCH, MFI, ADX, Time Features): Passed through raw or scaled independently, preserving their absolute values (e.g., Overbought/Oversold levels).

Source code in src/quantrl_lab/environments/stock/strategies/observations/feature_aware.py

class FeatureAwareObservationStrategy(BaseObservationStrategy):
    """
    Feature-aware observation strategy with smart normalization.

    Unlike the standard strategy which normalizes everything relative to the
    window start, this strategy discriminates between feature types:
    1. Price-like (Open, High, Low, Close, SMA, EMA, BB): Normalized relative to the first step in the window.
    2. Stationary (RSI, STOCH, MFI, ADX, Time Features): Passed through raw or scaled
       independently, preserving their absolute values (e.g., Overbought/Oversold levels).
    """

    NUM_PORTFOLIO_FEATURES = 9

    def __init__(
        self,
        volatility_lookback: int = 10,
        trend_lookback: int = 10,
        normalize_stationary: bool = True,
    ):
        """
        Args:
            volatility_lookback: Steps to calculate recent volatility.
            trend_lookback: Steps to calculate trend.
            normalize_stationary: If True, attempts to scale known 0-100 indicators to 0-1.
        """
        super().__init__()
        self.volatility_lookback = volatility_lookback
        self.trend_lookback = trend_lookback
        self.normalize_stationary = normalize_stationary

        # Keywords to identify stationary features that shouldn't be relatively normalized
        self.stationary_keywords = {
            "RSI",
            "STOCH",
            "MFI",
            "ADX",
            "WILLR",
            "CCI",
            "ATR",  # Moved ATR here to handle it specially (ATR/Price)
            "MACD",  # Price-difference based, normalise by Close to make it scale-free
            "sentiment",
            "grade",
            "rating",
            "day_sin",
            "day_cos",
            "month_sin",
            "month_cos",
            "time_features",
            "BB_bandwidth",
            "BB_percent",
            "%B",
            "OBV",  # OBV is technically unbounded but we will Z-score it or similar
            "sector",
            "industry",
            "change",
        }

        # Cache for column indices
        self._price_cols_idx: List[int] = []
        self._stationary_cols_idx: List[int] = []
        self._initialized_indices = False

    def define_observation_space(self, env: TradingEnvProtocol) -> gym.spaces.Box:
        obs_market_shape = env.window_size * env.num_features
        total_obs_dim = obs_market_shape + self.NUM_PORTFOLIO_FEATURES
        return spaces.Box(low=-np.inf, high=np.inf, shape=(total_obs_dim,), dtype=np.float32)

    def _identify_columns(self, env: TradingEnvProtocol):
        """Identify which columns are stationary vs price-like based on
        names."""
        if self._initialized_indices:
            return

        # Default: All price-like if no names available (fallback to old behavior)
        if not hasattr(env, "original_columns") or env.original_columns is None:
            self._price_cols_idx = list(range(env.num_features))
            self._stationary_cols_idx = []
            self._initialized_indices = True
            return

        self._price_cols_idx = []
        self._stationary_cols_idx = []

        for i, col_name in enumerate(env.original_columns):
            is_stationary = False
            col_upper = col_name.upper()

            # Check against keywords
            for kw in self.stationary_keywords:
                if kw.upper() in col_upper:
                    is_stationary = True
                    break

            if is_stationary:
                self._stationary_cols_idx.append(i)
            else:
                self._price_cols_idx.append(i)

        self._initialized_indices = True

    def build_observation(self, env: TradingEnvProtocol) -> np.ndarray:
        # Lazy init of indices
        self._identify_columns(env)

        # === 1. Market Window Extraction ===
        start_idx = max(0, env.current_step - env.window_size + 1)
        end_idx = env.current_step + 1

        # Get raw window
        raw_window = env.data[start_idx:end_idx, :]

        # Padding if needed (at start of episode)
        actual_len = raw_window.shape[0]
        if actual_len < env.window_size:
            if actual_len > 0:
                padding = np.repeat(raw_window[0, :][np.newaxis, :], env.window_size - actual_len, axis=0)
            else:
                padding = np.zeros((env.window_size - actual_len, env.num_features), dtype=env.data.dtype)
            raw_window = np.concatenate((padding, raw_window), axis=0)

        # === 2. Smart Normalization ===
        normalized_window = raw_window.copy()

        # A. Normalize Price-like columns (Relative to first step in window)
        # Formula: value / first_value
        if self._price_cols_idx:
            price_subset = raw_window[:, self._price_cols_idx]
            first_step_prices = price_subset[0, :]

            # Avoid division by zero
            denominator = np.where(np.abs(first_step_prices) < 1e-9, 1.0, first_step_prices)

            norm_prices = price_subset / denominator
            # Zero out where denominator was effectively zero (to avoid massive explosions)
            norm_prices[:, np.abs(first_step_prices) < 1e-9] = 0.0

            normalized_window[:, self._price_cols_idx] = norm_prices

        # B. Normalize Stationary columns
        if self._stationary_cols_idx:
            stationary_subset = raw_window[:, self._stationary_cols_idx]

            if self.normalize_stationary and hasattr(env, "original_columns"):
                price_col_idx = env.price_column_index
                # We need close prices for ATR normalization
                close_prices = raw_window[:, price_col_idx]

                for local_idx, global_idx in enumerate(self._stationary_cols_idx):
                    col_name = env.original_columns[global_idx].upper()

                    # 1. Oscillators (0-100) -> Scale to 0-1
                    if any(x in col_name for x in ["RSI", "STOCH", "MFI", "ADX"]):
                        stationary_subset[:, local_idx] = stationary_subset[:, local_idx] / 100.0

                    # 2. Williams %R (-100 to 0) -> Scale to 0-1
                    elif "WILLR" in col_name:
                        stationary_subset[:, local_idx] = (stationary_subset[:, local_idx] + 100.0) / 100.0

                    # 3. CCI (Unbounded, typ +/- 200) -> Scale roughly to -1 to 1
                    elif "CCI" in col_name:
                        stationary_subset[:, local_idx] = stationary_subset[:, local_idx] / 200.0

                    # 4. ATR (Price-based volatility) -> Normalize by Close Price
                    elif "ATR" in col_name:
                        # ATR / Close = Percentage Volatility
                        stationary_subset[:, local_idx] = stationary_subset[:, local_idx] / (close_prices + 1e-9)

                    # 5. MACD (EMA difference, price-denominated) -> Normalize by Close Price
                    elif "MACD" in col_name:
                        # MACD / Close = scale-free momentum signal
                        stationary_subset[:, local_idx] = stationary_subset[:, local_idx] / (close_prices + 1e-9)

                    # 6. OBV (Unbounded Volume) -> Z-Score locally
                    elif "OBV" in col_name:
                        vals = stationary_subset[:, local_idx]
                        mean = np.mean(vals)
                        std = np.std(vals) + 1e-9
                        stationary_subset[:, local_idx] = (vals - mean) / std

            normalized_window[:, self._stationary_cols_idx] = stationary_subset

        # === 3. Portfolio State (Standard Logic) ===
        current_price = env._get_current_price()
        total_shares = env.portfolio.total_shares

        position_size_ratio, unrealized_pl_pct, risk_reward_ratio, dist_stop, dist_target = 0.0, 0.0, 0.0, 0.0, 0.0

        if total_shares > 0:
            portfolio_value = env.portfolio.get_value(current_price)
            if portfolio_value > 1e-9:
                position_size_ratio = (total_shares * current_price) / portfolio_value

            entry_prices = [
                o["price"]
                for o in env.portfolio.executed_orders_history
                if o["type"] in ["market_buy", "limit_buy_executed"]
            ]
            avg_entry = np.mean(entry_prices) if entry_prices else current_price

            if avg_entry > 1e-9:
                unrealized_pl_pct = (current_price - avg_entry) / avg_entry

            # Risk metrics
            sl_orders = env.portfolio.stop_loss_orders
            tp_orders = env.portfolio.take_profit_orders
            if sl_orders and tp_orders:
                avg_sl = np.mean([o.price for o in sl_orders])
                avg_tp = np.mean([o.price for o in tp_orders])
                if abs(current_price - avg_sl) > 1e-9:
                    risk_reward_ratio = (avg_tp - current_price) / (current_price - avg_sl)
                if current_price > 1e-9:
                    dist_stop = (current_price - avg_sl) / current_price
                    dist_target = (avg_tp - current_price) / current_price

        # === 4. Feature Engineering (Volatility & Trend) ===
        recent_slice = env.data[max(0, env.current_step - self.volatility_lookback + 1) : end_idx]
        price_col = env.price_column_index

        recent_high = np.max(recent_slice[:, price_col]) if len(recent_slice) > 0 else current_price
        recent_low = np.min(recent_slice[:, price_col]) if len(recent_slice) > 0 else current_price

        # Price position within recent range (0.0 to 1.0)
        price_range = recent_high - recent_low
        price_pos = (current_price - recent_low) / price_range if price_range > 1e-9 else 0.5

        volatility = 0.0
        if len(recent_slice) > 1:
            rets = np.diff(recent_slice[:, price_col]) / recent_slice[:-1, price_col]
            volatility = np.std(rets)

        trend_start_idx = max(0, env.current_step - self.trend_lookback + 1)
        trend_slice = env.data[trend_start_idx:end_idx, price_col]
        trend = calc_trend(trend_slice)

        portfolio_features = np.array(
            [
                env.portfolio.balance / env.portfolio.initial_balance,
                position_size_ratio,
                unrealized_pl_pct,
                price_pos,
                volatility,
                trend,
                risk_reward_ratio,
                dist_stop,
                dist_target,
            ],
            dtype=np.float32,
        )

        return np.concatenate((normalized_window.flatten(), portfolio_features))

    def get_feature_names(self, env: TradingEnvProtocol) -> List[str]:
        """
        Generates the ordered list of feature names corresponding to the
        observation vector.

        The observation space consists of:
        1. The flattened market window (oldest step to newest step)
        2. The portfolio & engineering features
        """
        feature_names = []

        # 1. Market Window Features
        # The window is flattened row by row, from start_idx to end_idx.
        # So we iterate through the window steps: t-(window_size-1) up to t
        original_cols = getattr(env, "original_columns", [f"Feature_{i}" for i in range(env.num_features)])

        for step_idx in range(env.window_size):
            lag = env.window_size - 1 - step_idx
            time_label = "t" if lag == 0 else f"t-{lag}"

            for col in original_cols:
                feature_names.append(f"{col}_{time_label}")

        # 2. Portfolio Features (Must match the exact order in build_observation)
        portfolio_features = [
            "portfolio_balance_ratio",
            "position_size_ratio",
            "unrealized_pl_pct",
            "price_pos_in_range",
            "recent_volatility",
            "recent_trend",
            "risk_reward_ratio",
            "dist_to_stop_loss",
            "dist_to_take_profit",
        ]

        feature_names.extend(portfolio_features)

        return feature_names

`init(volatility_lookback=10, trend_lookback=10, normalize_stationary=True)`

Parameters:

Name	Type	Description	Default
`volatility_lookback`	`int`	Steps to calculate recent volatility.	`10`
`trend_lookback`	`int`	Steps to calculate trend.	`10`
`normalize_stationary`	`bool`	If True, attempts to scale known 0-100 indicators to 0-1.	`True`

Source code in src/quantrl_lab/environments/stock/strategies/observations/feature_aware.py

def __init__(
    self,
    volatility_lookback: int = 10,
    trend_lookback: int = 10,
    normalize_stationary: bool = True,
):
    """
    Args:
        volatility_lookback: Steps to calculate recent volatility.
        trend_lookback: Steps to calculate trend.
        normalize_stationary: If True, attempts to scale known 0-100 indicators to 0-1.
    """
    super().__init__()
    self.volatility_lookback = volatility_lookback
    self.trend_lookback = trend_lookback
    self.normalize_stationary = normalize_stationary

    # Keywords to identify stationary features that shouldn't be relatively normalized
    self.stationary_keywords = {
        "RSI",
        "STOCH",
        "MFI",
        "ADX",
        "WILLR",
        "CCI",
        "ATR",  # Moved ATR here to handle it specially (ATR/Price)
        "MACD",  # Price-difference based, normalise by Close to make it scale-free
        "sentiment",
        "grade",
        "rating",
        "day_sin",
        "day_cos",
        "month_sin",
        "month_cos",
        "time_features",
        "BB_bandwidth",
        "BB_percent",
        "%B",
        "OBV",  # OBV is technically unbounded but we will Z-score it or similar
        "sector",
        "industry",
        "change",
    }

    # Cache for column indices
    self._price_cols_idx: List[int] = []
    self._stationary_cols_idx: List[int] = []
    self._initialized_indices = False

`get_feature_names(env)`

Generates the ordered list of feature names corresponding to the observation vector.

The observation space consists of: 1. The flattened market window (oldest step to newest step) 2. The portfolio & engineering features

Source code in src/quantrl_lab/environments/stock/strategies/observations/feature_aware.py

def get_feature_names(self, env: TradingEnvProtocol) -> List[str]:
    """
    Generates the ordered list of feature names corresponding to the
    observation vector.

    The observation space consists of:
    1. The flattened market window (oldest step to newest step)
    2. The portfolio & engineering features
    """
    feature_names = []

    # 1. Market Window Features
    # The window is flattened row by row, from start_idx to end_idx.
    # So we iterate through the window steps: t-(window_size-1) up to t
    original_cols = getattr(env, "original_columns", [f"Feature_{i}" for i in range(env.num_features)])

    for step_idx in range(env.window_size):
        lag = env.window_size - 1 - step_idx
        time_label = "t" if lag == 0 else f"t-{lag}"

        for col in original_cols:
            feature_names.append(f"{col}_{time_label}")

    # 2. Portfolio Features (Must match the exact order in build_observation)
    portfolio_features = [
        "portfolio_balance_ratio",
        "position_size_ratio",
        "unrealized_pl_pct",
        "price_pos_in_range",
        "recent_volatility",
        "recent_trend",
        "risk_reward_ratio",
        "dist_to_stop_loss",
        "dist_to_take_profit",
    ]

    feature_names.extend(portfolio_features)

    return feature_names

Reward Strategies

`portfolio_value`

`PortfolioValueChangeReward`

Bases: BaseRewardStrategy

Calculates reward based on the % change in portfolio value.

Source code in src/quantrl_lab/environments/stock/strategies/rewards/portfolio_value.py

class PortfolioValueChangeReward(BaseRewardStrategy):
    """Calculates reward based on the % change in portfolio value."""

    def calculate_reward(self, env: TradingEnvProtocol) -> float:
        """
        Calculates the reward based on the percentage change in
        portfolio value.

        This method now correctly interacts with the environment's portfolio
        component to get the current value.

        Args:
            env (TradingEnvProtocol): The environment instance.

        Returns:
            float: The percentage change in portfolio value since the previous step.
        """
        prev_val = env.prev_portfolio_value

        current_price = env._get_current_price()

        current_val = env.portfolio.get_value(current_price)

        if prev_val > 1e-9:
            return (current_val - prev_val) / prev_val

        return 0.0

`calculate_reward(env)`

Calculates the reward based on the percentage change in portfolio value.

This method now correctly interacts with the environment's portfolio component to get the current value.

Parameters:

Name	Type	Description	Default
`env`	`TradingEnvProtocol`	The environment instance.	required

Returns:

Name	Type	Description
`float`	`float`	The percentage change in portfolio value since the previous step.

Source code in src/quantrl_lab/environments/stock/strategies/rewards/portfolio_value.py

def calculate_reward(self, env: TradingEnvProtocol) -> float:
    """
    Calculates the reward based on the percentage change in
    portfolio value.

    This method now correctly interacts with the environment's portfolio
    component to get the current value.

    Args:
        env (TradingEnvProtocol): The environment instance.

    Returns:
        float: The percentage change in portfolio value since the previous step.
    """
    prev_val = env.prev_portfolio_value

    current_price = env._get_current_price()

    current_val = env.portfolio.get_value(current_price)

    if prev_val > 1e-9:
        return (current_val - prev_val) / prev_val

    return 0.0

`sortino`

`DifferentialSortinoReward`

Bases: BaseRewardStrategy

Reward strategy based on the Differential Sortino Ratio.

Unlike the standard Sortino Ratio which is calculated over a fixed period, the Differential Sortino Ratio provides a dense reward signal at each step, representing the contribution of the current return to the overall Sortino Ratio.

It penalizes downside volatility (returns below target) while rewarding positive returns.

Source code in src/quantrl_lab/environments/stock/strategies/rewards/sortino.py

class DifferentialSortinoReward(BaseRewardStrategy):
    """
    Reward strategy based on the Differential Sortino Ratio.

    Unlike the standard Sortino Ratio which is calculated over a fixed
    period, the Differential Sortino Ratio provides a dense reward
    signal at each step, representing the contribution of the current
    return to the overall Sortino Ratio.

    It penalizes downside volatility (returns below target) while
    rewarding positive returns.
    """

    def __init__(self, target_return: float = 0.0, decay: float = 0.99):
        """
        Args:
            target_return: Minimum acceptable return (MAR). Returns below this are considered downside risk.
            decay: Decay factor for the moving average of returns and downside deviation (0 < decay < 1).
                   Closer to 1 means longer memory.
        """
        super().__init__()
        self.target_return = target_return
        self.decay = decay

        # Moving statistics
        self._mean_return = 0.0
        self._mean_downside_sq = 0.0  # Mean squared downside deviation
        self._step_count = 0

    def calculate_reward(self, env: TradingEnvProtocol) -> float:
        """
        Calculate the differential Sortino reward.

        Ref: "Online Learning of the Differential Sharpe Ratio" logic adapted for Sortino.
        """
        # Calculate current step return
        current_price = env._get_current_price()
        current_val = env.portfolio.get_value(current_price)
        prev_val = env.prev_portfolio_value

        if prev_val <= 1e-9:
            ret = 0.0
        else:
            ret = (current_val - prev_val) / prev_val

        # Update moving averages using exponential decay
        if self._step_count == 0:
            self._mean_return = ret
            # Downside deviation: only consider returns below target
            downside = min(0, ret - self.target_return)
            self._mean_downside_sq = downside**2
        else:
            dt = 1.0 - self.decay
            self._mean_return = (1 - dt) * self._mean_return + dt * ret

            downside = min(0, ret - self.target_return)
            self._mean_downside_sq = (1 - dt) * self._mean_downside_sq + dt * (downside**2)

        self._step_count += 1

        # Calculate Sortino Ratio components
        # Add epsilon for numerical stability
        downside_dev = np.sqrt(self._mean_downside_sq) + 1e-9

        # Differential Sortino (Approximation of gradient/contribution)
        # S_t = (R_t - R_f) / DD_t
        # We want the change in S caused by the new return
        # Simple differential form: (ret - mean_ret) / downside_dev (simplified)
        # OR standard Sortino of current state.

        # A robust differential formulation for step-by-step RL:
        # Reward = (Return - Target) / DownsideDev_prev
        # But we need to account for changing DownsideDev.

        # Let's use the actual Differential Sortino formula derived similarly to Differential Sharpe:
        # D_t = (B_{t-1} * (R_t - Mean_t) - 0.5 * A_{t-1} * (Downside_t^2 - DownsideMean_t)) / (DownsideMean_t^3)
        # Where A = Mean Return - Target, B = Downside Variance.
        # This is complex and often unstable.

        # A standardized stable proxy often used in trading RL:
        # Reward = Current_Return / (Running_Downside_Dev + epsilon)
        # If Current_Return is negative, it's penalized by volatility.
        # If positive, it's scaled by the risk environment.

        # Clip the reward to prevent exploding gradients when downside deviation is near zero.
        raw_reward = ret / downside_dev
        return float(np.clip(raw_reward, -10.0, 10.0))

    def on_step_end(self, env: TradingEnvProtocol):
        pass

    def reset(self):
        """Reset internal statistics."""
        self._mean_return = 0.0
        self._mean_downside_sq = 0.0
        self._step_count = 0

`init(target_return=0.0, decay=0.99)`

Parameters:

Name	Type	Description	Default
`target_return`	`float`	Minimum acceptable return (MAR). Returns below this are considered downside risk.	`0.0`
`decay`	`float`	Decay factor for the moving average of returns and downside deviation (0 < decay < 1). Closer to 1 means longer memory.	`0.99`

Source code in src/quantrl_lab/environments/stock/strategies/rewards/sortino.py

def __init__(self, target_return: float = 0.0, decay: float = 0.99):
    """
    Args:
        target_return: Minimum acceptable return (MAR). Returns below this are considered downside risk.
        decay: Decay factor for the moving average of returns and downside deviation (0 < decay < 1).
               Closer to 1 means longer memory.
    """
    super().__init__()
    self.target_return = target_return
    self.decay = decay

    # Moving statistics
    self._mean_return = 0.0
    self._mean_downside_sq = 0.0  # Mean squared downside deviation
    self._step_count = 0

`calculate_reward(env)`

Calculate the differential Sortino reward.

Ref: "Online Learning of the Differential Sharpe Ratio" logic adapted for Sortino.

Source code in src/quantrl_lab/environments/stock/strategies/rewards/sortino.py

def calculate_reward(self, env: TradingEnvProtocol) -> float:
    """
    Calculate the differential Sortino reward.

    Ref: "Online Learning of the Differential Sharpe Ratio" logic adapted for Sortino.
    """
    # Calculate current step return
    current_price = env._get_current_price()
    current_val = env.portfolio.get_value(current_price)
    prev_val = env.prev_portfolio_value

    if prev_val <= 1e-9:
        ret = 0.0
    else:
        ret = (current_val - prev_val) / prev_val

    # Update moving averages using exponential decay
    if self._step_count == 0:
        self._mean_return = ret
        # Downside deviation: only consider returns below target
        downside = min(0, ret - self.target_return)
        self._mean_downside_sq = downside**2
    else:
        dt = 1.0 - self.decay
        self._mean_return = (1 - dt) * self._mean_return + dt * ret

        downside = min(0, ret - self.target_return)
        self._mean_downside_sq = (1 - dt) * self._mean_downside_sq + dt * (downside**2)

    self._step_count += 1

    # Calculate Sortino Ratio components
    # Add epsilon for numerical stability
    downside_dev = np.sqrt(self._mean_downside_sq) + 1e-9

    # Differential Sortino (Approximation of gradient/contribution)
    # S_t = (R_t - R_f) / DD_t
    # We want the change in S caused by the new return
    # Simple differential form: (ret - mean_ret) / downside_dev (simplified)
    # OR standard Sortino of current state.

    # A robust differential formulation for step-by-step RL:
    # Reward = (Return - Target) / DownsideDev_prev
    # But we need to account for changing DownsideDev.

    # Let's use the actual Differential Sortino formula derived similarly to Differential Sharpe:
    # D_t = (B_{t-1} * (R_t - Mean_t) - 0.5 * A_{t-1} * (Downside_t^2 - DownsideMean_t)) / (DownsideMean_t^3)
    # Where A = Mean Return - Target, B = Downside Variance.
    # This is complex and often unstable.

    # A standardized stable proxy often used in trading RL:
    # Reward = Current_Return / (Running_Downside_Dev + epsilon)
    # If Current_Return is negative, it's penalized by volatility.
    # If positive, it's scaled by the risk environment.

    # Clip the reward to prevent exploding gradients when downside deviation is near zero.
    raw_reward = ret / downside_dev
    return float(np.clip(raw_reward, -10.0, 10.0))

`reset()`

Reset internal statistics.

Source code in src/quantrl_lab/environments/stock/strategies/rewards/sortino.py

def reset(self):
    """Reset internal statistics."""
    self._mean_return = 0.0
    self._mean_downside_sq = 0.0
    self._step_count = 0

`sharpe`

`DifferentialSharpeReward`

Bases: BaseRewardStrategy

Reward strategy based on the Differential Sharpe Ratio.

Provides a dense reward signal at each step, representing the contribution of the current return to the overall Sharpe Ratio.

It rewards high returns and penalizes total volatility.

Source code in src/quantrl_lab/environments/stock/strategies/rewards/sharpe.py

class DifferentialSharpeReward(BaseRewardStrategy):
    """
    Reward strategy based on the Differential Sharpe Ratio.

    Provides a dense reward signal at each step, representing the
    contribution of the current return to the overall Sharpe Ratio.

    It rewards high returns and penalizes total volatility.
    """

    def __init__(self, risk_free_rate: float = 0.0, decay: float = 0.99):
        """
        Args:
            risk_free_rate: The risk-free rate (per step) to subtract from returns.
                            Defaults to 0 assuming short time steps.
            decay: Decay factor for the moving average of returns and variance.
                   0 < decay < 1.
        """
        super().__init__()
        self.risk_free_rate = risk_free_rate
        self.decay = decay

        # Moving statistics
        self._mean_return = 0.0
        self._mean_sq_return = 0.0  # E[x^2]
        self._step_count = 0

    def calculate_reward(self, env: TradingEnvProtocol) -> float:
        """Calculate the differential Sharpe reward."""
        # Calculate current step return
        current_price = env._get_current_price()
        current_val = env.portfolio.get_value(current_price)
        prev_val = env.prev_portfolio_value

        if prev_val <= 1e-9:
            ret = 0.0
        else:
            ret = (current_val - prev_val) / prev_val

        # Excess return
        excess_ret = ret - self.risk_free_rate

        # Update moving averages using exponential decay
        if self._step_count == 0:
            self._mean_return = excess_ret
            self._mean_sq_return = excess_ret**2
        else:
            dt = 1.0 - self.decay
            self._mean_return = (1 - dt) * self._mean_return + dt * excess_ret
            self._mean_sq_return = (1 - dt) * self._mean_sq_return + dt * (excess_ret**2)

        self._step_count += 1

        # Calculate Variance: E[x^2] - (E[x])^2
        variance = self._mean_sq_return - (self._mean_return**2)
        # Ensure variance is non-negative (can be slightly negative due to float precision)
        variance = max(0.0, variance)

        std_dev = np.sqrt(variance) + 1e-9

        # A stable proxy for Differential Sharpe in RL context:
        # Reward = Excess_Return / Moving_Std_Dev
        # This scales the current return by the historical volatility environment.
        # If volatility is high, large returns are needed to get the same reward.

        return excess_ret / std_dev

    def on_step_end(self, env: TradingEnvProtocol):
        pass

    def reset(self):
        """Reset internal statistics."""
        self._mean_return = 0.0
        self._mean_sq_return = 0.0
        self._step_count = 0

`init(risk_free_rate=0.0, decay=0.99)`

Parameters:

Name	Type	Description	Default
`risk_free_rate`	`float`	The risk-free rate (per step) to subtract from returns. Defaults to 0 assuming short time steps.	`0.0`
`decay`	`float`	Decay factor for the moving average of returns and variance. 0 < decay < 1.	`0.99`

Source code in src/quantrl_lab/environments/stock/strategies/rewards/sharpe.py

def __init__(self, risk_free_rate: float = 0.0, decay: float = 0.99):
    """
    Args:
        risk_free_rate: The risk-free rate (per step) to subtract from returns.
                        Defaults to 0 assuming short time steps.
        decay: Decay factor for the moving average of returns and variance.
               0 < decay < 1.
    """
    super().__init__()
    self.risk_free_rate = risk_free_rate
    self.decay = decay

    # Moving statistics
    self._mean_return = 0.0
    self._mean_sq_return = 0.0  # E[x^2]
    self._step_count = 0

`calculate_reward(env)`

Calculate the differential Sharpe reward.

Source code in src/quantrl_lab/environments/stock/strategies/rewards/sharpe.py

def calculate_reward(self, env: TradingEnvProtocol) -> float:
    """Calculate the differential Sharpe reward."""
    # Calculate current step return
    current_price = env._get_current_price()
    current_val = env.portfolio.get_value(current_price)
    prev_val = env.prev_portfolio_value

    if prev_val <= 1e-9:
        ret = 0.0
    else:
        ret = (current_val - prev_val) / prev_val

    # Excess return
    excess_ret = ret - self.risk_free_rate

    # Update moving averages using exponential decay
    if self._step_count == 0:
        self._mean_return = excess_ret
        self._mean_sq_return = excess_ret**2
    else:
        dt = 1.0 - self.decay
        self._mean_return = (1 - dt) * self._mean_return + dt * excess_ret
        self._mean_sq_return = (1 - dt) * self._mean_sq_return + dt * (excess_ret**2)

    self._step_count += 1

    # Calculate Variance: E[x^2] - (E[x])^2
    variance = self._mean_sq_return - (self._mean_return**2)
    # Ensure variance is non-negative (can be slightly negative due to float precision)
    variance = max(0.0, variance)

    std_dev = np.sqrt(variance) + 1e-9

    # A stable proxy for Differential Sharpe in RL context:
    # Reward = Excess_Return / Moving_Std_Dev
    # This scales the current return by the historical volatility environment.
    # If volatility is high, large returns are needed to get the same reward.

    return excess_ret / std_dev

`reset()`

Reset internal statistics.

Source code in src/quantrl_lab/environments/stock/strategies/rewards/sharpe.py

def reset(self):
    """Reset internal statistics."""
    self._mean_return = 0.0
    self._mean_sq_return = 0.0
    self._step_count = 0

`drawdown`

`DrawdownPenaltyReward`

Bases: BaseRewardStrategy

Penalizes the agent proportional to the current drawdown depth.

This provides a continuous pressure to recover from losses. Reward = - (Current_Drawdown_Pct * penalty_factor)

Source code in src/quantrl_lab/environments/stock/strategies/rewards/drawdown.py

class DrawdownPenaltyReward(BaseRewardStrategy):
    """
    Penalizes the agent proportional to the current drawdown depth.

    This provides a continuous pressure to recover from losses.
    Reward = - (Current_Drawdown_Pct * penalty_factor)
    """

    def __init__(self, penalty_factor: float = 1.0):
        """
        Args:
            penalty_factor: Scaling factor for the penalty.
        """
        super().__init__()
        self.penalty_factor = penalty_factor
        self._max_portfolio_value = 0.0

    def calculate_reward(self, env: TradingEnvProtocol) -> float:
        """Calculate drawdown penalty."""
        current_price = env._get_current_price()
        current_val = env.portfolio.get_value(current_price)

        # Initialize max value if first step (or if portfolio was reset but strategy wasn't)
        if self._max_portfolio_value == 0.0:
            self._max_portfolio_value = current_val
            # Avoid division by zero if starting with 0 balance (unlikely)
            if self._max_portfolio_value <= 1e-9:
                self._max_portfolio_value = 1e-9

        # Update high-water mark
        if current_val > self._max_portfolio_value:
            self._max_portfolio_value = current_val

        # Calculate drawdown
        drawdown_pct = (self._max_portfolio_value - current_val) / self._max_portfolio_value

        # Ensure drawdown is non-negative (it should be by definition, but float errors exist)
        drawdown_pct = max(0.0, drawdown_pct)

        return -(drawdown_pct * self.penalty_factor)

    def on_step_end(self, env: TradingEnvProtocol):
        pass

    def reset(self):
        """Reset internal high-water mark."""
        self._max_portfolio_value = 0.0

`init(penalty_factor=1.0)`

Parameters:

Name	Type	Description	Default
`penalty_factor`	`float`	Scaling factor for the penalty.	`1.0`

Source code in src/quantrl_lab/environments/stock/strategies/rewards/drawdown.py

def __init__(self, penalty_factor: float = 1.0):
    """
    Args:
        penalty_factor: Scaling factor for the penalty.
    """
    super().__init__()
    self.penalty_factor = penalty_factor
    self._max_portfolio_value = 0.0

`calculate_reward(env)`

Calculate drawdown penalty.

Source code in src/quantrl_lab/environments/stock/strategies/rewards/drawdown.py

def calculate_reward(self, env: TradingEnvProtocol) -> float:
    """Calculate drawdown penalty."""
    current_price = env._get_current_price()
    current_val = env.portfolio.get_value(current_price)

    # Initialize max value if first step (or if portfolio was reset but strategy wasn't)
    if self._max_portfolio_value == 0.0:
        self._max_portfolio_value = current_val
        # Avoid division by zero if starting with 0 balance (unlikely)
        if self._max_portfolio_value <= 1e-9:
            self._max_portfolio_value = 1e-9

    # Update high-water mark
    if current_val > self._max_portfolio_value:
        self._max_portfolio_value = current_val

    # Calculate drawdown
    drawdown_pct = (self._max_portfolio_value - current_val) / self._max_portfolio_value

    # Ensure drawdown is non-negative (it should be by definition, but float errors exist)
    drawdown_pct = max(0.0, drawdown_pct)

    return -(drawdown_pct * self.penalty_factor)

`reset()`

Reset internal high-water mark.

Source code in src/quantrl_lab/environments/stock/strategies/rewards/drawdown.py

def reset(self):
    """Reset internal high-water mark."""
    self._max_portfolio_value = 0.0

`turnover`

`TurnoverPenaltyReward`

Bases: BaseRewardStrategy

Penalizes excessive trading by applying a multiple of the fees paid.

While PnL implicitly accounts for fees, an explicit penalty helps the agent learn "efficiency" faster, discouraging noise trading where the profit margin is razor-thin compared to the cost.

Source code in src/quantrl_lab/environments/stock/strategies/rewards/turnover.py

class TurnoverPenaltyReward(BaseRewardStrategy):
    """
    Penalizes excessive trading by applying a multiple of the fees paid.

    While PnL implicitly accounts for fees, an explicit penalty helps
    the agent learn "efficiency" faster, discouraging noise trading
    where the profit margin is razor-thin compared to the cost.
    """

    def __init__(self, penalty_factor: float = 1.0):
        """
        Args:
            penalty_factor: Multiplier for fees paid.
                            1.0 means penalty = fees (doubling the cost impact).
                            5.0 means extremely high penalty for churning.
        """
        self.penalty_factor = penalty_factor

    def calculate_reward(self, env: TradingEnvProtocol) -> float:
        """
        Calculate penalty based on transaction costs incurred in this
        step.

        We look at the executed_orders_history for events that happened
        at the current step.
        """
        fees_paid = 0.0

        # Check if history exists
        if env.portfolio.executed_orders_history:
            # Iterate backwards to find orders from current step
            # Optimally, we could just look at the last few, but this is safe
            for event in reversed(env.portfolio.executed_orders_history):
                if event["step"] != env.current_step:
                    break

                # Logic for cost extraction depends on event type
                # Market Buy / Limit Buy Executed: 'cost' field is the total cash spent
                # We need to estimate the *fee* portion.
                # StockPortfolio doesn't log the raw fee explicitly in the event dict,
                # but we can infer it or we might need to update StockPortfolio to log 'fee'.

                # Current StockPortfolio logic:
                # Buy: cost = shares * price * (1 + fee_pct)
                # Sell: revenue = shares * price * (1 - fee_pct)

                # To be precise without modifying Portfolio, we can approximate:
                # fee ~= value * fee_pct

                price = event.get("price", 0.0)
                shares = event.get("shares", 0)
                value = price * shares

                # Use the environment's configured transaction cost
                # env.portfolio.transaction_cost_pct is available
                fees_paid += value * env.portfolio.transaction_cost_pct

        return -(fees_paid * self.penalty_factor)

`init(penalty_factor=1.0)`

Parameters:

Name	Type	Description	Default
`penalty_factor`	`float`	Multiplier for fees paid. 1.0 means penalty = fees (doubling the cost impact). 5.0 means extremely high penalty for churning.	`1.0`

Source code in src/quantrl_lab/environments/stock/strategies/rewards/turnover.py

def __init__(self, penalty_factor: float = 1.0):
    """
    Args:
        penalty_factor: Multiplier for fees paid.
                        1.0 means penalty = fees (doubling the cost impact).
                        5.0 means extremely high penalty for churning.
    """
    self.penalty_factor = penalty_factor

`calculate_reward(env)`

Calculate penalty based on transaction costs incurred in this step.

We look at the executed_orders_history for events that happened at the current step.

Source code in src/quantrl_lab/environments/stock/strategies/rewards/turnover.py

def calculate_reward(self, env: TradingEnvProtocol) -> float:
    """
    Calculate penalty based on transaction costs incurred in this
    step.

    We look at the executed_orders_history for events that happened
    at the current step.
    """
    fees_paid = 0.0

    # Check if history exists
    if env.portfolio.executed_orders_history:
        # Iterate backwards to find orders from current step
        # Optimally, we could just look at the last few, but this is safe
        for event in reversed(env.portfolio.executed_orders_history):
            if event["step"] != env.current_step:
                break

            # Logic for cost extraction depends on event type
            # Market Buy / Limit Buy Executed: 'cost' field is the total cash spent
            # We need to estimate the *fee* portion.
            # StockPortfolio doesn't log the raw fee explicitly in the event dict,
            # but we can infer it or we might need to update StockPortfolio to log 'fee'.

            # Current StockPortfolio logic:
            # Buy: cost = shares * price * (1 + fee_pct)
            # Sell: revenue = shares * price * (1 - fee_pct)

            # To be precise without modifying Portfolio, we can approximate:
            # fee ~= value * fee_pct

            price = event.get("price", 0.0)
            shares = event.get("shares", 0)
            value = price * shares

            # Use the environment's configured transaction cost
            # env.portfolio.transaction_cost_pct is available
            fees_paid += value * env.portfolio.transaction_cost_pct

    return -(fees_paid * self.penalty_factor)

`invalid_action`

`InvalidActionPenalty`

Bases: BaseRewardStrategy

Applies a fixed penalty for attempting an invalid action.

E.g. Sell or Limit Sell when no shares are held.

Source code in src/quantrl_lab/environments/stock/strategies/rewards/invalid_action.py

class InvalidActionPenalty(BaseRewardStrategy):
    """
    Applies a fixed penalty for attempting an invalid action.

    E.g. Sell or Limit Sell when no shares are held.
    """

    def __init__(self, penalty: float = -1.0):
        self.penalty = penalty

    def calculate_reward(self, env: TradingEnvProtocol) -> float:
        """
        Calculate the reward based on the action taken in the
        environment. If an invalid action is attempted, a penalty is
        applied.

        Args:
            env (TradingEnvProtocol): The trading environment instance.

        Returns:
            float: The penalty for invalid action attempt.
        """
        if env.decoded_action_info.get("invalid_action_attempt", False):
            return self.penalty
        return 0.0

`calculate_reward(env)`

Calculate the reward based on the action taken in the environment. If an invalid action is attempted, a penalty is applied.

Parameters:

Name	Type	Description	Default
`env`	`TradingEnvProtocol`	The trading environment instance.	required

Returns:

Name	Type	Description
`float`	`float`	The penalty for invalid action attempt.

Source code in src/quantrl_lab/environments/stock/strategies/rewards/invalid_action.py

def calculate_reward(self, env: TradingEnvProtocol) -> float:
    """
    Calculate the reward based on the action taken in the
    environment. If an invalid action is attempted, a penalty is
    applied.

    Args:
        env (TradingEnvProtocol): The trading environment instance.

    Returns:
        float: The penalty for invalid action attempt.
    """
    if env.decoded_action_info.get("invalid_action_attempt", False):
        return self.penalty
    return 0.0

`boredom`

`BoredomPenaltyReward`

Bases: BaseRewardStrategy

Penalizes the agent for holding a position too long without significant price movement or profit.

This encourages the agent to: 1. Enter trades only when a move is expected soon. 2. Exit stale positions rather than holding them indefinitely hoping for a turnaround.

Source code in src/quantrl_lab/environments/stock/strategies/rewards/boredom.py

class BoredomPenaltyReward(BaseRewardStrategy):
    """
    Penalizes the agent for holding a position too long without
    significant price movement or profit.

    This encourages the agent to:
    1. Enter trades only when a move is expected soon.
    2. Exit stale positions rather than holding them indefinitely hoping for a turnaround.
    """

    def __init__(self, penalty_per_step: float = -0.001, grace_period: int = 10, min_profit_pct: float = 0.005):
        """
        Args:
            penalty_per_step: The negative reward to apply per step after the grace period.
            grace_period: Number of steps a position can be held without penalty.
            min_profit_pct: The minimum unrealized profit % required to reset the boredom timer.
                            If the position is profitable enough, we don't penalize holding (letting winners run).
        """
        super().__init__()
        self.penalty_per_step = penalty_per_step
        self.grace_period = grace_period
        self.min_profit_pct = min_profit_pct

        self._steps_held = 0
        self._entry_price = 0.0

    def calculate_reward(self, env: TradingEnvProtocol) -> float:
        """Calculate the boredom penalty."""
        # Check if we have a position
        if env.portfolio.total_shares <= 0:
            self._steps_held = 0
            self._entry_price = 0.0
            return 0.0

        # Initialize entry price if this is the start of a position
        if self._steps_held == 0:
            # Ideally get average entry price from portfolio, but for simplicity use current
            # if we just detected a position. Better: check portfolio.
            pass

        # Calculate unrealized PnL %
        # We need the average entry price to know if we are "winning"
        # The portfolio object has executed_orders_history, but accessing avg_entry directly is better.
        # Let's approximate using the price when we first started counting (or complex logic).
        # For simplicity, let's assume the agent just needs "action" (price movement), not necessarily profit.

        # Better logic:
        # If unrealized profit > min_profit_pct, boredom = 0 (Let winners run).
        # Else, increment counter.

        # We need average entry price.
        # Accessing private or complex portfolio attributes might be brittle.
        # Let's use a simplified heuristic:
        # If the portfolio value hasn't increased by min_profit_pct since the trade started...

        # Actually, let's use the 'unrealized_pnl' if available, or calc it.
        # Since we don't have easy access to 'avg_entry_price' without iterating history,
        # let's just use a counter that resets on ANY trade action (Buy/Sell/Close).

        # Wait, the agent might 'LimitBuy' repeatedly. Does that count as action?
        # No, we only care about HOLDING a position.

        # Let's try to get the average entry price from the portfolio if possible.
        # Inspecting portfolio... it usually tracks positions.
        # Assuming `env.portfolio.positions` or similar.
        # The standard portfolio has `executed_orders_history`.

        # Fallback: Just punish "Time in Market" if returns are flat.

        # Let's implement a simple "Stale Position" penalty.
        self._steps_held += 1

        if self._steps_held <= self.grace_period:
            return 0.0

        # We are past grace period. Are we winning?
        # We can roughly estimate entry from when counter started (imperfect but usable)
        # Or better: check if the agent *did* something this step.
        # If action was HOLD, apply penalty.
        # But `calculate_reward` is called AFTER `step`.

        # Let's keep it simple:
        # If holding > grace_period, apply penalty.
        return self.penalty_per_step

    def on_step_end(self, env: TradingEnvProtocol):
        # Reset if position is closed or significantly changed
        # This requires checking if a trade happened this step.
        # We can check env.portfolio.trades[-1] timestamp?

        # Actually, simpler:
        # If position size is 0, reset.
        if env.portfolio.total_shares == 0:
            self._steps_held = 0

    def reset(self):
        self._steps_held = 0
        self._entry_price = 0.0

`init(penalty_per_step=-0.001, grace_period=10, min_profit_pct=0.005)`

Parameters:

Name	Type	Description	Default
`penalty_per_step`	`float`	The negative reward to apply per step after the grace period.	`-0.001`
`grace_period`	`int`	Number of steps a position can be held without penalty.	`10`
`min_profit_pct`	`float`	The minimum unrealized profit % required to reset the boredom timer. If the position is profitable enough, we don't penalize holding (letting winners run).	`0.005`

Source code in src/quantrl_lab/environments/stock/strategies/rewards/boredom.py

def __init__(self, penalty_per_step: float = -0.001, grace_period: int = 10, min_profit_pct: float = 0.005):
    """
    Args:
        penalty_per_step: The negative reward to apply per step after the grace period.
        grace_period: Number of steps a position can be held without penalty.
        min_profit_pct: The minimum unrealized profit % required to reset the boredom timer.
                        If the position is profitable enough, we don't penalize holding (letting winners run).
    """
    super().__init__()
    self.penalty_per_step = penalty_per_step
    self.grace_period = grace_period
    self.min_profit_pct = min_profit_pct

    self._steps_held = 0
    self._entry_price = 0.0

`calculate_reward(env)`

Calculate the boredom penalty.

Source code in src/quantrl_lab/environments/stock/strategies/rewards/boredom.py

def calculate_reward(self, env: TradingEnvProtocol) -> float:
    """Calculate the boredom penalty."""
    # Check if we have a position
    if env.portfolio.total_shares <= 0:
        self._steps_held = 0
        self._entry_price = 0.0
        return 0.0

    # Initialize entry price if this is the start of a position
    if self._steps_held == 0:
        # Ideally get average entry price from portfolio, but for simplicity use current
        # if we just detected a position. Better: check portfolio.
        pass

    # Calculate unrealized PnL %
    # We need the average entry price to know if we are "winning"
    # The portfolio object has executed_orders_history, but accessing avg_entry directly is better.
    # Let's approximate using the price when we first started counting (or complex logic).
    # For simplicity, let's assume the agent just needs "action" (price movement), not necessarily profit.

    # Better logic:
    # If unrealized profit > min_profit_pct, boredom = 0 (Let winners run).
    # Else, increment counter.

    # We need average entry price.
    # Accessing private or complex portfolio attributes might be brittle.
    # Let's use a simplified heuristic:
    # If the portfolio value hasn't increased by min_profit_pct since the trade started...

    # Actually, let's use the 'unrealized_pnl' if available, or calc it.
    # Since we don't have easy access to 'avg_entry_price' without iterating history,
    # let's just use a counter that resets on ANY trade action (Buy/Sell/Close).

    # Wait, the agent might 'LimitBuy' repeatedly. Does that count as action?
    # No, we only care about HOLDING a position.

    # Let's try to get the average entry price from the portfolio if possible.
    # Inspecting portfolio... it usually tracks positions.
    # Assuming `env.portfolio.positions` or similar.
    # The standard portfolio has `executed_orders_history`.

    # Fallback: Just punish "Time in Market" if returns are flat.

    # Let's implement a simple "Stale Position" penalty.
    self._steps_held += 1

    if self._steps_held <= self.grace_period:
        return 0.0

    # We are past grace period. Are we winning?
    # We can roughly estimate entry from when counter started (imperfect but usable)
    # Or better: check if the agent *did* something this step.
    # If action was HOLD, apply penalty.
    # But `calculate_reward` is called AFTER `step`.

    # Let's keep it simple:
    # If holding > grace_period, apply penalty.
    return self.penalty_per_step

`execution_bonus`

`LimitExecutionReward`

Bases: BaseRewardStrategy

Provides a reward proportional to the price improvement achieved by a Limit Order filling instead of executing immediately at market.

Source code in src/quantrl_lab/environments/stock/strategies/rewards/execution_bonus.py

class LimitExecutionReward(BaseRewardStrategy):
    """Provides a reward proportional to the price improvement achieved
    by a Limit Order filling instead of executing immediately at
    market."""

    def __init__(self, improvement_multiplier: float = 10.0):
        """
        Args:
            improvement_multiplier: Scales the % improvement.
                                    e.g., a 2% price improvement * 10.0 = +0.20 reward.
        """
        super().__init__()
        self.improvement_multiplier = improvement_multiplier

    def calculate_reward(self, env: TradingEnvProtocol) -> float:
        bonus = 0.0

        if env.portfolio.executed_orders_history:
            for event in reversed(env.portfolio.executed_orders_history):
                if event.get("step") != env.current_step:
                    break

                order_type = event.get("type", "")
                exec_price = event.get("price", 0.0)
                ref_price = event.get("reference_price", 0.0)

                if ref_price <= 1e-9:
                    continue

                improvement_pct = 0.0
                if order_type == "limit_buy_executed":
                    # Bought cheaper than the reference price
                    improvement_pct = (ref_price - exec_price) / ref_price
                elif order_type == "limit_sell_executed":
                    # Sold higher than the reference price
                    improvement_pct = (exec_price - ref_price) / ref_price

                if improvement_pct > 0:
                    bonus += improvement_pct * self.improvement_multiplier

        return bonus

`init(improvement_multiplier=10.0)`

Parameters:

Name	Type	Description	Default
`improvement_multiplier`	`float`	Scales the % improvement. e.g., a 2% price improvement * 10.0 = +0.20 reward.	`10.0`

Source code in src/quantrl_lab/environments/stock/strategies/rewards/execution_bonus.py

def __init__(self, improvement_multiplier: float = 10.0):
    """
    Args:
        improvement_multiplier: Scales the % improvement.
                                e.g., a 2% price improvement * 10.0 = +0.20 reward.
    """
    super().__init__()
    self.improvement_multiplier = improvement_multiplier

`expiration`

`OrderExpirationPenaltyReward`

Bases: BaseRewardStrategy

Penalizes the agent when pending orders expire.

This discourages "order spamming" (placing unrealistic limit orders that never fill and just clog the system until they time out).

Source code in src/quantrl_lab/environments/stock/strategies/rewards/expiration.py

class OrderExpirationPenaltyReward(BaseRewardStrategy):
    """
    Penalizes the agent when pending orders expire.

    This discourages "order spamming" (placing unrealistic limit orders
    that never fill and just clog the system until they time out).
    """

    def __init__(self, penalty_per_order: float = -0.1):
        """
        Args:
            penalty_per_order: Fixed penalty for each expired order in the step.
                               Should be small but non-zero.
        """
        self.penalty_per_order = penalty_per_order

    def calculate_reward(self, env: TradingEnvProtocol) -> float:
        """Calculate penalty based on number of expired orders in this
        step."""
        expired_count = 0

        if env.portfolio.executed_orders_history:
            for event in reversed(env.portfolio.executed_orders_history):
                if event["step"] != env.current_step:
                    break

                # Check for expiration event types
                # StockPortfolio logs types like "limit_buy_expired", "stop_loss_expired"
                if "expired" in event["type"]:
                    expired_count += 1

        return expired_count * self.penalty_per_order

`init(penalty_per_order=-0.1)`

Parameters:

Name	Type	Description	Default
`penalty_per_order`	`float`	Fixed penalty for each expired order in the step. Should be small but non-zero.	`-0.1`

Source code in src/quantrl_lab/environments/stock/strategies/rewards/expiration.py

def __init__(self, penalty_per_order: float = -0.1):
    """
    Args:
        penalty_per_order: Fixed penalty for each expired order in the step.
                           Should be small but non-zero.
    """
    self.penalty_per_order = penalty_per_order

`calculate_reward(env)`

Calculate penalty based on number of expired orders in this step.

Source code in src/quantrl_lab/environments/stock/strategies/rewards/expiration.py

def calculate_reward(self, env: TradingEnvProtocol) -> float:
    """Calculate penalty based on number of expired orders in this
    step."""
    expired_count = 0

    if env.portfolio.executed_orders_history:
        for event in reversed(env.portfolio.executed_orders_history):
            if event["step"] != env.current_step:
                break

            # Check for expiration event types
            # StockPortfolio logs types like "limit_buy_expired", "stop_loss_expired"
            if "expired" in event["type"]:
                expired_count += 1

    return expired_count * self.penalty_per_order

`composite`

`CompositeReward`

Bases: BaseRewardStrategy

A composite strategy that combines multiple reward strategies with weights.

This class implements the Composite design pattern.

Features: - Weight Normalization: Ensures weights sum to 1.0. - Auto-Scaling: Optionally normalizes each component strategy to N(0,1) before weighting, preventing one strategy from dominating others due to scale.

Source code in src/quantrl_lab/environments/stock/strategies/rewards/composite.py

class CompositeReward(BaseRewardStrategy):
    """
    A composite strategy that combines multiple reward strategies with
    weights.

    This class implements the Composite design pattern.

    Features:
    - Weight Normalization: Ensures weights sum to 1.0.
    - Auto-Scaling: Optionally normalizes each component strategy to N(0,1)
      before weighting, preventing one strategy from dominating others due to scale.
    """

    def __init__(
        self,
        strategies: List[BaseRewardStrategy],
        weights: List[float],
        normalize_weights: bool = True,
        auto_scale: bool = False,
    ):
        if len(strategies) != len(weights):
            raise ValueError("The number of strategies and weights must be equal.")

        self.strategies = strategies
        self.weights = weights
        self.normalize_weights = normalize_weights
        self.auto_scale = auto_scale

        # Initialize stats trackers if auto-scaling is enabled
        self._stats = [_RunningStat() for _ in strategies] if auto_scale else []

    def calculate_reward(self, env: TradingEnvProtocol) -> float:
        """
        Calculate the composite reward based on the child strategies.

        Args:
            env (TradingEnvProtocol): The trading environment instance.

        Returns:
            float: The composite reward based on the child strategies.
        """
        weights_to_use = self.weights
        if self.normalize_weights:
            total_weight = sum(self.weights)
            if total_weight == 0:
                raise ValueError("Sum of weights must not be zero when normalize_weights is True.")
            weights_to_use = [w / total_weight for w in self.weights]

        total_reward = 0.0
        for i, (strategy, weight) in enumerate(zip(self.strategies, weights_to_use)):
            # Calculate the reward from the child strategy
            component_reward = strategy.calculate_reward(env)

            # Auto-scale if enabled
            if self.auto_scale:
                component_reward = self._stats[i].update_and_normalize(component_reward)

            total_reward += weight * component_reward

        return total_reward

    def on_step_end(self, env: TradingEnvProtocol):
        """Optional: A hook to update any internal state if needed.
        This method is called at the end of each step in the environment.

        Args:
            env (TradingEnvProtocol): The trading environment instance.
        """
        for strategy in self.strategies:
            strategy.on_step_end(env)

    def reset(self):
        """
        Reset child strategies.

        Note: We do NOT reset running stats (if auto_scale=True) because they
        represent the global distribution of the environment rewards, which
        should persist across episodes for stability.
        """
        for strategy in self.strategies:
            if hasattr(strategy, "reset"):
                strategy.reset()

`calculate_reward(env)`

Calculate the composite reward based on the child strategies.

Parameters:

Name	Type	Description	Default
`env`	`TradingEnvProtocol`	The trading environment instance.	required

Returns:

Name	Type	Description
`float`	`float`	The composite reward based on the child strategies.

Source code in src/quantrl_lab/environments/stock/strategies/rewards/composite.py

def calculate_reward(self, env: TradingEnvProtocol) -> float:
    """
    Calculate the composite reward based on the child strategies.

    Args:
        env (TradingEnvProtocol): The trading environment instance.

    Returns:
        float: The composite reward based on the child strategies.
    """
    weights_to_use = self.weights
    if self.normalize_weights:
        total_weight = sum(self.weights)
        if total_weight == 0:
            raise ValueError("Sum of weights must not be zero when normalize_weights is True.")
        weights_to_use = [w / total_weight for w in self.weights]

    total_reward = 0.0
    for i, (strategy, weight) in enumerate(zip(self.strategies, weights_to_use)):
        # Calculate the reward from the child strategy
        component_reward = strategy.calculate_reward(env)

        # Auto-scale if enabled
        if self.auto_scale:
            component_reward = self._stats[i].update_and_normalize(component_reward)

        total_reward += weight * component_reward

    return total_reward

`on_step_end(env)`

Optional: A hook to update any internal state if needed. This method is called at the end of each step in the environment.

Parameters:

Name	Type	Description	Default
`env`	`TradingEnvProtocol`	The trading environment instance.	required

Source code in src/quantrl_lab/environments/stock/strategies/rewards/composite.py

def on_step_end(self, env: TradingEnvProtocol):
    """Optional: A hook to update any internal state if needed.
    This method is called at the end of each step in the environment.

    Args:
        env (TradingEnvProtocol): The trading environment instance.
    """
    for strategy in self.strategies:
        strategy.on_step_end(env)

`reset()`

Reset child strategies.

Note: We do NOT reset running stats (if auto_scale=True) because they represent the global distribution of the environment rewards, which should persist across episodes for stability.

Source code in src/quantrl_lab/environments/stock/strategies/rewards/composite.py

def reset(self):
    """
    Reset child strategies.

    Note: We do NOT reset running stats (if auto_scale=True) because they
    represent the global distribution of the environment rewards, which
    should persist across episodes for stability.
    """
    for strategy in self.strategies:
        if hasattr(strategy, "reset"):
            strategy.reset()

Strategies

Base Interfaces

interfaces

BaseActionStrategy

define_action_space() abstractmethod

handle_action(env_self, action) abstractmethod

BaseObservationStrategy

define_observation_space(env) abstractmethod

build_observation(env) abstractmethod

get_feature_names(env) abstractmethod

BaseRewardStrategy

calculate_reward(env) abstractmethod

on_step_end(env)

Action Strategies

standard

StandardActionStrategy

define_action_space()

handle_action(env_self, action)

time_in_force

TimeInForceActionStrategy

define_action_space()

handle_action(env_self, action)

Observation Strategies

feature_aware

FeatureAwareObservationStrategy

__init__(volatility_lookback=10, trend_lookback=10, normalize_stationary=True)

get_feature_names(env)

Reward Strategies

portfolio_value

PortfolioValueChangeReward

calculate_reward(env)

sortino

DifferentialSortinoReward

__init__(target_return=0.0, decay=0.99)

calculate_reward(env)

reset()

sharpe

DifferentialSharpeReward

__init__(risk_free_rate=0.0, decay=0.99)

calculate_reward(env)

reset()

drawdown

DrawdownPenaltyReward

__init__(penalty_factor=1.0)

calculate_reward(env)

reset()

turnover

TurnoverPenaltyReward

__init__(penalty_factor=1.0)

calculate_reward(env)

invalid_action

InvalidActionPenalty

calculate_reward(env)

boredom

BoredomPenaltyReward

__init__(penalty_per_step=-0.001, grace_period=10, min_profit_pct=0.005)

calculate_reward(env)

execution_bonus

LimitExecutionReward

__init__(improvement_multiplier=10.0)

expiration

OrderExpirationPenaltyReward

__init__(penalty_per_order=-0.1)

calculate_reward(env)

composite

CompositeReward

calculate_reward(env)

on_step_end(env)

reset()

`interfaces`

`BaseActionStrategy`

`define_action_space()` `abstractmethod`

`handle_action(env_self, action)` `abstractmethod`

`BaseObservationStrategy`

`define_observation_space(env)` `abstractmethod`

`build_observation(env)` `abstractmethod`

`get_feature_names(env)` `abstractmethod`

`BaseRewardStrategy`

`calculate_reward(env)` `abstractmethod`

`on_step_end(env)`

`standard`

`StandardActionStrategy`

`define_action_space()`

`handle_action(env_self, action)`

`time_in_force`

`TimeInForceActionStrategy`

`define_action_space()`

`handle_action(env_self, action)`

`feature_aware`

`FeatureAwareObservationStrategy`

`init(volatility_lookback=10, trend_lookback=10, normalize_stationary=True)`

`get_feature_names(env)`

`portfolio_value`

`PortfolioValueChangeReward`

`calculate_reward(env)`

`sortino`

`DifferentialSortinoReward`

`init(target_return=0.0, decay=0.99)`

`calculate_reward(env)`

`reset()`

`sharpe`

`DifferentialSharpeReward`

`init(risk_free_rate=0.0, decay=0.99)`

`calculate_reward(env)`

`reset()`

`drawdown`

`DrawdownPenaltyReward`

`init(penalty_factor=1.0)`

`calculate_reward(env)`

`reset()`

`turnover`

`TurnoverPenaltyReward`

`init(penalty_factor=1.0)`

`calculate_reward(env)`

`invalid_action`

`InvalidActionPenalty`

`calculate_reward(env)`

`boredom`

`BoredomPenaltyReward`

`init(penalty_per_step=-0.001, grace_period=10, min_profit_pct=0.005)`

`calculate_reward(env)`

`execution_bonus`

`LimitExecutionReward`

`init(improvement_multiplier=10.0)`

`expiration`

`OrderExpirationPenaltyReward`

`init(penalty_per_order=-0.1)`

`calculate_reward(env)`

`composite`

`CompositeReward`

`calculate_reward(env)`

`on_step_end(env)`

`reset()`