From d26f2e61470c30a6722616439ee361bd4fcc0d72 Mon Sep 17 00:00:00 2001
From: Pabloo22 <pablete.arino@gmail.com>
Date: Thu, 11 Sep 2025 19:22:41 +0200
Subject: [PATCH 1/4] [Feature] Add `RewardWithPenalties` class and penalty
 functions

---
 job_shop_lib/metaheuristics/__init__.py       |   6 +
 .../metaheuristics/_objective_functions.py    |  85 ++++++++++----
 .../reinforcement_learning/__init__.py        |   9 ++
 .../_reward_observers.py                      |  75 +++++++++++++
 job_shop_lib/reinforcement_learning/_utils.py |  67 ++++++++++-
 .../test_reward_functions.py                  |  36 ------
 .../{test_rl_utils.py => test_utils.py}       | 104 ++++++++++++++++++
 7 files changed, 320 insertions(+), 62 deletions(-)
 delete mode 100644 tests/reinforcement_learning/test_reward_functions.py
 rename tests/reinforcement_learning/{test_rl_utils.py => test_utils.py} (71%)

diff --git a/job_shop_lib/metaheuristics/__init__.py b/job_shop_lib/metaheuristics/__init__.py
index 6118abdd..e4acd00e 100644
--- a/job_shop_lib/metaheuristics/__init__.py
+++ b/job_shop_lib/metaheuristics/__init__.py
@@ -31,12 +31,16 @@
     swap_random_operations
     ObjectiveFunction
     get_makespan_with_penalties_objective
+    compute_penalty_for_deadlines
+    compute_penalty_for_due_dates
 
 """
 
 from job_shop_lib.metaheuristics._objective_functions import (
     ObjectiveFunction,
     get_makespan_with_penalties_objective,
+    compute_penalty_for_deadlines,
+    compute_penalty_for_due_dates,
 )
 from job_shop_lib.metaheuristics._neighbor_generators import (
     NeighborGenerator,
@@ -58,4 +62,6 @@
     "swap_random_operations",
     "ObjectiveFunction",
     "get_makespan_with_penalties_objective",
+    "compute_penalty_for_deadlines",
+    "compute_penalty_for_due_dates",
 ]
diff --git a/job_shop_lib/metaheuristics/_objective_functions.py b/job_shop_lib/metaheuristics/_objective_functions.py
index 9b52ae08..88ffccb0 100644
--- a/job_shop_lib/metaheuristics/_objective_functions.py
+++ b/job_shop_lib/metaheuristics/_objective_functions.py
@@ -45,29 +45,70 @@ def get_makespan_with_penalties_objective(
 
     def objective(schedule: Schedule) -> float:
         makespan = schedule.makespan()
-        instance = schedule.instance
-
-        # Fast path: no constraint attributes present in the instance
-        if not instance.has_deadlines and not instance.has_due_dates:
-            return makespan
-
-        penalty = 0.0
-        for machine_schedule in schedule.schedule:
-            for scheduled_op in machine_schedule:
-                op = scheduled_op.operation
-                # Deadline (hard) penalty
-                if (
-                    op.deadline is not None
-                    and scheduled_op.end_time > op.deadline
-                ):
-                    penalty += deadline_penalty_factor
-                # Due date (soft) penalty
-                if (
-                    op.due_date is not None
-                    and scheduled_op.end_time > op.due_date
-                ):
-                    penalty += due_date_penalty_factor
+        penalty_for_deadlines = compute_penalty_for_deadlines(
+            schedule, deadline_penalty_factor
+        )
+        penalty_for_due_dates = compute_penalty_for_due_dates(
+            schedule, due_date_penalty_factor
+        )
+        penalty = penalty_for_deadlines + penalty_for_due_dates
 
         return makespan + penalty
 
     return objective
+
+
+def compute_penalty_for_deadlines(
+    schedule: Schedule, penalty_per_violation: float
+) -> float:
+    """Compute the total penalty for deadline violations in a schedule.
+
+    Args:
+        schedule:
+            The schedule to evaluate.
+        penalty_per_violation:
+            The penalty to apply for each operation that
+            finishes after its deadline.
+
+    Returns:
+        The total penalty for deadline violations.
+    """
+    if not schedule.instance.has_deadlines or penalty_per_violation == 0:
+        return 0.0
+
+    penalty = 0.0
+    for machine_schedule in schedule.schedule:
+        for scheduled_op in machine_schedule:
+            op = scheduled_op.operation
+            if op.deadline is not None and scheduled_op.end_time > op.deadline:
+                penalty += penalty_per_violation
+
+    return penalty
+
+
+def compute_penalty_for_due_dates(
+    schedule: Schedule, penalty_per_violation: float
+) -> float:
+    """Compute the total penalty for due date violations in a schedule.
+
+    Args:
+        schedule:
+            The schedule to evaluate.
+        penalty_per_violation:
+            The penalty to apply for each operation that
+            finishes after its due date.
+
+    Returns:
+        The total penalty for due date violations.
+    """
+    if not schedule.instance.has_due_dates or penalty_per_violation == 0:
+        return 0.0
+
+    penalty = 0.0
+    for machine_schedule in schedule.schedule:
+        for scheduled_op in machine_schedule:
+            op = scheduled_op.operation
+            if op.due_date is not None and scheduled_op.end_time > op.due_date:
+                penalty += penalty_per_violation
+
+    return penalty
diff --git a/job_shop_lib/reinforcement_learning/__init__.py b/job_shop_lib/reinforcement_learning/__init__.py
index 3757b3e9..c6841d75 100644
--- a/job_shop_lib/reinforcement_learning/__init__.py
+++ b/job_shop_lib/reinforcement_learning/__init__.py
@@ -14,11 +14,14 @@
     RewardObserver
     MakespanReward
     IdleTimeReward
+    RewardWithPenalties
     RenderConfig
     add_padding
     create_edge_type_dict
     map_values
     get_optimal_actions
+    get_deadline_violation_penalty
+    get_due_date_violation_penalty
 
 """
 
@@ -32,6 +35,7 @@
     RewardObserver,
     MakespanReward,
     IdleTimeReward,
+    RewardWithPenalties,
 )
 
 from job_shop_lib.reinforcement_learning._utils import (
@@ -39,6 +43,8 @@
     create_edge_type_dict,
     map_values,
     get_optimal_actions,
+    get_deadline_violation_penalty,
+    get_due_date_violation_penalty,
 )
 
 from job_shop_lib.reinforcement_learning._single_job_shop_graph_env import (
@@ -63,9 +69,12 @@
     "RewardObserver",
     "MakespanReward",
     "IdleTimeReward",
+    "RewardWithPenalties",
     "RenderConfig",
     "add_padding",
     "create_edge_type_dict",
     "map_values",
     "get_optimal_actions",
+    "get_deadline_violation_penalty",
+    "get_due_date_violation_penalty",
 ]
diff --git a/job_shop_lib/reinforcement_learning/_reward_observers.py b/job_shop_lib/reinforcement_learning/_reward_observers.py
index b1a53cea..06c9536b 100644
--- a/job_shop_lib/reinforcement_learning/_reward_observers.py
+++ b/job_shop_lib/reinforcement_learning/_reward_observers.py
@@ -1,6 +1,9 @@
 """Rewards functions are defined as `DispatcherObervers` and are used to
 calculate the reward for a given state."""
 
+from collections.abc import Callable
+
+from job_shop_lib.exceptions import ValidationError
 from job_shop_lib.dispatching import DispatcherObserver, Dispatcher
 from job_shop_lib import ScheduledOperation
 
@@ -83,3 +86,75 @@ def update(self, scheduled_operation: ScheduledOperation):
 
         reward = -idle_time
         self.rewards.append(reward)
+
+
+class RewardWithPenalties(RewardObserver):
+    """Reward function that adds penalties to another reward function.
+
+    The reward is calculated as the sum of the reward from another reward
+    function and a penalty for each constraint violation (due dates and
+    deadlines).
+
+    Attributes:
+        base_reward_observer:
+            The base reward observer to use for calculating the reward.
+        penalty_per_violation:
+            The penalty to apply for each constraint violation.
+
+    Args:
+        dispatcher:
+            The dispatcher to observe.
+        base_reward_observer:
+            The base reward observer to use for calculating the reward. It
+            must use the same dispatcher as this reward observer. If it is
+            subscribed to the dispatcher, it will be unsubscribed.
+        penalty_function:
+            A function that takes a scheduled operation and the
+            dispatcher as input and returns the penalty for that operation.
+        subscribe:
+            Whether to subscribe to the dispatcher upon initialization.
+
+    Raises:
+        ValidationError:
+            If the base reward observer does not use the same dispatcher as
+            this reward observer.
+
+    .. versionadded:: 1.7.0
+
+    .. seealso::
+        The following functions (along with ``functools.partial``) can be
+        used to create penalty functions:
+
+        - :class:`job_shop_lib.metaheuristics.penalty_for_deadlines`
+        - :class:`job_shop_lib.metaheuristics.penalty_for_due_dates`
+
+    """
+
+    def __init__(
+        self,
+        dispatcher: Dispatcher,
+        *,
+        base_reward_observer: RewardObserver,
+        penalty_function: Callable[[ScheduledOperation, Dispatcher], float],
+        subscribe: bool = True,
+    ) -> None:
+        super().__init__(dispatcher, subscribe=subscribe)
+        self.base_reward_observer = base_reward_observer
+        self.penalty_function = penalty_function
+        if base_reward_observer.dispatcher is not dispatcher:
+            raise ValidationError(
+                "The base reward observer must use the same "
+                "dispatcher as this reward observer."
+            )
+        if base_reward_observer in dispatcher.subscribers:
+            dispatcher.unsubscribe(base_reward_observer)
+
+    def reset(self) -> None:
+        super().reset()
+        self.base_reward_observer.reset()
+
+    def update(self, scheduled_operation: ScheduledOperation):
+        self.base_reward_observer.update(scheduled_operation)
+        base_reward = self.base_reward_observer.last_reward
+        penalty = self.penalty_function(scheduled_operation, self.dispatcher)
+        self.rewards.append(base_reward - penalty)
diff --git a/job_shop_lib/reinforcement_learning/_utils.py b/job_shop_lib/reinforcement_learning/_utils.py
index 4b15d44c..f76336f7 100644
--- a/job_shop_lib/reinforcement_learning/_utils.py
+++ b/job_shop_lib/reinforcement_learning/_utils.py
@@ -5,8 +5,9 @@
 import numpy as np
 from numpy.typing import NDArray
 
+from job_shop_lib import ScheduledOperation
 from job_shop_lib.exceptions import ValidationError
-from job_shop_lib.dispatching import OptimalOperationsObserver
+from job_shop_lib.dispatching import OptimalOperationsObserver, Dispatcher
 
 T = TypeVar("T", bound=np.number)
 
@@ -193,7 +194,65 @@ def get_optimal_actions(
     return optimal_actions
 
 
-if __name__ == "__main__":
-    import doctest
+def get_deadline_violation_penalty(
+    scheduled_operation: ScheduledOperation,
+    unused_dispatcher: Dispatcher,
+    deadline_penalty_factor: float = 10_000,
+) -> float:
+    """Compute the penalty for a scheduled operation that violates its
+    deadline.
 
-    doctest.testmod()
+    Args:
+        scheduled_operation:
+            The scheduled operation to evaluate.
+        unused_dispatcher:
+            This argument is unused but included for compatibility with the
+            penalty function signature.
+        deadline_penalty_factor:
+            Cost added for each operation that
+            finishes after its deadline. Defaults to 10_000.
+    Returns:
+        The penalty for the scheduled operation if it violates its deadline,
+        otherwise 0.
+
+    .. versionadded:: 1.7.0
+    """
+    if (
+        scheduled_operation.operation.deadline is not None
+        and scheduled_operation.end_time
+        > scheduled_operation.operation.deadline
+    ):
+        return deadline_penalty_factor
+    return 0.0
+
+
+def get_due_date_violation_penalty(
+    scheduled_operation: ScheduledOperation,
+    unused_dispatcher: Dispatcher,
+    due_date_penalty_factor: float = 100,
+) -> float:
+    """Compute the penalty for a scheduled operation that violates its
+    due date.
+
+    Args:
+        scheduled_operation:
+            The scheduled operation to evaluate.
+        unused_dispatcher:
+            This argument is unused but included for compatibility with the
+            penalty function signature.
+        due_date_penalty_factor:
+            Cost added for each operation that
+            finishes after its due date. Defaults to 100.
+    Returns:
+        The penalty for the scheduled operation if it violates its due date,
+        otherwise 0.
+
+    .. versionadded:: 1.7.0
+    """
+    if (
+        scheduled_operation.operation.due_date is not None
+        and scheduled_operation.end_time
+        > scheduled_operation.operation.due_date
+    ):
+        return due_date_penalty_factor
+    return 0.0
diff --git a/tests/reinforcement_learning/test_reward_functions.py b/tests/reinforcement_learning/test_reward_functions.py
deleted file mode 100644
index 6632c973..00000000
--- a/tests/reinforcement_learning/test_reward_functions.py
+++ /dev/null
@@ -1,36 +0,0 @@
-from job_shop_lib import JobShopInstance
-from job_shop_lib.reinforcement_learning import MakespanReward, IdleTimeReward
-from job_shop_lib.dispatching import (
-    Dispatcher,
-    filter_dominated_operations,
-)
-from job_shop_lib.dispatching.rules import DispatchingRuleSolver
-
-
-def test_makespan_reward(example_job_shop_instance: JobShopInstance):
-    dispatcher = Dispatcher(example_job_shop_instance)
-    reward_function = MakespanReward(dispatcher)
-    assert not reward_function.rewards
-    solver = DispatchingRuleSolver("most_work_remaining")
-    while not dispatcher.schedule.is_complete():
-        solver.step(dispatcher)
-        assert sum(reward_function.rewards) == -dispatcher.schedule.makespan()
-
-
-def test_idle_time_reward(example_job_shop_instance: JobShopInstance):
-    dispatcher = Dispatcher(
-        example_job_shop_instance,
-        ready_operations_filter=filter_dominated_operations,
-    )
-    reward_function = IdleTimeReward(dispatcher)
-    assert not reward_function.rewards
-    solver = DispatchingRuleSolver("most_work_remaining")
-    solver.solve(example_job_shop_instance, dispatcher)
-
-    assert sum(reward_function.rewards) == -(1 + 1 + 6)
-
-
-if __name__ == "__main__":
-    import pytest
-
-    pytest.main(["-vv", __file__])
diff --git a/tests/reinforcement_learning/test_rl_utils.py b/tests/reinforcement_learning/test_utils.py
similarity index 71%
rename from tests/reinforcement_learning/test_rl_utils.py
rename to tests/reinforcement_learning/test_utils.py
index fa356835..170ea764 100644
--- a/tests/reinforcement_learning/test_rl_utils.py
+++ b/tests/reinforcement_learning/test_utils.py
@@ -3,10 +3,14 @@
 from numpy.typing import NDArray
 
 from job_shop_lib.exceptions import ValidationError
+from job_shop_lib import Operation, JobShopInstance, ScheduledOperation
+from job_shop_lib.dispatching import Dispatcher
 from job_shop_lib.reinforcement_learning import (
     add_padding,
     create_edge_type_dict,
     map_values,
+    get_deadline_violation_penalty,
+    get_due_date_violation_penalty,
 )
 
 
@@ -297,5 +301,105 @@ def test_invalid_global_id():
         map_values(edge_index, mapping)
 
 
+def _make_scheduled_operation(
+    *,
+    duration: int,
+    start_time: int,
+    machine: int = 0,
+    deadline=None,
+    due_date=None,
+):
+    """Helper to build a minimal scheduled operation and dispatcher."""
+    jobs = [
+        [
+            Operation(
+                machine,
+                duration=duration,
+                deadline=deadline,
+                due_date=due_date,
+            )
+        ]
+    ]
+    instance = JobShopInstance(jobs, name="PenaltyTestInstance")
+    dispatcher = Dispatcher(instance)
+    op = instance.jobs[0][0]
+    scheduled_op = ScheduledOperation(
+        op, start_time=start_time, machine_id=machine
+    )
+    return scheduled_op, dispatcher
+
+
+# ---------------- Deadline penalty tests ---------------- #
+
+
+def test_deadline_penalty_violation():
+    scheduled_op, dispatcher = _make_scheduled_operation(
+        duration=10, start_time=0, deadline=5
+    )  # end_time = 10 > 5
+    assert get_deadline_violation_penalty(scheduled_op, dispatcher) == 10_000
+
+
+def test_deadline_penalty_no_violation_equal_boundary():
+    scheduled_op, dispatcher = _make_scheduled_operation(
+        duration=5, start_time=0, deadline=5
+    )  # end_time = 5 == 5
+    assert get_deadline_violation_penalty(scheduled_op, dispatcher) == 0.0
+
+
+def test_deadline_penalty_none_deadline():
+    scheduled_op, dispatcher = _make_scheduled_operation(
+        duration=4, start_time=0, deadline=None
+    )
+    assert get_deadline_violation_penalty(scheduled_op, dispatcher) == 0.0
+
+
+def test_deadline_penalty_custom_factor():
+    scheduled_op, dispatcher = _make_scheduled_operation(
+        duration=3, start_time=0, deadline=2
+    )  # end_time = 3 > 2
+    assert (
+        get_deadline_violation_penalty(
+            scheduled_op, dispatcher, deadline_penalty_factor=123.45
+        )
+        == 123.45
+    )
+
+
+# ---------------- Due date penalty tests ---------------- #
+
+
+def test_due_date_penalty_violation():
+    scheduled_op, dispatcher = _make_scheduled_operation(
+        duration=7, start_time=0, due_date=6
+    )  # end_time = 7 > 6
+    assert get_due_date_violation_penalty(scheduled_op, dispatcher) == 100
+
+
+def test_due_date_penalty_no_violation_equal_boundary():
+    scheduled_op, dispatcher = _make_scheduled_operation(
+        duration=5, start_time=0, due_date=5
+    )  # end_time = 5 == 5
+    assert get_due_date_violation_penalty(scheduled_op, dispatcher) == 0.0
+
+
+def test_due_date_penalty_none_due_date():
+    scheduled_op, dispatcher = _make_scheduled_operation(
+        duration=4, start_time=0, due_date=None
+    )
+    assert get_due_date_violation_penalty(scheduled_op, dispatcher) == 0.0
+
+
+def test_due_date_penalty_custom_factor():
+    scheduled_op, dispatcher = _make_scheduled_operation(
+        duration=9, start_time=0, due_date=1
+    )  # end_time = 9 > 1
+    assert (
+        get_due_date_violation_penalty(
+            scheduled_op, dispatcher, due_date_penalty_factor=7.5
+        )
+        == 7.5
+    )
+
+
 if __name__ == "__main__":
     pytest.main(["-vv", __file__])

From 6ecc2c7cfd7d9f5dfb879e7f3ce0888f701ede80 Mon Sep 17 00:00:00 2001
From: Pabloo22 <pablete.arino@gmail.com>
Date: Fri, 12 Sep 2025 18:27:45 +0200
Subject: [PATCH 2/4] [Tests] Add tests for RL utils and reward observers

---
 .../_reward_observers.py                      |   6 +-
 tests/conftest.py                             |  14 ++
 .../test_reward_observers.py                  | 191 ++++++++++++++++++
 tests/reinforcement_learning/test_utils.py    |  79 ++++++++
 4 files changed, 287 insertions(+), 3 deletions(-)
 create mode 100644 tests/reinforcement_learning/test_reward_observers.py

diff --git a/job_shop_lib/reinforcement_learning/_reward_observers.py b/job_shop_lib/reinforcement_learning/_reward_observers.py
index 06c9536b..32818ba3 100644
--- a/job_shop_lib/reinforcement_learning/_reward_observers.py
+++ b/job_shop_lib/reinforcement_learning/_reward_observers.py
@@ -125,10 +125,10 @@ class RewardWithPenalties(RewardObserver):
         The following functions (along with ``functools.partial``) can be
         used to create penalty functions:
 
-        - :class:`job_shop_lib.metaheuristics.penalty_for_deadlines`
-        - :class:`job_shop_lib.metaheuristics.penalty_for_due_dates`
+        - :class:`~job_shop_lib.reinforcement_learning.get_deadline_violation_penalty`
+        - :class:`~job_shop_lib.reinforcement_learning.get_due_date_violation_penalty`
 
-    """
+    """  # noqa: E501
 
     def __init__(
         self,
diff --git a/tests/conftest.py b/tests/conftest.py
index 62aa2109..c65c1848 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -334,3 +334,17 @@ def ft06_instance():
 @pytest.fixture
 def seeded_rng() -> random.Random:
     return random.Random(42)
+
+
+@pytest.fixture
+def single_machine_instance() -> JobShopInstance:
+    # Two single-op jobs on same machine
+    jobs = [[Operation(0, 2)], [Operation(0, 3)]]
+    return JobShopInstance(jobs, name="SingleMachine")
+
+
+@pytest.fixture
+def two_machines_instance() -> JobShopInstance:
+    # Two jobs, each with one operation on different machines
+    jobs = [[Operation(0, 5)], [Operation(1, 3)]]
+    return JobShopInstance(jobs, name="TwoMachines")
diff --git a/tests/reinforcement_learning/test_reward_observers.py b/tests/reinforcement_learning/test_reward_observers.py
new file mode 100644
index 00000000..adbb3377
--- /dev/null
+++ b/tests/reinforcement_learning/test_reward_observers.py
@@ -0,0 +1,191 @@
+# pylint: disable=missing-function-docstring, redefined-outer-name
+import functools
+import pytest
+
+from job_shop_lib import JobShopInstance, Operation
+from job_shop_lib.dispatching import Dispatcher
+from job_shop_lib.exceptions import ValidationError
+from job_shop_lib.reinforcement_learning import (
+    MakespanReward,
+    IdleTimeReward,
+    RewardWithPenalties,
+    get_due_date_violation_penalty,
+    get_deadline_violation_penalty,
+)
+
+
+def test_makespan_reward_basic(single_machine_instance: JobShopInstance):
+    dispatcher = Dispatcher(single_machine_instance)
+    reward_obs = MakespanReward(dispatcher)
+
+    # Schedule first job on machine 0
+    op0 = single_machine_instance.jobs[0][0]
+    dispatcher.dispatch(op0, 0)
+    assert reward_obs.rewards[-1] == -2
+
+    # Schedule second job on same machine
+    op1 = single_machine_instance.jobs[1][0]
+    dispatcher.dispatch(op1, 0)
+    # makespan increases from 2 to 5
+    assert reward_obs.rewards[-1] == -3
+
+    # Sum of rewards equals -final_makespan
+    assert sum(reward_obs.rewards) == -dispatcher.schedule.makespan() == -5
+
+
+def test_makespan_reward_zero_when_no_increase(
+    two_machines_instance: JobShopInstance,
+):
+    dispatcher = Dispatcher(two_machines_instance)
+    reward_obs = MakespanReward(dispatcher)
+
+    # Schedule the longer op first -> makespan = 5
+    op_long = two_machines_instance.jobs[0][0]
+    dispatcher.dispatch(op_long, 0)
+    assert reward_obs.rewards[-1] == -5
+
+    # Now schedule the shorter op on another machine -> ends 
+    # at 3 < current makespan
+    op_short = two_machines_instance.jobs[1][0]
+    dispatcher.dispatch(op_short, 1)
+    # No makespan increase -> zero reward
+    assert reward_obs.rewards[-1] == 0
+
+
+def test_idle_time_reward_computation():
+    # Construct instance that creates idle time on machine 0
+    # Job1: M0(1) then M1(1)
+    # Job0: M1(5) then M0(1) -> causes M0 idle from t=1 to t=5
+    jobs = [
+        [Operation(1, 5), Operation(0, 1)],  # job 0
+        [Operation(0, 1), Operation(1, 1)],  # job 1
+    ]
+    instance = JobShopInstance(jobs, name="IdleTimeExample")
+    dispatcher = Dispatcher(instance)
+    idle_obs = IdleTimeReward(dispatcher)
+
+    # 1) j1[0] on M0 at t=0..1
+    dispatcher.dispatch(instance.jobs[1][0], 0)
+    assert idle_obs.rewards[-1] == 0  # first op on machine -> start_time 0
+
+    # 2) j0[0] on M1 at t=0..5
+    dispatcher.dispatch(instance.jobs[0][0], 1)
+    assert idle_obs.rewards[-1] == 0  # first op on machine -> start_time 0
+
+    # 3) j1[1] on M1 at t=5..6 (no idle on M1)
+    dispatcher.dispatch(instance.jobs[1][1], 1)
+    assert idle_obs.rewards[-1] == 0
+
+    # 4) j0[1] on M0 at t=5..6 (idle on M0 from 1 to 5 -> reward = -4)
+    dispatcher.dispatch(instance.jobs[0][1], 0)
+    assert idle_obs.rewards[-1] == -4
+
+
+def test_reward_with_penalties_due_date():
+    # Build small instance where second op violates due date
+    jobs = [
+        [Operation(0, 1)],
+        [
+            Operation(0, 10, due_date=5)
+        ],  # will start at 1 and end at 11 -> late
+    ]
+    instance = JobShopInstance(jobs, name="DueDatePenalty")
+    dispatcher = Dispatcher(instance)
+
+    base = MakespanReward(dispatcher)
+    penalty_fn = functools.partial(
+        get_due_date_violation_penalty, due_date_penalty_factor=7
+    )
+    reward = RewardWithPenalties(
+        dispatcher,
+        base_reward_observer=base,
+        penalty_function=penalty_fn,
+    )
+
+    # First op (no penalty)
+    dispatcher.dispatch(instance.jobs[0][0], 0)
+    assert base.rewards[-1] == -1
+    assert reward.rewards[-1] == -1
+
+    # Second op violates due date -> penalty 7
+    dispatcher.dispatch(instance.jobs[1][0], 0)
+    assert base.rewards[-1] == -10
+    assert reward.rewards[-1] == -10 - 7
+
+
+def test_reward_with_penalties_deadline():
+    jobs = [
+        [Operation(0, 1)],
+        [Operation(0, 10, deadline=5)],  # ends at 11 -> deadline violation
+    ]
+    instance = JobShopInstance(jobs, name="DeadlinePenalty")
+    dispatcher = Dispatcher(instance)
+
+    base = MakespanReward(dispatcher)
+    penalty_fn = functools.partial(
+        get_deadline_violation_penalty, deadline_penalty_factor=13
+    )
+    reward = RewardWithPenalties(
+        dispatcher,
+        base_reward_observer=base,
+        penalty_function=penalty_fn,
+    )
+
+    dispatcher.dispatch(instance.jobs[0][0], 0)
+    dispatcher.dispatch(instance.jobs[1][0], 0)
+    assert reward.rewards[-1] == -10 - 13
+
+
+def test_reward_with_penalties_requires_same_dispatcher():
+    instance = JobShopInstance([[Operation(0, 1)]])
+    d1 = Dispatcher(instance)
+    d2 = Dispatcher(instance)
+    base = MakespanReward(d1)
+
+    with pytest.raises(ValidationError):
+        RewardWithPenalties(
+            d2, base_reward_observer=base, penalty_function=lambda op, d: 0.0
+        )
+
+
+def test_reward_with_penalties_unsubscribes_base():
+    instance = JobShopInstance([[Operation(0, 1)], [Operation(0, 1)]])
+    dispatcher = Dispatcher(instance)
+
+    base = MakespanReward(dispatcher)
+    assert base in dispatcher.subscribers
+
+    reward = RewardWithPenalties(
+        dispatcher,
+        base_reward_observer=base,
+        penalty_function=lambda op, d: 0.0,
+    )
+    # Base should be unsubscribed; wrapper is subscribed
+    assert base not in dispatcher.subscribers
+    assert reward in dispatcher.subscribers
+
+    # test reset
+    reward.reset()
+    assert not reward.rewards
+    assert not base.rewards
+
+
+def test_reward_observers_reset():
+    instance = JobShopInstance([[Operation(0, 1)], [Operation(0, 1)]])
+    dispatcher = Dispatcher(instance)
+
+    m_reward = MakespanReward(dispatcher)
+    i_reward = IdleTimeReward(dispatcher)
+
+    dispatcher.dispatch(instance.jobs[0][0], 0)
+    dispatcher.dispatch(instance.jobs[1][0], 0)
+
+    # Ensure rewards collected
+    assert m_reward.rewards
+    assert i_reward.rewards
+
+    # Reset and ensure cleared and internal state matches
+    m_reward.reset()
+    i_reward.reset()
+    assert not m_reward.rewards
+    assert not i_reward.rewards
diff --git a/tests/reinforcement_learning/test_utils.py b/tests/reinforcement_learning/test_utils.py
index 170ea764..c7516d0c 100644
--- a/tests/reinforcement_learning/test_utils.py
+++ b/tests/reinforcement_learning/test_utils.py
@@ -9,9 +9,12 @@
     add_padding,
     create_edge_type_dict,
     map_values,
+    get_optimal_actions,
     get_deadline_violation_penalty,
     get_due_date_violation_penalty,
 )
+from job_shop_lib.dispatching import OptimalOperationsObserver
+from job_shop_lib.dispatching.rules import DispatchingRuleSolver
 
 
 def test_add_padding_int_array():
@@ -401,5 +404,81 @@ def test_due_date_penalty_custom_factor():
     )
 
 
+# ---------------- get_optimal_actions tests ---------------- #
+
+
+def test_get_optimal_actions_initial_and_after_step(
+    example_job_shop_instance: JobShopInstance,
+):
+    # Build a reference schedule using a simple heuristic solver
+    solver = DispatchingRuleSolver()
+    reference_schedule = solver.solve(example_job_shop_instance)
+
+    # Fresh dispatcher and observer on same instance
+    dispatcher = Dispatcher(example_job_shop_instance)
+    optimal_obs = OptimalOperationsObserver(dispatcher, reference_schedule)
+
+    # Build available actions tuples (operation_id, machine_id, job_id)
+    available_ops = dispatcher.available_operations()
+    actions = [
+        (op.operation_id, op.machine_id, op.job_id) for op in available_ops
+    ]
+
+    # Compute mapping and expected optimal ids
+    mapping = get_optimal_actions(optimal_obs, actions)
+    expected_ones = {
+        (op.operation_id, op.machine_id, op.job_id)
+        for op in optimal_obs.optimal_available
+    }
+
+    # Check 1 for optimal, 0 otherwise
+    for a in actions:
+        assert mapping[a] == int(a in expected_ones)
+
+    # Dispatch one optimal operation and validate mapping updates
+    op_to_dispatch = next(iter(optimal_obs.optimal_available))
+    dispatcher.dispatch(op_to_dispatch)
+
+    available_ops = dispatcher.available_operations()
+    actions = [
+        (op.operation_id, op.machine_id, op.job_id) for op in available_ops
+    ]
+    mapping = get_optimal_actions(optimal_obs, actions)
+    expected_ones = {
+        (op.operation_id, op.machine_id, op.job_id)
+        for op in optimal_obs.optimal_available
+    }
+    for a in actions:
+        assert mapping[a] == int(a in expected_ones)
+
+
+def test_get_optimal_actions_marks_non_optimal_zero(
+    example_job_shop_instance: JobShopInstance,
+):
+    solver = DispatchingRuleSolver()
+    reference_schedule = solver.solve(example_job_shop_instance)
+    dispatcher = Dispatcher(example_job_shop_instance)
+    optimal_obs = OptimalOperationsObserver(dispatcher, reference_schedule)
+
+    # Valid available actions
+    available_ops = dispatcher.available_operations()
+    actions = [
+        (op.operation_id, op.machine_id, op.job_id) for op in available_ops
+    ]
+
+    # Add an artificial non-optimal action tuple (invalid machine id)
+    if actions:
+        fake_action = (actions[0][0], actions[0][1] + 99, actions[0][2])
+        actions_with_fake = actions + [fake_action]
+    else:
+        actions_with_fake = []
+
+    mapping = get_optimal_actions(optimal_obs, actions_with_fake)
+
+    # Fake action should be marked as non-optimal (0)
+    if actions_with_fake:
+        assert mapping[fake_action] == 0
+
+
 if __name__ == "__main__":
     pytest.main(["-vv", __file__])

From 99aa60c8493bb304ad1ef1b552fdd3adbc9bec01 Mon Sep 17 00:00:00 2001
From: Pabloo22 <pablete.arino@gmail.com>
Date: Sat, 13 Sep 2025 21:12:22 +0200
Subject: [PATCH 3/4] [Tests] Add comprehensive tests for objective functions
 and penalties

---
 .../test_objective_functions.py               | 108 ++++++++++++++++++
 1 file changed, 108 insertions(+)
 create mode 100644 tests/metaheuristics/test_objective_functions.py

diff --git a/tests/metaheuristics/test_objective_functions.py b/tests/metaheuristics/test_objective_functions.py
new file mode 100644
index 00000000..852204ec
--- /dev/null
+++ b/tests/metaheuristics/test_objective_functions.py
@@ -0,0 +1,108 @@
+# pylint: disable=missing-function-docstring, redefined-outer-name
+import pytest
+
+from job_shop_lib import (
+    JobShopInstance,
+    Operation,
+    Schedule,
+    ScheduledOperation,
+)
+from job_shop_lib.metaheuristics import (
+    get_makespan_with_penalties_objective,
+    compute_penalty_for_deadlines,
+    compute_penalty_for_due_dates,
+)
+
+
+@pytest.fixture
+def schedule_no_penalties() -> Schedule:
+    # Two machines; set due_date/deadline None
+    jobs = [[Operation(0, 2)], [Operation(1, 3)]]
+    instance = JobShopInstance(jobs, name="NoPenalties")
+    # Build schedule manually: M0: job0@t0..2; M1: job1@t0..3
+    s0 = ScheduledOperation(instance.jobs[0][0], start_time=0, machine_id=0)
+    s1 = ScheduledOperation(instance.jobs[1][0], start_time=0, machine_id=1)
+    schedule = Schedule(instance, [[s0], [s1]])
+    return schedule
+
+
+@pytest.fixture
+def schedule_with_deadlines() -> Schedule:
+    # Single machine sequence; second op violates deadline
+    jobs = [
+        [Operation(0, 2, deadline=1)],  # ends at 2 -> violation
+        [Operation(0, 3, deadline=5)],  # ends at 5 -> boundary, no violation
+    ]
+    instance = JobShopInstance(jobs, name="Deadlines")
+    s0 = ScheduledOperation(instance.jobs[0][0], start_time=0, machine_id=0)
+    s1 = ScheduledOperation(instance.jobs[1][0], start_time=2, machine_id=0)
+    schedule = Schedule(instance, [[s0, s1]])
+    return schedule
+
+
+@pytest.fixture
+def schedule_with_due_dates() -> Schedule:
+    # Single machine sequence; first op OK, second violates due date
+    jobs = [
+        [Operation(0, 1, due_date=1)],  # ends at 1 -> equal, OK
+        [Operation(0, 4, due_date=3)],  # ends at 5 -> violation
+    ]
+    instance = JobShopInstance(jobs, name="DueDates")
+    s0 = ScheduledOperation(instance.jobs[0][0], start_time=0, machine_id=0)
+    s1 = ScheduledOperation(instance.jobs[1][0], start_time=1, machine_id=0)
+    schedule = Schedule(instance, [[s0, s1]])
+    return schedule
+
+
+@pytest.fixture
+def schedule_with_both() -> Schedule:
+    # Mixed: first violates deadline, second violates due date
+    jobs = [
+        [Operation(0, 3, deadline=2, due_date=10)],  # deadline violation
+        [Operation(0, 4, deadline=10, due_date=6)],  # due date violation
+    ]
+    instance = JobShopInstance(jobs, name="Both")
+    s0 = ScheduledOperation(instance.jobs[0][0], start_time=0, machine_id=0)
+    s1 = ScheduledOperation(instance.jobs[1][0], start_time=3, machine_id=0)
+    schedule = Schedule(instance, [[s0, s1]])
+    return schedule
+
+
+def test_compute_penalty_for_deadlines_none(schedule_no_penalties: Schedule):
+    assert compute_penalty_for_deadlines(schedule_no_penalties, 1000) == 0.0
+
+
+def test_compute_penalty_for_due_dates_none(schedule_no_penalties: Schedule):
+    assert compute_penalty_for_due_dates(schedule_no_penalties, 100) == 0.0
+
+
+def test_compute_penalty_for_deadlines(schedule_with_deadlines: Schedule):
+    # Only first op violates -> 1 penalty
+    assert compute_penalty_for_deadlines(schedule_with_deadlines, 7.5) == 7.5
+
+
+def test_compute_penalty_for_due_dates(schedule_with_due_dates: Schedule):
+    # Only second op violates -> 1 penalty
+    assert compute_penalty_for_due_dates(schedule_with_due_dates, 3.0) == 3.0
+
+
+def test_objective_makespan_only_when_zero_factors(
+    schedule_with_both: Schedule,
+):
+    objective = get_makespan_with_penalties_objective(
+        deadline_penalty_factor=0, due_date_penalty_factor=0
+    )
+    assert objective(schedule_with_both) == schedule_with_both.makespan()
+
+
+def test_objective_with_penalties(schedule_with_both: Schedule):
+    # s0: 0..3 (violates deadline=2) -> +d_factor
+    # s1: 3..7 (violates due_date=6) -> +dd_factor
+    d_factor = 123.0
+    dd_factor = 4.0
+    objective = get_makespan_with_penalties_objective(
+        deadline_penalty_factor=d_factor,
+        due_date_penalty_factor=dd_factor,
+    )
+    expected = schedule_with_both.makespan() + d_factor + dd_factor
+    assert objective(schedule_with_both) == expected

From 976765be66e3d9bf6afbeba1239e5607a2a19499 Mon Sep 17 00:00:00 2001
From: Pabloo22 <pablete.arino@gmail.com>
Date: Sat, 13 Sep 2025 21:12:40 +0200
Subject: [PATCH 4/4] [Docs] Fix `RewardWithPenalties` class documentation

---
 job_shop_lib/reinforcement_learning/_reward_observers.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/job_shop_lib/reinforcement_learning/_reward_observers.py b/job_shop_lib/reinforcement_learning/_reward_observers.py
index 32818ba3..d25786a0 100644
--- a/job_shop_lib/reinforcement_learning/_reward_observers.py
+++ b/job_shop_lib/reinforcement_learning/_reward_observers.py
@@ -98,8 +98,9 @@ class RewardWithPenalties(RewardObserver):
     Attributes:
         base_reward_observer:
             The base reward observer to use for calculating the reward.
-        penalty_per_violation:
-            The penalty to apply for each constraint violation.
+        penalty_function:
+            A function that takes a scheduled operation and the dispatcher as
+            input and returns the penalty for that operation.
 
     Args:
         dispatcher:
@@ -125,8 +126,8 @@ class RewardWithPenalties(RewardObserver):
         The following functions (along with ``functools.partial``) can be
         used to create penalty functions:
 
-        - :class:`~job_shop_lib.reinforcement_learning.get_deadline_violation_penalty`
-        - :class:`~job_shop_lib.reinforcement_learning.get_due_date_violation_penalty`
+        - :func:`~job_shop_lib.reinforcement_learning.get_deadline_violation_penalty`
+        - :func:`~job_shop_lib.reinforcement_learning.get_due_date_violation_penalty`
 
     """  # noqa: E501