PathmindAI · ghost · Oct 21, 2021 · Oct 21, 2021 · Oct 25, 2021 · Oct 25, 2021
diff --git a/README.md b/README.md
@@ -28,6 +28,7 @@ See the following two examples to understand the mechanics behind Pathmind's Pyt
 
 - **Single Agent Example**: [Mouse chasing cheese example](https://github.com/PathmindAI/pathmind-api/blob/main/tests/examples/mouse/mouse_env_pathmind.py)
 - **Multi-Agent Example** - [Multiple mouses chasing cheese example](https://github.com/PathmindAI/pathmind-api/blob/main/tests/examples/mouse/multi_mouse_env_pathmind.py)
+- **Multi-Reward Example** - [Reward balancing mouses chasing cheese example](https://github.com/PathmindAI/pathmind-api/blob/main/tests/examples/mouse/two_reward_balance.py)
 
 Once you have a `Simulation` ready, you can use it to train a model with Pathmind
 as follows.
@@ -142,6 +143,7 @@ The interface is inspired by OpenAI gym, but differs in certain points:
   select which ones to use for training in the Pathmind app.
 - You can rely on Pathmind's scalable backend for training and don't have to
   run your workloads on your machine.
+- Competing reward signals may be optionally weighted and auto-normalized
 
 ## Development and running tests
 

diff --git a/pathmind/simulation.py b/pathmind/simulation.py
@@ -54,9 +54,16 @@ class Simulation:
     a single value for agent 0, then action[0] will be a float value, otherwise
     a numpy array with specified shape. You use "action" to apply the next actions
     to your agents in the "step" function.
+
+    During training, you may optionally provide "reward_weights" for each reward
+    term. At each step, the reward signal is comprised of a weighted sum of terms.
+    Reward terms optionally are preprocessed to to yield contributions of
+    comparable total signal magnitude along each episode. Use "auto_norm_reward".
     """
 
     action: Dict[int, Union[float, np.ndarray]] = None
+    reward_weights: List[float] = None
+    auto_norm_reward: bool = False
 
     def __init__(self, *args, **kwargs):
         """Set any properties and initial states needed for your simulation."""
@@ -169,7 +176,13 @@ def run(
             write_table(table=table, out_csv=out_csv)
             write_table(table=summary, out_csv=summary_csv)
 
-    def train(self, base_folder: str = "./", observation_yaml: str = None):
+    def train(
+        self,
+        base_folder: str = "./",
+        observation_yaml: str = None,
+        reward_weights: Optional[List[float]] = None,
+        auto_norm_reward: bool = False,
+    ):
         """
         :param base_folder the path to your base folder containing all your Python code. Defaults to the current
             working directory, which assumes you start training from the base of your code base.
@@ -185,6 +198,9 @@ def train(self, base_folder: str = "./", observation_yaml: str = None):
         else:
             obs_yaml = observation_yaml
 
+        self.reward_weights = reward_weights
+        self.auto_norm_reward = auto_norm_reward
+
         token = os.environ.get("PATHMIND_TOKEN")
         if not token:
             raise ValueError(
@@ -202,6 +218,7 @@ def train(self, base_folder: str = "./", observation_yaml: str = None):
               -F 'start=true' \
               -F 'multiAgent={multi_agent}' \
               -F 'obsSelection={obs_yaml}' \
+              -F 'useAutoNorm={auto_norm_reward}' \
               https://api.pathmind.com/py/upload
             """
 

diff --git a/tests/examples/mouse/two_reward_balance.py b/tests/examples/mouse/two_reward_balance.py
@@ -0,0 +1,55 @@
+import typing
+
+from pathmind.simulation import Continuous, Discrete, Simulation
+
+
+class TwoRewardMouseAndCheese(Simulation):
+
+    mouse = (0, 0)
+    cheese = (4, 4)
+    steps = 0
+    reward_weights = [1.0, 0.5]
+    auto_norm_reward = True
+
+    def number_of_agents(self) -> int:
+        return 1
+
+    def action_space(self, agent_id) -> typing.Union[Continuous, Discrete]:
+        return Discrete(4)
+
+    def reset(self) -> None:
+        self.mouse = (0, 0)
+        self.cheese = (4, 4)
+        self.steps = 0
+
+    def step(self) -> None:
+        self.steps += 1
+
+        action = self.action[0]
+
+        if action == 0:  # move up
+            self.mouse = (min(self.mouse[0] + 1, 5), self.mouse[1])
+        elif action == 1:  # move right
+            self.mouse = (self.mouse[0], min(self.mouse[1] + 1, 5))
+        elif action == 2:  # move down
+            self.mouse = (max(self.mouse[0] - 1, 0), self.mouse[1])
+        elif action == 3:  # move left
+            self.mouse = (self.mouse[0], max(self.mouse[1] - 1, 0))
+        else:
+            raise ValueError("Invalid action")
+
+    def get_observation(self, agent_id) -> typing.Dict[str, float]:
+        return {
+            "mouse_row": float(self.mouse[0]) / 5.0,
+            "mouse_col": float(self.mouse[1]) / 5.0,
+            "distance_to_cheese_row": abs(self.cheese[0] - self.mouse[0]) / 5.0,
+            "distance_to_cheese_col": abs(self.cheese[1] - self.mouse[1]) / 5.0,
+            "cheese_row": float(self.cheese[0]) / 5.0,
+            "cheese_col": float(self.cheese[1]) / 5.0,
+        }
+
+    def get_reward(self, agent_id) -> typing.Dict[str, float]:
+        return {"found_cheese": 1 if self.mouse == self.cheese else 0, "took_step": -1}
+
+    def is_done(self, agent_id) -> bool:
+        return self.mouse == self.cheese
diff --git a/tests/examples/mouse/two_reward_no_balance.py b/tests/examples/mouse/two_reward_no_balance.py
@@ -0,0 +1,53 @@
+import typing
+
+from pathmind.simulation import Continuous, Discrete, Simulation
+
+
+class TwoRewardMouseAndCheeseNoBalance(Simulation):
+
+    mouse = (0, 0)
+    cheese = (4, 4)
+    steps = 0
+
+    def number_of_agents(self) -> int:
+        return 1
+
+    def action_space(self, agent_id) -> typing.Union[Continuous, Discrete]:
+        return Discrete(4)
+
+    def reset(self) -> None:
+        self.mouse = (0, 0)
+        self.cheese = (4, 4)
+        self.steps = 0
+
+    def step(self) -> None:
+        self.steps += 1
+
+        action = self.action[0]
+
+        if action == 0:  # move up
+            self.mouse = (min(self.mouse[0] + 1, 5), self.mouse[1])
+        elif action == 1:  # move right
+            self.mouse = (self.mouse[0], min(self.mouse[1] + 1, 5))
+        elif action == 2:  # move down
+            self.mouse = (max(self.mouse[0] - 1, 0), self.mouse[1])
+        elif action == 3:  # move left
+            self.mouse = (self.mouse[0], max(self.mouse[1] - 1, 0))
+        else:
+            raise ValueError("Invalid action")
+
+    def get_observation(self, agent_id) -> typing.Dict[str, float]:
+        return {
+            "mouse_row": float(self.mouse[0]) / 5.0,
+            "mouse_col": float(self.mouse[1]) / 5.0,
+            "distance_to_cheese_row": abs(self.cheese[0] - self.mouse[0]) / 5.0,
+            "distance_to_cheese_col": abs(self.cheese[1] - self.mouse[1]) / 5.0,
+            "cheese_row": float(self.cheese[0]) / 5.0,
+            "cheese_col": float(self.cheese[1]) / 5.0,
+        }
+
+    def get_reward(self, agent_id) -> typing.Dict[str, float]:
+        return {"found_cheese": 1 if self.mouse == self.cheese else 0, "took_step": -1}
+
+    def is_done(self, agent_id) -> bool:
+        return self.mouse == self.cheese