From ef13950e018a922853565c7a661f340ee4766915 Mon Sep 17 00:00:00 2001 From: Jakob Date: Fri, 18 Apr 2025 11:15:53 +0200 Subject: [PATCH 1/2] Reordered the code and strcutured the settings params. --- train_and_play/train_dqn_sweep.py | 48 +++++++++++++++++++------------ 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/train_and_play/train_dqn_sweep.py b/train_and_play/train_dqn_sweep.py index c0a76a8..78a05cd 100644 --- a/train_and_play/train_dqn_sweep.py +++ b/train_and_play/train_dqn_sweep.py @@ -30,47 +30,63 @@ # --- Training Parameters --- params: dict[str, Any] = { - "nr_of_episodes": 15000, # Number of training games + # Game settings "rows": 3, # Board size (rows x rows) + "win_length": 3, # Number of in-a-row needed to win + "rewards": { + "W": 1.0, # Reward for a win + "L": -1.0, # Reward for a loss + "D": 0.5, # Reward for a draw + }, + + # Training settings + "nr_of_episodes": 15000, # Number of training games "learning_rate": 0.0001, # Optimizer learning rate "gamma": 0.95, # Discount factor for future rewards "switching": True, # Whether players switch turns - "win_length": 3, # Number of in-a-row needed to win + "target_update_frequency": 25, # Frequency to sync target network + + # Evaluation settings + "evaluation_frequency": 100, # Episodes between evaluations + "evaluation_batch_size": 300, # Games to evaluate per round + "wandb": False, # Enable Weights & Biases logging + "wandb_logging_frequency": 25, # Logging frequency (in episodes) + + # Exploration rate settings "epsilon_start": 0.925, # Initial exploration rate "epsilon_min": 0.01, # Minimum exploration rate "set_exploration_rate_externally": True, # Adaptive epsilon enabled "epsilon_update_threshold": 0.025, # Epsilon adjustment sensitivity "epsilon_decay": 0.95, # Decay rate for epsilon "win_rate_deque_length": 5, # Length of win rate deques - "batch_size": 256, # Batch size for training updates - "target_update_frequency": 25, # Frequency to sync target network - "evaluation_frequency": 100, # Episodes between evaluations - "evaluation_batch_size": 300, # Games to evaluate per round + + # Device settings "device": "mps", # Device: "cuda", "mps", or "cpu" - "wandb": False, # Enable Weights & Biases logging - "wandb_logging_frequency": 25, # Logging frequency (in episodes) - "load_network": False, # Whether to load pretrained weights + # Replay buffer settings "replay_buffer_type": "prioritized", # "uniform" or "prioritized" "replay_buffer_length": 10000, # Max length of replay buffer + "batch_size": 256, # Batch size for training updates "priority_alpha": 0.6, "priority_beta": 0.4, "shared_replay_buffer": False, # Share replay buffer between agents + # Q Network settings "network_type": "FullyCNN", # Network architecture: 'Equivariant', 'FullyCNN', 'FCN', 'CNN' "periodic": False, # Periodic boundary conditions + "load_network": False, # Whether to load pretrained weights "save_models": "/Users/jakob/TicTacToe/models/", # Save weights after training "symmetrized_loss": False, # Use symmetrized loss "state_shape": "one-hot", # state representation: 'flat' with shape (batch_size, rows * rows), # '2D' with shape (batch_size, 1, rows, rows), # 'one-hot' with shape (batch_size, 3, rows, rows) - "rewards": { - "W": 1.0, # Reward for a win - "L": -1.0, # Reward for a loss - "D": 0.5, # Reward for a draw - }, } +# --- Sweep Setup --- +param_sweep = {"replay_buffer_type": ["prioritized", "uniform"], "periodic": [True, False], "state_shape": ["one-hot", "flat"]} +sweep_combinations, param_keys = get_param_sweep_combinations(param_sweep) + +# --- Shared Replay Buffer Setup --- if params["shared_replay_buffer"]: state_shape = params["state_shape"] rows = params["rows"] @@ -96,10 +112,6 @@ params["shared_replay_buffer"] = ReplayBuffer(params["replay_buffer_length"], (params["rows"]**2, ), device=params["device"]) -# --- Sweep Setup --- -param_sweep = {"replay_buffer_type": ["prioritized", "uniform"], "periodic": [True, False], "state_shape": ["one-hot", "flat"]} -sweep_combinations, param_keys = get_param_sweep_combinations(param_sweep) -model_metadata = [] # --- Sweep Loop --- for sweep_idx, combination in enumerate(sweep_combinations): From 54d7794668aa950e854b8427842e24522a162872 Mon Sep 17 00:00:00 2001 From: Jakob Date: Fri, 18 Apr 2025 18:33:02 +0200 Subject: [PATCH 2/2] Fixed tests. Fixes #104 --- src/TicTacToe/Utils.py | 2 +- tests/test_utils.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/TicTacToe/Utils.py b/src/TicTacToe/Utils.py index a12098e..37b7818 100644 --- a/src/TicTacToe/Utils.py +++ b/src/TicTacToe/Utils.py @@ -230,7 +230,7 @@ def train_and_evaluate(game: TwoPlayerBoardGame, agent1: DeepQLearningAgent, age rewards=params["rewards"], ) if params["set_exploration_rate_externally"]: - exploration_rate = update_exploration_rate_smoothly(agent1, agent2, params, eval_data, exploration_rate, (X_win_rates, O_win_rates)) + exploration_rate = update_exploration_rate_smoothly(agent1, agent2, params, eval_data, exploration_rate, (X_win_rates, O_win_rates), wandb_logging=wandb_logging) print("Outcomes during learning:") print(f"X wins: {outcomes['X'] / params['nr_of_episodes']}, O wins: {outcomes['O'] / params['nr_of_episodes']}, draws: {outcomes['D'] / params['nr_of_episodes']}") diff --git a/tests/test_utils.py b/tests/test_utils.py index 2198a30..8ab99d4 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -131,7 +131,7 @@ def test_update_exploration_rate_smoothly(): O_win_rates = deque([0.5, 0.51], maxlen=3) current_epsilon = 0.5 - new_epsilon = update_exploration_rate_smoothly(agent1, agent2, params, eval_data, current_epsilon, (X_win_rates, O_win_rates)) + new_epsilon = update_exploration_rate_smoothly(agent1, agent2, params, eval_data, current_epsilon, (X_win_rates, O_win_rates), wandb_logging = False) assert new_epsilon < current_epsilon agent1.set_exploration_rate.assert_called() agent2.set_exploration_rate.assert_called() @@ -161,6 +161,11 @@ def test_train_and_evaluate(mock_tqdm, mock_eval_perf): "periodic": False, "win_rate_deque_length": 3, "state_shape": "flat", + "rewards": { + "W": 1.0, # Reward for a win + "L": -1.0, # Reward for a loss + "D": 0.5, # Reward for a draw + }, } mock_eval_perf.return_value = {