Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/TicTacToe/Utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ def train_and_evaluate(game: TwoPlayerBoardGame, agent1: DeepQLearningAgent, age
rewards=params["rewards"],
)
if params["set_exploration_rate_externally"]:
exploration_rate = update_exploration_rate_smoothly(agent1, agent2, params, eval_data, exploration_rate, (X_win_rates, O_win_rates))
exploration_rate = update_exploration_rate_smoothly(agent1, agent2, params, eval_data, exploration_rate, (X_win_rates, O_win_rates), wandb_logging=wandb_logging)

print("Outcomes during learning:")
print(f"X wins: {outcomes['X'] / params['nr_of_episodes']}, O wins: {outcomes['O'] / params['nr_of_episodes']}, draws: {outcomes['D'] / params['nr_of_episodes']}")
7 changes: 6 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def test_update_exploration_rate_smoothly():
O_win_rates = deque([0.5, 0.51], maxlen=3)
current_epsilon = 0.5

new_epsilon = update_exploration_rate_smoothly(agent1, agent2, params, eval_data, current_epsilon, (X_win_rates, O_win_rates))
new_epsilon = update_exploration_rate_smoothly(agent1, agent2, params, eval_data, current_epsilon, (X_win_rates, O_win_rates), wandb_logging = False)
assert new_epsilon < current_epsilon
agent1.set_exploration_rate.assert_called()
agent2.set_exploration_rate.assert_called()
Expand Down Expand Up @@ -161,6 +161,11 @@ def test_train_and_evaluate(mock_tqdm, mock_eval_perf):
"periodic": False,
"win_rate_deque_length": 3,
"state_shape": "flat",
"rewards": {
"W": 1.0, # Reward for a win
"L": -1.0, # Reward for a loss
"D": 0.5, # Reward for a draw
},
}

mock_eval_perf.return_value = {
Expand Down
48 changes: 30 additions & 18 deletions train_and_play/train_dqn_sweep.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,47 +30,63 @@

# --- Training Parameters ---
params: dict[str, Any] = {
"nr_of_episodes": 15000, # Number of training games
# Game settings
"rows": 3, # Board size (rows x rows)
"win_length": 3, # Number of in-a-row needed to win
"rewards": {
"W": 1.0, # Reward for a win
"L": -1.0, # Reward for a loss
"D": 0.5, # Reward for a draw
},

# Training settings
"nr_of_episodes": 15000, # Number of training games
"learning_rate": 0.0001, # Optimizer learning rate
"gamma": 0.95, # Discount factor for future rewards
"switching": True, # Whether players switch turns
"win_length": 3, # Number of in-a-row needed to win
"target_update_frequency": 25, # Frequency to sync target network

# Evaluation settings
"evaluation_frequency": 100, # Episodes between evaluations
"evaluation_batch_size": 300, # Games to evaluate per round
"wandb": False, # Enable Weights & Biases logging
"wandb_logging_frequency": 25, # Logging frequency (in episodes)

# Exploration rate settings
"epsilon_start": 0.925, # Initial exploration rate
"epsilon_min": 0.01, # Minimum exploration rate
"set_exploration_rate_externally": True, # Adaptive epsilon enabled
"epsilon_update_threshold": 0.025, # Epsilon adjustment sensitivity
"epsilon_decay": 0.95, # Decay rate for epsilon
"win_rate_deque_length": 5, # Length of win rate deques
"batch_size": 256, # Batch size for training updates
"target_update_frequency": 25, # Frequency to sync target network
"evaluation_frequency": 100, # Episodes between evaluations
"evaluation_batch_size": 300, # Games to evaluate per round

# Device settings
"device": "mps", # Device: "cuda", "mps", or "cpu"
"wandb": False, # Enable Weights & Biases logging
"wandb_logging_frequency": 25, # Logging frequency (in episodes)
"load_network": False, # Whether to load pretrained weights

# Replay buffer settings
"replay_buffer_type": "prioritized", # "uniform" or "prioritized"
"replay_buffer_length": 10000, # Max length of replay buffer
"batch_size": 256, # Batch size for training updates
"priority_alpha": 0.6,
"priority_beta": 0.4,
"shared_replay_buffer": False, # Share replay buffer between agents

# Q Network settings
"network_type": "FullyCNN", # Network architecture: 'Equivariant', 'FullyCNN', 'FCN', 'CNN'
"periodic": False, # Periodic boundary conditions
"load_network": False, # Whether to load pretrained weights
"save_models": "/Users/jakob/TicTacToe/models/", # Save weights after training
"symmetrized_loss": False, # Use symmetrized loss
"state_shape": "one-hot", # state representation: 'flat' with shape (batch_size, rows * rows),
# '2D' with shape (batch_size, 1, rows, rows),
# 'one-hot' with shape (batch_size, 3, rows, rows)
"rewards": {
"W": 1.0, # Reward for a win
"L": -1.0, # Reward for a loss
"D": 0.5, # Reward for a draw
},
}

# --- Sweep Setup ---
param_sweep = {"replay_buffer_type": ["prioritized", "uniform"], "periodic": [True, False], "state_shape": ["one-hot", "flat"]}
sweep_combinations, param_keys = get_param_sweep_combinations(param_sweep)

# --- Shared Replay Buffer Setup ---
if params["shared_replay_buffer"]:
state_shape = params["state_shape"]
rows = params["rows"]
Expand All @@ -96,10 +112,6 @@

params["shared_replay_buffer"] = ReplayBuffer(params["replay_buffer_length"], (params["rows"]**2, ), device=params["device"])

# --- Sweep Setup ---
param_sweep = {"replay_buffer_type": ["prioritized", "uniform"], "periodic": [True, False], "state_shape": ["one-hot", "flat"]}
sweep_combinations, param_keys = get_param_sweep_combinations(param_sweep)
model_metadata = []

# --- Sweep Loop ---
for sweep_idx, combination in enumerate(sweep_combinations):
Expand Down