-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathagent.py
More file actions
148 lines (129 loc) · 5.31 KB
/
agent.py
File metadata and controls
148 lines (129 loc) · 5.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
"""
Implementation of an asynchronous agent in the CartPole environment.
"""
import gym
import threading
import numpy as np
from collections import deque
import time
import matplotlib.pyplot as plt
from brain import brain
timesteps = 0
class Agent(threading.Thread):
"""
An agent running in a Gym environment.
Parameters
----------
discount_factor : float, optional
By how much should the agent discount its rewards over time? Also known as gamma. Default is 0.99.
timesteps_until_update : integer, optional
How many timesteps should we wait until updating? Default is 8.
max_td_steps : integer, optional
What is the maximum number of steps used in the temporal difference estimate of the advantage for every state?
If larger than timesteps_until_update, will be capped to that value. Default is 8.
"""
def __init__(self, render=False, discount_factor=0.99, timesteps_until_update=8, max_td_steps=8):
super().__init__()
self.THREAD_DELAY = 0.001
self.started = False
self.render = render
self.figure = None
self.plot = None
self.total_number_of_episodes = 0
self.episode_rewards = []
self.discount_factor = discount_factor
self.timesteps_until_update = timesteps_until_update
self.max_td_steps = max_td_steps
self.environment = gym.make("CartPole-v0")
self.memory = deque()
def run(self):
"""
Start the agent. It will start running episodes until explicitly stopped.
"""
self.started = True
if self.render:
self.figure = plt.figure()
self.plot = self.figure.add_subplot(1, 1, 1)
while self.started:
self.run_episode()
def stop(self):
"""
Stop the agent. It will finish its current episode run and turn idle.
"""
self.started = False
def run_episode(self):
"""
Run the agent for a single episode, saving the samples in the shared brain.
"""
global timesteps
total_rewards = 0.0
state = self.environment.reset()
while True:
time.sleep(self.THREAD_DELAY)
if self.render:
self.environment.render()
action_index, action = brain.select_action(state)
next_state, reward, terminal, _ = self.environment.step(action_index)
if terminal:
next_state = None
self.push_training_sample(state, action, reward, next_state)
state = next_state
timesteps += 1
total_rewards += reward
if terminal:
self.total_number_of_episodes += 1
if self.render:
# print("End of episode {}, total reward: {}"
# .format(self.total_number_of_episodes, total_rewards))
self.episode_rewards.append(total_rewards)
self.plot_episode()
break
def push_training_sample(self, state, action, reward, next_state):
"""
Compute and push a training sample into the brain training queue.
"""
self.memory.append((state, action, reward, next_state))
if next_state is None:
# We have reached the end of the episode, flush the memory
while self.memory:
state, action, _, _ = self.memory[0]
total_discounted_reward, n_discount, n_state = self.compute_return(self.memory)
brain.save_step(state, action, total_discounted_reward, n_discount, n_state)
self.memory.popleft()
else:
if len(self.memory) >= self.max_td_steps:
# We can compute at least one full n-step return
state, action, _, _ = self.memory[0]
total_discounted_reward, n_discount, n_state = \
self.compute_return([self.memory[i] for i in range(self.max_td_steps)])
brain.save_step(state, action, total_discounted_reward, n_discount, n_state)
self.memory.popleft()
def compute_return(self, trajectory):
"""
Compute return components over a trajectory.
Parameters
----------
trajectory : list or deque of (np.array((4,0)), np.array((2,0)), float, np.array((4,0))
The trajectory over which to compute the return components.
Returns
-------
total_discounted_reward
The total discounted reward for the trajectory.
n_discount : float
The discount factor for the n-step state value.
n_state : np.array of size (4, 0)
The state at the end of the trajectory.
"""
total_discounted_reward = 0.0
for k in range(len(trajectory)):
total_discounted_reward += self.discount_factor**k * trajectory[k][2]
n_discount = self.discount_factor**len(trajectory)
n_state = trajectory[-1][3]
return total_discounted_reward, n_discount, n_state
def plot_episode(self):
self.plot.clear()
self.plot.set_title('Total rewards of each episode')
self.plot.set_xlabel('Episode')
self.plot.set_ylabel('Total reward')
self.plot.plot(range(len(self.episode_rewards)), self.episode_rewards, '-')
plt.pause(0.001)