Skip to content

Commit 8f21a60

Browse files
committed
added tensorboard to A2C
1 parent ee35cc5 commit 8f21a60

File tree

2 files changed

+39
-4
lines changed

2 files changed

+39
-4
lines changed

data/cartpole.gif

-327 KB
Binary file not shown.

stable_baselines/a2c/a2c.py

+39-4
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ def __init__(self, policy, env, gamma=0.99, n_steps=5, vf_coef=0.25, ent_coef=0.
6767
self.initial_state = None
6868
self.learning_rate_schedule = None
6969
self.writer = None
70+
self.summary = None
7071

7172
# if we are loading, it is possible the environment is not known, however the obs and action space are known
7273
if _init_setup_model:
@@ -107,12 +108,29 @@ def setup_model(self):
107108
self.vf_loss = mse(tf.squeeze(train_model.value_fn), self.rewards_ph)
108109
loss = self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef
109110

111+
tf.summary.scalar('entropy_loss', self.entropy)
112+
tf.summary.scalar('policy_gradient_loss', self.pg_loss)
113+
tf.summary.scalar('value_function_loss', self.vf_loss)
114+
tf.summary.scalar('loss', loss)
115+
110116
self.params = find_trainable_variables("model")
111117
grads = tf.gradients(loss, self.params)
112118
if self.max_grad_norm is not None:
113119
grads, _ = tf.clip_by_global_norm(grads, self.max_grad_norm)
114120
grads = list(zip(grads, self.params))
115121

122+
with tf.variable_scope("info", reuse=False):
123+
tf.summary.scalar('rewards', tf.reduce_mean(self.rewards_ph))
124+
tf.summary.histogram('rewards', self.rewards_ph)
125+
tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate))
126+
tf.summary.histogram('learning_rate', self.learning_rate)
127+
tf.summary.scalar('advs', tf.reduce_mean(self.advs_ph))
128+
tf.summary.histogram('advs', self.advs_ph)
129+
if len(self.env.observation_space.shape) == 3:
130+
tf.summary.image('observation', train_model.obs_ph)
131+
else:
132+
tf.summary.histogram('observation', train_model.obs_ph)
133+
116134
trainer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_ph, decay=self.alpha,
117135
epsilon=self.epsilon)
118136
self.apply_backprop = trainer.apply_gradients(grads)
@@ -125,10 +143,12 @@ def setup_model(self):
125143
self.initial_state = step_model.initial_state
126144
tf.global_variables_initializer().run(session=self.sess)
127145

146+
self.summary = tf.summary.merge_all()
147+
128148
if self.tensorboard_log is not None:
129149
self.writer = tf.summary.FileWriter(self.tensorboard_log, graph=self.graph)
130150

131-
def _train_step(self, obs, states, rewards, masks, actions, values):
151+
def _train_step(self, obs, states, rewards, masks, actions, values, update):
132152
"""
133153
applies a training step to the model
134154
@@ -138,6 +158,7 @@ def _train_step(self, obs, states, rewards, masks, actions, values):
138158
:param masks: ([bool]) Whether or not the episode is over (used for reccurent policies)
139159
:param actions: ([float]) The actions taken
140160
:param values: ([float]) The logits values
161+
:param update: (int) the current step iteration
141162
:return: (float, float, float) policy loss, value loss, policy entropy
142163
"""
143164
advs = rewards - values
@@ -152,8 +173,22 @@ def _train_step(self, obs, states, rewards, masks, actions, values):
152173
td_map[self.train_model.states_ph] = states
153174
td_map[self.train_model.masks_ph] = masks
154175

155-
policy_loss, value_loss, policy_entropy, _ = self.sess.run(
156-
[self.pg_loss, self.vf_loss, self.entropy, self.apply_backprop], td_map)
176+
if update % 10 == 9:
177+
run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
178+
run_metadata = tf.RunMetadata()
179+
summary, policy_loss, value_loss, policy_entropy, _ = self.sess.run(
180+
[self.summary, self.pg_loss, self.vf_loss, self.entropy, self.apply_backprop],
181+
td_map, options=run_options, run_metadata=run_metadata)
182+
183+
if self.writer is not None:
184+
self.writer.add_run_metadata(run_metadata, 'step%d' % (update * (self.n_batch + 1)))
185+
else:
186+
summary, policy_loss, value_loss, policy_entropy, _ = self.sess.run(
187+
[self.summary, self.pg_loss, self.vf_loss, self.entropy, self.apply_backprop], td_map)
188+
189+
if self.writer is not None:
190+
self.writer.add_summary(summary, update * (self.n_batch + 1))
191+
157192
return policy_loss, value_loss, policy_entropy
158193

159194
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100):
@@ -168,7 +203,7 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=100):
168203
t_start = time.time()
169204
for update in range(1, total_timesteps // self.n_batch + 1):
170205
obs, states, rewards, masks, actions, values = runner.run()
171-
_, value_loss, policy_entropy = self._train_step(obs, states, rewards, masks, actions, values)
206+
_, value_loss, policy_entropy = self._train_step(obs, states, rewards, masks, actions, values, update)
172207
n_seconds = time.time() - t_start
173208
fps = int((update * self.n_batch) / n_seconds)
174209

0 commit comments

Comments
 (0)