@@ -67,6 +67,7 @@ def __init__(self, policy, env, gamma=0.99, n_steps=5, vf_coef=0.25, ent_coef=0.
67
67
self .initial_state = None
68
68
self .learning_rate_schedule = None
69
69
self .writer = None
70
+ self .summary = None
70
71
71
72
# if we are loading, it is possible the environment is not known, however the obs and action space are known
72
73
if _init_setup_model :
@@ -107,12 +108,29 @@ def setup_model(self):
107
108
self .vf_loss = mse (tf .squeeze (train_model .value_fn ), self .rewards_ph )
108
109
loss = self .pg_loss - self .entropy * self .ent_coef + self .vf_loss * self .vf_coef
109
110
111
+ tf .summary .scalar ('entropy_loss' , self .entropy )
112
+ tf .summary .scalar ('policy_gradient_loss' , self .pg_loss )
113
+ tf .summary .scalar ('value_function_loss' , self .vf_loss )
114
+ tf .summary .scalar ('loss' , loss )
115
+
110
116
self .params = find_trainable_variables ("model" )
111
117
grads = tf .gradients (loss , self .params )
112
118
if self .max_grad_norm is not None :
113
119
grads , _ = tf .clip_by_global_norm (grads , self .max_grad_norm )
114
120
grads = list (zip (grads , self .params ))
115
121
122
+ with tf .variable_scope ("info" , reuse = False ):
123
+ tf .summary .scalar ('rewards' , tf .reduce_mean (self .rewards_ph ))
124
+ tf .summary .histogram ('rewards' , self .rewards_ph )
125
+ tf .summary .scalar ('learning_rate' , tf .reduce_mean (self .learning_rate ))
126
+ tf .summary .histogram ('learning_rate' , self .learning_rate )
127
+ tf .summary .scalar ('advs' , tf .reduce_mean (self .advs_ph ))
128
+ tf .summary .histogram ('advs' , self .advs_ph )
129
+ if len (self .env .observation_space .shape ) == 3 :
130
+ tf .summary .image ('observation' , train_model .obs_ph )
131
+ else :
132
+ tf .summary .histogram ('observation' , train_model .obs_ph )
133
+
116
134
trainer = tf .train .RMSPropOptimizer (learning_rate = self .learning_rate_ph , decay = self .alpha ,
117
135
epsilon = self .epsilon )
118
136
self .apply_backprop = trainer .apply_gradients (grads )
@@ -125,10 +143,12 @@ def setup_model(self):
125
143
self .initial_state = step_model .initial_state
126
144
tf .global_variables_initializer ().run (session = self .sess )
127
145
146
+ self .summary = tf .summary .merge_all ()
147
+
128
148
if self .tensorboard_log is not None :
129
149
self .writer = tf .summary .FileWriter (self .tensorboard_log , graph = self .graph )
130
150
131
- def _train_step (self , obs , states , rewards , masks , actions , values ):
151
+ def _train_step (self , obs , states , rewards , masks , actions , values , update ):
132
152
"""
133
153
applies a training step to the model
134
154
@@ -138,6 +158,7 @@ def _train_step(self, obs, states, rewards, masks, actions, values):
138
158
:param masks: ([bool]) Whether or not the episode is over (used for reccurent policies)
139
159
:param actions: ([float]) The actions taken
140
160
:param values: ([float]) The logits values
161
+ :param update: (int) the current step iteration
141
162
:return: (float, float, float) policy loss, value loss, policy entropy
142
163
"""
143
164
advs = rewards - values
@@ -152,8 +173,22 @@ def _train_step(self, obs, states, rewards, masks, actions, values):
152
173
td_map [self .train_model .states_ph ] = states
153
174
td_map [self .train_model .masks_ph ] = masks
154
175
155
- policy_loss , value_loss , policy_entropy , _ = self .sess .run (
156
- [self .pg_loss , self .vf_loss , self .entropy , self .apply_backprop ], td_map )
176
+ if update % 10 == 9 :
177
+ run_options = tf .RunOptions (trace_level = tf .RunOptions .FULL_TRACE )
178
+ run_metadata = tf .RunMetadata ()
179
+ summary , policy_loss , value_loss , policy_entropy , _ = self .sess .run (
180
+ [self .summary , self .pg_loss , self .vf_loss , self .entropy , self .apply_backprop ],
181
+ td_map , options = run_options , run_metadata = run_metadata )
182
+
183
+ if self .writer is not None :
184
+ self .writer .add_run_metadata (run_metadata , 'step%d' % (update * (self .n_batch + 1 )))
185
+ else :
186
+ summary , policy_loss , value_loss , policy_entropy , _ = self .sess .run (
187
+ [self .summary , self .pg_loss , self .vf_loss , self .entropy , self .apply_backprop ], td_map )
188
+
189
+ if self .writer is not None :
190
+ self .writer .add_summary (summary , update * (self .n_batch + 1 ))
191
+
157
192
return policy_loss , value_loss , policy_entropy
158
193
159
194
def learn (self , total_timesteps , callback = None , seed = None , log_interval = 100 ):
@@ -168,7 +203,7 @@ def learn(self, total_timesteps, callback=None, seed=None, log_interval=100):
168
203
t_start = time .time ()
169
204
for update in range (1 , total_timesteps // self .n_batch + 1 ):
170
205
obs , states , rewards , masks , actions , values = runner .run ()
171
- _ , value_loss , policy_entropy = self ._train_step (obs , states , rewards , masks , actions , values )
206
+ _ , value_loss , policy_entropy = self ._train_step (obs , states , rewards , masks , actions , values , update )
172
207
n_seconds = time .time () - t_start
173
208
fps = int ((update * self .n_batch ) / n_seconds )
174
209
0 commit comments