chainer
diff --git a/‎README.md
+3 b/‎README.md
+3
diff --git a/‎chainerrl/agents/ddpg.py
+2 b/‎chainerrl/agents/ddpg.py
+2
diff --git a/‎chainerrl/agents/dqn.py
+2 b/‎chainerrl/agents/dqn.py
+2
diff --git a/‎chainerrl/agents/td3.py
+13-9 b/‎chainerrl/agents/td3.py
+13-9
diff --git a/‎chainerrl/experiments/evaluator.py
+1 b/‎chainerrl/experiments/evaluator.py
+1
diff --git a/‎chainerrl/initializers/__init__.py
+2 b/‎chainerrl/initializers/__init__.py
+2
diff --git a/‎chainerrl/initializers/orthogonal.py
+104 b/‎chainerrl/initializers/orthogonal.py
+104
@@ -49,6 +49,7 @@ For more information, you can refer to [ChainerRL's documentation](http://chaine
 | PCL (Path Consistency Learning) | ✓ | ✓ | ✓ | ✓ |
 | PPO  | ✓ | ✓ | x | x |
 | TRPO | ✓ | ✓ | x | x |
+| TD3 | x | ✓ | x | x |
 
 Following algorithms have been implemented in ChainerRL:
 - A3C (Asynchronous Advantage Actor-Critic)
@@ -63,6 +64,7 @@ Following algorithms have been implemented in ChainerRL:
 - PCL (Path Consistency Learning)
 - PPO (Proximal Policy Optimization)
 - TRPO (Trust Region Policy Optimization)
+- TD3 (Twin Delayed Deep Deterministic policy gradient algorithm)
 
 Q-function based algorithms such as DQN can utilize a Normalized Advantage Function (NAF) to tackle continuous-action problems as well as DQN-like discrete output networks.
 
@@ -84,6 +86,7 @@ The following papers have been implemented in ChainerRL:
 - [Trust Region Policy Optimization](https://arxiv.org/abs/1502.05477)
 - [Sample Efficient Actor-Critic with Experience Replay](https://arxiv.org/abs/1611.01224)
 - [Bridging the Gap Between Value and Policy Based Reinforcement Learning](https://arxiv.org/abs/1702.08892)
+- [Addressing Function Approximation Error in Actor-Critic Methods](https://arxiv.org/abs/1802.09477)
 
 
 ## Visualization
 
@@ -435,9 +435,11 @@ def batch_observe_and_train(
                     next_state=batch_obs[i],
                     next_action=None,
                     is_state_terminal=batch_done[i],
+                    env_id=i,
                 )
                 if batch_reset[i] or batch_done[i]:
                     self.batch_last_obs[i] = None
+                    self.replay_buffer.stop_current_episode(env_id=i)
             self.replay_updater.update_if_necessary(self.t)
 
     def batch_observe(self, batch_obs, batch_reward,
 
@@ -470,9 +470,11 @@ def batch_observe_and_train(self, batch_obs, batch_reward,
                     next_state=batch_obs[i],
                     next_action=None,
                     is_state_terminal=batch_done[i],
+                    env_id=i,
                 )
                 if batch_reset[i] or batch_done[i]:
                     self.batch_last_obs[i] = None
+                    self.replay_buffer.stop_current_episode(env_id=i)
             self.replay_updater.update_if_necessary(self.t)
 
     def batch_observe(self, batch_obs, batch_reward,
 
@@ -241,7 +241,7 @@ def update(self, experiences, errors_out=None):
             self.update_policy(batch)
             self.sync_target_network()
 
-    def select_greedy_action(self, obs):
+    def select_onpolicy_action(self, obs):
         with chainer.no_backprop_mode(), chainer.using_config('train', False):
             s = self.batch_states([obs], self.xp, self.phi)
             action = self.policy(s).sample().array
@@ -255,8 +255,9 @@ def act_and_train(self, obs, reward):
                 and self.policy_optimizer.t == 0):
             action = self.burnin_action_func()
         else:
-            greedy_action = self.select_greedy_action(obs)
-            action = self.explorer.select_action(self.t, lambda: greedy_action)
+            onpolicy_action = self.select_onpolicy_action(obs)
+            action = self.explorer.select_action(
+                self.t, lambda: onpolicy_action)
         self.t += 1
 
         if self.last_state is not None:
@@ -278,16 +279,16 @@ def act_and_train(self, obs, reward):
         return self.last_action
 
     def act(self, obs):
-        return self.select_greedy_action(obs)
+        return self.select_onpolicy_action(obs)
 
-    def batch_select_greedy_action(self, batch_obs):
+    def batch_select_onpolicy_action(self, batch_obs):
         with chainer.using_config('train', False), chainer.no_backprop_mode():
             batch_xs = self.batch_states(batch_obs, self.xp, self.phi)
             batch_action = self.policy(batch_xs).sample().array
         return list(cuda.to_cpu(batch_action))
 
     def batch_act(self, batch_obs):
-        return self.batch_select_greedy_action(batch_obs)
+        return self.batch_select_onpolicy_action(batch_obs)
 
     def batch_act_and_train(self, batch_obs):
         """Select a batch of actions for training.
@@ -304,11 +305,12 @@ def batch_act_and_train(self, batch_obs):
             batch_action = [self.burnin_action_func()
                             for _ in range(len(batch_obs))]
         else:
-            batch_greedy_action = self.batch_select_greedy_action(batch_obs)
+            batch_onpolicy_action = self.batch_select_onpolicy_action(
+                batch_obs)
             batch_action = [
                 self.explorer.select_action(
-                    self.t, lambda: batch_greedy_action[i])
-                for i in range(len(batch_greedy_action))]
+                    self.t, lambda: batch_onpolicy_action[i])
+                for i in range(len(batch_onpolicy_action))]
 
         self.batch_last_obs = list(batch_obs)
         self.batch_last_action = list(batch_action)
@@ -329,9 +331,11 @@ def batch_observe_and_train(
                     next_state=batch_obs[i],
                     next_action=None,
                     is_state_terminal=batch_done[i],
+                    env_id=i,
                 )
                 if batch_reset[i] or batch_done[i]:
                     self.batch_last_obs[i] = None
+                    self.replay_buffer.stop_current_episode(env_id=i)
             self.replay_updater.update_if_necessary(self.t)
 
     def batch_observe(self, batch_obs, batch_reward,
 
@@ -230,6 +230,7 @@ def eval_performance(env, agent, n_steps, n_episodes, max_episode_len=None,
     Args:
         env (Environment): Environment used for evaluation
         agent (Agent): Agent to evaluate.
+        n_steps (int): Number of timesteps to evaluate for.
         n_episodes (int): Number of evaluation episodes.
         max_episode_len (int or None): If specified, episodes longer than this
             value will be truncated.
 
@@ -1,5 +1,7 @@
 from chainerrl.initializers.constant import VarianceScalingConstant  # NOQA
 
+from chainerrl.initializers.orthogonal import Orthogonal  # NOQA
+
 # LeCunNormal was merged into Chainer v3, thus removed from ChainerRL.
 # For backward compatibility, it is still imported in this namespace.
 from chainer.initializers import LeCunNormal  # NOQA
@@ -0,0 +1,104 @@
+"""
+This is copied from https://github.com/chainer/chainer/pull/6031 and will be
+unnecessary once the PR is merged to Chainer.
+"""
+import functools
+import operator
+
+import numpy
+
+from chainer import cuda
+from chainer import initializer
+
+
+# Only Chainer v6 or later has chainer.utils.size_of_shape
+def size_of_shape(shape):
+    return functools.reduce(operator.mul, shape, 1)
+
+
+_orthogonal_constraints = {  # (assert emb., assert proj.)
+    'auto': (False, False),
+    'projection': (False, True),
+    'embedding': (True, False),
+    'basis': (True, True),
+}
+
+
+# Original code forked from MIT licensed keras project
+# https://github.com/fchollet/keras/blob/master/keras/initializations.py
+
+class Orthogonal(initializer.Initializer):
+    """Initializes array with an orthogonal system.
+
+    This initializer first makes a matrix of the same shape as the
+    array to be initialized whose elements are drawn independently from
+    standard Gaussian distribution.
+    Next, it applies QR decomposition to (the transpose of) the matrix.
+    To make the decomposition (almost surely) unique, we require the diagonal
+    of the triangular matrix R to be non-negative (see e.g. Edelman & Rao,
+    https://web.eecs.umich.edu/~rajnrao/Acta05rmt.pdf).
+    Then, it initializes the array with the (semi-)orthogonal matrix Q.
+    Finally, the array is multiplied by the constant ``scale``.
+
+    If the ``ndim`` of the input array is more than 2, we consider the array
+    to be a matrix by concatenating all axes except the first one.
+
+    The number of vectors consisting of the orthogonal system
+    (i.e. first element of the shape of the array) must be equal to or smaller
+    than the dimension of each vector (i.e. second element of the shape of
+    the array).
+
+    Attributes:
+        scale (float): A constant to be multiplied by.
+        dtype: Data type specifier.
+        mode (str): Assertion on the initialized shape.
+            ``'auto'`` (default), ``'projection'`` (before v6),
+            ``'embedding'``, or ``'basis'``.
+
+    Reference: Saxe et al., https://arxiv.org/abs/1312.6120
+
+    """
+
+    def __init__(self, scale=1.1, dtype=None, mode='auto'):
+        self.scale = scale
+        self.mode = mode
+        try:
+            self._checks = _orthogonal_constraints[mode]
+        except KeyError:
+            raise ValueError(
+                'Invalid mode: {}. Choose from {}.'.format(
+                    repr(mode),
+                    ', '.join(repr(m) for m in _orthogonal_constraints)))
+        super(Orthogonal, self).__init__(dtype)
+
+    # TODO(Kenta Oono)
+    # How do we treat overcomplete base-system case?
+    def __call__(self, array):
+        if self.dtype is not None:
+            assert array.dtype == self.dtype
+        xp = cuda.get_array_module(array)
+        if not array.shape:  # 0-dim case
+            array[...] = self.scale * (2 * numpy.random.randint(2) - 1)
+        elif not array.size:
+            raise ValueError('Array to be initialized must be non-empty.')
+        else:
+            # numpy.prod returns float value when the argument is empty.
+            out_dim = len(array)
+            in_dim = size_of_shape(array.shape[1:])
+            if (in_dim > out_dim and self._checks[0]) or (
+                    in_dim < out_dim and self._checks[1]):
+                raise ValueError(
+                    'Cannot make orthogonal {}.'
+                    'shape = {}, interpreted as '
+                    '{}-dim input and {}-dim output.'.format(
+                        self.mode, array.shape, in_dim, out_dim))
+            transpose = in_dim > out_dim
+            a = numpy.random.normal(size=(out_dim, in_dim))
+            if transpose:
+                a = a.T
+            # cupy.linalg.qr requires cusolver in CUDA 8+
+            q, r = numpy.linalg.qr(a)
+            q *= numpy.copysign(self.scale, numpy.diag(r))
+            if transpose:
+                q = q.T
+            array[...] = xp.asarray(q.reshape(array.shape))