Howuhh
diff --git a/‎README.md
+9-4 b/‎README.md
+9-4
diff --git a/‎evaluation.py
-6 b/‎evaluation.py
-6
diff --git a/‎models/test_BipedalWalker_v5.1.pkl
22.8 KB b/‎models/test_BipedalWalker_v5.1.pkl
22.8 KB
diff --git a/‎models/test_BipedalWalker_v5.2.pkl
22.8 KB b/‎models/test_BipedalWalker_v5.2.pkl
22.8 KB
diff --git a/‎models/test_BipedalWalker_v5.3.pkl
22.8 KB b/‎models/test_BipedalWalker_v5.3.pkl
22.8 KB
diff --git a/‎plots/test_BipedalWalker_v5.1.png
58.3 KB b/‎plots/test_BipedalWalker_v5.1.png
58.3 KB
diff --git a/‎plots/test_BipedalWalker_v5.2.png
72.2 KB b/‎plots/test_BipedalWalker_v5.2.png
72.2 KB
diff --git a/‎plots/test_BipedalWalker_v5.3.png
63.4 KB b/‎plots/test_BipedalWalker_v5.3.png
63.4 KB
diff --git a/‎tests/bipedal_walker.py
+9-9 b/‎tests/bipedal_walker.py
+9-9
diff --git a/‎training.py
+8-3 b/‎training.py
+8-3
@@ -62,10 +62,10 @@ from training import run_experiment
 
 example_config = {
     "experiment_name": "test_BipedalWalker_v0",
-    "plot_path": "../plots/",
-    "model_path": "../models/", # optional
-    "log_path": "../logs/" # optional
-    "init_model": "../models/test_BipedalWalker_v5.0.pkl",  # optional
+    "plot_path": "plots/",
+    "model_path": "models/", # optional
+    "log_path": "logs/", # optional
+    "init_model": "models/test_BipedalWalker_v5.0.pkl",  # optional
     "env": "BipedalWalker-v3",
     "n_sessions": 128,
     "env_steps": 1600, 
@@ -120,6 +120,11 @@ noise/lr annealing - https://cs231n.github.io/neural-networks-3/#anneal,  https:
 
 <!-- Наблюдения: легко решает среды, в которых легко исследовать/пробовать разное, т.к тогда точнее получается градиент и больше данных по реварду. Среды в которых ревард очень редкий решаются очень плохо т.к. до того, как случайно случится событие с ревардом может пройти очень много времени, т.к. поиск до этого случайны и обучения нет. Taxi-v3: плохо работает и генетический и метод кросс энтропии -->
 
+<!--         # if env.spec._env_name == 'MountainCarContinuous':
+        #     reward = reward + 10 * abs(new_obs[1])
+            # TODO: add novelity search reward (https://lilianweng.github.io/lil-log/2019/09/05/evolution-strategies.html)
+            # метод потенциалов https://habr.com/ru/company/hsespb/blog/444428/
+            # reward = reward + 300 * (0.99 * abs(new_obs[1]) - abs(obs[1])) -->
 ## References
 
 [Evolution Strategies as a Scalable Alternative to Reinforcement Learning](https://arxiv.org/abs/1703.03864) (Tim Salimans, Jonathan Ho, Xi Chen, Ilya Sutskever)
@@ -21,12 +21,6 @@ def eval_policy(policy, env, n_steps=200):
 
         new_obs, reward, done, _ = env.step(action)
 
-        # if env.spec._env_name == 'MountainCarContinuous':
-        #     reward = reward + 10 * abs(new_obs[1])
-            # TODO: add novelity search reward (https://lilianweng.github.io/lil-log/2019/09/05/evolution-strategies.html)
-            # метод потенциалов https://habr.com/ru/company/hsespb/blog/444428/
-            # reward = reward + 300 * (0.99 * abs(new_obs[1]) - abs(obs[1]))
-
         total_reward = total_reward + reward
         obs = new_obs
 
 
@@ -8,24 +8,24 @@
 # solving the task as getting an average score of 300+ over 100 consecutive random trials.
 def test():
     test_config = {
-        "experiment_name": "test_BipedalWalker_v5.1",
+        "experiment_name": "test_BipedalWalker_v6.0",
         "plot_path": "../plots/",
         "model_path": "../models/",
-        "log_path": "../logs/"
-        "init_model": "../models/test_BipedalWalker_v5.0.pkl",
+        "log_path": "../logs/",
         "env": "BipedalWalker-v3",
-        "n_sessions": 130,
-        "env_steps": 1200, 
+        "n_sessions": 512,
+        "env_steps": 1300, 
         "population_size": 128,
-        "learning_rate": 0.06,
-        "noise_std": 0.09704,
-        "noise_decay": 0.995,
+        "learning_rate": 0.065,
+        "noise_std": 0.1,
+        "noise_decay": 0.995, # 
         "decay_step": 20,
         "eval_step": 10,
-        "hidden_sizes": (40, 40) # sizes from https://designrl.github.io/
+        "hidden_sizes": (64, 40) # sizes from https://designrl.github.io/
     }
 
     policy = run_experiment(test_config, n_jobs=4)
 
+
 if __name__ == "__main__":
     test()
@@ -13,7 +13,7 @@
 from linear import ThreeLayerNetwork
 from es import OpenAiES
 from plot import plot_rewards
-from evaluation import eval_policy, eval_policy_delayed
+from evaluation import eval_policy_delayed, eval_policy
 
 # env: (n_states, n_actions)
 ENV_INFO = {
@@ -45,6 +45,10 @@ def train_loop(policy, env, config, n_jobs=1, verbose=True):
         rewards = np.array(Parallel(n_jobs=n_jobs)(rewards_jobs))
 
         es.update_population(rewards)
+
+        # populations stats
+        log["pop_mean_rewards"].append(np.mean(rewards))
+        log["pop_std_rewards"].append(np.std(rewards))
 
         # best policy stats
         if session % config.get("eval_step", 2) == 0:
@@ -82,10 +86,11 @@ def run_experiment(config, n_jobs=4, verbose=True):
             out_features=n_actions, 
             hidden_sizes=config["hidden_sizes"]
         )
+    # TODO: save model on KeyboardInterrupt exception
     log = train_loop(policy, env, config, n_jobs, verbose)
 
     if config.get("log_path", None):
-        with open(f"{config['log_path']}{config['experiment_name']}", "wb") as file:
+        with open(f"{config['log_path']}{config['experiment_name']}.pkl", "wb") as file:
             pickle.dump(log, file)
 
     if config.get("model_path", None):
@@ -112,5 +117,5 @@ def render_policy(model_path, env_name, n_videos=1):
 
 
 if __name__ == "__main__":
-    render_policy("models/test_BipedalWalker_v5.0.pkl", "BipedalWalker-v3")
+    render_policy("models/test_BipedalWalker_v5.3.pkl", "BipedalWalker-v3")