Skip to content

Commit 979bbf6

Browse files
committed
add log pop stats
1 parent 741ccc0 commit 979bbf6

10 files changed

+26
-22
lines changed

README.md

+9-4
Original file line numberDiff line numberDiff line change
@@ -62,10 +62,10 @@ from training import run_experiment
6262

6363
example_config = {
6464
"experiment_name": "test_BipedalWalker_v0",
65-
"plot_path": "../plots/",
66-
"model_path": "../models/", # optional
67-
"log_path": "../logs/" # optional
68-
"init_model": "../models/test_BipedalWalker_v5.0.pkl", # optional
65+
"plot_path": "plots/",
66+
"model_path": "models/", # optional
67+
"log_path": "logs/", # optional
68+
"init_model": "models/test_BipedalWalker_v5.0.pkl", # optional
6969
"env": "BipedalWalker-v3",
7070
"n_sessions": 128,
7171
"env_steps": 1600,
@@ -120,6 +120,11 @@ noise/lr annealing - https://cs231n.github.io/neural-networks-3/#anneal, https:
120120

121121
<!-- Наблюдения: легко решает среды, в которых легко исследовать/пробовать разное, т.к тогда точнее получается градиент и больше данных по реварду. Среды в которых ревард очень редкий решаются очень плохо т.к. до того, как случайно случится событие с ревардом может пройти очень много времени, т.к. поиск до этого случайны и обучения нет. Taxi-v3: плохо работает и генетический и метод кросс энтропии -->
122122

123+
<!-- # if env.spec._env_name == 'MountainCarContinuous':
124+
# reward = reward + 10 * abs(new_obs[1])
125+
# TODO: add novelity search reward (https://lilianweng.github.io/lil-log/2019/09/05/evolution-strategies.html)
126+
# метод потенциалов https://habr.com/ru/company/hsespb/blog/444428/
127+
# reward = reward + 300 * (0.99 * abs(new_obs[1]) - abs(obs[1])) -->
123128
## References
124129

125130
[Evolution Strategies as a Scalable Alternative to Reinforcement Learning](https://arxiv.org/abs/1703.03864) (Tim Salimans, Jonathan Ho, Xi Chen, Ilya Sutskever)

evaluation.py

-6
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,6 @@ def eval_policy(policy, env, n_steps=200):
2121

2222
new_obs, reward, done, _ = env.step(action)
2323

24-
# if env.spec._env_name == 'MountainCarContinuous':
25-
# reward = reward + 10 * abs(new_obs[1])
26-
# TODO: add novelity search reward (https://lilianweng.github.io/lil-log/2019/09/05/evolution-strategies.html)
27-
# метод потенциалов https://habr.com/ru/company/hsespb/blog/444428/
28-
# reward = reward + 300 * (0.99 * abs(new_obs[1]) - abs(obs[1]))
29-
3024
total_reward = total_reward + reward
3125
obs = new_obs
3226

models/test_BipedalWalker_v5.1.pkl

22.8 KB
Binary file not shown.

models/test_BipedalWalker_v5.2.pkl

22.8 KB
Binary file not shown.

models/test_BipedalWalker_v5.3.pkl

22.8 KB
Binary file not shown.

plots/test_BipedalWalker_v5.1.png

58.3 KB
Loading

plots/test_BipedalWalker_v5.2.png

72.2 KB
Loading

plots/test_BipedalWalker_v5.3.png

63.4 KB
Loading

tests/bipedal_walker.py

+9-9
Original file line numberDiff line numberDiff line change
@@ -8,24 +8,24 @@
88
# solving the task as getting an average score of 300+ over 100 consecutive random trials.
99
def test():
1010
test_config = {
11-
"experiment_name": "test_BipedalWalker_v5.1",
11+
"experiment_name": "test_BipedalWalker_v6.0",
1212
"plot_path": "../plots/",
1313
"model_path": "../models/",
14-
"log_path": "../logs/"
15-
"init_model": "../models/test_BipedalWalker_v5.0.pkl",
14+
"log_path": "../logs/",
1615
"env": "BipedalWalker-v3",
17-
"n_sessions": 130,
18-
"env_steps": 1200,
16+
"n_sessions": 512,
17+
"env_steps": 1300,
1918
"population_size": 128,
20-
"learning_rate": 0.06,
21-
"noise_std": 0.09704,
22-
"noise_decay": 0.995,
19+
"learning_rate": 0.065,
20+
"noise_std": 0.1,
21+
"noise_decay": 0.995, #
2322
"decay_step": 20,
2423
"eval_step": 10,
25-
"hidden_sizes": (40, 40) # sizes from https://designrl.github.io/
24+
"hidden_sizes": (64, 40) # sizes from https://designrl.github.io/
2625
}
2726

2827
policy = run_experiment(test_config, n_jobs=4)
2928

29+
3030
if __name__ == "__main__":
3131
test()

training.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from linear import ThreeLayerNetwork
1414
from es import OpenAiES
1515
from plot import plot_rewards
16-
from evaluation import eval_policy, eval_policy_delayed
16+
from evaluation import eval_policy_delayed, eval_policy
1717

1818
# env: (n_states, n_actions)
1919
ENV_INFO = {
@@ -45,6 +45,10 @@ def train_loop(policy, env, config, n_jobs=1, verbose=True):
4545
rewards = np.array(Parallel(n_jobs=n_jobs)(rewards_jobs))
4646

4747
es.update_population(rewards)
48+
49+
# populations stats
50+
log["pop_mean_rewards"].append(np.mean(rewards))
51+
log["pop_std_rewards"].append(np.std(rewards))
4852

4953
# best policy stats
5054
if session % config.get("eval_step", 2) == 0:
@@ -82,10 +86,11 @@ def run_experiment(config, n_jobs=4, verbose=True):
8286
out_features=n_actions,
8387
hidden_sizes=config["hidden_sizes"]
8488
)
89+
# TODO: save model on KeyboardInterrupt exception
8590
log = train_loop(policy, env, config, n_jobs, verbose)
8691

8792
if config.get("log_path", None):
88-
with open(f"{config['log_path']}{config['experiment_name']}", "wb") as file:
93+
with open(f"{config['log_path']}{config['experiment_name']}.pkl", "wb") as file:
8994
pickle.dump(log, file)
9095

9196
if config.get("model_path", None):
@@ -112,5 +117,5 @@ def render_policy(model_path, env_name, n_videos=1):
112117

113118

114119
if __name__ == "__main__":
115-
render_policy("models/test_BipedalWalker_v5.0.pkl", "BipedalWalker-v3")
120+
render_policy("models/test_BipedalWalker_v5.3.pkl", "BipedalWalker-v3")
116121

0 commit comments

Comments
 (0)