-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTrainer.fs
247 lines (211 loc) · 8.76 KB
/
Trainer.fs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
namespace DeepKuhnPoker
open MathNet.Numerics.LinearAlgebra
open TorchSharp
module Trainer =
/// Computes strategy for the given info set using the
/// given advantage model.
let getStrategy infoSetKey model =
use _ = torch.no_grad() // use model.eval() instead?
(AdvantageModel.getAdvantage infoSetKey model)
.data<float32>()
|> DenseVector.ofSeq
|> InformationSet.getStrategy
/// Negates opponent's utilties (assuming a zero-zum game).
let private getActiveUtilities utilities =
utilities
|> Seq.map (~-)
|> DenseVector.ofSeq
/// Evaluates the utility of the given deal.
let private traverse iter deal updatingPlayer (models : _[]) =
/// Appends an item to the end of an array.
let append items item =
[| yield! items; yield item |]
/// Top-level loop.
let rec loop history =
match KuhnPoker.getPayoff deal history with
| Some payoff ->
float32 payoff, Array.empty // game is over
| None ->
loopNonTerminal history
/// Recurses for non-terminal game state.
and loopNonTerminal history =
// get info set for current state from this player's point of view
let activePlayer = KuhnPoker.getActivePlayer history
let infoSetKey = deal[activePlayer] + history
// get active player's current strategy for this info set
let strategy =
getStrategy infoSetKey models[activePlayer]
// get utility of this info set
if activePlayer = updatingPlayer then
// get utility of each action
let actionUtilities, samples =
let utilities, sampleArrays =
KuhnPoker.actions
|> Array.map (fun action ->
loop (history + action))
|> Array.unzip
getActiveUtilities utilities,
Array.concat sampleArrays
// utility of this info set is action utilities weighted by action probabilities
let utility = actionUtilities * strategy
let sample =
AdvantageSample.create
infoSetKey
(actionUtilities - utility)
iter |> Choice1Of2
utility, append samples sample
else
// sample a single action according to the strategy
let utility, samples =
let action =
strategy
|> Vector.sample settings.Random
|> Array.get KuhnPoker.actions
loop (history + action)
let sample =
StrategySample.create
infoSetKey
strategy
iter |> Choice2Of2
-utility, append samples sample
loop "" |> snd
/// Advantage state managed for each player.
type private AdvantageState =
{
/// Player's model.
Model : AdvantageModel
/// Player's reservoir.
Reservoir : Reservoir<AdvantageSample>
}
module private AdvantageState =
/// Creates an advantage model.
let private createModel () =
AdvantageModel.create
settings.HiddenSize
settings.LearningRate
/// Creates an advantage state.
let create () =
{
Model = createModel ()
Reservoir =
Reservoir.create
settings.Random
settings.NumAdvantageSamples
}
/// Resets the model of the given state.
let resetModel state =
{
state with
Model = createModel ()
}
/// Generates training data for the given player.
let private generateSamples iter updatingPlayer stateMap =
Choice.unzip [|
for _ = 1 to settings.NumTraversals do
let deal =
let iDeal =
settings.Random.Next(
KuhnPoker.allDeals.Length)
KuhnPoker.allDeals[iDeal]
let models =
stateMap
|> Map.values
|> Seq.map (fun state -> state.Model)
|> Seq.toArray
yield! traverse
iter deal updatingPlayer models
|]
/// Adds the given samples to the given reservoir and then
/// uses the reservoir to train the given advantage model.
let private trainAdvantageModel state newSamples =
let resv =
Reservoir.addMany newSamples state.Reservoir
let losses =
AdvantageModel.train
settings.NumAdvantageTrainSteps
resv.Items
state.Model
resv, losses
/// Trains a single iteration.
let private trainIteration iter stateMap =
// train each player's model
let stratSampleSeqs, stateMap =
(stateMap, seq { 0 .. KuhnPoker.numPlayers - 1 })
||> Seq.mapFold (fun stateMap updatingPlayer ->
// generate training data for this player
let advSamples, stratSamples =
generateSamples iter updatingPlayer stateMap
// train this player's model
let state =
stateMap[updatingPlayer]
|> AdvantageState.resetModel
let resv, losses =
trainAdvantageModel state advSamples
let stateMap =
let state = { state with Reservoir = resv }
Map.add updatingPlayer state stateMap
// log inputs and losses
settings.Writer.add_scalar(
$"advantage reservoir/player{updatingPlayer}",
float32 resv.Items.Count,
iter)
for step = 0 to losses.Length - 1 do
settings.Writer.add_scalar(
$"advantage loss/iter%04d{iter}/player{updatingPlayer}",
losses[step], step)
stratSamples, stateMap)
// log betting behavior
for infoSetKey in [ "J"; "K"; "Jc"; "Qb"; "Qcb" ] do
let betProb =
let model =
let player = (infoSetKey.Length - 1) % 2
stateMap[player].Model
(getStrategy infoSetKey model)[0]
settings.Writer.add_scalar(
$"advantage bet probability/{infoSetKey}",
betProb,
iter)
stateMap, Seq.concat stratSampleSeqs
/// Trains a strategy model using the given samples.
let private trainStrategyModel (resv : Reservoir<_>) =
let model =
StrategyModel.create
settings.HiddenSize
settings.LearningRate
let losses =
StrategyModel.train
settings.NumStrategyTrainSteps
resv.Items
model
for step = 0 to losses.Length - 1 do
settings.Writer.add_scalar(
"strategy loss", losses[step], step)
model
/// Trains for the given number of iterations.
let train () =
// create advantage state
let advStateMap =
Map [|
for player = 0 to KuhnPoker.numPlayers - 1 do
player, AdvantageState.create ()
|]
// run the iterations
let _, stratResv =
let stratResv =
Reservoir.create
settings.Random
settings.NumStrategySamples
let iterNums = seq { 0 .. settings.NumIterations - 1 }
((advStateMap, stratResv), iterNums)
||> Seq.fold (fun (advStateMap, stratResv) iter ->
let advStateMap, stratSamples =
trainIteration iter advStateMap
let stratResv =
Reservoir.addMany stratSamples stratResv
settings.Writer.add_scalar(
$"strategy reservoir",
float32 stratResv.Items.Count,
iter)
advStateMap, stratResv)
// train the final strategy model
trainStrategyModel stratResv