-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathq_learning_agent.py
119 lines (92 loc) · 3.51 KB
/
q_learning_agent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import random
from actions import Actions
from agent import Agent
class QLearningAgent(Agent):
def __init__(self, alpha, discount, epsilon):
self.alpha = alpha
self.discount = discount
self.epsilon = epsilon
self.q_values = {}
self.last_action = None
self.last_features = None
self.states_seen = {}
self.no_policy_moves_made = 0
def getQValue(self, features, action):
return self.q_values.get((features, action), 0.0)
def getValue(self, features, hand):
actions = hand.getPossibleActions()
if not actions:
return 0.0
else:
return max(map(lambda action: self.getQValue(features, action), actions))
def getPolicy(self, features, hand):
# Cheating
if features[0] <= 8:
return Actions.HIT
actions = hand.getPossibleActions()
if not actions:
return None
else:
no_policy = True
max_value = None
best_actions = []
for action in actions:
if self.q_values.has_key((features, action)):
no_policy = False
value = self.getQValue(features, action)
if value > max_value:
max_value = value
best_actions = [action]
elif value == max_value:
best_actions.append(action)
if no_policy:
self.no_policy_moves_made += 1
return random.choice(best_actions)
def stateToFeatures(self, gameState, playerHand):
hands = map(lambda hand: (hand.getHardCount(), hand.getHasAce()), gameState.getPlayerHands().keys())
features = (
playerHand.getHardCount(),
playerHand.getHasAce(),
# tuple(hands), # ignore other hands
gameState.getDealerUpCard().getSoftCount())
return features
def update(self, features, hand, reward):
key = (self.last_features, self.last_action)
self.states_seen[key] = self.states_seen.get(key, 0) + 1
value = self.getValue(features, hand)
q_value = self.q_values.get((self.last_features, self.last_action), 0.0)
self.q_values[(self.last_features, self.last_action)] = \
(1.0 - self.alpha) * q_value + self.alpha * (reward + self.discount * value)
def getNextAction(self, gameState, hand):
features = self.stateToFeatures(gameState, hand)
if self.last_features and self.last_action:
self.update(features, hand, 0.0)
actions = hand.getPossibleActions()
if not actions:
action = None
elif random.random() < self.epsilon:
action = random.choice(actions)
else:
action = self.getPolicy(features, hand)
self.last_action = action
self.last_features = features
return action
def gameOver(self, gameState, hand, reward):
features = self.stateToFeatures(gameState, hand)
self.update(features, hand, reward)
self.last_action = None
self.last_features = None
def lose(self, gameState, hand):
self.gameOver(gameState, hand, -hand.getBet())
def win(self, gameState, hand):
self.gameOver(gameState, hand, hand.getBet())
def tie(self, gameState, hand):
self.gameOver(gameState, hand, 0)
def __str__(self):
return "Q learning agent"
def needsTraining(self):
return True
def trainingOver(self):
self.epsilon = 0.0
self.alpha = 0.0
self.no_policy_moves_made = 0