-
Notifications
You must be signed in to change notification settings - Fork 58
/
minimax.py
294 lines (240 loc) · 9.96 KB
/
minimax.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
import time
import random
from typing import Any
from catanatron.game import Game
from catanatron.models.player import Player
from catanatron_experimental.machine_learning.players.tree_search_utils import (
expand_spectrum,
list_prunned_actions,
)
from catanatron_experimental.machine_learning.players.value import (
DEFAULT_WEIGHTS,
get_value_fn,
)
ALPHABETA_DEFAULT_DEPTH = 2
MAX_SEARCH_TIME_SECS = 20
class AlphaBetaPlayer(Player):
"""
Player that executes an AlphaBeta Search where the value of each node
is taken to be the expected value (using the probability of rolls, etc...)
of its children. At leafs we simply use the heuristic function given.
NOTE: More than 3 levels seems to take much longer, it would be
interesting to see this with prunning.
"""
def __init__(
self,
color,
depth=ALPHABETA_DEFAULT_DEPTH,
prunning=False,
value_fn_builder_name=None,
params=DEFAULT_WEIGHTS,
epsilon=None,
):
super().__init__(color)
self.depth = int(depth)
self.prunning = str(prunning).lower() != "false"
self.value_fn_builder_name = (
"contender_fn" if value_fn_builder_name == "C" else "base_fn"
)
self.params = params
self.use_value_function = None
self.epsilon = epsilon
def value_function(self, game, p0_color):
raise NotImplementedError
def get_actions(self, game):
if self.prunning:
return list_prunned_actions(game)
return game.state.playable_actions
def decide(self, game: Game, playable_actions):
actions = self.get_actions(game)
if len(actions) == 1:
return actions[0]
if self.epsilon is not None and random.random() < self.epsilon:
return random.choice(playable_actions)
start = time.time()
state_id = str(len(game.state.actions))
node = DebugStateNode(state_id, self.color) # i think it comes from outside
deadline = start + MAX_SEARCH_TIME_SECS
result = self.alphabeta(
game.copy(), self.depth, float("-inf"), float("inf"), deadline, node
)
# print("Decision Results:", self.depth, len(actions), time.time() - start)
# if game.state.num_turns > 10:
# render_debug_tree(node)
# breakpoint()
if result[0] is None:
return playable_actions[0]
return result[0]
def __repr__(self) -> str:
return (
super().__repr__()
+ f"(depth={self.depth},value_fn={self.value_fn_builder_name},prunning={self.prunning})"
)
def alphabeta(self, game, depth, alpha, beta, deadline, node):
"""AlphaBeta MiniMax Algorithm.
NOTE: Sometimes returns a value, sometimes an (action, value). This is
because some levels are state=>action, some are action=>state and in
action=>state would probably need (action, proba, value) as return type.
{'value', 'action'|None if leaf, 'node' }
"""
if depth == 0 or game.winning_color() is not None or time.time() >= deadline:
value_fn = get_value_fn(
self.value_fn_builder_name,
self.params,
self.value_function if self.use_value_function else None,
)
value = value_fn(game, self.color)
node.expected_value = value
return None, value
maximizingPlayer = game.state.current_color() == self.color
actions = self.get_actions(game) # list of actions.
action_outcomes = expand_spectrum(game, actions) # action => (game, proba)[]
if maximizingPlayer:
best_action = None
best_value = float("-inf")
for i, (action, outcomes) in enumerate(action_outcomes.items()):
action_node = DebugActionNode(action)
expected_value = 0
for j, (outcome, proba) in enumerate(outcomes):
out_node = DebugStateNode(
f"{node.label} {i} {j}", outcome.state.current_color()
)
result = self.alphabeta(
outcome, depth - 1, alpha, beta, deadline, out_node
)
value = result[1]
expected_value += proba * value
action_node.children.append(out_node)
action_node.probas.append(proba)
action_node.expected_value = expected_value
node.children.append(action_node)
if expected_value > best_value:
best_action = action
best_value = expected_value
alpha = max(alpha, best_value)
if alpha >= beta:
break # beta cutoff
node.expected_value = best_value
return best_action, best_value
else:
best_action = None
best_value = float("inf")
for i, (action, outcomes) in enumerate(action_outcomes.items()):
action_node = DebugActionNode(action)
expected_value = 0
for j, (outcome, proba) in enumerate(outcomes):
out_node = DebugStateNode(
f"{node.label} {i} {j}", outcome.state.current_color()
)
result = self.alphabeta(
outcome, depth - 1, alpha, beta, deadline, out_node
)
value = result[1]
expected_value += proba * value
action_node.children.append(out_node)
action_node.probas.append(proba)
action_node.expected_value = expected_value
node.children.append(action_node)
if expected_value < best_value:
best_action = action
best_value = expected_value
beta = min(beta, best_value)
if beta <= alpha:
break # alpha cutoff
node.expected_value = best_value
return best_action, best_value
class DebugStateNode:
def __init__(self, label, color):
self.label = label
self.children = [] # DebugActionNode[]
self.expected_value = None
self.color = color
class DebugActionNode:
def __init__(self, action):
self.action = action
self.expected_value: Any = None
self.children = [] # DebugStateNode[]
self.probas = []
def render_debug_tree(node):
from graphviz import Digraph
dot = Digraph("AlphaBetaSearch")
agenda = [node]
while len(agenda) != 0:
tmp = agenda.pop()
dot.node(
tmp.label,
label=f"<{tmp.label}<br /><font point-size='10'>{tmp.expected_value}</font>>",
style="filled",
fillcolor=tmp.color.value,
)
for child in tmp.children:
action_label = (
f"{tmp.label} - {str(child.action).replace('<', '').replace('>', '')}"
)
dot.node(
action_label,
label=f"<{action_label}<br /><font point-size='10'>{child.expected_value}</font>>",
shape="box",
)
dot.edge(tmp.label, action_label)
for action_child, proba in zip(child.children, child.probas):
dot.node(
action_child.label,
label=f"<{action_child.label}<br /><font point-size='10'>{action_child.expected_value}</font>>",
)
dot.edge(action_label, action_child.label, label=str(proba))
agenda.append(action_child)
print(dot.render())
class SameTurnAlphaBetaPlayer(AlphaBetaPlayer):
"""
Same like AlphaBeta but only within turn
"""
def alphabeta(self, game, depth, alpha, beta, deadline, node):
"""AlphaBeta MiniMax Algorithm.
NOTE: Sometimes returns a value, sometimes an (action, value). This is
because some levels are state=>action, some are action=>state and in
action=>state would probably need (action, proba, value) as return type.
{'value', 'action'|None if leaf, 'node' }
"""
if (
depth == 0
or game.state.current_color() != self.color
or game.winning_color() is not None
or time.time() >= deadline
):
value_fn = get_value_fn(
self.value_fn_builder_name,
self.params,
self.value_function if self.use_value_function else None,
)
value = value_fn(game, self.color)
node.expected_value = value
return None, value
actions = self.get_actions(game) # list of actions.
action_outcomes = expand_spectrum(game, actions) # action => (game, proba)[]
best_action = None
best_value = float("-inf")
for i, (action, outcomes) in enumerate(action_outcomes.items()):
action_node = DebugActionNode(action)
expected_value = 0
for j, (outcome, proba) in enumerate(outcomes):
out_node = DebugStateNode(
f"{node.label} {i} {j}", outcome.state.current_color()
)
result = self.alphabeta(
outcome, depth - 1, alpha, beta, deadline, out_node
)
value = result[1]
expected_value += proba * value
action_node.children.append(out_node)
action_node.probas.append(proba)
action_node.expected_value = expected_value
node.children.append(action_node)
if expected_value > best_value:
best_action = action
best_value = expected_value
alpha = max(alpha, best_value)
if alpha >= beta:
break # beta cutoff
node.expected_value = best_value
return best_action, best_value