-
Notifications
You must be signed in to change notification settings - Fork 1
/
self_play.py
executable file
·99 lines (80 loc) · 2.65 KB
/
self_play.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# ====================
# セルフプレイ部
# ====================
# パッケージのインポート
from game import State
from pv_mcts import pv_mcts_scores
from dual_network import DN_OUTPUT_SIZE
from datetime import datetime
from tensorflow.keras.models import load_model
from tensorflow.keras import backend as K
from pathlib import Path
import numpy as np
import pickle
import os
# パラメータの準備
SP_GAME_COUNT = 500 # セルフプレイを行うゲーム数(本家は25000)
SP_TEMPERATURE = 1.0 # ボルツマン分布の温度パラメータ
# 先手プレイヤーの価値
def first_player_value(ended_state):
# 1:先手勝利, -1:先手敗北, 0:引き分け
if ended_state.is_lose():
return -1 if ended_state.is_first_player() else 1
return 0
# 学習データの保存
def write_data(history):
now = datetime.now()
os.makedirs('./data/', exist_ok=True) # フォルダがない時は生成
path = './data/{:04}{:02}{:02}{:02}{:02}{:02}.history'.format(
now.year, now.month, now.day, now.hour, now.minute, now.second)
with open(path, mode='wb') as f:
pickle.dump(history, f)
# 1ゲームの実行
def play(model):
# 学習データ
history = []
# 状態の生成
state = State()
while True:
# ゲーム終了時
if state.is_done():
break
# 合法手の確率分布の取得
scores = pv_mcts_scores(model, state, SP_TEMPERATURE)
# 学習データに状態と方策を追加
policies = [0] * DN_OUTPUT_SIZE
for action, policy in zip(state.legal_actions(), scores):
policies[action] = policy
history.append([[state.pieces, state.enemy_pieces], policies, None])
# 行動の取得
action = np.random.choice(state.legal_actions(), p=scores)
# 次の状態の取得
state = state.next(action)
# 学習データに価値を追加
value = first_player_value(state)
for i in range(len(history)):
history[i][2] = value
value = -value
return history
# セルフプレイ
def self_play():
# 学習データ
history = []
# ベストプレイヤーのモデルの読み込み
model = load_model('./model/best.h5')
# 複数回のゲームの実行
for i in range(SP_GAME_COUNT):
# 1ゲームの実行
h = play(model)
history.extend(h)
# 出力
print('\rSelfPlay {}/{}'.format(i+1, SP_GAME_COUNT), end='')
print('')
# 学習データの保存
write_data(history)
# モデルの破棄
K.clear_session()
del model
# 動作確認
if __name__ == '__main__':
self_play()