Mathesis Wiki

Protokoll

Heute haben wir den kompletten Algorithmus eins zu eins übertragen auf unser Vier gewinnt Spiel. So funktioniert jetzt die Funktion zur Überprüfung, ob jemand gewonnen hat oder nicht auch. Dabei sind wir jedoch auf ein einfaches Problem gestoßen, was man im Vorhinein hätte bedanken sollen. Und zwar ist der Zustandsraum von Vier gewinnt sehr viel größer, als der Zustandsraum von Tic Tac Toe. So gibt es über 10 hoch 20 mögliche Zustände, (wenn man nicht zwischen legalen und nicht möglichen Zuständen unterscheidet). Somit kann nicht so einfach wie zuvor eine Tabelle benutzt werden, in der jeder möglicher Zustand steht und der zugehörige Wert. Also haben wir ein Programm geschrieben, was zwar theoretisch fertig ist, aber praktisch nicht umsetztbar. Das heißt, wir müssen uns einen anderen Weg überlegen, unsere Ai gegen uns Vier gewinnt spielen zu lassen. Momentan sieht unser Trainingsprogramm für unsere Ai so aus:

#Training AI vs AI
import numpy as np
import itertools
import random
 
game_state = np.zeros_like(np.arange(42).reshape(6,7))#4Gewinnt Feld mit Nullen
players = ['Blue','Red']#Blau fängt an / Blau ist 1 und Rot ist -1
 
def play_move(state, player, block_num):
    if(state[int((block_num-1)/6)][(block_num-1)%7] == 0):
        state[int((block_num-1)/6)][(block_num-1)%7] = player
    else:
        print("Fehler")#Dürfte eigentlich beim Training nie passieren
 
def copy_game_state(state):
    new_state = np.zeros_like(np.arange(42).reshape(6,7))#4Gewinnt Feld mit Nullen
    for i in range(6):
        for j in range(7):
            new_state[i][j] = state[i][j]
    return new_state
 
def check_array(a):
    for i in range(len(a)-3):
        b=sum(a[i:i+4])   
        if (b== 4 or b==-4): return b 
 
def check_current_state(game_state):#Überprüfung ob jemand Gewonnen hat oder das Spiel unentschieden ausgegangen ist   
   #Spalte
    for i in range(7):
        a=check_array(game_state[:6, i:i+1].reshape(6,))
        if(a==4):return "Win"
        if(a==-4):return "Lose"
 
    #Diagonalen:
    for i in range(3): 
        a=check_array(np.diagonal(game_state, i+1))
        if(a==4):return "Win"
        if(a==-4):return "Lose"
        a=check_array(np.diagonal(game_state, -i)) 
        if(a==4):return "Win"
        if(a==-4):return "Lose"
        a=check_array(np.fliplr(game_state).diagonal(i+1))
        if(a==4):return "Win"
        if(a==-4):return "Lose"
        a=check_array(np.fliplr(game_state).diagonal(-i)) 
        if(a==4):return "Win"
        if(a==-4):return "Lose"
 
    #Zeile: 
    for i in range(6):
        a=check_array(game_state[i:i+1].reshape(7,)) 
        if(a==4):return "Win"
        if(a==-4):return "Lose"
    # Check if draw
    for i in range(7):  
        if(game_state[0,i] == 0):
           return None
    return "Draw"
 
 
 
# Initialize state values
player = [1,-1,0]
states_dict = {}
all_possible_states = [[list(i[0:7]),list(i[7:14]),list(i[14:21]),list(i[21:28]),list(i[28:35]),list(i[35:42])] for i in itertools.product(player, repeat = 42)]
n_states = len(all_possible_states) # 2 players, 42 spaces
n_actions = 42   # 42 spaces
state_values_for_AI_Blue = np.full((n_states),0.0)
state_values_for_AI_Red = np.full((n_states),0.0)
#print("n_states = %i \nn_actions = %i"%(n_states, n_actions))
 
# State values for AI 'Red', (fängt nicht an)
for i in range(n_states):
    states_dict[i] = all_possible_states[i]
    winner = check_current_state(states_dict[i])
    if winner == 'Lose':   # AI won
        state_values_for_AI_Red[i] = 1
    elif winner == 'Win':   # AI lost
        state_values_for_AI_Red[i] = -1
 
# State values for AI 'Blue', (fängt an)
for i in range(n_states):
    winner = check_current_state(states_dict[i])
    if winner == 'Lose':   # AI lost
        state_values_for_AI_Blue[i] = -1
    elif winner == 'Win':   # AI won
        state_values_for_AI_Blue[i] = 1
 
def update_state_value_Red(curr_state_idx, next_state_idx, learning_rate):
    new_value = state_values_for_AI_Red[curr_state_idx] + learning_rate*(state_values_for_AI_Red[next_state_idx]  - state_values_for_AI_Red[curr_state_idx])
    state_values_for_AI_Red[curr_state_idx] = new_value
 
def update_state_value_Blue(curr_state_idx, next_state_idx, learning_rate):
    new_value = state_values_for_AI_Blue[curr_state_idx] + learning_rate*(state_values_for_AI_Blue[next_state_idx]  - state_values_for_AI_Blue[curr_state_idx])
    state_values_for_AI_Blue[curr_state_idx] = new_value
 
def getBestMove(state, player, epsilon):
    '''
    Reinforcement Learning Algorithm
    '''    
    moves = []
    curr_state_values = []
    empty_cells = []
    for i in range(7):
        if(state[0,i] == 0):
            a = 5
            while(state[a,i] != 0):
                a = a-1
            empty_cells.append(a*6 + (i+1))
 
    for empty_cell in empty_cells:
        moves.append(empty_cell)
        new_state = copy_game_state(state)
        play_move(new_state, player, empty_cell)
        next_state_idx = list(states_dict.keys())[list(states_dict.values()).index(new_state)]
        if player == 'Blue':
            curr_state_values.append(state_values_for_AI_Blue[next_state_idx])
        else:
            curr_state_values.append(state_values_for_AI_Red[next_state_idx])
 
    best_move_idx = np.argmax(curr_state_values)
 
    if np.random.uniform(0,1) <= epsilon:       # Exploration
        best_move = random.choice(empty_cells)
        epsilon *= 0.99
    else:   #Exploitation
        best_move = moves[best_move_idx]
 
    return best_move
 
# PLaying
 
#LOAD TRAINED STATE VALUES
state_values_for_AI_Blue = np.loadtxt('trained_state_values_Blue.txt', dtype=np.float64)
state_values_for_AI_Red = np.loadtxt('trained_state_values_Red.txt', dtype=np.float64)
 
learning_rate = 0.2
epsilon = 0.2
num_iterations = 1
for iteration in range(num_iterations):
    game_state = np.zeros_like(np.arange(42).reshape(6,7))#4Gewinnt Feld mit Nullen
    winner = None
    current_player_idx = random.choice([0,1])
 
    while winner == None:
        curr_state_idx = list(states_dict.keys())[list(states_dict.values()).index(game_state)]
        if current_player_idx == 0:     # AI_Blue's turn        
            block_choice = getBestMove(game_state, players[current_player_idx], epsilon)
            play_move(game_state ,players[current_player_idx], block_choice)
            new_state_idx = list(states_dict.keys())[list(states_dict.values()).index(game_state)]
 
        else:       # AI_Red's turn   
            block_choice = getBestMove(game_state, players[current_player_idx], epsilon)
            play_move(game_state ,players[current_player_idx], block_choice)
            new_state_idx = list(states_dict.keys())[list(states_dict.values()).index(game_state)]
 
        update_state_value_Red(curr_state_idx, new_state_idx, learning_rate)
        update_state_value_Blue(curr_state_idx, new_state_idx, learning_rate)
        winner = check_current_state(game_state)
        if(winner == None):
            current_player_idx = (current_player_idx + 1)%2
 
 
 
# Save state values for future use
np.savetxt('trained_state_values_Blue.txt', state_values_for_AI_Blue, fmt = '%.6f')
np.savetxt('trained_state_values_Red.txt', state_values_for_AI_Red, fmt = '%.6f')
 
print('Training Complete!')

Vorheriger Termin: 23.01.2020
Nächster Termin: Blocktermin

Mathesis Wiki

Seitenleiste

Navigation

Protokoll

Mathesis Wiki

Benutzer-Werkzeuge

Webseiten-Werkzeuge

Seitenleiste

Navigation

Protokoll

Seiten-Werkzeuge