===Protokoll=== Heute haben wir den kompletten Algorithmus eins zu eins übertragen auf unser Vier gewinnt Spiel. So funktioniert jetzt die Funktion zur Überprüfung, ob jemand gewonnen hat oder nicht auch. Dabei sind wir jedoch auf ein einfaches Problem gestoßen, was man im Vorhinein hätte bedanken sollen. Und zwar ist der Zustandsraum von Vier gewinnt sehr viel größer, als der Zustandsraum von Tic Tac Toe. So gibt es über 10 hoch 20 mögliche Zustände, (wenn man nicht zwischen legalen und nicht möglichen Zuständen unterscheidet). Somit kann nicht so einfach wie zuvor eine Tabelle benutzt werden, in der jeder möglicher Zustand steht und der zugehörige Wert. Also haben wir ein Programm geschrieben, was zwar theoretisch fertig ist, aber praktisch nicht umsetztbar. Das heißt, wir müssen uns einen anderen Weg überlegen, unsere Ai gegen uns Vier gewinnt spielen zu lassen. Momentan sieht unser Trainingsprogramm für unsere Ai so aus:



#Training AI vs AI
import numpy as np
import itertools
import random

game_state = np.zeros_like(np.arange(42).reshape(6,7))#4Gewinnt Feld mit Nullen
players = ['Blue','Red']#Blau fängt an / Blau ist 1 und Rot ist -1

def play_move(state, player, block_num):
    if(state[int((block_num-1)/6)][(block_num-1)%7] == 0):
        state[int((block_num-1)/6)][(block_num-1)%7] = player
    else:
        print("Fehler")#Dürfte eigentlich beim Training nie passieren
    
def copy_game_state(state):
    new_state = np.zeros_like(np.arange(42).reshape(6,7))#4Gewinnt Feld mit Nullen
    for i in range(6):
        for j in range(7):
            new_state[i][j] = state[i][j]
    return new_state

def check_array(a):
    for i in range(len(a)-3):
        b=sum(a[i:i+4])   
        if (b== 4 or b==-4): return b 
      
def check_current_state(game_state):#Überprüfung ob jemand Gewonnen hat oder das Spiel unentschieden ausgegangen ist   
   #Spalte
    for i in range(7):
        a=check_array(game_state[:6, i:i+1].reshape(6,))
        if(a==4):return "Win"
        if(a==-4):return "Lose"
    
    #Diagonalen:
    for i in range(3): 
        a=check_array(np.diagonal(game_state, i+1))
        if(a==4):return "Win"
        if(a==-4):return "Lose"
        a=check_array(np.diagonal(game_state, -i)) 
        if(a==4):return "Win"
        if(a==-4):return "Lose"
        a=check_array(np.fliplr(game_state).diagonal(i+1))
        if(a==4):return "Win"
        if(a==-4):return "Lose"
        a=check_array(np.fliplr(game_state).diagonal(-i)) 
        if(a==4):return "Win"
        if(a==-4):return "Lose"
      
    #Zeile: 
    for i in range(6):
        a=check_array(game_state[i:i+1].reshape(7,)) 
        if(a==4):return "Win"
        if(a==-4):return "Lose"
    # Check if draw
    for i in range(7):  
        if(game_state[0,i] == 0):
           return None
    return "Draw"
    
    

# Initialize state values
player = [1,-1,0]
states_dict = {}
all_possible_states = [[list(i[0:7]),list(i[7:14]),list(i[14:21]),list(i[21:28]),list(i[28:35]),list(i[35:42])] for i in itertools.product(player, repeat = 42)]
n_states = len(all_possible_states) # 2 players, 42 spaces
n_actions = 42   # 42 spaces
state_values_for_AI_Blue = np.full((n_states),0.0)
state_values_for_AI_Red = np.full((n_states),0.0)
#print("n_states = %i \nn_actions = %i"%(n_states, n_actions))

# State values for AI 'Red', (fängt nicht an)
for i in range(n_states):
    states_dict[i] = all_possible_states[i]
    winner = check_current_state(states_dict[i])
    if winner == 'Lose':   # AI won
        state_values_for_AI_Red[i] = 1
    elif winner == 'Win':   # AI lost
        state_values_for_AI_Red[i] = -1
        
# State values for AI 'Blue', (fängt an)
for i in range(n_states):
    winner = check_current_state(states_dict[i])
    if winner == 'Lose':   # AI lost
        state_values_for_AI_Blue[i] = -1
    elif winner == 'Win':   # AI won
        state_values_for_AI_Blue[i] = 1

def update_state_value_Red(curr_state_idx, next_state_idx, learning_rate):
    new_value = state_values_for_AI_Red[curr_state_idx] + learning_rate*(state_values_for_AI_Red[next_state_idx]  - state_values_for_AI_Red[curr_state_idx])
    state_values_for_AI_Red[curr_state_idx] = new_value
    
def update_state_value_Blue(curr_state_idx, next_state_idx, learning_rate):
    new_value = state_values_for_AI_Blue[curr_state_idx] + learning_rate*(state_values_for_AI_Blue[next_state_idx]  - state_values_for_AI_Blue[curr_state_idx])
    state_values_for_AI_Blue[curr_state_idx] = new_value

def getBestMove(state, player, epsilon):
    '''
    Reinforcement Learning Algorithm
    '''    
    moves = []
    curr_state_values = []
    empty_cells = []
    for i in range(7):
        if(state[0,i] == 0):
            a = 5
            while(state[a,i] != 0):
                a = a-1
            empty_cells.append(a*6 + (i+1))
    
    for empty_cell in empty_cells:
        moves.append(empty_cell)
        new_state = copy_game_state(state)
        play_move(new_state, player, empty_cell)
        next_state_idx = list(states_dict.keys())[list(states_dict.values()).index(new_state)]
        if player == 'Blue':
            curr_state_values.append(state_values_for_AI_Blue[next_state_idx])
        else:
            curr_state_values.append(state_values_for_AI_Red[next_state_idx])
            
    best_move_idx = np.argmax(curr_state_values)
    
    if np.random.uniform(0,1) <= epsilon:       # Exploration
        best_move = random.choice(empty_cells)
        epsilon *= 0.99
    else:   #Exploitation
        best_move = moves[best_move_idx]

    return best_move

# PLaying

#LOAD TRAINED STATE VALUES
state_values_for_AI_Blue = np.loadtxt('trained_state_values_Blue.txt', dtype=np.float64)
state_values_for_AI_Red = np.loadtxt('trained_state_values_Red.txt', dtype=np.float64)

learning_rate = 0.2
epsilon = 0.2
num_iterations = 1
for iteration in range(num_iterations):
    game_state = np.zeros_like(np.arange(42).reshape(6,7))#4Gewinnt Feld mit Nullen
    winner = None
    current_player_idx = random.choice([0,1])
        
    while winner == None:
        curr_state_idx = list(states_dict.keys())[list(states_dict.values()).index(game_state)]
        if current_player_idx == 0:     # AI_Blue's turn        
            block_choice = getBestMove(game_state, players[current_player_idx], epsilon)
            play_move(game_state ,players[current_player_idx], block_choice)
            new_state_idx = list(states_dict.keys())[list(states_dict.values()).index(game_state)]
            
        else:       # AI_Red's turn   
            block_choice = getBestMove(game_state, players[current_player_idx], epsilon)
            play_move(game_state ,players[current_player_idx], block_choice)
            new_state_idx = list(states_dict.keys())[list(states_dict.values()).index(game_state)]
        
        update_state_value_Red(curr_state_idx, new_state_idx, learning_rate)
        update_state_value_Blue(curr_state_idx, new_state_idx, learning_rate)
        winner = check_current_state(game_state)
        if(winner == None):
            current_player_idx = (current_player_idx + 1)%2
        


# Save state values for future use
np.savetxt('trained_state_values_Blue.txt', state_values_for_AI_Blue, fmt = '%.6f')
np.savetxt('trained_state_values_Red.txt', state_values_for_AI_Red, fmt = '%.6f')

print('Training Complete!')

Vorheriger Termin: [[some:23.01.2020]]\\ Nächster Termin: [[some:Blocktermin]]