Hier werden die Unterschiede zwischen zwei Versionen gezeigt.
Nächste Überarbeitung | Vorhergehende Überarbeitung | ||
some:23.01.2020 [2020/03/27 16:11] benbaute angelegt |
some:23.01.2020 [2020/03/27 16:52] (aktuell) benbaute |
||
---|---|---|---|
Zeile 11: | Zeile 11: | ||
</code> | </code> | ||
- | **Protokoll für den 06.02.2020** | + | Vorheriger Termin: [[some:09.01.2020]]\\ Nächster Termin: [[some:06.02.2020]] |
- | Training Algorithmus fertig | + | |
- | Funktion fertig - Wissen ob 4 in einer Reihe sind | + | |
- | + | ||
- | Problem: Anzahl möglicher Zustände über 10 hoch 20 | + | |
- | D.h. das Programm läuft so noch nicht | + | |
- | + | ||
- | + | ||
- | + | ||
- | + | ||
- | + | ||
- | Kommentierte Version des Codes mit dem NN: | + | |
- | + | ||
- | <code python> | + | |
- | import random | + | |
- | import gym | + | |
- | import numpy as np | + | |
- | from collections import deque | + | |
- | from keras.models import Sequential | + | |
- | from keras.layers import Dense, Dropout | + | |
- | from keras.optimizers import Adam | + | |
- | import keras | + | |
- | + | ||
- | + | ||
- | def ViergewinntAnfangsmatrix(): | + | |
- | #game_state[x,0,0] greift auf das zu benutzende Feld zu (x=0/1 sind Spiele; x=2 zeigt leere Felder) | + | |
- | #game_state[0,x,0] greift auf die Zeile zu | + | |
- | #game_state[0,0,x] greift auf die Spalte zu | + | |
- | game_state = np.zeros_like(np.arange(42*3).reshape(3,6,7)) | + | |
- | game_state[2]=1 | + | |
- | return game_state | + | |
- | + | ||
- | def check_array(a): | + | |
- | #Prüft ob in einem Array 4 in einer Reihe sind | + | |
- | for i in range(len(a)-3): | + | |
- | b=sum(a[i:i+4]) | + | |
- | if (b== 4): return 4 | + | |
- | + | ||
- | def checkgewonnen(game_state): | + | |
- | #Prüft ob jemand gewonnen hat | + | |
- | Winstate = game_state[0] | + | |
- | Losestate = game_state[1] | + | |
- | Drawstate = game_state[2] | + | |
- | #Spalte | + | |
- | for i in range(7): | + | |
- | a=check_array(Winstate[:6, i:i+1].reshape(6,)) | + | |
- | b=check_array(Losestate[:6, i:i+1].reshape(6,)) | + | |
- | if(a==4):return "Win" | + | |
- | if(b==4):return "Lose" | + | |
- | + | ||
- | #Diagonalen: | + | |
- | for i in range(3): | + | |
- | a=check_array(np.diagonal(Winstate, i+1)) | + | |
- | b=check_array(np.diagonal(Losestate, i+1)) | + | |
- | if(a==4):return "Win" | + | |
- | if(b==4):return "Lose" | + | |
- | a=check_array(np.diagonal(Winstate, -i)) | + | |
- | b=check_array(np.diagonal(Losestate, -i)) | + | |
- | if(a==4):return "Win" | + | |
- | if(b==4):return "Lose" | + | |
- | a=check_array(np.fliplr(Winstate).diagonal(i+1)) | + | |
- | b=check_array(np.fliplr(Losestate).diagonal(i+1)) | + | |
- | if(a==4):return "Win" | + | |
- | if(b==4):return "Lose" | + | |
- | a=check_array(np.fliplr(Winstate).diagonal(-i)) | + | |
- | b=check_array(np.fliplr(Losestate).diagonal(-i)) | + | |
- | if(a==4):return "Win" | + | |
- | if(b==4):return "Lose" | + | |
- | + | ||
- | #Zeile: | + | |
- | for i in range(6): | + | |
- | a=check_array(Winstate[i:i+1].reshape(7,)) | + | |
- | b=check_array(Losestate[i:i+1].reshape(7,)) | + | |
- | if(a==4):return "Win" | + | |
- | if(b==4):return "Lose" | + | |
- | # Check if draw | + | |
- | for i in range(7): | + | |
- | if(Drawstate[0,i] == 1): | + | |
- | return None | + | |
- | return "Draw" | + | |
- | + | ||
- | + | ||
- | + | ||
- | class Agent: | + | |
- | def __init__(self, state_size, action_size): | + | |
- | self.state_size = state_size | + | |
- | self.action_size = action_size | + | |
- | self.memory = deque(maxlen=2000) #Erinnerungen | + | |
- | self.gamma = 1.0 # discount rate | + | |
- | self.epsilon = 1.0 # exploration rate | + | |
- | self.epsilon_min = 0.01 | + | |
- | self.epsilon_decay = 0.999 | + | |
- | self.learning_rate = 0.001 | + | |
- | self.model = self._build_model() #NN wird gebaut | + | |
- | + | ||
- | def _build_model(self): | + | |
- | # Einfaches NN | + | |
- | #Müssen weitere Layer hinzugefügt werden, da 4gewinnt komplexer ist? | + | |
- | model = Sequential() #Standard komplett vernetztes NN | + | |
- | model.add(Dense(48, input_dim=self.state_size, activation='relu', | + | |
- | kernel_regularizer=keras.regularizers.l2(0.00001)))#Input muss angepasst werden | + | |
- | model.add(Dropout(0.3)) # Soll "Overfitting" vermeiden | + | |
- | model.add(Dense(24, activation='relu', kernel_regularizer=keras.regularizers.l2(0.00001)))#Hidden-Layer | + | |
- | model.add(Dense(self.action_size, activation='linear'))#Output im Bezug auf mögliche Aktionen (hier 7) | + | |
- | model.compile(loss='mse', | + | |
- | optimizer=Adam(lr=self.learning_rate)) #Erstellung | + | |
- | return model | + | |
- | + | ||
- | def remember(self, state, action, reward, next_state, done):#Erinnerungsmöglichkeit | + | |
- | # merkt sich alle bisher durchlaufenen Zustände | + | |
- | self.memory.append((state, action, reward, next_state, done)) | + | |
- | + | ||
- | def act(self, state): | + | |
- | # epsilon-greedy: off-policy oder policy | + | |
- | + | ||
- | if np.random.rand() <= self.epsilon:#Zufällige Aktion | + | |
- | return random.randrange(self.action_size) | + | |
- | act_values = self.model.predict(state)#Vorhersage des NN | + | |
- | return np.argmax(act_values[0]) # returns action | + | |
- | + | ||
- | def replay(self, batch_size): | + | |
- | # baut den Vektor der Q-Werte aus | + | |
- | # als reward zum Zeitpunkt t + gamma*max(moegliche rewards zum Zeitpunkt t+1) | + | |
- | #prüft ob ein Zug gut war? | + | |
- | minibatch = random.sample(self.memory, batch_size) | + | |
- | states, targets_f = [], [] | + | |
- | for state, action, reward, next_state, done in minibatch: | + | |
- | target = reward #Reward-Anpassung | + | |
- | if not done: | + | |
- | target = (reward + self.gamma * | + | |
- | np.amax(self.model.predict(next_state)[0])) | + | |
- | target_f = self.model.predict(state) | + | |
- | target_f[0][action] = target | + | |
- | # Filtering out states and targets for training | + | |
- | states.append(state[0]) | + | |
- | targets_f.append(target_f[0]) | + | |
- | history = self.model.fit(np.array(states), np.array(targets_f), epochs=1, verbose=0) | + | |
- | # Keeping track of loss | + | |
- | loss = history.history['loss'][0] | + | |
- | if self.epsilon > self.epsilon_min: | + | |
- | self.epsilon *= self.epsilon_decay | + | |
- | return loss | + | |
- | + | ||
- | #laden und speichern des NN | + | |
- | def load(self, name): | + | |
- | self.model.load_weights(name) | + | |
- | + | ||
- | def save(self, name): | + | |
- | self.model.save_weights(name) | + | |
- | + | ||
- | + | ||
- | + | ||
- | # Lernen: | + | |
- | + | ||
- | # EPISODES mal durchspielen, bis der Mast umfaellt | + | |
- | + | ||
- | + | ||
- | + | ||
- | + | ||
- | EPISODES = 10 | + | |
- | + | ||
- | + | ||
- | env = gym.make('CartPole-v1') | + | |
- | #4gewinnt einfuegen | + | |
- | + | ||
- | + | ||
- | #state_size = env.observation_space.shape[0] | + | |
- | state_size = 7 #Für die 7 Möglichkeiten die es gibt einen Stein zu platzieren | + | |
- | #Spielfeld? | + | |
- | + | ||
- | + | ||
- | action_size = env.action_space.n | + | |
- | #Wird 7 sein, solange keine Spalte voll ist | + | |
- | + | ||
- | + | ||
- | agent = Agent(state_size, action_size) | + | |
- | done = False | + | |
- | batch_size = 32#Muss eventuell auch angepasst werden | + | |
- | + | ||
- | for e in range(EPISODES): | + | |
- | state = env.reset() | + | |
- | #eigene Reset Funktion | + | |
- | + | ||
- | state = np.reshape(state, [1, state_size])#Erzeugt einen Array, der den Zustand repräsentiert | + | |
- | cum_reward = 0 | + | |
- | for time in range(500):#Zeit braucht man nicht bei 4 gewinnt, ersetzen mit max Spiellänge | + | |
- | #env.render() | + | |
- | action = agent.act(state)#Auswahl der möglichen Aktionen | + | |
- | next_state, reward, done, _ = env.step(action)#Ausführung dieser Aktion | + | |
- | reward = reward/(abs(next_state[0])+1.)**2 if not done else -10 #Anpssung des rewards-System | + | |
- | cum_reward += reward | + | |
- | next_state = np.reshape(next_state, [1, state_size])#nächsten Zustand herbei führen | + | |
- | agent.remember(state, action, reward, next_state, done)#merken | + | |
- | state = next_state | + | |
- | if done: | + | |
- | print(("episode: {}/{}, score: {}, e: {:.2}"#Score sollte nicht abhängig von Anzahl Züge, sondern von gewonenen/verloren sein | + | |
- | .format(e, EPISODES, time, agent.epsilon))) | + | |
- | break | + | |
- | if len(agent.memory) > batch_size: | + | |
- | loss = agent.replay(batch_size) | + | |
- | # Logging training loss and actual reward every 10 timesteps | + | |
- | if time % 10 == 0: | + | |
- | print(("episode: {}/{}, time: {}, cumulative reward: {:.4f}, loss: {:.4f}".format(e, EPISODES, time, cum_reward, loss))) | + | |
- | + | ||
- | agent.save("qlearning_cartpole.weights") | + | |
- | + | ||
- | + | ||
- | # Testen des gelernten Modells mit leichten Störungen, um die Stabilität zu sehen. Mit Visualisierung | + | |
- | #Funktioniert hier nicht so richtig | + | |
- | + | ||
- | #2 Ai's gegeneinander spielen lassen? | + | |
- | + | ||
- | agent.load("qlearning_cartpole.weights")#gemerkte Entscheidungen laden, um damit gute Züge machen zu können | + | |
- | + | ||
- | import time as ti #Zeit braucht man nicht | + | |
- | for e in range(1000):#Von hier | + | |
- | state = env.reset() | + | |
- | state[0] = state[0] + np.random.randn()*0.1 | + | |
- | state[1] = state[1] + np.random.randn()*0.1 | + | |
- | state[2] = state[2] + np.random.randn()*0.1 | + | |
- | state[3] = state[3] + np.random.randn()*0.1 | + | |
- | env.env.state = state | + | |
- | state = np.reshape(state, [1, state_size]) | + | |
- | for time in range(2000):#bis hier muss komplett ersetzt werden | + | |
- | + | ||
- | env.render()#Nicht Zeit abhängig, braucht man also nicht | + | |
- | agent.epsilon = 0# Setzt Zufallshandlung auf 0 | + | |
- | action = agent.act(state) | + | |
- | next_state, reward, done, _ = env.step(action) | + | |
- | next_state = np.reshape(next_state, [1, state_size]) | + | |
- | state = next_state#Spielt einen Zug durch | + | |
- | if done: | + | |
- | print("Duration: ", time) | + | |
- | break | + | |
- | + | ||
- | else: | + | |
- | print("Volle Zeit")#Hier wichtig ob Gewonnen/Verloren/Unentschieden? | + | |
- | </code> | + |