Benutzer-Werkzeuge

Webseiten-Werkzeuge


ws1819:pacman_code

Unterschiede

Hier werden die Unterschiede zwischen zwei Versionen gezeigt.

Link zu dieser Vergleichsansicht

Nächste Überarbeitung
Vorhergehende Überarbeitung
ws1819:pacman_code [2019/03/20 18:47]
rhotert angelegt
ws1819:pacman_code [2019/03/31 15:30] (aktuell)
rhotert
Zeile 1: Zeile 1:
-um den code Zu +**Pacman Code** 
 + 
 +ZIP des Code:  {{:​ws1819:​pacman_uni.rar|}} 
 + 
 +Wenn ihr das Programm selbst ausprobieren möchtet braucht ihr leider alle Pakete von [[Requirements]],​ selbst damit ist ein Funktionieren alles andere als Garantiert. Wenn ein Windows update kommt geht wahrscheinlich nichts mehr. 
 + 
 +import random 
 +import gym 
 +import numpy as np 
 +from collections import deque 
 +from keras.models import Sequential 
 +from keras.layers import Dense, Dropout 
 +from keras.optimizers import Adam 
 +from keras.layers import Conv2D, MaxPooling2D,​ Flatten 
 +import keras 
 + 
 +input_width = 80 
 +input_channels = 1 
 +conv_n_maps = [32, 64, 64] 
 +conv_kernel_sizes = [(8,8), (4,4), (3,3)] 
 +conv_strides = [4, 2, 1] 
 +conv_paddings = ["​SAME"​] * 3  
 +#​conv_activation = [tf.nn.relu] * 3 
 +n_hidden_in = 64 * 11 * 10  # conv3 has 64 maps of 11x10 each 
 +n_hidden = 512 
 +#​hidden_activation = tf.nn.relu 
 +#n_outputs = env.action_space.n ​ # 9 discrete actions are available 
 +#​initializer = tf.variance_scaling_initializer() 
 + 
 +class DQNAgent: 
 +    def __init__(self,​ state_size, action_size):​ 
 +        self.state_size = state_size 
 +        self.action_size = action_size 
 +        self.memory = deque(maxlen=2000) 
 +        self.gamma = 1.0   # discount rate 
 +        self.epsilon = 1.0  # exploration rate 
 +        self.epsilon_min = 0.01 
 +        self.epsilon_decay = 0.999 
 +        self.learning_rate = 0.001 
 +        self.model = self._build_model() 
 + 
 +         
 +    def _build_model(self):​ 
 +        # Einfaches NN  
 +        vision_model = Sequential() 
 +        vision_model.add(Conv2D(32,​ (5, 5) ,​activation=None,​  
 +              padding='​valid',​ input_shape=state_size)) ## Achtung hier muss die richtige Dimension rein 
 +        vision_model.add(keras.layers.advanced_activations.LeakyReLU(alpha=0.05)) #1 
 +        vision_model.add(MaxPooling2D((2,​ 2))) #2 
 +        vision_model.add(Conv2D(64,​ (3, 3), activation=None,​ padding='​valid'​)) #3 
 +        vision_model.add(keras.layers.advanced_activations.LeakyReLU(alpha=0.05)) #4 
 +        vision_model.add(MaxPooling2D((2,​ 2))) #5 
 +        vision_model.add(Flatten()) #6 
 +    #​vision_model.add(keras.layers.core.Dropout(dropout,​ noise_shape=None,​ seed=None)) #7 
 +        vision_model.add(Dense(20,​activation=None))#​kernel_regularizer=keras.regularizers.l1(reg))) #8 
 +        vision_model.add(keras.layers.advanced_activations.LeakyReLU(alpha=0.05)) #9 
 +        vision_model.add(Dense(self.action_size,​activation='​softmax',​ name='​main_output'​)) #10  
 +        vision_model.compile(loss='​mse',​ 
 +                      optimizer=Adam(lr=self.learning_rate)) 
 +        return vision_model 
 + 
 +    def remember(self,​ state, action, reward, next_state, done, total, importance):​ 
 +        # merkt sich alle bisher durchlaufenen Zustände 
 +        self.memory.append([state,​ action, reward, next_state, done,​total,​importance]) 
 + 
 +    def act(self, state): 
 +        # epsilon-greedy:​ off-policy oder policy 
 +         
 +        if np.random.rand() <= self.epsilon:​ 
 +            return random.randrange(self.action_size) 
 +        act_values = self.model.predict(state) 
 +        return np.argmax(act_values[0]) ​ # returns action 
 + 
 +    def replay(self,​ batch_size):​ 
 +        # baut den Vektor der Q-Werte aus  
 +        # als reward zum Zeitpunkt t + gamma*max(moegliche rewards zum Zeitpunkt t+1) 
 +         
 +        probabilities = np.array([m[-1] for m in self.memory]) 
 +        probabilities = 1./​np.sum(probabilities) * probabilities 
 +        #print( probabilities.shape) 
 +        minibatch = [self.memory[i] for i in np.random.choice(range(len(self.memory)),​size=batch_size,​ p=probabilities)] 
 +        states, targets_f = [], [] 
 +        for state, action, reward, next_state, done,​total,​importance in minibatch:​ 
 +            target = reward 
 +            if not done: 
 +                target = (reward + self.gamma * 
 +                          np.amax(self.model.predict(next_state)[0])) 
 +            #​print("​Reward:​ ", reward) 
 +            target_f = self.model.predict(state) 
 +            target_f[0][action] = target  
 +            # Filtering out states and targets for training 
 +            states.append(state[0]) 
 +            targets_f.append(target_f[0]) 
 +        history = self.model.fit(np.array(states),​ np.array(targets_f),​ epochs=1, verbose=0) 
 +        # Keeping track of loss 
 +        loss = history.history['​loss'​][0] 
 +        if self.epsilon > self.epsilon_min:​ 
 +            self.epsilon *= self.epsilon_decay 
 +        return loss 
 + 
 +    def load(self, name): 
 +        self.model.load_weights(name) 
 + 
 +    def save(self, name): 
 +        self.model.save_weights(name) 
 +         
 +EPISODES = 22 
 + 
 + 
 +env = gym.make('​MsPacman-v0'​) 
 +state_size = env.observation_space.shape 
 +action_size = env.action_space.n 
 +agent = DQNAgent(state_size,​ action_size) 
 +done = False 
 +batch_size = 32 
 + 
 +for e in range(EPISODES):​ 
 +    state = env.reset() 
 +    state = np.reshape(state,​ (1,)+ state_size) 
 +    cum_reward = 0 
 +    for time in range(500):​ 
 +        env.render() 
 +        action = agent.act(state) 
 +        next_state, reward, done, _ = env.step(action) 
 +        #​additional_reward = -(state[0,​0] + state[0,​0]*state[0,​2]-state[0,​1]*state[0,​3])##​faktore aus probieren 
 +        reward = reward #+ additional_reward if not done else 10 # 
 +        cum_reward += reward 
 +        next_state = np.reshape(next_state,​ (1,)+ state_size) 
 +        agent.remember(state,​ action, reward, next_state, done,​reward,​1) 
 +        state = next_state 
 +        if done: 
 +            print("​episode:​ {}/{}, score: {}, e: {:​.2}"​ 
 +                  .format(e, EPISODES, time, agent.epsilon)) 
 +            break 
 +        if len(agent.memory) > batch_size:​ 
 +            loss = agent.replay(batch_size) 
 +            # Logging training loss and actual reward every 10 timesteps 
 +            if time % 10 == 0: 
 +                print("​episode:​ {}/{}, time: {}, cumulative reward: {:.4f}, loss: {:​.4f}"​.format(e,​ EPISODES, time, cum_reward, loss))  
 +         
 +     
 +    for i in range(time):​ 
 +        pos = -i-1 
 +        agent.memory[-i-2][-2] += reward 
 +        for j in range(-time,​pos):​ 
 +            new_total =  agent.memory[j][-2] + agent.memory[pos][2] 
 +            mem = agent.memory[j] 
 +            agent.memory[j][-1] =new_total 
 + 
 +    for i in range(time):​ 
 +        pos = -i-1 
 +        imp = max(agent.memory[pos][-2]-agent.model.predict(agent.memory[pos][0])[0,​agent.memory[pos][1]],​0) 
 +        mem = agent.memory[pos] 
 +        agent.memory[pos][-1] = imp 
 +             
 +             
 +    agent.save("​qlearning_Acrobot_3versuche"​) 
 +     
 +   
 +import gym 
 +env = gym.make('​MsPacman-v0'​) 
 +state_size = env.observation_space.shape 
 +action_size = env.action_space.n 
 +agent = DQNAgent(state_size,​ action_size) 
 +done = False 
 +batch_size = 32 
 +zähler=0 
 + 
 +#​agent.load("​qlearning_Acrobot_3versuche"​) 
 + 
 +import time  as ti 
 +for e in range(100):​ 
 +    state = env.reset() 
 +    #state[0] = state[0] + np.random.randn()*0.1 
 +    #state[1] = state[1] + np.random.randn()*0.1 
 +    #state[2] = state[2] + np.random.randn()*0.1 
 +    #state[3] = state[3] + np.random.randn()*0.1 
 +    #​env.env.state = state 
 +    state = np.reshape(state,​ [1, state_size]) 
 +    for time in range(2000):​ 
 +         
 +        env.render() 
 +        agent.epsilon = 0 
 +        action = agent.act(state) 
 +        next_state, reward, done, _ = env.step(action) 
 +        next_state = np.reshape(next_state,​ [1, state_size]) 
 +        state = next_state 
 +        if done: 
 +            zähler+=1 
 +            print (zähler, ​  "​Duration:​ ", time) 
 +            break 
 +             
 +    else: 
 +        print ("​Volle Zeit")
ws1819/pacman_code.1553104035.txt.gz · Zuletzt geändert: 2019/03/20 18:47 von rhotert