Benutzer-Werkzeuge

Webseiten-Werkzeuge


ws1819:pacman_code

Unterschiede

Hier werden die Unterschiede zwischen zwei Versionen gezeigt.

Link zu dieser Vergleichsansicht

Beide Seiten der vorigen Revision Vorhergehende Überarbeitung
Nächste Überarbeitung
Vorhergehende Überarbeitung
ws1819:pacman_code [2019/03/20 18:56]
rhotert
ws1819:pacman_code [2019/03/31 15:30] (aktuell)
rhotert
Zeile 1: Zeile 1:
 **Pacman Code** **Pacman Code**
 +
 +ZIP des Code:  {{:​ws1819:​pacman_uni.rar|}}
  
 Wenn ihr das Programm selbst ausprobieren möchtet braucht ihr leider alle Pakete von [[Requirements]],​ selbst damit ist ein Funktionieren alles andere als Garantiert. Wenn ein Windows update kommt geht wahrscheinlich nichts mehr. Wenn ihr das Programm selbst ausprobieren möchtet braucht ihr leider alle Pakete von [[Requirements]],​ selbst damit ist ein Funktionieren alles andere als Garantiert. Wenn ein Windows update kommt geht wahrscheinlich nichts mehr.
Zeile 16: Zeile 18:
 input_channels = 1 input_channels = 1
 conv_n_maps = [32, 64, 64] conv_n_maps = [32, 64, 64]
 +conv_kernel_sizes = [(8,8), (4,4), (3,3)]
 +conv_strides = [4, 2, 1]
 +conv_paddings = ["​SAME"​] * 3 
 +#​conv_activation = [tf.nn.relu] * 3
 +n_hidden_in = 64 * 11 * 10  # conv3 has 64 maps of 11x10 each
 +n_hidden = 512
 +#​hidden_activation = tf.nn.relu
 +#n_outputs = env.action_space.n ​ # 9 discrete actions are available
 +#​initializer = tf.variance_scaling_initializer()
 +
 +class DQNAgent:
 +    def __init__(self,​ state_size, action_size):​
 +        self.state_size = state_size
 +        self.action_size = action_size
 +        self.memory = deque(maxlen=2000)
 +        self.gamma = 1.0   # discount rate
 +        self.epsilon = 1.0  # exploration rate
 +        self.epsilon_min = 0.01
 +        self.epsilon_decay = 0.999
 +        self.learning_rate = 0.001
 +        self.model = self._build_model()
 +
 +        ​
 +    def _build_model(self):​
 +        # Einfaches NN 
 +        vision_model = Sequential()
 +        vision_model.add(Conv2D(32,​ (5, 5) ,​activation=None, ​
 +              padding='​valid',​ input_shape=state_size)) ## Achtung hier muss die richtige Dimension rein
 +        vision_model.add(keras.layers.advanced_activations.LeakyReLU(alpha=0.05)) #1
 +        vision_model.add(MaxPooling2D((2,​ 2))) #2
 +        vision_model.add(Conv2D(64,​ (3, 3), activation=None,​ padding='​valid'​)) #3
 +        vision_model.add(keras.layers.advanced_activations.LeakyReLU(alpha=0.05)) #4
 +        vision_model.add(MaxPooling2D((2,​ 2))) #5
 +        vision_model.add(Flatten()) #6
 +    #​vision_model.add(keras.layers.core.Dropout(dropout,​ noise_shape=None,​ seed=None)) #7
 +        vision_model.add(Dense(20,​activation=None))#​kernel_regularizer=keras.regularizers.l1(reg))) #8
 +        vision_model.add(keras.layers.advanced_activations.LeakyReLU(alpha=0.05)) #9
 +        vision_model.add(Dense(self.action_size,​activation='​softmax',​ name='​main_output'​)) #10 
 +        vision_model.compile(loss='​mse',​
 +                      optimizer=Adam(lr=self.learning_rate))
 +        return vision_model
 +
 +    def remember(self,​ state, action, reward, next_state, done, total, importance):​
 +        # merkt sich alle bisher durchlaufenen Zustände
 +        self.memory.append([state,​ action, reward, next_state, done,​total,​importance])
 +
 +    def act(self, state):
 +        # epsilon-greedy:​ off-policy oder policy
 +        ​
 +        if np.random.rand() <= self.epsilon:​
 +            return random.randrange(self.action_size)
 +        act_values = self.model.predict(state)
 +        return np.argmax(act_values[0]) ​ # returns action
 +
 +    def replay(self,​ batch_size):​
 +        # baut den Vektor der Q-Werte aus 
 +        # als reward zum Zeitpunkt t + gamma*max(moegliche rewards zum Zeitpunkt t+1)
 +        ​
 +        probabilities = np.array([m[-1] for m in self.memory])
 +        probabilities = 1./​np.sum(probabilities) * probabilities
 +        #print( probabilities.shape)
 +        minibatch = [self.memory[i] for i in np.random.choice(range(len(self.memory)),​size=batch_size,​ p=probabilities)]
 +        states, targets_f = [], []
 +        for state, action, reward, next_state, done,​total,​importance in minibatch:
 +            target = reward
 +            if not done:
 +                target = (reward + self.gamma *
 +                          np.amax(self.model.predict(next_state)[0]))
 +            #​print("​Reward:​ ", reward)
 +            target_f = self.model.predict(state)
 +            target_f[0][action] = target ​
 +            # Filtering out states and targets for training
 +            states.append(state[0])
 +            targets_f.append(target_f[0])
 +        history = self.model.fit(np.array(states),​ np.array(targets_f),​ epochs=1, verbose=0)
 +        # Keeping track of loss
 +        loss = history.history['​loss'​][0]
 +        if self.epsilon > self.epsilon_min:​
 +            self.epsilon *= self.epsilon_decay
 +        return loss
 +
 +    def load(self, name):
 +        self.model.load_weights(name)
 +
 +    def save(self, name):
 +        self.model.save_weights(name)
 +        ​
 +EPISODES = 22
 +
 +
 +env = gym.make('​MsPacman-v0'​)
 +state_size = env.observation_space.shape
 +action_size = env.action_space.n
 +agent = DQNAgent(state_size,​ action_size)
 +done = False
 +batch_size = 32
 +
 +for e in range(EPISODES):​
 +    state = env.reset()
 +    state = np.reshape(state,​ (1,)+ state_size)
 +    cum_reward = 0
 +    for time in range(500):
 +        env.render()
 +        action = agent.act(state)
 +        next_state, reward, done, _ = env.step(action)
 +        #​additional_reward = -(state[0,​0] + state[0,​0]*state[0,​2]-state[0,​1]*state[0,​3])##​faktore aus probieren
 +        reward = reward #+ additional_reward if not done else 10 #
 +        cum_reward += reward
 +        next_state = np.reshape(next_state,​ (1,)+ state_size)
 +        agent.remember(state,​ action, reward, next_state, done,​reward,​1)
 +        state = next_state
 +        if done:
 +            print("​episode:​ {}/{}, score: {}, e: {:.2}"
 +                  .format(e, EPISODES, time, agent.epsilon))
 +            break
 +        if len(agent.memory) > batch_size:
 +            loss = agent.replay(batch_size)
 +            # Logging training loss and actual reward every 10 timesteps
 +            if time % 10 == 0:
 +                print("​episode:​ {}/{}, time: {}, cumulative reward: {:.4f}, loss: {:​.4f}"​.format(e,​ EPISODES, time, cum_reward, loss)) ​
 +        ​
 +    ​
 +    for i in range(time):​
 +        pos = -i-1
 +        agent.memory[-i-2][-2] += reward
 +        for j in range(-time,​pos):​
 +            new_total =  agent.memory[j][-2] + agent.memory[pos][2]
 +            mem = agent.memory[j]
 +            agent.memory[j][-1] =new_total
 +
 +    for i in range(time):​
 +        pos = -i-1
 +        imp = max(agent.memory[pos][-2]-agent.model.predict(agent.memory[pos][0])[0,​agent.memory[pos][1]],​0)
 +        mem = agent.memory[pos]
 +        agent.memory[pos][-1] = imp
 +            ​
 +            ​
 +    agent.save("​qlearning_Acrobot_3versuche"​)
 +    ​
 +  ​
 +import gym
 +env = gym.make('​MsPacman-v0'​)
 +state_size = env.observation_space.shape
 +action_size = env.action_space.n
 +agent = DQNAgent(state_size,​ action_size)
 +done = False
 +batch_size = 32
 +zähler=0
 +
 +#​agent.load("​qlearning_Acrobot_3versuche"​)
  
 +import time  as ti
 +for e in range(100):
 +    state = env.reset()
 +    #state[0] = state[0] + np.random.randn()*0.1
 +    #state[1] = state[1] + np.random.randn()*0.1
 +    #state[2] = state[2] + np.random.randn()*0.1
 +    #state[3] = state[3] + np.random.randn()*0.1
 +    #​env.env.state = state
 +    state = np.reshape(state,​ [1, state_size])
 +    for time in range(2000):​
 +        ​
 +        env.render()
 +        agent.epsilon = 0
 +        action = agent.act(state)
 +        next_state, reward, done, _ = env.step(action)
 +        next_state = np.reshape(next_state,​ [1, state_size])
 +        state = next_state
 +        if done:
 +            zähler+=1
 +            print (zähler, ​  "​Duration:​ ", time)
 +            break
 +            ​
 +    else:
 +        print ("​Volle Zeit")
ws1819/pacman_code.1553104603.txt.gz · Zuletzt geändert: 2019/03/20 18:56 von rhotert