Hier werden die Unterschiede zwischen zwei Versionen gezeigt.
Beide Seiten der vorigen Revision Vorhergehende Überarbeitung Nächste Überarbeitung | Vorhergehende Überarbeitung | ||
ws1819:pacman_code [2019/03/20 18:56] rhotert |
ws1819:pacman_code [2019/03/31 15:30] (aktuell) rhotert |
||
---|---|---|---|
Zeile 1: | Zeile 1: | ||
**Pacman Code** | **Pacman Code** | ||
+ | |||
+ | ZIP des Code: {{:ws1819:pacman_uni.rar|}} | ||
Wenn ihr das Programm selbst ausprobieren möchtet braucht ihr leider alle Pakete von [[Requirements]], selbst damit ist ein Funktionieren alles andere als Garantiert. Wenn ein Windows update kommt geht wahrscheinlich nichts mehr. | Wenn ihr das Programm selbst ausprobieren möchtet braucht ihr leider alle Pakete von [[Requirements]], selbst damit ist ein Funktionieren alles andere als Garantiert. Wenn ein Windows update kommt geht wahrscheinlich nichts mehr. | ||
Zeile 16: | Zeile 18: | ||
input_channels = 1 | input_channels = 1 | ||
conv_n_maps = [32, 64, 64] | conv_n_maps = [32, 64, 64] | ||
+ | conv_kernel_sizes = [(8,8), (4,4), (3,3)] | ||
+ | conv_strides = [4, 2, 1] | ||
+ | conv_paddings = ["SAME"] * 3 | ||
+ | #conv_activation = [tf.nn.relu] * 3 | ||
+ | n_hidden_in = 64 * 11 * 10 # conv3 has 64 maps of 11x10 each | ||
+ | n_hidden = 512 | ||
+ | #hidden_activation = tf.nn.relu | ||
+ | #n_outputs = env.action_space.n # 9 discrete actions are available | ||
+ | #initializer = tf.variance_scaling_initializer() | ||
+ | |||
+ | class DQNAgent: | ||
+ | def __init__(self, state_size, action_size): | ||
+ | self.state_size = state_size | ||
+ | self.action_size = action_size | ||
+ | self.memory = deque(maxlen=2000) | ||
+ | self.gamma = 1.0 # discount rate | ||
+ | self.epsilon = 1.0 # exploration rate | ||
+ | self.epsilon_min = 0.01 | ||
+ | self.epsilon_decay = 0.999 | ||
+ | self.learning_rate = 0.001 | ||
+ | self.model = self._build_model() | ||
+ | |||
+ | | ||
+ | def _build_model(self): | ||
+ | # Einfaches NN | ||
+ | vision_model = Sequential() | ||
+ | vision_model.add(Conv2D(32, (5, 5) ,activation=None, | ||
+ | padding='valid', input_shape=state_size)) ## Achtung hier muss die richtige Dimension rein | ||
+ | vision_model.add(keras.layers.advanced_activations.LeakyReLU(alpha=0.05)) #1 | ||
+ | vision_model.add(MaxPooling2D((2, 2))) #2 | ||
+ | vision_model.add(Conv2D(64, (3, 3), activation=None, padding='valid')) #3 | ||
+ | vision_model.add(keras.layers.advanced_activations.LeakyReLU(alpha=0.05)) #4 | ||
+ | vision_model.add(MaxPooling2D((2, 2))) #5 | ||
+ | vision_model.add(Flatten()) #6 | ||
+ | #vision_model.add(keras.layers.core.Dropout(dropout, noise_shape=None, seed=None)) #7 | ||
+ | vision_model.add(Dense(20,activation=None))#kernel_regularizer=keras.regularizers.l1(reg))) #8 | ||
+ | vision_model.add(keras.layers.advanced_activations.LeakyReLU(alpha=0.05)) #9 | ||
+ | vision_model.add(Dense(self.action_size,activation='softmax', name='main_output')) #10 | ||
+ | vision_model.compile(loss='mse', | ||
+ | optimizer=Adam(lr=self.learning_rate)) | ||
+ | return vision_model | ||
+ | |||
+ | def remember(self, state, action, reward, next_state, done, total, importance): | ||
+ | # merkt sich alle bisher durchlaufenen Zustände | ||
+ | self.memory.append([state, action, reward, next_state, done,total,importance]) | ||
+ | |||
+ | def act(self, state): | ||
+ | # epsilon-greedy: off-policy oder policy | ||
+ | | ||
+ | if np.random.rand() <= self.epsilon: | ||
+ | return random.randrange(self.action_size) | ||
+ | act_values = self.model.predict(state) | ||
+ | return np.argmax(act_values[0]) # returns action | ||
+ | |||
+ | def replay(self, batch_size): | ||
+ | # baut den Vektor der Q-Werte aus | ||
+ | # als reward zum Zeitpunkt t + gamma*max(moegliche rewards zum Zeitpunkt t+1) | ||
+ | | ||
+ | probabilities = np.array([m[-1] for m in self.memory]) | ||
+ | probabilities = 1./np.sum(probabilities) * probabilities | ||
+ | #print( probabilities.shape) | ||
+ | minibatch = [self.memory[i] for i in np.random.choice(range(len(self.memory)),size=batch_size, p=probabilities)] | ||
+ | states, targets_f = [], [] | ||
+ | for state, action, reward, next_state, done,total,importance in minibatch: | ||
+ | target = reward | ||
+ | if not done: | ||
+ | target = (reward + self.gamma * | ||
+ | np.amax(self.model.predict(next_state)[0])) | ||
+ | #print("Reward: ", reward) | ||
+ | target_f = self.model.predict(state) | ||
+ | target_f[0][action] = target | ||
+ | # Filtering out states and targets for training | ||
+ | states.append(state[0]) | ||
+ | targets_f.append(target_f[0]) | ||
+ | history = self.model.fit(np.array(states), np.array(targets_f), epochs=1, verbose=0) | ||
+ | # Keeping track of loss | ||
+ | loss = history.history['loss'][0] | ||
+ | if self.epsilon > self.epsilon_min: | ||
+ | self.epsilon *= self.epsilon_decay | ||
+ | return loss | ||
+ | |||
+ | def load(self, name): | ||
+ | self.model.load_weights(name) | ||
+ | |||
+ | def save(self, name): | ||
+ | self.model.save_weights(name) | ||
+ | | ||
+ | EPISODES = 22 | ||
+ | |||
+ | |||
+ | env = gym.make('MsPacman-v0') | ||
+ | state_size = env.observation_space.shape | ||
+ | action_size = env.action_space.n | ||
+ | agent = DQNAgent(state_size, action_size) | ||
+ | done = False | ||
+ | batch_size = 32 | ||
+ | |||
+ | for e in range(EPISODES): | ||
+ | state = env.reset() | ||
+ | state = np.reshape(state, (1,)+ state_size) | ||
+ | cum_reward = 0 | ||
+ | for time in range(500): | ||
+ | env.render() | ||
+ | action = agent.act(state) | ||
+ | next_state, reward, done, _ = env.step(action) | ||
+ | #additional_reward = -(state[0,0] + state[0,0]*state[0,2]-state[0,1]*state[0,3])##faktore aus probieren | ||
+ | reward = reward #+ additional_reward if not done else 10 # | ||
+ | cum_reward += reward | ||
+ | next_state = np.reshape(next_state, (1,)+ state_size) | ||
+ | agent.remember(state, action, reward, next_state, done,reward,1) | ||
+ | state = next_state | ||
+ | if done: | ||
+ | print("episode: {}/{}, score: {}, e: {:.2}" | ||
+ | .format(e, EPISODES, time, agent.epsilon)) | ||
+ | break | ||
+ | if len(agent.memory) > batch_size: | ||
+ | loss = agent.replay(batch_size) | ||
+ | # Logging training loss and actual reward every 10 timesteps | ||
+ | if time % 10 == 0: | ||
+ | print("episode: {}/{}, time: {}, cumulative reward: {:.4f}, loss: {:.4f}".format(e, EPISODES, time, cum_reward, loss)) | ||
+ | | ||
+ | | ||
+ | for i in range(time): | ||
+ | pos = -i-1 | ||
+ | agent.memory[-i-2][-2] += reward | ||
+ | for j in range(-time,pos): | ||
+ | new_total = agent.memory[j][-2] + agent.memory[pos][2] | ||
+ | mem = agent.memory[j] | ||
+ | agent.memory[j][-1] =new_total | ||
+ | |||
+ | for i in range(time): | ||
+ | pos = -i-1 | ||
+ | imp = max(agent.memory[pos][-2]-agent.model.predict(agent.memory[pos][0])[0,agent.memory[pos][1]],0) | ||
+ | mem = agent.memory[pos] | ||
+ | agent.memory[pos][-1] = imp | ||
+ | | ||
+ | | ||
+ | agent.save("qlearning_Acrobot_3versuche") | ||
+ | | ||
+ | | ||
+ | import gym | ||
+ | env = gym.make('MsPacman-v0') | ||
+ | state_size = env.observation_space.shape | ||
+ | action_size = env.action_space.n | ||
+ | agent = DQNAgent(state_size, action_size) | ||
+ | done = False | ||
+ | batch_size = 32 | ||
+ | zähler=0 | ||
+ | |||
+ | #agent.load("qlearning_Acrobot_3versuche") | ||
+ | import time as ti | ||
+ | for e in range(100): | ||
+ | state = env.reset() | ||
+ | #state[0] = state[0] + np.random.randn()*0.1 | ||
+ | #state[1] = state[1] + np.random.randn()*0.1 | ||
+ | #state[2] = state[2] + np.random.randn()*0.1 | ||
+ | #state[3] = state[3] + np.random.randn()*0.1 | ||
+ | #env.env.state = state | ||
+ | state = np.reshape(state, [1, state_size]) | ||
+ | for time in range(2000): | ||
+ | | ||
+ | env.render() | ||
+ | agent.epsilon = 0 | ||
+ | action = agent.act(state) | ||
+ | next_state, reward, done, _ = env.step(action) | ||
+ | next_state = np.reshape(next_state, [1, state_size]) | ||
+ | state = next_state | ||
+ | if done: | ||
+ | zähler+=1 | ||
+ | print (zähler, "Duration: ", time) | ||
+ | break | ||
+ | | ||
+ | else: | ||
+ | print ("Volle Zeit") |