Hier werden die Unterschiede zwischen zwei Versionen gezeigt.
Beide Seiten der vorigen Revision Vorhergehende Überarbeitung Nächste Überarbeitung | Vorhergehende Überarbeitung | ||
ws1819:programme [2019/01/10 17:50] johannakistenbruegge |
ws1819:programme [2019/03/25 20:08] (aktuell) stefanborn |
||
---|---|---|---|
Zeile 2: | Zeile 2: | ||
__ | __ | ||
+ | <code python> | ||
# -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||
""" | """ | ||
Created on Thu Dec 13 16:09:46 2018 | Created on Thu Dec 13 16:09:46 2018 | ||
- | @author: Johanna | + | @author: Luca |
""" | """ | ||
Zeile 235: | Zeile 235: | ||
state | state | ||
+ | |||
+ | |||
+ | aktueller code | ||
+ | |||
+ | # -*- coding: utf-8 -*- | ||
+ | """ | ||
+ | Created on Thu Jan 24 16:43:47 2019 | ||
+ | |||
+ | @author: Luca | ||
+ | """ | ||
+ | |||
+ | import random | ||
+ | import gym | ||
+ | import numpy as np | ||
+ | from collections import deque | ||
+ | from keras.models import Sequential | ||
+ | from keras.layers import Dense, Dropout | ||
+ | from keras.optimizers import Adam | ||
+ | import keras | ||
+ | |||
+ | class DQNAgent: | ||
+ | def __init__(self, state_size, action_size): | ||
+ | self.state_size = state_size | ||
+ | self.action_size = action_size | ||
+ | self.memory = deque(maxlen=2000) | ||
+ | self.gamma = 1.0 # discount rate | ||
+ | self.epsilon = 1.0 # exploration rate | ||
+ | self.epsilon_min = 0.01 | ||
+ | self.epsilon_decay = 0.999 | ||
+ | self.learning_rate = 0.001 | ||
+ | self.model = self._build_model() | ||
+ | |||
+ | def _build_model(self): | ||
+ | # Einfaches NN | ||
+ | model = Sequential() | ||
+ | model.add(Dense(48, input_dim=self.state_size, activation='relu', | ||
+ | kernel_regularizer=keras.regularizers.l2(0.00001))) | ||
+ | model.add(Dropout(0.3)) | ||
+ | model.add(Dense(24, activation='relu', kernel_regularizer=keras.regularizers.l2(0.00001))) | ||
+ | model.add(Dense(self.action_size, activation='linear')) | ||
+ | model.compile(loss='mse', | ||
+ | optimizer=Adam(lr=self.learning_rate)) | ||
+ | return model | ||
+ | |||
+ | def remember(self, state, action, reward, next_state, done, total, importance): | ||
+ | # merkt sich alle bisher durchlaufenen Zustände | ||
+ | self.memory.append((state, action, reward, next_state, done,total,importance)) | ||
+ | |||
+ | def act(self, state): | ||
+ | # epsilon-greedy: off-policy oder policy | ||
+ | | ||
+ | if np.random.rand() <= self.epsilon: | ||
+ | return random.randrange(self.action_size) | ||
+ | act_values = self.model.predict(state) | ||
+ | return np.argmax(act_values[0]) # returns action | ||
+ | |||
+ | def replay(self, batch_size): | ||
+ | # baut den Vektor der Q-Werte aus | ||
+ | # als reward zum Zeitpunkt t + gamma*max(moegliche rewards zum Zeitpunkt t+1) | ||
+ | | ||
+ | probabilities = np.array([m[-1] for m in self.memory]) | ||
+ | probabilities = 1./np.sum(probabilities) * probabilities | ||
+ | #print( probabilities.shape) | ||
+ | minibatch = [self.memory[i] for i in np.random.choice(range(len(self.memory)),size=batch_size, p=probabilities)] | ||
+ | states, targets_f = [], [] | ||
+ | for state, action, reward, next_state, done,total,importance in minibatch: | ||
+ | target = reward | ||
+ | if not done: | ||
+ | target = (reward + self.gamma * | ||
+ | np.amax(self.model.predict(next_state)[0])) | ||
+ | target_f = self.model.predict(state) | ||
+ | target_f[0][action] = target | ||
+ | # Filtering out states and targets for training | ||
+ | states.append(state[0]) | ||
+ | targets_f.append(target_f[0]) | ||
+ | history = self.model.fit(np.array(states), np.array(targets_f), epochs=1, verbose=0) | ||
+ | # Keeping track of loss | ||
+ | loss = history.history['loss'][0] | ||
+ | if self.epsilon > self.epsilon_min: | ||
+ | self.epsilon *= self.epsilon_decay | ||
+ | return loss | ||
+ | |||
+ | def load(self, name): | ||
+ | self.model.load_weights(name) | ||
+ | |||
+ | def save(self, name): | ||
+ | self.model.save_weights(name) | ||
+ | | ||
+ | # Lernen: | ||
+ | |||
+ | # EPISODES mal durchspielen, bis der Mast umfaellt | ||
+ | |||
+ | |||
+ | import gym | ||
+ | |||
+ | |||
+ | EPISODES = 100 | ||
+ | |||
+ | |||
+ | env = gym.make('Acrobot-v1') | ||
+ | state_size = env.observation_space.shape[0] | ||
+ | action_size = env.action_space.n | ||
+ | agent = DQNAgent(state_size, action_size) | ||
+ | done = False | ||
+ | batch_size = 32 | ||
+ | |||
+ | for e in range(EPISODES): | ||
+ | state = env.reset() | ||
+ | state = np.reshape(state, [1, state_size]) | ||
+ | cum_reward = 0 | ||
+ | for time in range(500): | ||
+ | env.render() | ||
+ | action = agent.act(state) | ||
+ | next_state, reward, done, _ = env.step(action) | ||
+ | reward = reward/(abs(next_state[0])+1.)**2 if not done else -10 # | ||
+ | cum_reward += reward | ||
+ | next_state = np.reshape(next_state, [1, state_size]) | ||
+ | agent.remember(state, action, reward, next_state, done,reward,1) | ||
+ | state = next_state | ||
+ | if done: | ||
+ | print("episode: {}/{}, score: {}, e: {:.2}" | ||
+ | .format(e, EPISODES, time, agent.epsilon)) | ||
+ | break | ||
+ | if len(agent.memory) > batch_size: | ||
+ | loss = agent.replay(batch_size) | ||
+ | # Logging training loss and actual reward every 10 timesteps | ||
+ | if time % 10 == 0: | ||
+ | print("episode: {}/{}, time: {}, cumulative reward: {:.4f}, loss: {:.4f}".format(e, EPISODES, time, cum_reward, loss)) | ||
+ | | ||
+ | | ||
+ | for i in range(time): | ||
+ | pos = -i-1 | ||
+ | for j in range(-time,pos): | ||
+ | new_total = agent.memory[j][-2] + agent.memory[pos][2] | ||
+ | mem = agent.memory[j] | ||
+ | agent.memory[j] = (mem[0],mem[1],mem[2],mem[3], mem[4],new_total,1) | ||
+ | | ||
+ | for i in range(time): | ||
+ | pos = -i-1 | ||
+ | imp = max(agent.memory[pos][-2]-agent.model.predict(agent.memory[pos][0])[0,agent.memory[pos][1]],0) | ||
+ | mem = agent.memory[pos] | ||
+ | agent.memory[pos] = (mem[0],mem[1],mem[2],mem[3], mem[4],mem[5],imp) | ||
+ | | ||
+ | | ||
+ | agent.save("qlearning_Acrobot_1000versuche") | ||
+ | </code> |