Hier werden die Unterschiede zwischen zwei Versionen gezeigt.
Beide Seiten der vorigen Revision Vorhergehende Überarbeitung Nächste Überarbeitung | Vorhergehende Überarbeitung | ||
ws1819:acrobot [2019/03/20 17:05] rhotert |
ws1819:acrobot [2019/03/31 15:24] (aktuell) rhotert |
||
---|---|---|---|
Zeile 1: | Zeile 1: | ||
- | Ich habe mich, um das Thema besser zu verstehen mit den Classic control Acrobot beschäftigt. (https://gym.openai.com/envs/#classic_control) | + | **Acrobot** |
+ | |||
+ | gym environment | ||
+ | |||
+ | |||
+ | Ich habe mich, um das Thema besser zu verstehen, mit den Classic control Acrobot beschäftigt. (https://gym.openai.com/envs/#classic_control) | ||
Zeile 7: | Zeile 12: | ||
Die Schwierigkeit liegt bei diesem Environment darin, dass man einen Weg finden muss, dass NN mit den wenigen Erfolgen zu trainieren. (Anfangs kam das Pendel bei 1000 Versuchen ca. 2 mal über die Linie) | Die Schwierigkeit liegt bei diesem Environment darin, dass man einen Weg finden muss, dass NN mit den wenigen Erfolgen zu trainieren. (Anfangs kam das Pendel bei 1000 Versuchen ca. 2 mal über die Linie) | ||
- | Dazu muss man wissen, dass die klassische KI meist nur durch Belohnungen etwas lernt (die sich durch zwischen Etappen erringen lassen). | + | Dazu muss man wissen, dass die klassische Künstliche Intelligenz (KI) meist nur durch Belohnungen etwas lernt (die sich durch Zwischenetappen erreichen lassen). |
- | Zum Vergleich ich habe das Problem mit zwei unterschiedlichen NN getestet | + | Zum Vergleich: |
- | (hier Graphen zum Vergleich) | + | ich habe das Problem mit zwei unterschiedlich starken Neuralen Netzen getestet. Nach mehren hundert Testläufen meinerseits, ergab sich, dass die besten Resultate mit dem Einfachsten NN und ca. 22 Episoden erzielt wurden |
+ | (Highscore 63).[der vermeintliche Weltrekord liegt bei 42] | ||
+ | 63 heißt hierbei aufschwing Versuche. | ||
+ | |||
+ | |||
+ | |||
+ | |||
+ | Ich lasse die Trainingsdaten weg, da ich finde, dass es am meisten Spaß macht beim Lernen zu zugucken | ||
+ | (wenn man die Seite bearbeitet ist der Code richtig formatiert/ ich sende ihn gerne via SLACK) | ||
+ | |||
+ | |||
+ | ZIP des Code: {{:ws1819:acrobotzip.rar|}} | ||
Der Code zum besten Ergebnis | Der Code zum besten Ergebnis | ||
- | ich lass die Trainingsdaten weg, da ich finde dass es am meisten Spaß macht beim lernen zu zugucken. | ||
+ | <code python> | ||
+ | import random | ||
+ | import gym | ||
+ | import numpy as np | ||
+ | from collections import deque | ||
+ | from keras.models import Sequential | ||
+ | from keras.layers import Dense, Dropout | ||
+ | from keras.optimizers import Adam | ||
+ | import keras | ||
+ | |||
+ | #////////////////////////////// | ||
+ | |||
+ | class DQNAgent: | ||
+ | def __init__(self, state_size, action_size): | ||
+ | self.state_size = state_size | ||
+ | self.action_size = action_size | ||
+ | self.memory = deque(maxlen=2000) | ||
+ | self.gamma = 1.0 # discount rate | ||
+ | self.epsilon = 1.0 # exploration rate | ||
+ | self.epsilon_min = 0.01 | ||
+ | self.epsilon_decay = 0.999 | ||
+ | self.learning_rate = 0.001 | ||
+ | self.model = self._build_model() | ||
+ | |||
+ | def _build_model(self): | ||
+ | # Einfaches NN | ||
+ | model = Sequential() | ||
+ | model.add(Dense(16, input_dim=self.state_size, activation='relu', | ||
+ | kernel_regularizer=keras.regularizers.l2(0.00001))) | ||
+ | #model.add(Dropout(0.3)) | ||
+ | #model.add(Dense(24, activation='relu', kernel_regularizer=keras.regularizers.l2(0.00001))) | ||
+ | model.add(Dense(self.action_size, activation='linear')) | ||
+ | model.compile(loss='mse', | ||
+ | optimizer=Adam(lr=self.learning_rate)) | ||
+ | return model | ||
+ | |||
+ | def remember(self, state, action, reward, next_state, done, total, importance): | ||
+ | # merkt sich alle bisher durchlaufenen Zustände | ||
+ | self.memory.append([state, action, reward, next_state, done,total,importance]) | ||
+ | |||
+ | def act(self, state): | ||
+ | # epsilon-greedy: off-policy oder policy | ||
+ | | ||
+ | if np.random.rand() <= self.epsilon: | ||
+ | return random.randrange(self.action_size) | ||
+ | act_values = self.model.predict(state) | ||
+ | return np.argmax(act_values[0]) # returns action | ||
+ | |||
+ | def replay(self, batch_size): | ||
+ | # baut den Vektor der Q-Werte aus | ||
+ | # als reward zum Zeitpunkt t + gamma*max(moegliche rewards zum Zeitpunkt t+1) | ||
+ | | ||
+ | probabilities = np.array([m[-1] for m in self.memory]) | ||
+ | probabilities = 1./np.sum(probabilities) * probabilities | ||
+ | #print( probabilities.shape) | ||
+ | minibatch = [self.memory[i] for i in np.random.choice(range(len(self.memory)),size=batch_size, p=probabilities)] | ||
+ | states, targets_f = [], [] | ||
+ | for state, action, reward, next_state, done,total,importance in minibatch: | ||
+ | target = reward | ||
+ | if not done: | ||
+ | target = (reward + self.gamma * | ||
+ | np.amax(self.model.predict(next_state)[0])) | ||
+ | target_f = self.model.predict(state) | ||
+ | target_f[0][action] = target | ||
+ | # Filtering out states and targets for training | ||
+ | states.append(state[0]) | ||
+ | targets_f.append(target_f[0]) | ||
+ | history = self.model.fit(np.array(states), np.array(targets_f), epochs=1, verbose=0) | ||
+ | # Keeping track of loss | ||
+ | loss = history.history['loss'][0] | ||
+ | if self.epsilon > self.epsilon_min: | ||
+ | self.epsilon *= self.epsilon_decay | ||
+ | return loss | ||
+ | |||
+ | def load(self, name): | ||
+ | self.model.load_weights(name) | ||
+ | |||
+ | def save(self, name): | ||
+ | self.model.save_weights(name) | ||
+ | | ||
+ | |||
+ | EPISODES = 7 | ||
+ | |||
+ | |||
+ | env = gym.make('Acrobot-v1') | ||
+ | state_size = env.observation_space.shape[0] | ||
+ | action_size = env.action_space.n | ||
+ | agent = DQNAgent(state_size, action_size) | ||
+ | done = False | ||
+ | batch_size = 32 | ||
+ | |||
+ | for e in range(EPISODES): | ||
+ | state = env.reset() | ||
+ | state = np.reshape(state, [1, state_size]) | ||
+ | cum_reward = 0 | ||
+ | for time in range(500): | ||
+ | env.render() | ||
+ | action = agent.act(state) | ||
+ | next_state, reward, done, _ = env.step(action) | ||
+ | additional_reward = -(state[0,0] + state[0,0]*state[0,2]-state[0,1]*state[0,3])#*0.2##faktore aus probieren | ||
+ | reward = reward + additional_reward if not done else 10 # | ||
+ | cum_reward += reward | ||
+ | next_state = np.reshape(next_state, [1, state_size]) | ||
+ | agent.remember(state, action, reward, next_state, done,reward,1) | ||
+ | state = next_state | ||
+ | if done: | ||
+ | print("episode: {}/{}, score: {}, e: {:.2}" | ||
+ | .format(e, EPISODES, time, agent.epsilon)) | ||
+ | break | ||
+ | if len(agent.memory) > batch_size: | ||
+ | loss = agent.replay(batch_size) | ||
+ | # Logging training loss and actual reward every 10 timesteps | ||
+ | if time % 10 == 0: | ||
+ | print("episode: {}/{}, time: {}, cumulative reward: {:.4f}, loss: {:.4f}".format(e, EPISODES, time, cum_reward, loss)) | ||
+ | | ||
+ | | ||
+ | for i in range(time): | ||
+ | pos = -i-1 | ||
+ | agent.memory[-i-2][-2] += reward | ||
+ | for j in range(-time,pos): | ||
+ | new_total = agent.memory[j][-2] + agent.memory[pos][2] | ||
+ | mem = agent.memory[j] | ||
+ | agent.memory[j][-1] =new_total | ||
+ | |||
+ | for i in range(time): | ||
+ | pos = -i-1 | ||
+ | imp = max(agent.memory[pos][-2]-agent.model.predict(agent.memory[pos][0])[0,agent.memory[pos][1]],0) | ||
+ | mem = agent.memory[pos] | ||
+ | agent.memory[pos][-1] = imp | ||
+ | | ||
+ | | ||
+ | agent.save("qlearning_Acrobot_1000versuche") | ||
- | - import random | + | #////////////////////////////////////////////// |
- | - import gym | + | </code> |
- | - import numpy as np | + | |
- | - from collections import deque | + | |
- | - from keras.models import Sequential | + | |
- | - from keras.layers import Dense, Dropout | + | |
- | - from keras.optimizers import Adam | + | |
- | - import keras | + | |
- | - | + | |
- | - | + | |
- | - #////////////////////////////// | + | |
- | - | + | |
- | - class DQNAgent: | + | |
- | - def __init__(self, state_size, action_size): | + | |
- | - self.state_size = state_size | + | |
- | - self.action_size = action_size | + | |
- | - self.memory = deque(maxlen=2000) | + | |
- | - self.gamma = 1.0 # discount rate | + | |
- | - self.epsilon = 1.0 # exploration rate | + | |
- | - self.epsilon_min = 0.01 | + | |
- | - self.epsilon_decay = 0.999 | + | |
- | - self.learning_rate = 0.001 | + | |
- | - self.model = self._build_model() | + | |
- | - | + | |
- | - def _build_model(self): | + | |
- | - # Einfaches NN | + | |
- | - model = Sequential() | + | |
- | - model.add(Dense(16, input_dim=self.state_size, activation='relu', | + | |
- | - kernel_regularizer=keras.regularizers.l2(0.00001))) | + | |
- | - #model.add(Dropout(0.3)) | + | |
- | - #model.add(Dense(24, activation='relu', kernel_regularizer=keras.regularizers.l2(0.00001))) | + | |
- | - model.add(Dense(self.action_size, activation='linear')) | + | |
- | - model.compile(loss='mse', | + | |
- | - optimizer=Adam(lr=self.learning_rate)) | + | |
- | - return model | + | |
- | - | + | |
- | - def remember(self, state, action, reward, next_state, done, total, importance): | + | |
- | - # merkt sich alle bisher durchlaufenen Zustände | + | |
- | - self.memory.append([state, action, reward, next_state, done,total,importance]) | + | |
- | - | + | |
- | - def act(self, state): | + | |
- | - # epsilon-greedy: off-policy oder policy | + | |
- | - | + | |
- | - if np.random.rand() <= self.epsilon: | + | |
- | - return random.randrange(self.action_size) | + | |
- | - act_values = self.model.predict(state) | + | |
- | - return np.argmax(act_values[0]) # returns action | + | |
- | - | + | |
- | - def replay(self, batch_size): | + | |
- | - # baut den Vektor der Q-Werte aus | + | |
- | - # als reward zum Zeitpunkt t + gamma*max(moegliche rewards zum Zeitpunkt t+1) | + | |
- | - | + | |
- | - probabilities = np.array([m[-1] for m in self.memory]) | + | |
- | - probabilities = 1./np.sum(probabilities) * probabilities | + | |
- | - #print( probabilities.shape) | + | |
- | - minibatch = [self.memory[i] for i in np.random.choice(range(len(self.memory)),size=batch_size, p=probabilities)] | + | |
- | - states, targets_f = [], [] | + | |
- | - for state, action, reward, next_state, done,total,importance in minibatch: | + | |
- | - target = reward | + | |
- | - if not done: | + | |
- | - target = (reward + self.gamma * | + | |
- | - np.amax(self.model.predict(next_state)[0])) | + | |
- | - target_f = self.model.predict(state) | + | |
- | - target_f[0][action] = target | + | |
- | - # Filtering out states and targets for training | + | |
- | - states.append(state[0]) | + | |
- | - targets_f.append(target_f[0]) | + | |
- | - history = self.model.fit(np.array(states), np.array(targets_f), epochs=1, verbose=0) | + | |
- | - # Keeping track of loss | + | |
- | - loss = history.history['loss'][0] | + | |
- | - if self.epsilon > self.epsilon_min: | + | |
- | - self.epsilon *= self.epsilon_decay | + | |
- | - return loss | + | |
- | - | + | |
- | - def load(self, name): | + | |
- | - self.model.load_weights(name) | + | |
- | - | + | |
- | - def save(self, name): | + | |
- | - self.model.save_weights(name) | + | |
- | - | + | |
- | - | + | |
- | - EPISODES = 7 | + | |
- | - | + | |
- | - | + | |
- | - env = gym.make('Acrobot-v1') | + | |
- | - state_size = env.observation_space.shape[0] | + | |
- | - action_size = env.action_space.n | + | |
- | - agent = DQNAgent(state_size, action_size) | + | |
- | - done = False | + | |
- | - batch_size = 32 | + | |
- | - | + | |
- | - for e in range(EPISODES): | + | |
- | - state = env.reset() | + | |
- | - state = np.reshape(state, [1, state_size]) | + | |
- | - cum_reward = 0 | + | |
- | - for time in range(500): | + | |
- | - env.render() | + | |
- | - action = agent.act(state) | + | |
- | - next_state, reward, done, _ = env.step(action) | + | |
- | - additional_reward = -(state[0,0] + state[0,0]*state[0,2]-state[0,1]*state[0,3])#*0.2##faktore aus probieren | + | |
- | - reward = reward + additional_reward if not done else 10 # | + | |
- | - cum_reward += reward | + | |
- | - next_state = np.reshape(next_state, [1, state_size]) | + | |
- | - agent.remember(state, action, reward, next_state, done,reward,1) | + | |
- | - state = next_state | + | |
- | - if done: | + | |
- | - print("episode: {}/{}, score: {}, e: {:.2}" | + | |
- | - .format(e, EPISODES, time, agent.epsilon)) | + | |
- | - break | + | |
- | - if len(agent.memory) > batch_size: | + | |
- | - loss = agent.replay(batch_size) | + | |
- | - # Logging training loss and actual reward every 10 timesteps | + | |
- | - if time % 10 == 0: | + | |
- | - print("episode: {}/{}, time: {}, cumulative reward: {:.4f}, loss: {:.4f}".format(e, EPISODES, time, cum_reward, loss)) | + | |
- | - | + | |
- | - | + | |
- | - for i in range(time): | + | |
- | - pos = -i-1 | + | |
- | - agent.memory[-i-2][-2] += reward | + | |
- | - for j in range(-time,pos): | + | |
- | - new_total = agent.memory[j][-2] + agent.memory[pos][2] | + | |
- | - mem = agent.memory[j] | + | |
- | - agent.memory[j][-1] =new_total | + | |
- | - | + | |
- | - for i in range(time): | + | |
- | - pos = -i-1 | + | |
- | - imp = max(agent.memory[pos][-2]-agent.model.predict(agent.memory[pos][0])[0,agent.memory[pos][1]],0) | + | |
- | - mem = agent.memory[pos] | + | |
- | - agent.memory[pos][-1] = imp | + | |
- | - | + | |
- | - | + | |
- | - agent.save("qlearning_Acrobot_1000versuche") | + | |
- | - | + | |
- | - #////////////////////////////////////////////// | + | |