__**Programm zum Spielen von Space Invaders vom 20.12.18:** __ # -*- coding: utf-8 -*- """ Created on Thu Dec 13 16:09:46 2018 @author: Luca """ #!/usr/bin/env python from __future__ import print_function import sys, gym, time # # Test yourself as a learning agent! Pass environment name as a command-line argument, for example: # # python keyboard_agent.py SpaceInvadersNoFrameskip-v4 # env = gym.make('SpaceInvaders-v0' if len(sys.argv)<2 else sys.argv[1]) if not hasattr(env.action_space, 'n'): raise Exception('Keyboard agent only supports discrete action spaces') ACTIONS = env.action_space.n SKIP_CONTROL = 0 # Use previous control decision SKIP_CONTROL times, that's how you # can test what skip is still usable. human_agent_action = 0 human_wants_restart = False human_sets_pause = False def key_press(key, mod): global human_agent_action, human_wants_restart, human_sets_pause if key==0xff0d: human_wants_restart = True if key==32: human_sets_pause = not human_sets_pause a = int( key - ord('0') ) if a <= 0 or a >= ACTIONS: return human_agent_action = a def key_release(key, mod): global human_agent_action a = int( key - ord('0') ) if a <= 0 or a >= ACTIONS: return if human_agent_action == a: human_agent_action = 0 env.render() env.unwrapped.viewer.window.on_key_press = key_press env.unwrapped.viewer.window.on_key_release = key_release def rollout(env): global human_agent_action, human_wants_restart, human_sets_pause human_wants_restart = False obser = env.reset() skip = 0 total_reward = 0 total_timesteps = 0 while 1: if not skip: #print("taking action {}".format(human_agent_action)) a = human_agent_action total_timesteps += 1 skip = SKIP_CONTROL else: skip -= 1 obser, r, done, info = env.step(a) if r != 0: print("reward %0.3f" % r) total_reward += r window_still_open = env.render() if window_still_open==False: return False if done: break if human_wants_restart: break while human_sets_pause: env.render() time.sleep(0.1) time.sleep(0.1) print("timesteps %i reward %0.2f" % (total_timesteps, total_reward)) print("ACTIONS={}".format(ACTIONS)) print("Press keys 1 2 3 ... to take actions 1 2 3 ...") print("No keys pressed is taking action 0") while 1: window_still_open = rollout(env) if window_still_open==False: break __**Programm zum Cartpool vom 10.1.2019**__ import random import gym import numpy as np from collections import deque from keras.models import Sequential from keras.layers import Dense from keras.optimizers import Adam class DQNAgent: def __init__(self, state_size, action_size): self.state_size = state_size self.action_size = action_size self.memory = deque(maxlen=2000) self.gamma = 0.95 # discount rate self.epsilon = 1.0 # exploration rate self.epsilon_min = 0.01 self.epsilon_decay = 0.995 self.learning_rate = 0.001 self.model = self._build_model() def _build_model(self): # Einfaches NN model = Sequential() model.add(Dense(24, input_dim=self.state_size, activation='relu')) model.add(Dense(24, activation='relu')) model.add(Dense(self.action_size, activation='linear')) model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate)) return model def remember(self, state, action, reward, next_state, done): # merkt sich alle bisher durchlaufenen Zustände self.memory.append((state, action, reward, next_state, done)) def act(self, state): # epsilon-greedy: off-policy oder policy if np.random.rand() <= self.epsilon: return random.randrange(self.action_size) act_values = self.model.predict(state) return np.argmax(act_values[0]) # returns action def replay(self, batch_size): # baut den Vektor der Q-Werte aus # als reward zum Zeitpunkt t + gamma*max(moegliche rewards zum Zeitpunkt t+1) minibatch = random.sample(self.memory, batch_size) states, targets_f = [], [] for state, action, reward, next_state, done in minibatch: target = reward if not done: target = (reward + self.gamma * np.amax(self.model.predict(next_state)[0])) target_f = self.model.predict(state) target_f[0][action] = target # Filtering out states and targets for training states.append(state[0]) targets_f.append(target_f[0]) history = self.model.fit(np.array(states), np.array(targets_f), epochs=1, verbose=0) # Keeping track of loss loss = history.history['loss'][0] if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay return loss def load(self, name): self.model.load_weights(name) def save(self, name): self.model.save_weights(name) # Lernen: # EPISODES mal durchspielen, bis der Mast umfaellt import gym EPISODES = 1000 env = gym.make('CartPole-v1') state_size = env.observation_space.shape[0] action_size = env.action_space.n agent = DQNAgent(state_size, action_size) done = False batch_size = 32 for e in range(EPISODES): state = env.reset() state = np.reshape(state, [1, state_size]) for time in range(500): env.render() action = agent.act(state) next_state, reward, done, _ = env.step(action) reward = reward-abs(next_state[0]) if not done else -10 print(next_state[0], reward) next_state = np.reshape(next_state, [1, state_size]) agent.remember(state, action, reward, next_state, done) state = next_state if done: print("episode: {}/{}, score: {}, e: {:.2}" .format(e, EPISODES, time, agent.epsilon)) break if len(agent.memory) > batch_size: loss = agent.replay(batch_size) # Logging training loss every 10 timesteps if time % 10 == 0: print("episode: {}/{}, time: {}, loss: {:.4f}".format(e, EPISODES, time, loss)) # Und nochmal lernen for e in range(EPISODES): state = env.reset() state = np.reshape(state, [1, state_size]) for time in range(500): env.render() action = agent.act(state) next_state, reward, done, _ = env.step(action) reward = reward-abs(next_state[0]) if not done else -10 print(next_state[0], reward) next_state = np.reshape(next_state, [1, state_size]) agent.remember(state, action, reward, next_state, done) state = next_state if done: print("episode: {}/{}, score: {}, e: {:.2}" .format(e, EPISODES, time, agent.epsilon)) break if len(agent.memory) > batch_size: loss = agent.replay(batch_size) # Logging training loss every 10 timesteps if time % 10 == 0: print("episode: {}/{}, time: {}, loss: {:.4f}".format(e, EPISODES, time, loss)) dir(env) env.action_space env.observation_space state = env.reset() state aktueller code # -*- coding: utf-8 -*- """ Created on Thu Jan 24 16:43:47 2019 @author: Luca """ import random import gym import numpy as np from collections import deque from keras.models import Sequential from keras.layers import Dense, Dropout from keras.optimizers import Adam import keras class DQNAgent: def __init__(self, state_size, action_size): self.state_size = state_size self.action_size = action_size self.memory = deque(maxlen=2000) self.gamma = 1.0 # discount rate self.epsilon = 1.0 # exploration rate self.epsilon_min = 0.01 self.epsilon_decay = 0.999 self.learning_rate = 0.001 self.model = self._build_model() def _build_model(self): # Einfaches NN model = Sequential() model.add(Dense(48, input_dim=self.state_size, activation='relu', kernel_regularizer=keras.regularizers.l2(0.00001))) model.add(Dropout(0.3)) model.add(Dense(24, activation='relu', kernel_regularizer=keras.regularizers.l2(0.00001))) model.add(Dense(self.action_size, activation='linear')) model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate)) return model def remember(self, state, action, reward, next_state, done, total, importance): # merkt sich alle bisher durchlaufenen Zustände self.memory.append((state, action, reward, next_state, done,total,importance)) def act(self, state): # epsilon-greedy: off-policy oder policy if np.random.rand() <= self.epsilon: return random.randrange(self.action_size) act_values = self.model.predict(state) return np.argmax(act_values[0]) # returns action def replay(self, batch_size): # baut den Vektor der Q-Werte aus # als reward zum Zeitpunkt t + gamma*max(moegliche rewards zum Zeitpunkt t+1) probabilities = np.array([m[-1] for m in self.memory]) probabilities = 1./np.sum(probabilities) * probabilities #print( probabilities.shape) minibatch = [self.memory[i] for i in np.random.choice(range(len(self.memory)),size=batch_size, p=probabilities)] states, targets_f = [], [] for state, action, reward, next_state, done,total,importance in minibatch: target = reward if not done: target = (reward + self.gamma * np.amax(self.model.predict(next_state)[0])) target_f = self.model.predict(state) target_f[0][action] = target # Filtering out states and targets for training states.append(state[0]) targets_f.append(target_f[0]) history = self.model.fit(np.array(states), np.array(targets_f), epochs=1, verbose=0) # Keeping track of loss loss = history.history['loss'][0] if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay return loss def load(self, name): self.model.load_weights(name) def save(self, name): self.model.save_weights(name) # Lernen: # EPISODES mal durchspielen, bis der Mast umfaellt import gym EPISODES = 100 env = gym.make('Acrobot-v1') state_size = env.observation_space.shape[0] action_size = env.action_space.n agent = DQNAgent(state_size, action_size) done = False batch_size = 32 for e in range(EPISODES): state = env.reset() state = np.reshape(state, [1, state_size]) cum_reward = 0 for time in range(500): env.render() action = agent.act(state) next_state, reward, done, _ = env.step(action) reward = reward/(abs(next_state[0])+1.)**2 if not done else -10 # cum_reward += reward next_state = np.reshape(next_state, [1, state_size]) agent.remember(state, action, reward, next_state, done,reward,1) state = next_state if done: print("episode: {}/{}, score: {}, e: {:.2}" .format(e, EPISODES, time, agent.epsilon)) break if len(agent.memory) > batch_size: loss = agent.replay(batch_size) # Logging training loss and actual reward every 10 timesteps if time % 10 == 0: print("episode: {}/{}, time: {}, cumulative reward: {:.4f}, loss: {:.4f}".format(e, EPISODES, time, cum_reward, loss)) for i in range(time): pos = -i-1 for j in range(-time,pos): new_total = agent.memory[j][-2] + agent.memory[pos][2] mem = agent.memory[j] agent.memory[j] = (mem[0],mem[1],mem[2],mem[3], mem[4],new_total,1) for i in range(time): pos = -i-1 imp = max(agent.memory[pos][-2]-agent.model.predict(agent.memory[pos][0])[0,agent.memory[pos][1]],0) mem = agent.memory[pos] agent.memory[pos] = (mem[0],mem[1],mem[2],mem[3], mem[4],mem[5],imp) agent.save("qlearning_Acrobot_1000versuche")