Benutzer-Werkzeuge

Webseiten-Werkzeuge


Seitenleiste

ws1819:programme

Programm zum Spielen von Space Invaders vom 20.12.18:

# -*- coding: utf-8 -*-
"""
Created on Thu Dec 13 16:09:46 2018
 
@author: Luca
"""
 
#!/usr/bin/env python
from __future__ import print_function
 
import sys, gym, time
 
#
# Test yourself as a learning agent! Pass environment name as a command-line argument, for example:
#
# python keyboard_agent.py SpaceInvadersNoFrameskip-v4
#
 
env = gym.make('SpaceInvaders-v0' if len(sys.argv)<2 else sys.argv[1])
 
if not hasattr(env.action_space, 'n'):
    raise Exception('Keyboard agent only supports discrete action spaces')
ACTIONS = env.action_space.n
SKIP_CONTROL = 0    # Use previous control decision SKIP_CONTROL times, that's how you
                    # can test what skip is still usable.
 
human_agent_action = 0
human_wants_restart = False
human_sets_pause = False
 
def key_press(key, mod):
    global human_agent_action, human_wants_restart, human_sets_pause
    if key==0xff0d: human_wants_restart = True
    if key==32: human_sets_pause = not human_sets_pause
    a = int( key - ord('0') )
    if a <= 0 or a >= ACTIONS: return
    human_agent_action = a
 
def key_release(key, mod):
    global human_agent_action
    a = int( key - ord('0') )
    if a <= 0 or a >= ACTIONS: return
    if human_agent_action == a:
        human_agent_action = 0
 
env.render()
env.unwrapped.viewer.window.on_key_press = key_press
env.unwrapped.viewer.window.on_key_release = key_release
 
def rollout(env):
    global human_agent_action, human_wants_restart, human_sets_pause
    human_wants_restart = False
    obser = env.reset()
    skip = 0
    total_reward = 0
    total_timesteps = 0
    while 1:
        if not skip:
            #print("taking action {}".format(human_agent_action))
            a = human_agent_action
            total_timesteps += 1
            skip = SKIP_CONTROL
        else:
            skip -= 1
 
        obser, r, done, info = env.step(a)
        if r != 0:
            print("reward %0.3f" % r)
        total_reward += r
        window_still_open = env.render()
        if window_still_open==False: return False
        if done: break
        if human_wants_restart: break
        while human_sets_pause:
            env.render()
            time.sleep(0.1)
        time.sleep(0.1)
    print("timesteps %i reward %0.2f" % (total_timesteps, total_reward))
 
print("ACTIONS={}".format(ACTIONS))
print("Press keys 1 2 3 ... to take actions 1 2 3 ...")
print("No keys pressed is taking action 0")
 
while 1:
    window_still_open = rollout(env)
    if window_still_open==False: break
 
 
 
 
__**Programm zum Cartpool vom 10.1.2019**__
 
 
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
 
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()
 
    def _build_model(self):
        # Einfaches NN 
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse',
                      optimizer=Adam(lr=self.learning_rate))
        return model
 
    def remember(self, state, action, reward, next_state, done):
        # merkt sich alle bisher durchlaufenen Zustände
        self.memory.append((state, action, reward, next_state, done))
 
    def act(self, state):
        # epsilon-greedy: off-policy oder policy
 
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action
 
    def replay(self, batch_size):
        # baut den Vektor der Q-Werte aus 
        # als reward zum Zeitpunkt t + gamma*max(moegliche rewards zum Zeitpunkt t+1)
 
        minibatch = random.sample(self.memory, batch_size)
        states, targets_f = [], []
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = (reward + self.gamma *
                          np.amax(self.model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target 
            # Filtering out states and targets for training
            states.append(state[0])
            targets_f.append(target_f[0])
        history = self.model.fit(np.array(states), np.array(targets_f), epochs=1, verbose=0)
        # Keeping track of loss
        loss = history.history['loss'][0]
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        return loss
 
    def load(self, name):
        self.model.load_weights(name)
 
    def save(self, name):
        self.model.save_weights(name)
# Lernen: 
 
#  EPISODES mal durchspielen, bis der Mast umfaellt
 
 
import gym
 
 
EPISODES = 1000
 
 
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = DQNAgent(state_size, action_size)
done = False
batch_size = 32
 
for e in range(EPISODES):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    for time in range(500):
        env.render()
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        reward = reward-abs(next_state[0]) if not done else -10
        print(next_state[0], reward)
        next_state = np.reshape(next_state, [1, state_size])
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        if done:
            print("episode: {}/{}, score: {}, e: {:.2}"
                  .format(e, EPISODES, time, agent.epsilon))
            break
        if len(agent.memory) > batch_size:
            loss = agent.replay(batch_size)
            # Logging training loss every 10 timesteps
            if time % 10 == 0:
                print("episode: {}/{}, time: {}, loss: {:.4f}".format(e, EPISODES, time, loss))
 
# Und nochmal lernen
 
for e in range(EPISODES):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    for time in range(500):
        env.render()
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        reward = reward-abs(next_state[0]) if not done else -10
        print(next_state[0], reward)
        next_state = np.reshape(next_state, [1, state_size])
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        if done:
            print("episode: {}/{}, score: {}, e: {:.2}"
                  .format(e, EPISODES, time, agent.epsilon))
            break
        if len(agent.memory) > batch_size:
            loss = agent.replay(batch_size)
            # Logging training loss every 10 timesteps
            if time % 10 == 0:
                print("episode: {}/{}, time: {}, loss: {:.4f}".format(e, EPISODES, time, loss)) 
dir(env)
env.action_space
env.observation_space
state = env.reset()
state
 
 
 
aktueller code
 
# -*- coding: utf-8 -*-
"""
Created on Thu Jan 24 16:43:47 2019
 
@author: Luca
"""
 
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
import keras
 
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 1.0   # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.999
        self.learning_rate = 0.001
        self.model = self._build_model()
 
    def _build_model(self):
        # Einfaches NN 
        model = Sequential()
        model.add(Dense(48, input_dim=self.state_size, activation='relu', 
                        kernel_regularizer=keras.regularizers.l2(0.00001)))
        model.add(Dropout(0.3))
        model.add(Dense(24, activation='relu', kernel_regularizer=keras.regularizers.l2(0.00001)))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse',
                      optimizer=Adam(lr=self.learning_rate))
        return model
 
    def remember(self, state, action, reward, next_state, done, total, importance):
        # merkt sich alle bisher durchlaufenen Zustände
        self.memory.append((state, action, reward, next_state, done,total,importance))
 
    def act(self, state):
        # epsilon-greedy: off-policy oder policy
 
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action
 
    def replay(self, batch_size):
        # baut den Vektor der Q-Werte aus 
        # als reward zum Zeitpunkt t + gamma*max(moegliche rewards zum Zeitpunkt t+1)
 
        probabilities = np.array([m[-1] for m in self.memory])
        probabilities = 1./np.sum(probabilities) * probabilities
        #print( probabilities.shape)
        minibatch = [self.memory[i] for i in np.random.choice(range(len(self.memory)),size=batch_size, p=probabilities)]
        states, targets_f = [], []
        for state, action, reward, next_state, done,total,importance in minibatch:
            target = reward
            if not done:
                target = (reward + self.gamma *
                          np.amax(self.model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target 
            # Filtering out states and targets for training
            states.append(state[0])
            targets_f.append(target_f[0])
        history = self.model.fit(np.array(states), np.array(targets_f), epochs=1, verbose=0)
        # Keeping track of loss
        loss = history.history['loss'][0]
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        return loss
 
    def load(self, name):
        self.model.load_weights(name)
 
    def save(self, name):
        self.model.save_weights(name)
 
# Lernen: 
 
#  EPISODES mal durchspielen, bis der Mast umfaellt
 
 
import gym
 
 
EPISODES = 100
 
 
env = gym.make('Acrobot-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = DQNAgent(state_size, action_size)
done = False
batch_size = 32
 
for e in range(EPISODES):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    cum_reward = 0
    for time in range(500):
        env.render()
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        reward = reward/(abs(next_state[0])+1.)**2 if not done else -10 #
        cum_reward += reward
        next_state = np.reshape(next_state, [1, state_size])
        agent.remember(state, action, reward, next_state, done,reward,1)
        state = next_state
        if done:
            print("episode: {}/{}, score: {}, e: {:.2}"
                  .format(e, EPISODES, time, agent.epsilon))
            break
        if len(agent.memory) > batch_size:
            loss = agent.replay(batch_size)
            # Logging training loss and actual reward every 10 timesteps
            if time % 10 == 0:
                print("episode: {}/{}, time: {}, cumulative reward: {:.4f}, loss: {:.4f}".format(e, EPISODES, time, cum_reward, loss)) 
 
 
        for i in range(time):
            pos = -i-1
            for j in range(-time,pos):
                new_total =  agent.memory[j][-2] + agent.memory[pos][2]
                mem = agent.memory[j]
                agent.memory[j] = (mem[0],mem[1],mem[2],mem[3], mem[4],new_total,1)
 
        for i in range(time):
            pos = -i-1
            imp = max(agent.memory[pos][-2]-agent.model.predict(agent.memory[pos][0])[0,agent.memory[pos][1]],0)
            mem = agent.memory[pos]
            agent.memory[pos] = (mem[0],mem[1],mem[2],mem[3], mem[4],mem[5],imp)
 
 
agent.save("qlearning_Acrobot_1000versuche")
ws1819/programme.txt · Zuletzt geändert: 2019/03/25 20:08 von stefanborn