1052 lines
42 KiB
Python
1052 lines
42 KiB
Python
import random
|
|
import numpy as np
|
|
import tensorflow as tf
|
|
from tensorflow import keras
|
|
|
|
from labirinth_ai.LabyrinthWorld import LabyrinthWorld
|
|
from labirinth_ai.loss import loss2, loss3
|
|
from labirinth_ai.Models.BaseModel import BaseModel, train, create_optimizer, device, from_numpy
|
|
|
|
# import torch
|
|
# dtype = torch.float
|
|
# device = torch.device("cpu")
|
|
|
|
|
|
class Subject:
|
|
name = 'random'
|
|
col = 8
|
|
num = 0
|
|
random = True
|
|
r = 255
|
|
g = 255
|
|
b = 255
|
|
|
|
def __init__(self, x, y):
|
|
self.alive = True
|
|
self.x = x
|
|
self.y = y
|
|
self.kills = 0
|
|
self.lives = 1
|
|
self.tick = 0
|
|
|
|
self.id = self.num
|
|
Subject.num += 1
|
|
|
|
def update(self, world: LabyrinthWorld):
|
|
# 0, 0 is top left
|
|
right = (1, 0)
|
|
left = (-1, 0)
|
|
up = (0, -1)
|
|
down = (0, 1)
|
|
directions = []
|
|
|
|
if self.x - 1 >= 0:
|
|
if world.board[self.x - 1, self.y] != 0:
|
|
directions.append(left)
|
|
|
|
if self.x + 1 < world.board_shape[0]:
|
|
if world.board[self.x + 1, self.y] != 0:
|
|
directions.append(right)
|
|
|
|
if self.y - 1 >= 0:
|
|
if world.board[self.x, self.y - 1] != 0:
|
|
directions.append(up)
|
|
|
|
if self.y + 1 < world.board_shape[1]:
|
|
if world.board[self.x, self.y + 1] != 0:
|
|
directions.append(down)
|
|
|
|
if directions != [] and self.alive:
|
|
if len(directions) > 1:
|
|
d = directions[random.randint(0, len(directions) - 1)]
|
|
else:
|
|
d = directions[0]
|
|
|
|
if len(world.subjectDict[(self.x + d[0], self.y + d[1])]) > 0:
|
|
for sub in world.subjectDict[(self.x + d[0], self.y + d[1])]:
|
|
if sub.alive:
|
|
self.kills += 1
|
|
sub.alive = False
|
|
self.alive = True
|
|
|
|
world.subjectDict[(self.x, self.y)].remove(self)
|
|
world.trailMix[self.x, self.y] += 1
|
|
self.x += d[0]
|
|
self.y += d[1]
|
|
world.subjectDict[(self.x, self.y)].append(self)
|
|
|
|
def respawnUpdate(self, x, y, world: LabyrinthWorld):
|
|
world.subjectDict[(self.x, self.y)].remove(self)
|
|
self.x = x
|
|
self.y = y
|
|
world.subjectDict[(self.x, self.y)].append(self)
|
|
self.alive = True
|
|
self.lives += 1
|
|
|
|
|
|
class QLearner(Subject):
|
|
name = 'QLearner'
|
|
col = 14
|
|
learningRate = 0.25
|
|
discountFactor = 0.5
|
|
random = False
|
|
|
|
Q = {}
|
|
def __init__(self, x, y):
|
|
super(QLearner, self).__init__(x, y)
|
|
# self.Q = {}
|
|
self.viewD = 3
|
|
self.lastAction = None
|
|
self.lastState = None
|
|
self.lastReward = 0
|
|
|
|
def respawnUpdate(self, x, y, world: LabyrinthWorld):
|
|
super(QLearner, self).respawnUpdate(x, y, world)
|
|
self.lastReward -= 20
|
|
|
|
def createState(self, world: LabyrinthWorld):
|
|
state = np.zeros((2 * self.viewD + 1, 2 * self.viewD + 1), np.int) # - 1
|
|
|
|
# # floodfill state
|
|
# queued = [(0, 0)]
|
|
# todo = [(0, 0, 0)]
|
|
# while todo != []:
|
|
# doing = todo.pop(0)
|
|
#
|
|
# if self.x + doing[0] >= 0 and self.x + doing[0] < 64 and self.y + doing[1] >= 0 and self.y + doing[1] < 64:
|
|
# value = world.board[self.x + doing[0], self.y + doing[1]]
|
|
# state[self.viewD + doing[0], self.viewD + doing[1]] = value
|
|
#
|
|
# # if value == 3:
|
|
# # state[self.viewD + doing[0], self.viewD + doing[1]] = value
|
|
#
|
|
# if value != 0 and doing[2] < self.viewD:
|
|
# for i in range(-1, 2, 1):
|
|
# for j in range(-1, 2, 1):
|
|
# # 4-neighbour. without it it is 8-neighbour
|
|
# if abs(i) + abs(j) == 1:
|
|
# if (doing[0] + i, doing[1] + j) not in queued:
|
|
# queued.append((doing[0] + i, doing[1] + j))
|
|
# todo.append((doing[0] + i, doing[1] + j, doing[2] + 1))
|
|
#
|
|
# for sub in world.subjects:
|
|
# if (sub.x - self.x, sub.y - self.y) in queued and state[
|
|
# self.viewD + sub.x - self.x, self.viewD + sub.y - self.y] != 3:
|
|
# state[self.viewD + sub.x - self.x, self.viewD + sub.y - self.y] = state[
|
|
# self.viewD + sub.x - self.x, self.viewD + sub.y - self.y] * 100 + sub.col
|
|
|
|
maxdirleft = self.x - max(self.x - (self.viewD), 0)
|
|
maxdirright = min(self.x + (self.viewD), (world.board_shape[0] - 1)) - self.x
|
|
maxdirup = self.y - max(self.y - (self.viewD), 0)
|
|
maxdirdown = min(self.y + (self.viewD), (world.board_shape[1] - 1)) - self.y
|
|
|
|
# state[self.viewD - maxdirleft: self.viewD + maxdirright, self.viewD - maxdirup: self.viewD + maxdirdown] = world.board[self.x - maxdirleft: self.x + maxdirright, self.y - maxdirup: self.y + maxdirdown]
|
|
for sub in world.subjects:
|
|
if abs(sub.x - self.x) < self.viewD and abs(sub.y - self.y) < self.viewD:
|
|
if state[self.viewD + sub.x - self.x, self.viewD + sub.y - self.y] != 3:
|
|
state[self.viewD + sub.x - self.x, self.viewD + sub.y - self.y] = state[self.viewD + sub.x - self.x, self.viewD + sub.y - self.y] * 100 + 1# sub.col
|
|
|
|
return state
|
|
|
|
def update(self, world: LabyrinthWorld):
|
|
# 0, 0 is top left
|
|
right = (1, 0)
|
|
left = (-1, 0)
|
|
up = (0, -1)
|
|
down = (0, 1)
|
|
directions = []
|
|
|
|
if self.x - 1 >= 0:
|
|
if world.board[self.x - 1, self.y] != 0:
|
|
directions.append(left)
|
|
|
|
if self.x + 1 < world.board_shape[0]:
|
|
if world.board[self.x + 1, self.y] != 0:
|
|
directions.append(right)
|
|
|
|
if self.y - 1 >= 0:
|
|
if world.board[self.x, self.y - 1] != 0:
|
|
directions.append(up)
|
|
|
|
if self.y + 1 < world.board_shape[1]:
|
|
if world.board[self.x, self.y + 1] != 0:
|
|
directions.append(down)
|
|
|
|
if directions != [] and self.alive:
|
|
state = self.createState(world)
|
|
|
|
if str(state) not in self.Q.keys():
|
|
self.Q[str(state)] = {}
|
|
for dir in directions:
|
|
if dir not in self.Q[str(state)].keys():
|
|
self.Q[str(state)][dir] = random.randint(0, 5)
|
|
|
|
allowedActions = dict(filter(lambda elem: elem[0] in directions,self.Q[str(state)].items()))
|
|
action = max(allowedActions, key=allowedActions.get)
|
|
|
|
if self.learningRate != 0:
|
|
self.Q[str(state)][action] = (1 - self.learningRate) * self.Q[str(state)][action] + self.learningRate * (self.lastReward + self.discountFactor * self.Q[str(state)][action])
|
|
|
|
self.lastAction = action
|
|
self.lastState = state
|
|
self.lastReward = 0
|
|
|
|
if len(action) == 2:
|
|
if len(world.subjectDict[(self.x + action[0], self.y + action[1])]) > 0:
|
|
for sub in world.subjectDict[(self.x + action[0], self.y + action[1])]:
|
|
if sub.alive:
|
|
self.kills += 1
|
|
sub.alive = False
|
|
self.alive = True
|
|
self.lastReward += 10
|
|
|
|
world.subjectDict[(self.x, self.y)].remove(self)
|
|
self.x += action[0]
|
|
self.y += action[1]
|
|
world.subjectDict[(self.x, self.y)].append(self)
|
|
pass
|
|
|
|
|
|
class DoubleQLearner(QLearner):
|
|
name = 'DoubleQLearner'
|
|
col = 11
|
|
learningRate = 0.5
|
|
discountFactor = 0.5
|
|
random = False
|
|
|
|
QA = {}
|
|
QB = {}
|
|
def __init__(self, x, y):
|
|
super(DoubleQLearner, self).__init__(x, y)
|
|
self.viewD = 3
|
|
self.lastAction = None
|
|
self.lastState = None
|
|
self.lastReward = 0
|
|
|
|
def respawnUpdate(self, x, y, world: LabyrinthWorld):
|
|
super(DoubleQLearner, self).respawnUpdate(x, y, world)
|
|
|
|
def update(self, world: LabyrinthWorld):
|
|
# 0, 0 is top left
|
|
right = (1, 0)
|
|
left = (-1, 0)
|
|
up = (0, -1)
|
|
down = (0, 1)
|
|
directions = []
|
|
|
|
if self.x - 1 >= 0:
|
|
if world.board[self.x - 1, self.y] != 0:
|
|
directions.append(left)
|
|
|
|
if self.x + 1 < world.board_shape[0]:
|
|
if world.board[self.x + 1, self.y] != 0:
|
|
directions.append(right)
|
|
|
|
if self.y - 1 >= 0:
|
|
if world.board[self.x, self.y - 1] != 0:
|
|
directions.append(up)
|
|
|
|
if self.y + 1 < world.board_shape[1]:
|
|
if world.board[self.x, self.y + 1] != 0:
|
|
directions.append(down)
|
|
|
|
if directions != [] and self.alive:
|
|
state = self.createState(world)
|
|
|
|
if str(state) not in self.QA.keys():
|
|
self.QA[str(state)] = {}
|
|
self.QB[str(state)] = {}
|
|
for dir in directions:
|
|
if dir not in self.QA[str(state)].keys():
|
|
self.QA[str(state)][dir] = random.randint(0, 5)
|
|
self.QB[str(state)][dir] = random.randint(0, 5)
|
|
|
|
allowedActionsA = dict(filter(lambda elem: elem[0] in directions, self.QA[str(state)].items()))
|
|
allowedActionsB = dict(filter(lambda elem: elem[0] in directions, self.QB[str(state)].items()))
|
|
allowedActions = {}
|
|
for key in allowedActionsA.keys():
|
|
allowedActions[key] = allowedActionsA[key] + allowedActionsB[key]
|
|
|
|
actionA = max(allowedActionsA, key=allowedActionsA.get)
|
|
actionB = max(allowedActionsB, key=allowedActionsB.get)
|
|
action = max(allowedActions, key=allowedActions.get)
|
|
|
|
if self.learningRate != 0:
|
|
if random.randint(0, 1) == 0:
|
|
valA = self.QA[str(state)][action]
|
|
self.QA[str(state)][action] = valA + self.learningRate * (self.lastReward + self.discountFactor * self.QB[str(state)][actionA] - valA)
|
|
else:
|
|
valB = self.QB[str(state)][action]
|
|
self.QB[str(state)][action] = valB + self.learningRate * (self.lastReward + self.discountFactor * self.QA[str(state)][actionB] - valB)
|
|
|
|
self.lastAction = action
|
|
self.lastState = state
|
|
self.lastReward = 0
|
|
|
|
if len(action) == 2:
|
|
if len(world.subjectDict[(self.x + action[0], self.y + action[1])]) > 0:
|
|
for sub in world.subjectDict[(self.x + action[0], self.y + action[1])]:
|
|
if sub.alive:
|
|
self.kills += 1
|
|
sub.alive = False
|
|
self.alive = True
|
|
self.lastReward += 10
|
|
|
|
world.subjectDict[(self.x, self.y)].remove(self)
|
|
self.x += action[0]
|
|
self.y += action[1]
|
|
world.subjectDict[(self.x, self.y)].append(self)
|
|
pass
|
|
|
|
|
|
class NetLearner(Subject):
|
|
right = (1, 0)
|
|
left = (-1, 0)
|
|
up = (0, -1)
|
|
down = (0, 1)
|
|
act2IDict = {right: 0, left: 1, up: 2, down: 3}
|
|
|
|
name = 'NetLearner'
|
|
col = 15
|
|
viewD = 3
|
|
historyLength = 2
|
|
channels = 4
|
|
|
|
learningRate = 0.001
|
|
discountFactor = 0.5
|
|
randomBuffer = 0
|
|
batchsize = 1000
|
|
randomBuffer = max(4*batchsize, randomBuffer)
|
|
randomChance = 9
|
|
|
|
historySizeMul = 20
|
|
|
|
# samples = []
|
|
|
|
# x_in = keras.Input(shape=(4 * (2 * viewD + 1) * (2 * viewD + 1) + 2))
|
|
# target = keras.Input(shape=(10, 1))
|
|
# inVec = keras.layers.Flatten()(x_in)
|
|
# # kernel_regularizer=keras.regularizers.l2(0.01)
|
|
# actions = keras.layers.Dense((3 * (2 * viewD + 1) * (2 * viewD + 1)), activation='relu')(inVec)
|
|
# actions = keras.layers.Dense(((2 * viewD + 1) * (2 * viewD + 1)), activation='relu')(actions)
|
|
# actions = keras.layers.Dense(8, activation='linear', use_bias=False)(actions)
|
|
#
|
|
# model = keras.Model(inputs=x_in, outputs=actions)
|
|
#
|
|
# # model.compile(optimizer='adam', loss=loss, target_tensors=[target])
|
|
# model.compile(optimizer=tf.keras.optimizers.RMSprop(learningRate), loss=loss, target_tensors=[target])
|
|
|
|
def respawnUpdate(self, x, y, world: LabyrinthWorld):
|
|
super(NetLearner, self).respawnUpdate(x, y, world)
|
|
# self.lastReward -= 20
|
|
|
|
if len(self.samples) < self.randomBuffer or random.randint(0, 10) > self.randomChance:
|
|
self.random = True
|
|
# print('Rando ' + self.name)
|
|
pass
|
|
else:
|
|
self.random = False
|
|
# print('Slau ' + self.name)
|
|
|
|
self.strikes = 0
|
|
|
|
def __init__(self, x, y):
|
|
super(NetLearner, self).__init__(x, y)
|
|
|
|
self.action = None
|
|
self.state = None
|
|
self.actDict = {}
|
|
|
|
self.history = []
|
|
self.lastAction = None
|
|
self.lastState = None
|
|
self.lastReward = 0
|
|
self.lastVal = 0
|
|
self.random = False
|
|
self.nextTrain = self.randomBuffer
|
|
|
|
self.samples = []
|
|
|
|
self.x_in = []
|
|
self.actions = []
|
|
self.target = []
|
|
self.model = BaseModel(self.viewD, 4, 4).to(device)
|
|
self.optimizer = create_optimizer(self.model)
|
|
|
|
if len(self.samples) < self.randomBuffer:
|
|
self.random = True
|
|
else:
|
|
self.random = False
|
|
|
|
self.strikes = 0
|
|
|
|
self.lastRewards = []
|
|
|
|
self.accumulated_rewards = 0
|
|
|
|
def visualize(self):
|
|
print(self.name)
|
|
layers = self.model.get_weights()
|
|
# layers.reverse()
|
|
layersN = [[0, 1, 8, 9, 16], [2, 3, 10, 11, 17], [4, 5, 12, 13, 18], [6, 7, 14, 15, 19]]
|
|
for action in range(8):
|
|
v = np.zeros((1, 2))
|
|
v[0][0 if action < 4 else 1] = 1.0
|
|
layerN = list(layersN[action % 4])
|
|
layerN.reverse()
|
|
for n in layerN:
|
|
l = layers[n]
|
|
if len(l.shape) == 2:
|
|
layer = np.transpose(l)
|
|
v = np.dot(v, layer)
|
|
else:
|
|
layer = np.array([l])
|
|
v = v + layer
|
|
lastAction = v[0, -2:]
|
|
v = np.reshape(v[0, :-2], (4, (2 * self.viewD + 1), (2 * self.viewD + 1)))
|
|
|
|
# right, left, up, down
|
|
dir = {0: 'right', 1: 'left', 2: 'up', 3: 'down'}
|
|
dir = dir[action % 4]
|
|
#0-3 current
|
|
#4-8 future
|
|
if action < 4:
|
|
time = 'current '
|
|
else:
|
|
time = 'future '
|
|
import matplotlib
|
|
import matplotlib.pyplot as plt
|
|
fig, axs = plt.subplots(2, 2, figsize=(5, 5))
|
|
|
|
fig.suptitle(time + dir)
|
|
im = axs[0, 0].pcolor(np.rot90(v[0]))
|
|
fig.colorbar(im, ax=axs[0, 0])
|
|
axs[0, 0].set_title('board')
|
|
|
|
axs[0, 1].pcolor(np.rot90(v[1]))
|
|
fig.colorbar(im, ax=axs[0, 1])
|
|
axs[0, 1].set_title('subjects')
|
|
|
|
axs[1, 0].pcolor(np.rot90(v[2]))
|
|
fig.colorbar(im, ax=axs[1, 0])
|
|
axs[1, 0].set_title('trail')
|
|
|
|
axs[1, 1].pcolor(np.rot90(v[3]))
|
|
fig.colorbar(im, ax=axs[1, 1])
|
|
axs[1, 1].set_title('grass')
|
|
plt.show(block=True)
|
|
|
|
|
|
def createState(self, world: LabyrinthWorld):
|
|
state = np.zeros((2 * self.viewD + 1, 2 * self.viewD + 1), np.float) # - 1
|
|
state2 = np.zeros((2 * self.viewD + 1, 2 * self.viewD + 1), np.float) # - 1
|
|
state3 = np.zeros((2 * self.viewD + 1, 2 * self.viewD + 1), np.float) # - 1
|
|
state4 = np.zeros((2 * self.viewD + 1, 2 * self.viewD + 1), np.float) # - 1
|
|
|
|
maxdirleft = self.x - max(self.x - (self.viewD), 0)
|
|
maxdirright = min(self.x + (self.viewD), (world.board_shape[0] - 1)) - self.x
|
|
maxdirup = self.y - max(self.y - (self.viewD), 0)
|
|
maxdirdown = min(self.y + (self.viewD), (world.board_shape[1] - 1)) - self.y
|
|
|
|
state[self.viewD - maxdirleft: self.viewD + maxdirright, self.viewD - maxdirup: self.viewD + maxdirdown] = world.board[self.x - maxdirleft: self.x + maxdirright, self.y - maxdirup: self.y + maxdirdown]
|
|
# for sub in world.subjects:
|
|
# if abs(sub.x - self.x) < self.viewD and abs(sub.y - self.y) < self.viewD:
|
|
# if state[self.viewD + sub.x - self.x, self.viewD + sub.y - self.y] != 3:
|
|
# state2[self.viewD + sub.x - self.x, self.viewD + sub.y - self.y] = sub.col
|
|
for x in range(-maxdirleft, maxdirright, 1):
|
|
for y in range(-maxdirup, maxdirdown, 1):
|
|
if world.subjectDict[(self.x + x, self.y + y)] != []:
|
|
state2[x + maxdirleft, y + maxdirup] = 1#world.subjectDict[(self.x + x, self.y + y)][0].col
|
|
|
|
state3[self.viewD - maxdirleft: self.viewD + maxdirright, self.viewD - maxdirup: self.viewD + maxdirdown] = world.trailMix[self.x - maxdirleft: self.x + maxdirright, self.y - maxdirup: self.y + maxdirdown]
|
|
state4[self.viewD - maxdirleft: self.viewD + maxdirright, self.viewD - maxdirup: self.viewD + maxdirdown] = world.hunter_grass[self.x - maxdirleft: self.x + maxdirright, self.y - maxdirup: self.y + maxdirdown]
|
|
|
|
if not self.random:
|
|
test=1
|
|
|
|
area = np.reshape(np.stack((state, state2, state3, state4)), (4 * (2 * self.viewD + 1) * (2 * self.viewD + 1)))
|
|
action = [0, 0]
|
|
if self.lastAction is not None:
|
|
action = self.lastAction
|
|
return np.reshape(np.concatenate((area, action)), (1, 4 * (2 * self.viewD + 1) * (2 * self.viewD + 1) + 2))
|
|
|
|
def calculateAction(self, world: LabyrinthWorld, vals=None, state=None):
|
|
# 0, 0 is top left
|
|
directions = []
|
|
|
|
if self.x - 1 >= 0:
|
|
if world.board[self.x - 1, self.y] != 0:
|
|
directions.append(self.left)
|
|
|
|
if self.x + 1 < world.board_shape[0]:
|
|
if world.board[self.x + 1, self.y] != 0:
|
|
directions.append(self.right)
|
|
|
|
if self.y - 1 >= 0:
|
|
if world.board[self.x, self.y - 1] != 0:
|
|
directions.append(self.up)
|
|
|
|
if self.y + 1 < world.board_shape[1]:
|
|
if world.board[self.x, self.y + 1] != 0:
|
|
directions.append(self.down)
|
|
|
|
if directions == []:
|
|
print('Wut?')
|
|
|
|
if directions != [] and self.alive:
|
|
if state is None:
|
|
state = self.createState(world)
|
|
if vals is None:
|
|
vals = self.model(from_numpy(state)).detach().numpy()
|
|
vals = np.reshape(np.transpose(np.reshape(vals, (4, 2)), (1, 0)),
|
|
(1, 8))
|
|
|
|
self.actDict = {self.right: vals[0][0] + vals[0][4], self.left: vals[0][1] + vals[0][5], self.up: vals[0][2] + vals[0][6], self.down: vals[0][3] + vals[0][7]}
|
|
|
|
allowedActions = dict(filter(lambda elem: elem[0] in directions, self.actDict.items()))
|
|
|
|
# if self.name == 'Herbivore' and self.id == 11 and not self.random:
|
|
# print(allowedActions)
|
|
# print(self.lastReward)
|
|
if self.strikes <= 0:
|
|
self.random = False
|
|
|
|
if not self.random:
|
|
self.action = max(allowedActions, key=allowedActions.get)
|
|
else:
|
|
self.action = self.randomAct(world)
|
|
|
|
self.state = state
|
|
|
|
def update(self, world: LabyrinthWorld, doTrain=True):
|
|
if self.lastAction is not None:
|
|
if not self.random:
|
|
if self.lastAction[0] + self.action[0] == 0 and self.lastAction[1] + self.action[1] == 0:
|
|
self.strikes += 1
|
|
else:
|
|
self.strikes -= 1
|
|
if self.strikes > 100:
|
|
self.random = True
|
|
else:
|
|
self.strikes -= 1
|
|
|
|
if len(self.history) >= self.historyLength:
|
|
self.history.pop(0)
|
|
self.history.append((self.lastState.copy(), int(self.act2IDict[self.lastAction]), int(self.lastVal), float(self.lastReward), np.array(self.lastRewards)))
|
|
|
|
# if self.lastReward != 0 or random.randint(0, 9) == 0:
|
|
if len(self.history) == self.historyLength:
|
|
self.samples.append(self.history.copy())
|
|
|
|
# if len(self.samples) % self.batchsize == 0 and len(self.samples) >= self.randomBuffer:
|
|
if len(self.samples) > self.nextTrain and doTrain:
|
|
print('train')
|
|
self.train()
|
|
self.nextTrain = min(self.batchsize + self.nextTrain, (self.historySizeMul + 1) * self.batchsize)
|
|
|
|
self.accumulated_rewards += self.lastReward
|
|
|
|
self.lastAction = self.action
|
|
self.lastState = self.state
|
|
self.lastReward = 0
|
|
self.lastVal = self.actDict[self.action]
|
|
|
|
maxVal = 0
|
|
|
|
self.executeAction(world, self.action)
|
|
|
|
def randomAct(self, world: LabyrinthWorld):
|
|
right = (1, 0)
|
|
left = (-1, 0)
|
|
up = (0, -1)
|
|
down = (0, 1)
|
|
directions = []
|
|
|
|
if self.x - 1 >= 0:
|
|
if world.board[self.x - 1, self.y] != 0:
|
|
directions.append(left)
|
|
|
|
if self.x + 1 < world.board_shape[0]:
|
|
if world.board[self.x + 1, self.y] != 0:
|
|
directions.append(right)
|
|
|
|
if self.y - 1 >= 0:
|
|
if world.board[self.x, self.y - 1] != 0:
|
|
directions.append(up)
|
|
|
|
if self.y + 1 < world.board_shape[1]:
|
|
if world.board[self.x, self.y + 1] != 0:
|
|
directions.append(down)
|
|
|
|
d = random.randint(0, len(directions) - 1)
|
|
action = directions[d]
|
|
|
|
return action
|
|
|
|
def executeAction(self, world: LabyrinthWorld, action):
|
|
pass
|
|
|
|
def generateSamples(self):
|
|
# history element: (self.lastState.copy(), self.act2IDict[self.lastAction], self.lastVal, self.lastReward, np.array(self.lastRewards))
|
|
# history: [t-2, t-1]
|
|
states = []
|
|
targets = []
|
|
for i in range(4):
|
|
true_batch = int(self.batchsize/4)
|
|
target = np.zeros((true_batch, 2, 1))
|
|
samples = np.array(self.samples[:-self.batchsize])
|
|
# print('Samples for ' + str(i))
|
|
# print(len(samples))
|
|
samples = np.array(list(filter(lambda e: e[0, 1] == i, list(samples))))
|
|
# print(len(samples))
|
|
partTwo = True
|
|
if len(samples) == 0:
|
|
print('No samples for:' + str(i))
|
|
partTwo = False
|
|
samples = np.array(self.samples[:-self.batchsize])
|
|
buffer_size = len(samples)
|
|
index = np.random.choice(np.arange(buffer_size),
|
|
size=true_batch,
|
|
replace=True)
|
|
samples = samples[index]
|
|
# self.samples = []
|
|
if partTwo:
|
|
target[:, 1, 0] = samples[:, 1, 3] #reward t-2 got
|
|
|
|
nextState = np.concatenate(samples[:, 1, 0]) #states of t-1
|
|
nextVals = self.model(from_numpy(nextState)).detach().numpy()
|
|
|
|
nextVals2 = nextVals[:, i, 0] + nextVals[:, i, 1]
|
|
target[:, 0, 0] = nextVals2 #best q t-1
|
|
else:
|
|
target[:, 1, 0] = np.array(list(map(lambda elem: list(elem), list(np.array(samples[:, 1, 4])))))[:, i] # reward t-2 got
|
|
|
|
targets.append(target)
|
|
|
|
states.append(np.concatenate(samples[:, 0, 0])) #states of t-2
|
|
|
|
return states, targets
|
|
|
|
def train(self):
|
|
print(self.name)
|
|
states, target = self.generateSamples()
|
|
train(states, target, self.model, self.optimizer)
|
|
|
|
self.samples = self.samples[-self.historySizeMul*self.batchsize:]
|
|
|
|
# print(self.model.get_weights())
|
|
|
|
pass
|
|
|
|
|
|
class Herbivore(NetLearner):
|
|
name = 'Herbivore'
|
|
col = 9
|
|
r = 255
|
|
g = 255
|
|
b = 0
|
|
viewD = 3
|
|
historyLength = 2
|
|
|
|
learningRate = 0.001
|
|
discountFactor = 0.5
|
|
randomBuffer = 0
|
|
batchsize = 1000
|
|
randomBuffer = max(2 * batchsize, randomBuffer)
|
|
randomChance = 9
|
|
|
|
samples = []
|
|
|
|
# x_in = keras.Input(shape=(4 * (2 * viewD + 1) * (2 * viewD + 1) + 2))
|
|
# target = keras.Input(shape=(10, 1))
|
|
# inVec = keras.layers.Flatten()(x_in)
|
|
# # kernel_regularizer=keras.regularizers.l2(0.01)
|
|
# actions = keras.layers.Dense((4 * (2 * viewD + 1) * (2 * viewD + 1)), activation='elu')(inVec)
|
|
# actions = keras.layers.Dense(((2 * viewD + 1) * (2 * viewD + 1)), activation='elu')(actions)
|
|
# actions = keras.layers.Dense(8, activation='linear', use_bias=False)(actions)
|
|
# # actions = keras.layers.Dense(4, activation='linear', use_bias=False)(inVec)
|
|
#
|
|
# model = keras.Model(inputs=x_in, outputs=actions)
|
|
#
|
|
# # model.compile(optimizer='adam', loss=loss2, target_tensors=[target])
|
|
# model.compile(optimizer=tf.keras.optimizers.RMSprop(learningRate), loss=loss2, target_tensors=[target])
|
|
|
|
# def __init__(self, x, y):
|
|
# super(Herbivore, self).__init__(x, y)
|
|
|
|
def createState(self, world: LabyrinthWorld):
|
|
state = np.zeros((2 * self.viewD + 1, 2 * self.viewD + 1), np.float) # - 1
|
|
state2 = np.zeros((2 * self.viewD + 1, 2 * self.viewD + 1), np.float) # - 1
|
|
state3 = np.zeros((2 * self.viewD + 1, 2 * self.viewD + 1), np.float) # - 1
|
|
state4 = np.zeros((2 * self.viewD + 1, 2 * self.viewD + 1), np.float) # - 1
|
|
|
|
maxdirleft = self.x - max(self.x - (self.viewD), 0)
|
|
maxdirright = min(self.x + (self.viewD), (world.board_shape[0] - 1)) - self.x
|
|
maxdirup = self.y - max(self.y - (self.viewD), 0)
|
|
maxdirdown = min(self.y + (self.viewD), (world.board_shape[1] - 1)) - self.y
|
|
|
|
state[self.viewD - maxdirleft: self.viewD + maxdirright, self.viewD - maxdirup: self.viewD + maxdirdown] = world.board[self.x - maxdirleft: self.x + maxdirright, self.y - maxdirup: self.y + maxdirdown]
|
|
# for sub in world.subjects:
|
|
# if abs(sub.x - self.x) < self.viewD and abs(sub.y - self.y) < self.viewD:
|
|
# if state[self.viewD + sub.x - self.x, self.viewD + sub.y - self.y] != 3:
|
|
# state2[self.viewD + sub.x - self.x, self.viewD + sub.y - self.y] = sub.col
|
|
for x in range(-maxdirleft, maxdirright, 1):
|
|
for y in range(-maxdirup, maxdirdown, 1):
|
|
if world.subjectDict[(self.x + x, self.y + y)] != []:
|
|
state2[x + maxdirleft, y + maxdirup] = 1#world.subjectDict[(self.x + x, self.y + y)][0].col
|
|
|
|
state3[self.viewD - maxdirleft: self.viewD + maxdirright, self.viewD - maxdirup: self.viewD + maxdirdown] = world.trailMix[self.x - maxdirleft: self.x + maxdirright, self.y - maxdirup: self.y + maxdirdown]
|
|
state4[self.viewD - maxdirleft: self.viewD + maxdirright, self.viewD - maxdirup: self.viewD + maxdirdown] = world.grass[self.x - maxdirleft: self.x + maxdirright, self.y - maxdirup: self.y + maxdirdown]
|
|
|
|
if not self.random:
|
|
test=1
|
|
|
|
area = np.reshape(np.stack((state, state2, state3, state4)), (4 * (2 * self.viewD + 1) * (2 * self.viewD + 1)))
|
|
action = [0, 0]
|
|
if self.lastAction is not None:
|
|
action = self.lastAction
|
|
return np.reshape(np.concatenate((area, action)), (1, 4 * (2 * self.viewD + 1) * (2 * self.viewD + 1) + 2))
|
|
|
|
def executeAction(self, world: LabyrinthWorld, action):
|
|
right = (1, 0)
|
|
left = (-1, 0)
|
|
up = (0, -1)
|
|
down = (0, 1)
|
|
directions = []
|
|
|
|
if self.x - 1 >= 0:
|
|
if world.board[self.x - 1, self.y] != 0:
|
|
directions.append(left)
|
|
|
|
if self.x + 1 < world.board_shape[0]:
|
|
if world.board[self.x + 1, self.y] != 0:
|
|
directions.append(right)
|
|
|
|
if self.y - 1 >= 0:
|
|
if world.board[self.x, self.y - 1] != 0:
|
|
directions.append(up)
|
|
|
|
if self.y + 1 < world.board_shape[1]:
|
|
if world.board[self.x, self.y + 1] != 0:
|
|
directions.append(down)
|
|
if len(action) == 2:
|
|
if len(world.subjectDict[(self.x + action[0], self.y + action[1])]) > 0:
|
|
for sub in world.subjectDict[(self.x + action[0], self.y + action[1])]:
|
|
if isinstance(sub, Hunter):
|
|
if sub.alive:
|
|
sub.kills += 1
|
|
sub.alive = True
|
|
sub.lastReward += 10
|
|
self.alive = False
|
|
|
|
self.lastRewards = []
|
|
if right in directions:
|
|
self.lastRewards.append(world.grass[self.x + 1, self.y])
|
|
else:
|
|
self.lastRewards.append(0)
|
|
if left in directions:
|
|
self.lastRewards.append(world.grass[self.x - 1, self.y])
|
|
else:
|
|
self.lastRewards.append(0)
|
|
if up in directions:
|
|
self.lastRewards.append(world.grass[self.x, self.y - 1])
|
|
else:
|
|
self.lastRewards.append(0)
|
|
if down in directions:
|
|
self.lastRewards.append(world.grass[self.x, self.y + 1])
|
|
else:
|
|
self.lastRewards.append(0)
|
|
assert len(self.lastRewards) == 4, 'Last Rewards not filled correctly!'
|
|
|
|
world.subjectDict[(self.x, self.y)].remove(self)
|
|
self.lastReward += world.trailMix[self.x, self.y]
|
|
self.x += action[0]
|
|
self.y += action[1]
|
|
world.subjectDict[(self.x, self.y)].append(self)
|
|
world.trailMix[self.x, self.y] = max(1.0, world.trailMix[self.x, self.y])
|
|
self.lastReward += (world.grass[self.x, self.y] - 0.0)
|
|
world.grass[self.x, self.y] = 0
|
|
world.hunter_grass[self.x, self.y] = 0
|
|
|
|
def randomAct(self, world: LabyrinthWorld):
|
|
right = (1, 0)
|
|
left = (-1, 0)
|
|
up = (0, -1)
|
|
down = (0, 1)
|
|
directions = []
|
|
actDict = {}
|
|
|
|
if self.x - 1 >= 0:
|
|
if world.board[self.x - 1, self.y] != 0:
|
|
directions.append(left)
|
|
actDict[left] = world.grass[self.x - 1, self.y]
|
|
|
|
if self.x + 1 < world.board_shape[0]:
|
|
if world.board[self.x + 1, self.y] != 0:
|
|
directions.append(right)
|
|
actDict[right] = world.grass[self.x + 1, self.y]
|
|
|
|
if self.y - 1 >= 0:
|
|
if world.board[self.x, self.y - 1] != 0:
|
|
directions.append(up)
|
|
actDict[up] = world.grass[self.x, self.y - 1]
|
|
|
|
if self.y + 1 < world.board_shape[1]:
|
|
if world.board[self.x, self.y + 1] != 0:
|
|
directions.append(down)
|
|
actDict[down] = world.grass[self.x, self.y + 1]
|
|
|
|
allowedActions = dict(filter(lambda elem: elem[0] in directions, actDict.items()))
|
|
action = max(allowedActions, key=allowedActions.get)
|
|
|
|
return action
|
|
|
|
def respawnUpdate(self, x, y, world: LabyrinthWorld):
|
|
super(Herbivore, self).respawnUpdate(x, y, world)
|
|
self.lastReward -= 1
|
|
|
|
|
|
class Hunter(NetLearner):
|
|
name = 'Hunter'
|
|
hunterGrassScale = 0.5
|
|
r = 0
|
|
g = 255
|
|
b = 255
|
|
def randomAct(self, world: LabyrinthWorld):
|
|
right = (1, 0)
|
|
left = (-1, 0)
|
|
up = (0, -1)
|
|
down = (0, 1)
|
|
directions = []
|
|
actDict = {}
|
|
|
|
if self.x - 1 >= 0:
|
|
if world.board[self.x - 1, self.y] > 0.01:
|
|
directions.append(left)
|
|
|
|
sub = self.getClosestSubject(world, self.x - 1, self.y)
|
|
dist = self.viewD
|
|
if sub is not None:
|
|
dist = np.sqrt(np.square(self.x - 1 - sub.x) + np.square(self.y - sub.y))
|
|
distReward = self.viewD - dist
|
|
|
|
actDict[left] = world.trailMix[self.x - 1, self.y] + world.hunter_grass[self.x - 1, self.y] * self.hunterGrassScale + distReward
|
|
if len(world.subjectDict[(self.x + left[0], self.y + left[1])]) > 0:
|
|
for sub in world.subjectDict[(self.x + left[0], self.y + left[1])]:
|
|
if sub.col != self.col:
|
|
actDict[left] += 10
|
|
|
|
if self.x + 1 < world.board_shape[0]:
|
|
if world.board[self.x + 1, self.y] > 0.01:
|
|
directions.append(right)
|
|
|
|
sub = self.getClosestSubject(world, self.x + 1, self.y)
|
|
dist = self.viewD
|
|
if sub is not None:
|
|
dist = np.sqrt(np.square(self.x + 1 - sub.x) + np.square(self.y - sub.y))
|
|
distReward = self.viewD - dist
|
|
|
|
actDict[right] = world.trailMix[self.x + 1, self.y] + world.hunter_grass[self.x + 1, self.y] * self.hunterGrassScale + distReward
|
|
if len(world.subjectDict[(self.x + right[0], self.y + right[1])]) > 0:
|
|
for sub in world.subjectDict[(self.x + right[0], self.y + right[1])]:
|
|
if sub.col != self.col:
|
|
actDict[right] += 10
|
|
|
|
if self.y - 1 >= 0:
|
|
if world.board[self.x, self.y - 1] > 0.01:
|
|
directions.append(up)
|
|
|
|
sub = self.getClosestSubject(world, self.x, self.y - 1)
|
|
dist = self.viewD
|
|
if sub is not None:
|
|
dist = np.sqrt(np.square(self.x - sub.x) + np.square(self.y - 1 - sub.y))
|
|
distReward = self.viewD - dist
|
|
|
|
actDict[up] = world.trailMix[self.x, self.y - 1] + world.hunter_grass[self.x, self.y - 1] * self.hunterGrassScale + distReward
|
|
if len(world.subjectDict[(self.x + up[0], self.y + up[1])]) > 0:
|
|
for sub in world.subjectDict[(self.x + up[0], self.y + up[1])]:
|
|
if sub.col != self.col:
|
|
actDict[up] += 10
|
|
|
|
if self.y + 1 < world.board_shape[1]:
|
|
if world.board[self.x, self.y + 1] > 0.01:
|
|
directions.append(down)
|
|
|
|
sub = self.getClosestSubject(world, self.x, self.y + 1)
|
|
dist = self.viewD
|
|
if sub is not None:
|
|
dist = np.sqrt(np.square(self.x - sub.x) + np.square(self.y + 1 - sub.y))
|
|
distReward = self.viewD - dist
|
|
|
|
actDict[down] = world.trailMix[self.x, self.y + 1] + world.hunter_grass[self.x, self.y + 1] * self.hunterGrassScale + distReward
|
|
if len(world.subjectDict[(self.x + down[0], self.y + down[1])]) > 0:
|
|
for sub in world.subjectDict[(self.x + down[0], self.y + down[1])]:
|
|
if sub.col != self.col:
|
|
actDict[down] += 10
|
|
|
|
if len(actDict) > 0:
|
|
allowedActions = dict(filter(lambda elem: elem[0] in directions, actDict.items()))
|
|
else:
|
|
return super(Hunter, self).randomAct(world)
|
|
action = max(allowedActions, key=allowedActions.get)
|
|
|
|
return action
|
|
|
|
def respawnUpdate(self, x, y, world: LabyrinthWorld):
|
|
super(Hunter, self).respawnUpdate(x, y, world)
|
|
self.lastReward -= 1
|
|
|
|
def getClosestSubject(self, world, x, y):
|
|
for dist in range(1, self.viewD):
|
|
dy = dist
|
|
for dx in range(-dist, dist):
|
|
if world.board_shape[0] > x + dx >= 0 and world.board_shape[1] > y + dy >= 0:
|
|
for sub in world.subjectDict[(x + dx, y + dy)]:
|
|
if sub.alive and sub.col != self.col:
|
|
return sub
|
|
|
|
dy = -dist
|
|
for dx in range(-dist, dist):
|
|
if world.board_shape[0] > x + dx >= 0 and world.board_shape[1] > y + dy >= 0:
|
|
for sub in world.subjectDict[(x + dx, y + dy)]:
|
|
if sub.alive and sub.col != self.col:
|
|
return sub
|
|
|
|
dx = dist
|
|
for dy in range(-dist, dist):
|
|
if world.board_shape[0] > x + dx >= 0 and world.board_shape[1] > y + dy >= 0:
|
|
for sub in world.subjectDict[(x + dx, y + dy)]:
|
|
if sub.alive and sub.col != self.col:
|
|
return sub
|
|
|
|
dx = -dist
|
|
for dy in range(-dist, dist):
|
|
if world.board_shape[0] > x + dx >= 0 and world.board_shape[1] > y + dy >= 0:
|
|
for sub in world.subjectDict[(x + dx, y + dy)]:
|
|
if sub.alive and sub.col != self.col:
|
|
return sub
|
|
return None
|
|
|
|
def executeAction(self, world: LabyrinthWorld, action):
|
|
grass_factor = 0.5
|
|
|
|
right = (1, 0)
|
|
left = (-1, 0)
|
|
up = (0, -1)
|
|
down = (0, 1)
|
|
directions = []
|
|
|
|
if self.x - 1 >= 0:
|
|
if world.board[self.x - 1, self.y] != 0:
|
|
directions.append(left)
|
|
|
|
if self.x + 1 < world.board_shape[0]:
|
|
if world.board[self.x + 1, self.y] != 0:
|
|
directions.append(right)
|
|
|
|
if self.y - 1 >= 0:
|
|
if world.board[self.x, self.y - 1] != 0:
|
|
directions.append(up)
|
|
|
|
if self.y + 1 < world.board_shape[1]:
|
|
if world.board[self.x, self.y + 1] != 0:
|
|
directions.append(down)
|
|
|
|
if len(action) == 2:
|
|
right_kill = left_kill = up_kill = down_kill = False
|
|
if right in directions:
|
|
for sub in world.subjectDict[(self.x + action[0], self.y + action[1])]:
|
|
if sub.alive:
|
|
if sub.col != self.col:
|
|
right_kill = True
|
|
if left in directions:
|
|
for sub in world.subjectDict[(self.x + left[0], self.y + left[1])]:
|
|
if sub.alive:
|
|
if sub.col != self.col:
|
|
left_kill = True
|
|
if up in directions:
|
|
for sub in world.subjectDict[(self.x + up[0], self.y + up[1])]:
|
|
if sub.alive:
|
|
if sub.col != self.col:
|
|
up_kill = True
|
|
if down in directions:
|
|
for sub in world.subjectDict[(self.x + down[0], self.y + down[1])]:
|
|
if sub.alive:
|
|
if sub.col != self.col:
|
|
down_kill = True
|
|
|
|
if len(world.subjectDict[(self.x + action[0], self.y + action[1])]) > 0:
|
|
for sub in world.subjectDict[(self.x + action[0], self.y + action[1])]:
|
|
if sub.alive:
|
|
self.kills += 1
|
|
if sub.col != self.col:
|
|
self.lastReward += 10
|
|
sub.alive = False
|
|
self.alive = True
|
|
|
|
self.lastRewards = []
|
|
if right in directions:
|
|
sub = self.getClosestSubject(world, self.x + 1, self.y)
|
|
dist = self.viewD
|
|
if sub is not None:
|
|
dist = np.sqrt(np.square(self.x + 1 - sub.x) + np.square(self.y - sub.y))
|
|
distReward = self.viewD - dist
|
|
if right_kill:
|
|
self.lastRewards.append(10 + world.trailMix[self.x + 1, self.y] + world.hunter_grass[self.x + 1, self.y] * grass_factor + distReward)
|
|
else:
|
|
self.lastRewards.append(world.trailMix[self.x + 1, self.y] + world.hunter_grass[self.x + 1, self.y] * grass_factor + distReward)
|
|
else:
|
|
self.lastRewards.append(0)
|
|
if left in directions:
|
|
sub = self.getClosestSubject(world, self.x - 1, self.y)
|
|
dist = self.viewD
|
|
if sub is not None:
|
|
dist = np.sqrt(np.square(self.x - 1 - sub.x) + np.square(self.y - sub.y))
|
|
distReward = self.viewD - dist
|
|
if left_kill:
|
|
self.lastRewards.append(10 + world.trailMix[self.x - 1, self.y] + world.hunter_grass[self.x - 1, self.y] * grass_factor + distReward)
|
|
else:
|
|
self.lastRewards.append(world.trailMix[self.x - 1, self.y] + world.hunter_grass[self.x - 1, self.y] * grass_factor + distReward)
|
|
else:
|
|
self.lastRewards.append(0)
|
|
if up in directions:
|
|
sub = self.getClosestSubject(world, self.x, self.y - 1)
|
|
dist = self.viewD
|
|
if sub is not None:
|
|
dist = np.sqrt(np.square(self.x - sub.x) + np.square(self.y - sub.y - 1))
|
|
distReward = self.viewD - dist
|
|
if up_kill:
|
|
self.lastRewards.append(10 + world.trailMix[self.x, self.y - 1] + world.hunter_grass[self.x, self.y - 1] * grass_factor + distReward)
|
|
else:
|
|
self.lastRewards.append(world.trailMix[self.x, self.y - 1] + world.hunter_grass[self.x, self.y - 1] * grass_factor + distReward)
|
|
else:
|
|
self.lastRewards.append(0)
|
|
if down in directions:
|
|
sub = self.getClosestSubject(world, self.x, self.y + 1)
|
|
dist = self.viewD
|
|
if sub is not None:
|
|
dist = np.sqrt(np.square(self.x - sub.x) + np.square(self.y + 1 - sub.y))
|
|
distReward = self.viewD - dist
|
|
if down_kill:
|
|
self.lastRewards.append(10 + world.trailMix[self.x, self.y + 1] + world.hunter_grass[self.x, self.y + 1] * grass_factor + distReward)
|
|
else:
|
|
self.lastRewards.append(world.trailMix[self.x, self.y + 1] + world.hunter_grass[self.x, self.y + 1] * grass_factor + distReward)
|
|
else:
|
|
self.lastRewards.append(0)
|
|
assert len(self.lastRewards) == 4, 'Last Rewards not filled correctly!'
|
|
|
|
world.subjectDict[(self.x, self.y)].remove(self)
|
|
self.x += action[0]
|
|
self.y += action[1]
|
|
self.lastReward += world.trailMix[self.x, self.y]
|
|
world.subjectDict[(self.x, self.y)].append(self)
|
|
self.lastReward += (world.hunter_grass[self.x, self.y] * 0.1)
|
|
world.hunter_grass[self.x, self.y] = 0
|
|
|
|
sub = self.getClosestSubject(world, self.x, self.y)
|
|
dist = self.viewD
|
|
if sub is not None:
|
|
dist = np.sqrt(np.square(self.x - sub.x) + np.square(self.y - sub.y))
|
|
distReward = self.viewD - dist
|
|
|
|
self.lastReward += distReward
|