import random import numpy as np import tensorflow as tf from tensorflow import keras from labirinth_ai.LabyrinthWorld import LabyrinthWorld from labirinth_ai.loss import loss2, loss3 from labirinth_ai.Models.BaseModel import BaseModel, train, create_optimizer, device, from_numpy # import torch # dtype = torch.float # device = torch.device("cpu") class Subject: name = 'random' col = 8 num = 0 random = True r = 255 g = 255 b = 255 def __init__(self, x, y): self.alive = True self.x = x self.y = y self.kills = 0 self.lives = 1 self.tick = 0 self.id = self.num Subject.num += 1 def update(self, world: LabyrinthWorld): # 0, 0 is top left right = (1, 0) left = (-1, 0) up = (0, -1) down = (0, 1) directions = [] if self.x - 1 >= 0: if world.board[self.x - 1, self.y] != 0: directions.append(left) if self.x + 1 < world.board_shape[0]: if world.board[self.x + 1, self.y] != 0: directions.append(right) if self.y - 1 >= 0: if world.board[self.x, self.y - 1] != 0: directions.append(up) if self.y + 1 < world.board_shape[1]: if world.board[self.x, self.y + 1] != 0: directions.append(down) if directions != [] and self.alive: if len(directions) > 1: d = directions[random.randint(0, len(directions) - 1)] else: d = directions[0] if len(world.subjectDict[(self.x + d[0], self.y + d[1])]) > 0: for sub in world.subjectDict[(self.x + d[0], self.y + d[1])]: if sub.alive: self.kills += 1 sub.alive = False self.alive = True world.subjectDict[(self.x, self.y)].remove(self) world.trailMix[self.x, self.y] += 1 self.x += d[0] self.y += d[1] world.subjectDict[(self.x, self.y)].append(self) def respawnUpdate(self, x, y, world: LabyrinthWorld): world.subjectDict[(self.x, self.y)].remove(self) self.x = x self.y = y world.subjectDict[(self.x, self.y)].append(self) self.alive = True self.lives += 1 class QLearner(Subject): name = 'QLearner' col = 14 learningRate = 0.25 discountFactor = 0.5 random = False Q = {} def __init__(self, x, y): super(QLearner, self).__init__(x, y) # self.Q = {} self.viewD = 3 self.lastAction = None self.lastState = None self.lastReward = 0 def respawnUpdate(self, x, y, world: LabyrinthWorld): super(QLearner, self).respawnUpdate(x, y, world) self.lastReward -= 20 def createState(self, world: LabyrinthWorld): state = np.zeros((2 * self.viewD + 1, 2 * self.viewD + 1), np.int) # - 1 # # floodfill state # queued = [(0, 0)] # todo = [(0, 0, 0)] # while todo != []: # doing = todo.pop(0) # # if self.x + doing[0] >= 0 and self.x + doing[0] < 64 and self.y + doing[1] >= 0 and self.y + doing[1] < 64: # value = world.board[self.x + doing[0], self.y + doing[1]] # state[self.viewD + doing[0], self.viewD + doing[1]] = value # # # if value == 3: # # state[self.viewD + doing[0], self.viewD + doing[1]] = value # # if value != 0 and doing[2] < self.viewD: # for i in range(-1, 2, 1): # for j in range(-1, 2, 1): # # 4-neighbour. without it it is 8-neighbour # if abs(i) + abs(j) == 1: # if (doing[0] + i, doing[1] + j) not in queued: # queued.append((doing[0] + i, doing[1] + j)) # todo.append((doing[0] + i, doing[1] + j, doing[2] + 1)) # # for sub in world.subjects: # if (sub.x - self.x, sub.y - self.y) in queued and state[ # self.viewD + sub.x - self.x, self.viewD + sub.y - self.y] != 3: # state[self.viewD + sub.x - self.x, self.viewD + sub.y - self.y] = state[ # self.viewD + sub.x - self.x, self.viewD + sub.y - self.y] * 100 + sub.col maxdirleft = self.x - max(self.x - (self.viewD), 0) maxdirright = min(self.x + (self.viewD), (world.board_shape[0] - 1)) - self.x maxdirup = self.y - max(self.y - (self.viewD), 0) maxdirdown = min(self.y + (self.viewD), (world.board_shape[1] - 1)) - self.y # state[self.viewD - maxdirleft: self.viewD + maxdirright, self.viewD - maxdirup: self.viewD + maxdirdown] = world.board[self.x - maxdirleft: self.x + maxdirright, self.y - maxdirup: self.y + maxdirdown] for sub in world.subjects: if abs(sub.x - self.x) < self.viewD and abs(sub.y - self.y) < self.viewD: if state[self.viewD + sub.x - self.x, self.viewD + sub.y - self.y] != 3: state[self.viewD + sub.x - self.x, self.viewD + sub.y - self.y] = state[self.viewD + sub.x - self.x, self.viewD + sub.y - self.y] * 100 + 1# sub.col return state def update(self, world: LabyrinthWorld): # 0, 0 is top left right = (1, 0) left = (-1, 0) up = (0, -1) down = (0, 1) directions = [] if self.x - 1 >= 0: if world.board[self.x - 1, self.y] != 0: directions.append(left) if self.x + 1 < world.board_shape[0]: if world.board[self.x + 1, self.y] != 0: directions.append(right) if self.y - 1 >= 0: if world.board[self.x, self.y - 1] != 0: directions.append(up) if self.y + 1 < world.board_shape[1]: if world.board[self.x, self.y + 1] != 0: directions.append(down) if directions != [] and self.alive: state = self.createState(world) if str(state) not in self.Q.keys(): self.Q[str(state)] = {} for dir in directions: if dir not in self.Q[str(state)].keys(): self.Q[str(state)][dir] = random.randint(0, 5) allowedActions = dict(filter(lambda elem: elem[0] in directions,self.Q[str(state)].items())) action = max(allowedActions, key=allowedActions.get) if self.learningRate != 0: self.Q[str(state)][action] = (1 - self.learningRate) * self.Q[str(state)][action] + self.learningRate * (self.lastReward + self.discountFactor * self.Q[str(state)][action]) self.lastAction = action self.lastState = state self.lastReward = 0 if len(action) == 2: if len(world.subjectDict[(self.x + action[0], self.y + action[1])]) > 0: for sub in world.subjectDict[(self.x + action[0], self.y + action[1])]: if sub.alive: self.kills += 1 sub.alive = False self.alive = True self.lastReward += 10 world.subjectDict[(self.x, self.y)].remove(self) self.x += action[0] self.y += action[1] world.subjectDict[(self.x, self.y)].append(self) pass class DoubleQLearner(QLearner): name = 'DoubleQLearner' col = 11 learningRate = 0.5 discountFactor = 0.5 random = False QA = {} QB = {} def __init__(self, x, y): super(DoubleQLearner, self).__init__(x, y) self.viewD = 3 self.lastAction = None self.lastState = None self.lastReward = 0 def respawnUpdate(self, x, y, world: LabyrinthWorld): super(DoubleQLearner, self).respawnUpdate(x, y, world) def update(self, world: LabyrinthWorld): # 0, 0 is top left right = (1, 0) left = (-1, 0) up = (0, -1) down = (0, 1) directions = [] if self.x - 1 >= 0: if world.board[self.x - 1, self.y] != 0: directions.append(left) if self.x + 1 < world.board_shape[0]: if world.board[self.x + 1, self.y] != 0: directions.append(right) if self.y - 1 >= 0: if world.board[self.x, self.y - 1] != 0: directions.append(up) if self.y + 1 < world.board_shape[1]: if world.board[self.x, self.y + 1] != 0: directions.append(down) if directions != [] and self.alive: state = self.createState(world) if str(state) not in self.QA.keys(): self.QA[str(state)] = {} self.QB[str(state)] = {} for dir in directions: if dir not in self.QA[str(state)].keys(): self.QA[str(state)][dir] = random.randint(0, 5) self.QB[str(state)][dir] = random.randint(0, 5) allowedActionsA = dict(filter(lambda elem: elem[0] in directions, self.QA[str(state)].items())) allowedActionsB = dict(filter(lambda elem: elem[0] in directions, self.QB[str(state)].items())) allowedActions = {} for key in allowedActionsA.keys(): allowedActions[key] = allowedActionsA[key] + allowedActionsB[key] actionA = max(allowedActionsA, key=allowedActionsA.get) actionB = max(allowedActionsB, key=allowedActionsB.get) action = max(allowedActions, key=allowedActions.get) if self.learningRate != 0: if random.randint(0, 1) == 0: valA = self.QA[str(state)][action] self.QA[str(state)][action] = valA + self.learningRate * (self.lastReward + self.discountFactor * self.QB[str(state)][actionA] - valA) else: valB = self.QB[str(state)][action] self.QB[str(state)][action] = valB + self.learningRate * (self.lastReward + self.discountFactor * self.QA[str(state)][actionB] - valB) self.lastAction = action self.lastState = state self.lastReward = 0 if len(action) == 2: if len(world.subjectDict[(self.x + action[0], self.y + action[1])]) > 0: for sub in world.subjectDict[(self.x + action[0], self.y + action[1])]: if sub.alive: self.kills += 1 sub.alive = False self.alive = True self.lastReward += 10 world.subjectDict[(self.x, self.y)].remove(self) self.x += action[0] self.y += action[1] world.subjectDict[(self.x, self.y)].append(self) pass class NetLearner(Subject): right = (1, 0) left = (-1, 0) up = (0, -1) down = (0, 1) act2IDict = {right: 0, left: 1, up: 2, down: 3} name = 'NetLearner' col = 15 viewD = 3 historyLength = 2 channels = 4 learningRate = 0.001 discountFactor = 0.5 randomBuffer = 0 batchsize = 1000 randomBuffer = max(4*batchsize, randomBuffer) randomChance = 9 historySizeMul = 20 # samples = [] # x_in = keras.Input(shape=(4 * (2 * viewD + 1) * (2 * viewD + 1) + 2)) # target = keras.Input(shape=(10, 1)) # inVec = keras.layers.Flatten()(x_in) # # kernel_regularizer=keras.regularizers.l2(0.01) # actions = keras.layers.Dense((3 * (2 * viewD + 1) * (2 * viewD + 1)), activation='relu')(inVec) # actions = keras.layers.Dense(((2 * viewD + 1) * (2 * viewD + 1)), activation='relu')(actions) # actions = keras.layers.Dense(8, activation='linear', use_bias=False)(actions) # # model = keras.Model(inputs=x_in, outputs=actions) # # # model.compile(optimizer='adam', loss=loss, target_tensors=[target]) # model.compile(optimizer=tf.keras.optimizers.RMSprop(learningRate), loss=loss, target_tensors=[target]) def respawnUpdate(self, x, y, world: LabyrinthWorld): super(NetLearner, self).respawnUpdate(x, y, world) # self.lastReward -= 20 if len(self.samples) < self.randomBuffer or random.randint(0, 10) > self.randomChance: self.random = True # print('Rando ' + self.name) pass else: self.random = False # print('Slau ' + self.name) self.strikes = 0 def __init__(self, x, y): super(NetLearner, self).__init__(x, y) self.action = None self.state = None self.actDict = {} self.history = [] self.lastAction = None self.lastState = None self.lastReward = 0 self.lastVal = 0 self.random = False self.nextTrain = self.randomBuffer self.samples = [] self.x_in = [] self.actions = [] self.target = [] self.model = BaseModel(self.viewD, 4, 4).to(device) self.optimizer = create_optimizer(self.model) if len(self.samples) < self.randomBuffer: self.random = True else: self.random = False self.strikes = 0 self.lastRewards = [] self.accumulated_rewards = 0 def visualize(self): print(self.name) layers = self.model.get_weights() # layers.reverse() layersN = [[0, 1, 8, 9, 16], [2, 3, 10, 11, 17], [4, 5, 12, 13, 18], [6, 7, 14, 15, 19]] for action in range(8): v = np.zeros((1, 2)) v[0][0 if action < 4 else 1] = 1.0 layerN = list(layersN[action % 4]) layerN.reverse() for n in layerN: l = layers[n] if len(l.shape) == 2: layer = np.transpose(l) v = np.dot(v, layer) else: layer = np.array([l]) v = v + layer lastAction = v[0, -2:] v = np.reshape(v[0, :-2], (4, (2 * self.viewD + 1), (2 * self.viewD + 1))) # right, left, up, down dir = {0: 'right', 1: 'left', 2: 'up', 3: 'down'} dir = dir[action % 4] #0-3 current #4-8 future if action < 4: time = 'current ' else: time = 'future ' import matplotlib import matplotlib.pyplot as plt fig, axs = plt.subplots(2, 2, figsize=(5, 5)) fig.suptitle(time + dir) im = axs[0, 0].pcolor(np.rot90(v[0])) fig.colorbar(im, ax=axs[0, 0]) axs[0, 0].set_title('board') axs[0, 1].pcolor(np.rot90(v[1])) fig.colorbar(im, ax=axs[0, 1]) axs[0, 1].set_title('subjects') axs[1, 0].pcolor(np.rot90(v[2])) fig.colorbar(im, ax=axs[1, 0]) axs[1, 0].set_title('trail') axs[1, 1].pcolor(np.rot90(v[3])) fig.colorbar(im, ax=axs[1, 1]) axs[1, 1].set_title('grass') plt.show(block=True) def createState(self, world: LabyrinthWorld): state = np.zeros((2 * self.viewD + 1, 2 * self.viewD + 1), np.float) # - 1 state2 = np.zeros((2 * self.viewD + 1, 2 * self.viewD + 1), np.float) # - 1 state3 = np.zeros((2 * self.viewD + 1, 2 * self.viewD + 1), np.float) # - 1 state4 = np.zeros((2 * self.viewD + 1, 2 * self.viewD + 1), np.float) # - 1 maxdirleft = self.x - max(self.x - (self.viewD), 0) maxdirright = min(self.x + (self.viewD), (world.board_shape[0] - 1)) - self.x maxdirup = self.y - max(self.y - (self.viewD), 0) maxdirdown = min(self.y + (self.viewD), (world.board_shape[1] - 1)) - self.y state[self.viewD - maxdirleft: self.viewD + maxdirright, self.viewD - maxdirup: self.viewD + maxdirdown] = world.board[self.x - maxdirleft: self.x + maxdirright, self.y - maxdirup: self.y + maxdirdown] # for sub in world.subjects: # if abs(sub.x - self.x) < self.viewD and abs(sub.y - self.y) < self.viewD: # if state[self.viewD + sub.x - self.x, self.viewD + sub.y - self.y] != 3: # state2[self.viewD + sub.x - self.x, self.viewD + sub.y - self.y] = sub.col for x in range(-maxdirleft, maxdirright, 1): for y in range(-maxdirup, maxdirdown, 1): if world.subjectDict[(self.x + x, self.y + y)] != []: state2[x + maxdirleft, y + maxdirup] = 1#world.subjectDict[(self.x + x, self.y + y)][0].col state3[self.viewD - maxdirleft: self.viewD + maxdirright, self.viewD - maxdirup: self.viewD + maxdirdown] = world.trailMix[self.x - maxdirleft: self.x + maxdirright, self.y - maxdirup: self.y + maxdirdown] state4[self.viewD - maxdirleft: self.viewD + maxdirright, self.viewD - maxdirup: self.viewD + maxdirdown] = world.hunter_grass[self.x - maxdirleft: self.x + maxdirright, self.y - maxdirup: self.y + maxdirdown] if not self.random: test=1 area = np.reshape(np.stack((state, state2, state3, state4)), (4 * (2 * self.viewD + 1) * (2 * self.viewD + 1))) action = [0, 0] if self.lastAction is not None: action = self.lastAction return np.reshape(np.concatenate((area, action)), (1, 4 * (2 * self.viewD + 1) * (2 * self.viewD + 1) + 2)) def calculateAction(self, world: LabyrinthWorld, vals=None, state=None): # 0, 0 is top left directions = [] if self.x - 1 >= 0: if world.board[self.x - 1, self.y] != 0: directions.append(self.left) if self.x + 1 < world.board_shape[0]: if world.board[self.x + 1, self.y] != 0: directions.append(self.right) if self.y - 1 >= 0: if world.board[self.x, self.y - 1] != 0: directions.append(self.up) if self.y + 1 < world.board_shape[1]: if world.board[self.x, self.y + 1] != 0: directions.append(self.down) if directions == []: print('Wut?') if directions != [] and self.alive: if state is None: state = self.createState(world) if vals is None: vals = self.model(from_numpy(state)).detach().numpy() vals = np.reshape(np.transpose(np.reshape(vals, (4, 2)), (1, 0)), (1, 8)) self.actDict = {self.right: vals[0][0] + vals[0][4], self.left: vals[0][1] + vals[0][5], self.up: vals[0][2] + vals[0][6], self.down: vals[0][3] + vals[0][7]} allowedActions = dict(filter(lambda elem: elem[0] in directions, self.actDict.items())) # if self.name == 'Herbivore' and self.id == 11 and not self.random: # print(allowedActions) # print(self.lastReward) if self.strikes <= 0: self.random = False if not self.random: self.action = max(allowedActions, key=allowedActions.get) else: self.action = self.randomAct(world) self.state = state def update(self, world: LabyrinthWorld, doTrain=True): if self.lastAction is not None: if not self.random: if self.lastAction[0] + self.action[0] == 0 and self.lastAction[1] + self.action[1] == 0: self.strikes += 1 else: self.strikes -= 1 if self.strikes > 100: self.random = True else: self.strikes -= 1 if len(self.history) >= self.historyLength: self.history.pop(0) self.history.append((self.lastState.copy(), int(self.act2IDict[self.lastAction]), int(self.lastVal), float(self.lastReward), np.array(self.lastRewards))) # if self.lastReward != 0 or random.randint(0, 9) == 0: if len(self.history) == self.historyLength: self.samples.append(self.history.copy()) # if len(self.samples) % self.batchsize == 0 and len(self.samples) >= self.randomBuffer: if len(self.samples) > self.nextTrain and doTrain: print('train') self.train() self.nextTrain = min(self.batchsize + self.nextTrain, (self.historySizeMul + 1) * self.batchsize) self.accumulated_rewards += self.lastReward self.lastAction = self.action self.lastState = self.state self.lastReward = 0 self.lastVal = self.actDict[self.action] maxVal = 0 self.executeAction(world, self.action) def randomAct(self, world: LabyrinthWorld): right = (1, 0) left = (-1, 0) up = (0, -1) down = (0, 1) directions = [] if self.x - 1 >= 0: if world.board[self.x - 1, self.y] != 0: directions.append(left) if self.x + 1 < world.board_shape[0]: if world.board[self.x + 1, self.y] != 0: directions.append(right) if self.y - 1 >= 0: if world.board[self.x, self.y - 1] != 0: directions.append(up) if self.y + 1 < world.board_shape[1]: if world.board[self.x, self.y + 1] != 0: directions.append(down) d = random.randint(0, len(directions) - 1) action = directions[d] return action def executeAction(self, world: LabyrinthWorld, action): pass def generateSamples(self): # history element: (self.lastState.copy(), self.act2IDict[self.lastAction], self.lastVal, self.lastReward, np.array(self.lastRewards)) # history: [t-2, t-1] states = [] targets = [] for i in range(4): true_batch = int(self.batchsize/4) target = np.zeros((true_batch, 2, 1)) samples = np.array(self.samples[:-self.batchsize]) # print('Samples for ' + str(i)) # print(len(samples)) samples = np.array(list(filter(lambda e: e[0, 1] == i, list(samples)))) # print(len(samples)) partTwo = True if len(samples) == 0: print('No samples for:' + str(i)) partTwo = False samples = np.array(self.samples[:-self.batchsize]) buffer_size = len(samples) index = np.random.choice(np.arange(buffer_size), size=true_batch, replace=True) samples = samples[index] # self.samples = [] if partTwo: target[:, 1, 0] = samples[:, 1, 3] #reward t-2 got nextState = np.concatenate(samples[:, 1, 0]) #states of t-1 nextVals = self.model(from_numpy(nextState)).detach().numpy() nextVals2 = nextVals[:, i, 0] + nextVals[:, i, 1] target[:, 0, 0] = nextVals2 #best q t-1 else: target[:, 1, 0] = np.array(list(map(lambda elem: list(elem), list(np.array(samples[:, 1, 4])))))[:, i] # reward t-2 got targets.append(target) states.append(np.concatenate(samples[:, 0, 0])) #states of t-2 return states, targets def train(self): print(self.name) states, target = self.generateSamples() train(states, target, self.model, self.optimizer) self.samples = self.samples[-self.historySizeMul*self.batchsize:] # print(self.model.get_weights()) pass class Herbivore(NetLearner): name = 'Herbivore' col = 9 r = 255 g = 255 b = 0 viewD = 3 historyLength = 2 learningRate = 0.001 discountFactor = 0.5 randomBuffer = 0 batchsize = 1000 randomBuffer = max(2 * batchsize, randomBuffer) randomChance = 9 samples = [] # x_in = keras.Input(shape=(4 * (2 * viewD + 1) * (2 * viewD + 1) + 2)) # target = keras.Input(shape=(10, 1)) # inVec = keras.layers.Flatten()(x_in) # # kernel_regularizer=keras.regularizers.l2(0.01) # actions = keras.layers.Dense((4 * (2 * viewD + 1) * (2 * viewD + 1)), activation='elu')(inVec) # actions = keras.layers.Dense(((2 * viewD + 1) * (2 * viewD + 1)), activation='elu')(actions) # actions = keras.layers.Dense(8, activation='linear', use_bias=False)(actions) # # actions = keras.layers.Dense(4, activation='linear', use_bias=False)(inVec) # # model = keras.Model(inputs=x_in, outputs=actions) # # # model.compile(optimizer='adam', loss=loss2, target_tensors=[target]) # model.compile(optimizer=tf.keras.optimizers.RMSprop(learningRate), loss=loss2, target_tensors=[target]) # def __init__(self, x, y): # super(Herbivore, self).__init__(x, y) def createState(self, world: LabyrinthWorld): state = np.zeros((2 * self.viewD + 1, 2 * self.viewD + 1), np.float) # - 1 state2 = np.zeros((2 * self.viewD + 1, 2 * self.viewD + 1), np.float) # - 1 state3 = np.zeros((2 * self.viewD + 1, 2 * self.viewD + 1), np.float) # - 1 state4 = np.zeros((2 * self.viewD + 1, 2 * self.viewD + 1), np.float) # - 1 maxdirleft = self.x - max(self.x - (self.viewD), 0) maxdirright = min(self.x + (self.viewD), (world.board_shape[0] - 1)) - self.x maxdirup = self.y - max(self.y - (self.viewD), 0) maxdirdown = min(self.y + (self.viewD), (world.board_shape[1] - 1)) - self.y state[self.viewD - maxdirleft: self.viewD + maxdirright, self.viewD - maxdirup: self.viewD + maxdirdown] = world.board[self.x - maxdirleft: self.x + maxdirright, self.y - maxdirup: self.y + maxdirdown] # for sub in world.subjects: # if abs(sub.x - self.x) < self.viewD and abs(sub.y - self.y) < self.viewD: # if state[self.viewD + sub.x - self.x, self.viewD + sub.y - self.y] != 3: # state2[self.viewD + sub.x - self.x, self.viewD + sub.y - self.y] = sub.col for x in range(-maxdirleft, maxdirright, 1): for y in range(-maxdirup, maxdirdown, 1): if world.subjectDict[(self.x + x, self.y + y)] != []: state2[x + maxdirleft, y + maxdirup] = 1#world.subjectDict[(self.x + x, self.y + y)][0].col state3[self.viewD - maxdirleft: self.viewD + maxdirright, self.viewD - maxdirup: self.viewD + maxdirdown] = world.trailMix[self.x - maxdirleft: self.x + maxdirright, self.y - maxdirup: self.y + maxdirdown] state4[self.viewD - maxdirleft: self.viewD + maxdirright, self.viewD - maxdirup: self.viewD + maxdirdown] = world.grass[self.x - maxdirleft: self.x + maxdirright, self.y - maxdirup: self.y + maxdirdown] if not self.random: test=1 area = np.reshape(np.stack((state, state2, state3, state4)), (4 * (2 * self.viewD + 1) * (2 * self.viewD + 1))) action = [0, 0] if self.lastAction is not None: action = self.lastAction return np.reshape(np.concatenate((area, action)), (1, 4 * (2 * self.viewD + 1) * (2 * self.viewD + 1) + 2)) def executeAction(self, world: LabyrinthWorld, action): right = (1, 0) left = (-1, 0) up = (0, -1) down = (0, 1) directions = [] if self.x - 1 >= 0: if world.board[self.x - 1, self.y] != 0: directions.append(left) if self.x + 1 < world.board_shape[0]: if world.board[self.x + 1, self.y] != 0: directions.append(right) if self.y - 1 >= 0: if world.board[self.x, self.y - 1] != 0: directions.append(up) if self.y + 1 < world.board_shape[1]: if world.board[self.x, self.y + 1] != 0: directions.append(down) if len(action) == 2: if len(world.subjectDict[(self.x + action[0], self.y + action[1])]) > 0: for sub in world.subjectDict[(self.x + action[0], self.y + action[1])]: if isinstance(sub, Hunter): if sub.alive: sub.kills += 1 sub.alive = True sub.lastReward += 10 self.alive = False self.lastRewards = [] if right in directions: self.lastRewards.append(world.grass[self.x + 1, self.y]) else: self.lastRewards.append(0) if left in directions: self.lastRewards.append(world.grass[self.x - 1, self.y]) else: self.lastRewards.append(0) if up in directions: self.lastRewards.append(world.grass[self.x, self.y - 1]) else: self.lastRewards.append(0) if down in directions: self.lastRewards.append(world.grass[self.x, self.y + 1]) else: self.lastRewards.append(0) assert len(self.lastRewards) == 4, 'Last Rewards not filled correctly!' world.subjectDict[(self.x, self.y)].remove(self) self.lastReward += world.trailMix[self.x, self.y] self.x += action[0] self.y += action[1] world.subjectDict[(self.x, self.y)].append(self) world.trailMix[self.x, self.y] = max(1.0, world.trailMix[self.x, self.y]) self.lastReward += (world.grass[self.x, self.y] - 0.0) world.grass[self.x, self.y] = 0 world.hunter_grass[self.x, self.y] = 0 def randomAct(self, world: LabyrinthWorld): right = (1, 0) left = (-1, 0) up = (0, -1) down = (0, 1) directions = [] actDict = {} if self.x - 1 >= 0: if world.board[self.x - 1, self.y] != 0: directions.append(left) actDict[left] = world.grass[self.x - 1, self.y] if self.x + 1 < world.board_shape[0]: if world.board[self.x + 1, self.y] != 0: directions.append(right) actDict[right] = world.grass[self.x + 1, self.y] if self.y - 1 >= 0: if world.board[self.x, self.y - 1] != 0: directions.append(up) actDict[up] = world.grass[self.x, self.y - 1] if self.y + 1 < world.board_shape[1]: if world.board[self.x, self.y + 1] != 0: directions.append(down) actDict[down] = world.grass[self.x, self.y + 1] allowedActions = dict(filter(lambda elem: elem[0] in directions, actDict.items())) action = max(allowedActions, key=allowedActions.get) return action def respawnUpdate(self, x, y, world: LabyrinthWorld): super(Herbivore, self).respawnUpdate(x, y, world) self.lastReward -= 1 class Hunter(NetLearner): name = 'Hunter' hunterGrassScale = 0.5 r = 0 g = 255 b = 255 def randomAct(self, world: LabyrinthWorld): right = (1, 0) left = (-1, 0) up = (0, -1) down = (0, 1) directions = [] actDict = {} if self.x - 1 >= 0: if world.board[self.x - 1, self.y] > 0.01: directions.append(left) sub = self.getClosestSubject(world, self.x - 1, self.y) dist = self.viewD if sub is not None: dist = np.sqrt(np.square(self.x - 1 - sub.x) + np.square(self.y - sub.y)) distReward = self.viewD - dist actDict[left] = world.trailMix[self.x - 1, self.y] + world.hunter_grass[self.x - 1, self.y] * self.hunterGrassScale + distReward if len(world.subjectDict[(self.x + left[0], self.y + left[1])]) > 0: for sub in world.subjectDict[(self.x + left[0], self.y + left[1])]: if sub.col != self.col: actDict[left] += 10 if self.x + 1 < world.board_shape[0]: if world.board[self.x + 1, self.y] > 0.01: directions.append(right) sub = self.getClosestSubject(world, self.x + 1, self.y) dist = self.viewD if sub is not None: dist = np.sqrt(np.square(self.x + 1 - sub.x) + np.square(self.y - sub.y)) distReward = self.viewD - dist actDict[right] = world.trailMix[self.x + 1, self.y] + world.hunter_grass[self.x + 1, self.y] * self.hunterGrassScale + distReward if len(world.subjectDict[(self.x + right[0], self.y + right[1])]) > 0: for sub in world.subjectDict[(self.x + right[0], self.y + right[1])]: if sub.col != self.col: actDict[right] += 10 if self.y - 1 >= 0: if world.board[self.x, self.y - 1] > 0.01: directions.append(up) sub = self.getClosestSubject(world, self.x, self.y - 1) dist = self.viewD if sub is not None: dist = np.sqrt(np.square(self.x - sub.x) + np.square(self.y - 1 - sub.y)) distReward = self.viewD - dist actDict[up] = world.trailMix[self.x, self.y - 1] + world.hunter_grass[self.x, self.y - 1] * self.hunterGrassScale + distReward if len(world.subjectDict[(self.x + up[0], self.y + up[1])]) > 0: for sub in world.subjectDict[(self.x + up[0], self.y + up[1])]: if sub.col != self.col: actDict[up] += 10 if self.y + 1 < world.board_shape[1]: if world.board[self.x, self.y + 1] > 0.01: directions.append(down) sub = self.getClosestSubject(world, self.x, self.y + 1) dist = self.viewD if sub is not None: dist = np.sqrt(np.square(self.x - sub.x) + np.square(self.y + 1 - sub.y)) distReward = self.viewD - dist actDict[down] = world.trailMix[self.x, self.y + 1] + world.hunter_grass[self.x, self.y + 1] * self.hunterGrassScale + distReward if len(world.subjectDict[(self.x + down[0], self.y + down[1])]) > 0: for sub in world.subjectDict[(self.x + down[0], self.y + down[1])]: if sub.col != self.col: actDict[down] += 10 if len(actDict) > 0: allowedActions = dict(filter(lambda elem: elem[0] in directions, actDict.items())) else: return super(Hunter, self).randomAct(world) action = max(allowedActions, key=allowedActions.get) return action def respawnUpdate(self, x, y, world: LabyrinthWorld): super(Hunter, self).respawnUpdate(x, y, world) self.lastReward -= 1 def getClosestSubject(self, world, x, y): for dist in range(1, self.viewD): dy = dist for dx in range(-dist, dist): if world.board_shape[0] > x + dx >= 0 and world.board_shape[1] > y + dy >= 0: for sub in world.subjectDict[(x + dx, y + dy)]: if sub.alive and sub.col != self.col: return sub dy = -dist for dx in range(-dist, dist): if world.board_shape[0] > x + dx >= 0 and world.board_shape[1] > y + dy >= 0: for sub in world.subjectDict[(x + dx, y + dy)]: if sub.alive and sub.col != self.col: return sub dx = dist for dy in range(-dist, dist): if world.board_shape[0] > x + dx >= 0 and world.board_shape[1] > y + dy >= 0: for sub in world.subjectDict[(x + dx, y + dy)]: if sub.alive and sub.col != self.col: return sub dx = -dist for dy in range(-dist, dist): if world.board_shape[0] > x + dx >= 0 and world.board_shape[1] > y + dy >= 0: for sub in world.subjectDict[(x + dx, y + dy)]: if sub.alive and sub.col != self.col: return sub return None def executeAction(self, world: LabyrinthWorld, action): grass_factor = 0.5 right = (1, 0) left = (-1, 0) up = (0, -1) down = (0, 1) directions = [] if self.x - 1 >= 0: if world.board[self.x - 1, self.y] != 0: directions.append(left) if self.x + 1 < world.board_shape[0]: if world.board[self.x + 1, self.y] != 0: directions.append(right) if self.y - 1 >= 0: if world.board[self.x, self.y - 1] != 0: directions.append(up) if self.y + 1 < world.board_shape[1]: if world.board[self.x, self.y + 1] != 0: directions.append(down) if len(action) == 2: right_kill = left_kill = up_kill = down_kill = False if right in directions: for sub in world.subjectDict[(self.x + action[0], self.y + action[1])]: if sub.alive: if sub.col != self.col: right_kill = True if left in directions: for sub in world.subjectDict[(self.x + left[0], self.y + left[1])]: if sub.alive: if sub.col != self.col: left_kill = True if up in directions: for sub in world.subjectDict[(self.x + up[0], self.y + up[1])]: if sub.alive: if sub.col != self.col: up_kill = True if down in directions: for sub in world.subjectDict[(self.x + down[0], self.y + down[1])]: if sub.alive: if sub.col != self.col: down_kill = True if len(world.subjectDict[(self.x + action[0], self.y + action[1])]) > 0: for sub in world.subjectDict[(self.x + action[0], self.y + action[1])]: if sub.alive: self.kills += 1 if sub.col != self.col: self.lastReward += 10 sub.alive = False self.alive = True self.lastRewards = [] if right in directions: sub = self.getClosestSubject(world, self.x + 1, self.y) dist = self.viewD if sub is not None: dist = np.sqrt(np.square(self.x + 1 - sub.x) + np.square(self.y - sub.y)) distReward = self.viewD - dist if right_kill: self.lastRewards.append(10 + world.trailMix[self.x + 1, self.y] + world.hunter_grass[self.x + 1, self.y] * grass_factor + distReward) else: self.lastRewards.append(world.trailMix[self.x + 1, self.y] + world.hunter_grass[self.x + 1, self.y] * grass_factor + distReward) else: self.lastRewards.append(0) if left in directions: sub = self.getClosestSubject(world, self.x - 1, self.y) dist = self.viewD if sub is not None: dist = np.sqrt(np.square(self.x - 1 - sub.x) + np.square(self.y - sub.y)) distReward = self.viewD - dist if left_kill: self.lastRewards.append(10 + world.trailMix[self.x - 1, self.y] + world.hunter_grass[self.x - 1, self.y] * grass_factor + distReward) else: self.lastRewards.append(world.trailMix[self.x - 1, self.y] + world.hunter_grass[self.x - 1, self.y] * grass_factor + distReward) else: self.lastRewards.append(0) if up in directions: sub = self.getClosestSubject(world, self.x, self.y - 1) dist = self.viewD if sub is not None: dist = np.sqrt(np.square(self.x - sub.x) + np.square(self.y - sub.y - 1)) distReward = self.viewD - dist if up_kill: self.lastRewards.append(10 + world.trailMix[self.x, self.y - 1] + world.hunter_grass[self.x, self.y - 1] * grass_factor + distReward) else: self.lastRewards.append(world.trailMix[self.x, self.y - 1] + world.hunter_grass[self.x, self.y - 1] * grass_factor + distReward) else: self.lastRewards.append(0) if down in directions: sub = self.getClosestSubject(world, self.x, self.y + 1) dist = self.viewD if sub is not None: dist = np.sqrt(np.square(self.x - sub.x) + np.square(self.y + 1 - sub.y)) distReward = self.viewD - dist if down_kill: self.lastRewards.append(10 + world.trailMix[self.x, self.y + 1] + world.hunter_grass[self.x, self.y + 1] * grass_factor + distReward) else: self.lastRewards.append(world.trailMix[self.x, self.y + 1] + world.hunter_grass[self.x, self.y + 1] * grass_factor + distReward) else: self.lastRewards.append(0) assert len(self.lastRewards) == 4, 'Last Rewards not filled correctly!' world.subjectDict[(self.x, self.y)].remove(self) self.x += action[0] self.y += action[1] self.lastReward += world.trailMix[self.x, self.y] world.subjectDict[(self.x, self.y)].append(self) self.lastReward += (world.hunter_grass[self.x, self.y] * 0.1) world.hunter_grass[self.x, self.y] = 0 sub = self.getClosestSubject(world, self.x, self.y) dist = self.viewD if sub is not None: dist = np.sqrt(np.square(self.x - sub.x) + np.square(self.y - sub.y)) distReward = self.viewD - dist self.lastReward += distReward