From a5a098e0ef116d0045c42b155e21ffc28330710a Mon Sep 17 00:00:00 2001 From: Pavlo-Andrianatos Date: Sat, 11 Feb 2023 15:53:22 +0200 Subject: [PATCH] First commit, added all needed files --- .../analysis/cpandrianatos/DataEntry.py | 52 ++ .../analysis/cpandrianatos/DataReader.py | 31 ++ .../analysis/cpandrianatos/FunctionalNode.py | 47 ++ .../analysis/cpandrianatos/Genetic_Program.py | 452 ++++++++++++++++ .../analysis/cpandrianatos/Individual.py | 77 +++ .../analysis/cpandrianatos/LoadParameters.py | 43 ++ challenge1/analysis/cpandrianatos/Main.py | 53 ++ challenge1/analysis/cpandrianatos/Node.py | 29 + .../cpandrianatos/OutputPredictions.py | 54 ++ challenge1/analysis/cpandrianatos/README.md | 72 +++ .../analysis/cpandrianatos/Run_script.bat | 3 + .../analysis/cpandrianatos/TerminalNode.py | 34 ++ .../analysis/cpandrianatos/environment.yml | 20 + challenge1/analysis/cpandrianatos/mape.txt | 1 + .../analysis/cpandrianatos/parameters.config | 15 + .../predicted_energy_production.csv | 501 ++++++++++++++++++ 16 files changed, 1484 insertions(+) create mode 100644 challenge1/analysis/cpandrianatos/DataEntry.py create mode 100644 challenge1/analysis/cpandrianatos/DataReader.py create mode 100644 challenge1/analysis/cpandrianatos/FunctionalNode.py create mode 100644 challenge1/analysis/cpandrianatos/Genetic_Program.py create mode 100644 challenge1/analysis/cpandrianatos/Individual.py create mode 100644 challenge1/analysis/cpandrianatos/LoadParameters.py create mode 100644 challenge1/analysis/cpandrianatos/Main.py create mode 100644 challenge1/analysis/cpandrianatos/Node.py create mode 100644 challenge1/analysis/cpandrianatos/OutputPredictions.py create mode 100644 challenge1/analysis/cpandrianatos/README.md create mode 100644 challenge1/analysis/cpandrianatos/Run_script.bat create mode 100644 challenge1/analysis/cpandrianatos/TerminalNode.py create mode 100644 challenge1/analysis/cpandrianatos/environment.yml create mode 100644 challenge1/analysis/cpandrianatos/mape.txt create mode 100644 challenge1/analysis/cpandrianatos/parameters.config create mode 100644 challenge1/analysis/cpandrianatos/predicted_energy_production.csv diff --git a/challenge1/analysis/cpandrianatos/DataEntry.py b/challenge1/analysis/cpandrianatos/DataEntry.py new file mode 100644 index 000000000..413be0f23 --- /dev/null +++ b/challenge1/analysis/cpandrianatos/DataEntry.py @@ -0,0 +1,52 @@ +""" + @author Pavlo Andrianatos + Date: 11/02/2023 +""" + + +class DataEntry: + ID: float + Label: float + House: float + Year: float + Month: float + Temperature: float + Daylight: float + EnergyProduction: float + + """ + Each data entry represents one row in the dataset + """ + def __init__(self, id_num, label, house, year, month, temperature, daylight, energy_production): + self.ID = id_num + self.Label = label + self.House = house + self.Year = year + self.Month = month + self.Temperature = temperature + self.Daylight = daylight + self.EnergyProduction = energy_production + + def getID(self): + return self.ID + + def getLabel(self): + return self.Label + + def getHouse(self): + return self.House + + def getYear(self): + return self.Year + + def getMonth(self): + return self.Month + + def getTemperature(self): + return self.Temperature + + def getDaylight(self): + return self.Daylight + + def getEnergyProduction(self): + return self.EnergyProduction diff --git a/challenge1/analysis/cpandrianatos/DataReader.py b/challenge1/analysis/cpandrianatos/DataReader.py new file mode 100644 index 000000000..2cd8232da --- /dev/null +++ b/challenge1/analysis/cpandrianatos/DataReader.py @@ -0,0 +1,31 @@ +import csv +from DataEntry import DataEntry + +""" + @author Pavlo Andrianatos + Date: 11/02/2023 +""" + + +class DataReader: + fileName: str + + """ + The data reader will read the data in either the training set or test set, whichever the user provides + """ + + def __init__(self, fileName): + self.fileName = fileName + + def ReadInData(self) -> list: + data = [] + with open("../../data/" + self.fileName) as csv_file: + csv_reader = csv.reader(csv_file, delimiter=',') + line_count = 0 + for row in csv_reader: + if line_count == 0: + line_count += 1 + else: + data.append(DataEntry(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7])) + line_count += 1 + return data diff --git a/challenge1/analysis/cpandrianatos/FunctionalNode.py b/challenge1/analysis/cpandrianatos/FunctionalNode.py new file mode 100644 index 000000000..eb0c05140 --- /dev/null +++ b/challenge1/analysis/cpandrianatos/FunctionalNode.py @@ -0,0 +1,47 @@ +from Node import Node + +""" + @author Pavlo Andrianatos + Date: 11/02/2023 +""" + + +class FunctionalNode(Node): + label: str + + children: list + + """ + Possible operators + - * / + Each functional node will either have the mathematical operation plus, minus, multiply, or divide. + Functional nodes can either be the root or are situated above the terminal nodes. + Functional nodes have children (a max of 2). + """ + + def __init__(self, label: str, children: list): + super().__init__() + self.label = label + self.children = children + + def getLabel(self) -> str: + return self.label + + def setLabel(self, newLabel: str): + self.label = newLabel + + def getChildren(self) -> list: + return self.children + + def setChildren(self, newChildren: list): + self.children = newChildren + + # Count nodes from this node downwards, this a recursive function and is + # usually run from the root node downwards + def CountNodes(self) -> int: + c = 1 + if not self.children: + return c + for i in range(0, len(self.children)): + if self.children[i] is not None: + c += self.children[i].CountNodes() + return c diff --git a/challenge1/analysis/cpandrianatos/Genetic_Program.py b/challenge1/analysis/cpandrianatos/Genetic_Program.py new file mode 100644 index 000000000..edf137a5a --- /dev/null +++ b/challenge1/analysis/cpandrianatos/Genetic_Program.py @@ -0,0 +1,452 @@ +import random +import sys + +from TerminalNode import TerminalNode +from Node import Node +from FunctionalNode import FunctionalNode +from Individual import Individual +from DataReader import DataReader + +import copy +import re + +""" + @author Pavlo Andrianatos + Date: 11/02/2023 +""" + + +class Genetic_Program: + seed: int + max_generations: int + pop_size: int + max_depth: int + crossover_chance: float + mutation_chance: float + reproduction_chance: float + + ramped_half_half: float + + population: list + newPopulation: list + + matingPool: list + + bestIndividual: Individual + + totalAdjusted: float + + def __init__(self): + self.seed = 0 + self.max_generations = 1 + self.pop_size = 1 + self.max_depth = 3 + self.crossover_chance = 0.6 + self.mutation_chance = 0.3 + self.reproduction_chance = 0.1 + self.ramped_half_half = 0.5 + + self.population = [] + + self.newPopulation = [] + + # Matingpool of individuals used during Fitness Proportionate Selection + self.matingPool = [] + + # Used Fitness Proportionate Selection + self.totalAdjusted = 0.0 + + # 100000.0 is the starting fitness, the best individual is the one with the lowest fitness in a population + # 100000.0 was picked arbitrarily, it can be any large number, that should not be encountered when training + self.bestIndividual = Individual(Node(), 100000.0) + + def Perform_Run(self, filename, seed, maxGen, popSize, maxDepth, cross, mutate, reproduction, + ramped_half_half, random_seed): + self.seed = seed + # If set to 1 in parameters.config, a random seed will be used instead of the user-provided one + if random_seed: + self.seed = random.randrange(sys.maxsize) + random.seed(self.seed) + self.max_generations = maxGen + self.pop_size = popSize + self.max_depth = maxDepth + self.crossover_chance = cross + self.mutation_chance = mutate + self.reproduction_chance = reproduction + self.ramped_half_half = ramped_half_half + + # Data is read in from the training dataset + dr = DataReader(filename) + + data = dr.ReadInData() + + # Data is shuffled to try and make each training unique and hopefully means we avoid local minimums. + random.shuffle(data) + + # Generating the initial population of trees + self.InitialPopulationGeneration() + + currentGeneration = 0 + + while currentGeneration < self.max_generations: + print("Generation: ", currentGeneration) + + for i in range(0, len(self.population)): + fitness = 0.0 + # Each individuals are run with each data entry to calculate MAPE and use it for the fitness for each individual + for j in range(0, len(data)): + prediction = self.RunGP(self.population[i], self.population[i].getRoot(), data[j]) + + fitness += abs((float(data[i].getEnergyProduction()) - prediction) / float(data[i].getEnergyProduction())) + + fitness = fitness * (1 / len(data)) + # The fitness (actually MAPE) is multiplied by 100 + fitness = fitness * 100 + self.population[i].setFitness(fitness) + + # Update best individual + if self.bestIndividual.getFitness() > fitness: + self.bestIndividual = copy.deepcopy(self.population[i]) + + # The fitnesses used in Fitness Proportionate Selection is calculated + self.population[i].setStandardisedFitness(self.population[i].getFitness()) + + tempAdjustedFitness = 1 / (1 + self.population[i].getStandardisedFitness()) + + self.population[i].setAdjustedFitness(tempAdjustedFitness) + + self.totalAdjusted += self.population[i].getAdjustedFitness() + + self.CreateFitnessProportionateSelection() + + # These genetic operators are used to populate the new population used in the next generation + while len(self.newPopulation) < self.pop_size: + rand = random.random() + if rand <= self.crossover_chance and (len(self.newPopulation) <= self.pop_size - 2): + self.CrossOver() + elif rand <= self.mutation_chance: + self.Mutation() + elif rand <= self.reproduction_chance: + self.Reproduction() + + self.population = [] + + for i in range(0, len(self.newPopulation)): + self.population.append(copy.deepcopy(self.newPopulation[i])) + + self.newPopulation = [] + + self.totalAdjusted = 0.0 + + self.matingPool = [] + + currentGeneration += 1 + + print("Best Individual Fitness: ", self.bestIndividual.getFitness()) + self.bestIndividual.printTree(self.bestIndividual.getRoot(), "-") + + print("Best Final Individual Fitness: ", self.bestIndividual.getFitness()) + self.bestIndividual.printTree(self.bestIndividual.getRoot(), "-") + print("Seed: ", self.seed) + + return [self.bestIndividual, self.seed] + + def InitialPopulationGeneration(self): + treeDepthTemp = self.max_depth - 1 + + if treeDepthTemp <= 0: + treeDepthTemp = 1 + + numberOfTreesForEachDepth = self.pop_size // treeDepthTemp # Math.floorDiv + + depthNumber = 1 + + count = 0 + + # Will generate trees using each method + # METHOD_GROW will generate a full tree with the chance (ramped_half_half) to stop a branch from + # generating a full branch and terminate with a terminal node + # METHOD_FULL will generate a full tree given a specific depth + for i in range(0, treeDepthTemp): + for j in range(0, numberOfTreesForEachDepth // 2): + self.population.append(self.GenerateTree(depthNumber, "METHOD_GROW")) + count += 1 + for j in range(0, numberOfTreesForEachDepth // 2): + self.population.append(self.GenerateTree(depthNumber, "METHOD_FULL")) + count += 1 + depthNumber += 1 + + # This is here just incase it does not generate the correct amount (strange pop_size provided) + leftToGenerate = self.pop_size - count + + if leftToGenerate > 0: + randTreeDepth = random.randint(1, self.max_depth) + if random.random() < 0.5: + self.population.append(self.GenerateTree(randTreeDepth, "METHOD_GROW")) + count += 1 + else: + self.population.append(self.GenerateTree(randTreeDepth, "METHOD_FULL")) + count += 1 + + """ + Will generate a tree, this starts out with the root, the root will always be a functional node + """ + + def GenerateTree(self, maxDepth, method) -> Individual: + node = FunctionalNode("", []) + + children = [] + rand = random.random() + + if rand <= 0.25: + node.setLabel("+") + elif rand <= 0.50: + node.setLabel("-") + elif rand <= 0.75: + node.setLabel("*") + elif rand <= 1.0: + node.setLabel("/") + + children.append(self.GenRndExpr(maxDepth - 1, method)) + children.append(self.GenRndExpr(maxDepth - 1, method)) + node.setChildren(children) + + individual = Individual(node, 100000.0) + + return individual + + """ + Recursive function that will generate the rest of the tree started in GenerateTree + """ + + def GenRndExpr(self, maxDepth, method) -> Node: + if maxDepth == 0 or (method == "METHOD_GROW" and random.random() < self.ramped_half_half): + node = TerminalNode("") + rand = random.random() + + if rand <= 0.16: + # Will make a constant between 0 and 1000 inclusive + node.setLabel(str(float(random.randint(0, 1000)))) + elif rand <= 0.30: + node.setLabel("Label") + elif rand <= 0.44: + node.setLabel("House") + elif rand <= 0.58: + node.setLabel("Year") + elif rand <= 0.72: + node.setLabel("Month") + elif rand <= 0.86: + node.setLabel("Temperature") + elif rand <= 1.0: + node.setLabel("Daylight") + + return node + else: + node = FunctionalNode("", []) + + children = [] + rand = random.random() + + if rand <= 0.25: + node.setLabel("+") + elif rand <= 0.50: + node.setLabel("-") + elif rand <= 0.75: + node.setLabel("*") + elif rand <= 1.0: + node.setLabel("/") + + children.append(self.GenRndExpr(maxDepth - 1, method)) + children.append(self.GenRndExpr(maxDepth - 1, method)) + node.setChildren(children) + + return node + + """ + An interpreter that interprets a tree, will return a prediction. + """ + + def RunGP(self, individual, node, dataEntry) -> float: + if re.findall(r"\d+", node.getLabel()): + return node.getLabel() + match node.getLabel(): + case "Label": + return dataEntry.getLabel() + case "House": + return dataEntry.getHouse() + case "Year": + return dataEntry.getYear() + case "Month": + return dataEntry.getMonth() + case "Temperature": + return dataEntry.getTemperature() + case "Daylight": + return dataEntry.getDaylight() + case "+": + return float(self.RunGP(individual, node.getChildren()[0], dataEntry)) + float( + self.RunGP(individual, node.getChildren()[1], + dataEntry)) + case "-": + return float(self.RunGP(individual, node.getChildren()[0], dataEntry)) - float( + self.RunGP(individual, node.getChildren()[1], + dataEntry)) + case "*": + return float(self.RunGP(individual, node.getChildren()[0], dataEntry)) * float( + self.RunGP(individual, node.getChildren()[1], + dataEntry)) + case "/": + # Check if the denominator is 0, can't divide by 0 + divisor = float(self.RunGP(individual, node.getChildren()[1], dataEntry)) + if divisor == 0: + return 100000.0 + else: + return float(self.RunGP(individual, node.getChildren()[0], dataEntry)) / float(divisor) + case _: + pass + + """ + Returns a random individual in matingPool + """ + + def FitnessProportionateSelection(self) -> Individual: + return self.matingPool[random.randint(0, len(self.matingPool) - 1)] + + """ + Populates the matingPool list with individuals based on its normalisedFitness. + An individual who has a better fitness has a higher chance of being picked in FitnessProportionateSelection + because it occurs more in the mating pool list + """ + + def CreateFitnessProportionateSelection(self): + for i in range(0, len(self.population)): + self.population[i].setNormalisedFitness(self.population[i].getAdjustedFitness() / self.totalAdjusted) + numberOfOccurrences = round(self.population[i].getNormalisedFitness() * self.pop_size) + for j in range(0, numberOfOccurrences): + self.matingPool.append(copy.deepcopy(self.population[i])) + + """ + Crossover will select 2 random individuals and 2 random points (one point in each individual). + The subtrees at each point are then swapped. + The resulting tree is added to the next population for teh next generation. + """ + + def CrossOver(self): + parentOne = copy.deepcopy(self.FitnessProportionateSelection()) + parentTwo = copy.deepcopy(self.FitnessProportionateSelection()) + + numNodesOne = parentOne.getRoot().CountNodes() + numNodesTwo = parentTwo.getRoot().CountNodes() + + pointOne = random.randint(1, numNodesOne) + pointTwo = random.randint(1, numNodesTwo) + + nodeOne = self.CrossOverHelper(parentOne.getRoot(), pointOne) + nodeTwo = self.CrossOverHelper(parentTwo.getRoot(), pointTwo) + + nodeOneParent = self.CrossOverParentHelper(parentOne.getRoot(), nodeOne) + nodeTwoParent = self.CrossOverParentHelper(parentTwo.getRoot(), nodeTwo) + + tempNodeOne = copy.deepcopy(nodeOne) + tempNodeTwo = copy.deepcopy(nodeTwo) + + for i in range(0, len(nodeOneParent.getChildren())): + if nodeOneParent.getChildren()[i].getLabel() == nodeOne.getLabel(): + nodeOneParent.getChildren()[i] = tempNodeOne + break + + for i in range(0, len(nodeTwoParent.getChildren())): + if nodeTwoParent.getChildren()[i].getLabel() == nodeTwo.getLabel(): + nodeTwoParent.getChildren()[i] = tempNodeTwo + break + + parentOne.resetValues() + parentTwo.resetValues() + self.newPopulation.append(parentOne) + self.newPopulation.append(parentTwo) + + """ + CrossOverHelper will return the node at the random point provided + """ + + def CrossOverHelper(self, node, point) -> Node: + tempQueue = [] + + if point == 1: + if node.getChildren() is None: + return node.getChildren()[0] + else: + return node + + tempQueue.append(node) + while tempQueue: + tempNode = tempQueue.pop(0) + + if point <= 1: + #node = tempNode + return node + + point -= 1 + + if tempNode.getChildren(): + for i in range(0, len(tempNode.getChildren())): + tempQueue.append(tempNode.getChildren()[i]) + return None + + """ + CrossOverParentHelper will return the parent of the node returned in CrossOverHelper + """ + + def CrossOverParentHelper(self, nodeParent, node) -> Node: + tempQueue = [nodeParent] + + while tempQueue: + tempNode = tempQueue.pop(0) + + if tempNode == nodeParent: + #nodeParent = tempNode + return nodeParent + + if tempNode.getChildren(): + if node in tempNode.getChildren(): + nodeParent = tempNode + return nodeParent + for i in range(0, len(tempNode.getChildren())): + tempQueue.append(tempNode.getChildren()[i]) + + return None + + """ + Mutation selects a random individual and a random point in the individual. + A new subtree is generated and replaces the subtree at the random point. + """ + + def Mutation(self): + parent = copy.deepcopy(self.FitnessProportionateSelection()) + + numNodes = parent.getRoot().CountNodes() + + point = random.randint(1, numNodes) + + node = self.CrossOverHelper(parent.getRoot(), point) + + nodeParent = self.CrossOverParentHelper(parent.getRoot(), node) + + newSubTree = self.GenerateTree(self.max_depth, "METHOD_GROW") + + for i in range(0, len(nodeParent.getChildren())): + if nodeParent.getChildren()[i].getLabel == node.getLabel(): + nodeParent.getChildren()[i] = newSubTree.getRoot() + break + + parent.resetValues() + + self.newPopulation.append(parent) + + """ + Reproduction copies over a random individual from the old population to the next population. + """ + + def Reproduction(self): + parent = copy.deepcopy(self.FitnessProportionateSelection()) + parent.resetValues() + self.newPopulation.append(parent) diff --git a/challenge1/analysis/cpandrianatos/Individual.py b/challenge1/analysis/cpandrianatos/Individual.py new file mode 100644 index 000000000..e46c94fb5 --- /dev/null +++ b/challenge1/analysis/cpandrianatos/Individual.py @@ -0,0 +1,77 @@ +from Node import * + +""" + @author Pavlo Andrianatos + Date: 11/02/2023 +""" + + +class Individual: + root: Node + fitness: float + + standardisedFitness: float + adjustedFitness: float + normalizedFitness: float + + """ + Each individual is one tree generated by the Genetic Program (GP). + The fitness and associated standardisedFitness, adjustedFitness, and normalizedFitness are stored in + each individual. + The standardisedFitness, adjustedFitness, normalizedFitness are used during Fitness proportionate selection to + select new individuals for the next generation. + The fitness for an individual is the MAPE over every data entry (row in dataset) and multiplied by 100. + """ + + def __init__(self, newRoot: Node, newFitness: float): + self.root = newRoot + self.fitness = newFitness + self.standardisedFitness = 0 + self.adjustedFitness = 0 + self.normalizedFitness = 0 + + def getRoot(self): + return self.root + + def getFitness(self): + return self.fitness + + def setFitness(self, newFitness): + self.fitness = newFitness + + def getStandardisedFitness(self): + return self.standardisedFitness + + def setStandardisedFitness(self, newStandardisedFitness): + self.standardisedFitness = newStandardisedFitness + + def getAdjustedFitness(self): + return self.adjustedFitness + + def setAdjustedFitness(self, newAdjustedFitness): + self.adjustedFitness = newAdjustedFitness + + def getNormalisedFitness(self): + return self.normalizedFitness + + def setNormalisedFitness(self, newNormalisedFitness): + self.normalizedFitness = newNormalisedFitness + + def resetValues(self): + self.fitness = 0.0 + self.standardisedFitness = 0 + self.adjustedFitness = 0 + self.normalizedFitness = 0 + + # Recursive function will print the tree associated with this individual + def printTree(self, node, appender): + if node is None: + return + print(appender, node.getLabel()) + if node.getLabel() is None: + return + if node.getChildren() is None: + return + for n in node.getChildren(): + if n is not None: + self.printTree(n, appender + "-") diff --git a/challenge1/analysis/cpandrianatos/LoadParameters.py b/challenge1/analysis/cpandrianatos/LoadParameters.py new file mode 100644 index 000000000..153c64f4c --- /dev/null +++ b/challenge1/analysis/cpandrianatos/LoadParameters.py @@ -0,0 +1,43 @@ +import configparser + +""" + @author Pavlo Andrianatos + Date: 11/02/2023 +""" + + +class LoadParameters: + fileNameTrain: str + random_seed: bool + seed: int + max_generations: int + pop_size: int + max_depth: int + crossover_chance: float + mutation_chance: float + reproduction_chance: float + num_of_runs: int + ramped_half_half: float + fileNameTest: str + + """ + Using config parser, parameters are read in using the parameters.config file. + These parameters are saved above and are accessed throughout the program. + """ + def ReadInParameters(self): + config = configparser.ConfigParser() + + config.read("parameters.config") + self.fileNameTrain = config["Training"]["filename"] + self.random_seed = True if config["Training"]["random_seed"] == "True" else False + self.seed = int(config["Training"]["seed"]) + self.max_generations = int(config["Training"]["max_generations"]) + self.pop_size = int(config["Training"]["pop_size"]) + self.max_depth = int(config["Training"]["max_depth"]) + self.crossover_chance = float(config["Training"]["crossover_chance"]) + self.mutation_chance = float(config["Training"]["mutation_chance"]) + self.reproduction_chance = float(config["Training"]["reproduction_chance"]) + self.num_of_runs = int(config["Training"]["num_of_runs"]) + self.ramped_half_half = float(config["Training"]["ramped_half_half"]) + + self.fileNameTest = config["Testing"]["filename"] diff --git a/challenge1/analysis/cpandrianatos/Main.py b/challenge1/analysis/cpandrianatos/Main.py new file mode 100644 index 000000000..b60d71c35 --- /dev/null +++ b/challenge1/analysis/cpandrianatos/Main.py @@ -0,0 +1,53 @@ +from Genetic_Program import Genetic_Program +from Individual import Individual +from LoadParameters import LoadParameters +from OutputPredictions import OutputPredictions + +""" + @author Pavlo Andrianatos + Date: 11/02/2023 +""" + +if __name__ == "__main__": + # Load Parameters + lp = LoadParameters() + lp.ReadInParameters() + + fileName = lp.fileNameTrain + seed = lp.seed + max_generations = lp.max_generations + pop_size = lp.pop_size + max_depth = lp.max_depth + crossover_chance = lp.crossover_chance + mutation_chance = lp.mutation_chance + reproduction_chance = lp.reproduction_chance + num_of_runs = lp.num_of_runs + ramped_half_half = lp.ramped_half_half + random_seed = lp.random_seed + + # List to store the best individual for each run + data_from_runs = [] + + for i in range(0, num_of_runs): + gp = Genetic_Program() + data_from_runs.append(gp.Perform_Run(fileName, seed, max_generations, pop_size, max_depth, crossover_chance, + mutation_chance, reproduction_chance, ramped_half_half, random_seed)) + # This is mainly if you want to have an increasing seed each run, otherwise using a static seed will + # result in the same outcome every run + seed += 10 + + # The best individual is obtained from all runs and sent to the OutputEneryPredictions class + bestIndividual: Individual + bestIndividualFitness = 100000.0 + bestIndex = -1 + for i in range(0, len(data_from_runs)): + if data_from_runs[i][0].getFitness() < bestIndividualFitness: + bestIndividualFitness = data_from_runs[i][0].getFitness() + bestIndex = i + + bestIndividual = data_from_runs[bestIndex][0] + bestSeed = data_from_runs[bestIndex][1] + + op = OutputPredictions() + + op.OutputEneryPredictions(lp, bestIndividual, bestSeed) diff --git a/challenge1/analysis/cpandrianatos/Node.py b/challenge1/analysis/cpandrianatos/Node.py new file mode 100644 index 000000000..7993c53aa --- /dev/null +++ b/challenge1/analysis/cpandrianatos/Node.py @@ -0,0 +1,29 @@ +""" + @author Pavlo Andrianatos + Date: 11/02/2023 +""" + + +class Node(object): + + """ + Base class for functional and terminal nodes + """ + + def __init__(self): + pass + + def getLabel(self) -> str: + pass + + def setLabel(self, newLabel: str): + pass + + def getChildren(self) -> list: + pass + + def setChildren(self, newChildren: list): + pass + + def CountNodes(self) -> int: + pass diff --git a/challenge1/analysis/cpandrianatos/OutputPredictions.py b/challenge1/analysis/cpandrianatos/OutputPredictions.py new file mode 100644 index 000000000..f2af824b6 --- /dev/null +++ b/challenge1/analysis/cpandrianatos/OutputPredictions.py @@ -0,0 +1,54 @@ +import csv +from DataReader import DataReader +from Genetic_Program import Genetic_Program + +""" + @author Pavlo Andrianatos + Date: 11/02/2023 +""" + + +class OutputPredictions: + """ + The best individual is used to calculate the EnergyProduction for each house in the test dataset. + The Mean Absolute Percentage Error (MAPE) is calculated and output to a txt file. + The predictions are saved in the predicted_energy_production.csv file + """ + def OutputEneryPredictions(self, lp, bestIndividual, bestSeed): + fileNameTest = lp.fileNameTest + + dr = DataReader(fileNameTest) + + data = dr.ReadInData() + + gpTest = Genetic_Program() + + testFitness = 0.0 + + output_house_energy = [] + + for i in range(0, len(data)): + prediction = gpTest.RunGP(bestIndividual, bestIndividual.getRoot(), data[i]) + + testFitness += abs( + (float(data[i].getEnergyProduction()) - prediction) / float(data[i].getEnergyProduction())) + + tempOutput = [data[i].getHouse(), data[i].getEnergyProduction()] + output_house_energy.append(tempOutput) + + testFitness = testFitness * (1 / len(data)) + + bestIndividual.printTree(bestIndividual.getRoot(), "-") + + print("MAPE: ", testFitness) + print("Seed: ", bestSeed) + + header = ["House", "EnergyProduction"] + + with open("predicted_energy_production.csv", "w", encoding='UTF8', newline='') as CSV_file: + writer = csv.writer(CSV_file) + writer.writerow(header) + writer.writerows(output_house_energy) + + with open("mape.txt", "w", encoding='UTF8', newline='') as MAPE_txt: + MAPE_txt.write(str(testFitness)) diff --git a/challenge1/analysis/cpandrianatos/README.md b/challenge1/analysis/cpandrianatos/README.md new file mode 100644 index 000000000..a61894d75 --- /dev/null +++ b/challenge1/analysis/cpandrianatos/README.md @@ -0,0 +1,72 @@ +This submission is from Pavlo Andrianatos on the 11th February 2023 + +This submission was made using Pycharm 2022.1, Anaconda 4.13.0, and Python 3.10. A environment.yml file is provided to replicate the programming envvironment. + +The program can be run by running Run_Script.bat, this will run Main.py + +I opted to use Genetic Programming to solve challenge 1, this uses symbolic regression to obtain an optimal mathematical function that +predicts the EnergyProduction. + +This involves generating trees that get interpreted and using the values for each ID (Label, House, Year, Month, Temperature, Daylight) will predict an +EnergyProduction. + +There is a parameter.config file where parameters can be changed to alter the generation of solutions. + +The parameters are as followed: + +[Training] +filename = training_dataset_500.csv (Filename of training dataset) +random_seed = True (If True the program will use a random seed) +seed = 123 (Seed to use during training, if random_seed is set to 1 the user-provided seed is ignored) +max_generations = 5 (Number of generations in each run, will perform crossover, mutation, and reproduction at the end of each generation) +pop_size = 10 (Number of trees in each generation) +max_depth = 4 (The max depth of trees, Mutation can cause a tree to exceed the max_depth. Recommended 2 - 6, can make it more + but the trees become quite big) +crossover_chance = 0.7 (Chance crossover will be used to create a new tree for the next generation) +mutation_chance = 0.2 (Chance mutation will be used to create a new tree for the next generation) +reproduction_chance = 0.1 (Chance reproduction will be used to create a new tree for the next generation. Don't recommend setting + this too high, because then you are just copying over the last population to the next. We want to apply + selection pressure) +num_of_runs = 3 (Number of runs) +ramped_half_half = 0.5 (Chance a tree will stop generating a full tree and the generating branch terminates with a terminal node) + +[Testing] +filename = test_dataset_500.csv (Filename of test dataset) + +The fitness for each individual in the population is the MAPE for the individual given all training data and multiplied by 100. The best fitness is 0. + +The fitness is used in fitness proportionate selection where random individual are selected out of a mating pool to be used in either crossover, mutation, or reproduction. + +Crossover selects a random point on two different individuals and swaps the sub branches at those nodes. + +Mutation selects a random point in an individual and generates a new sub tree at that point. + +Reproduction copies over an individual from the old population into the new population. + +Best Individual found: +- - +-- + +--- 557.0 +--- Label +-- + +--- Label +--- Temperature +MAPE: 0.13358284671566079 +Seed: 3054022391716465016 + +Best Individual parameters: +[Training] +filename = training_dataset_500.csv +random_seed = False +seed = 3054022391716465016 +max_generations = 3 +pop_size = 10 +max_depth = 3 +crossover_chance = 0.7 +mutation_chance = 0.2 +reproduction_chance = 0.1 +num_of_runs = 3 +ramped_half_half = 0.5 + +[Testing] +filename = test_dataset_500.csv \ No newline at end of file diff --git a/challenge1/analysis/cpandrianatos/Run_script.bat b/challenge1/analysis/cpandrianatos/Run_script.bat new file mode 100644 index 000000000..3dae8950a --- /dev/null +++ b/challenge1/analysis/cpandrianatos/Run_script.bat @@ -0,0 +1,3 @@ +@echo off +python Main.py +pause \ No newline at end of file diff --git a/challenge1/analysis/cpandrianatos/TerminalNode.py b/challenge1/analysis/cpandrianatos/TerminalNode.py new file mode 100644 index 000000000..471e4f1db --- /dev/null +++ b/challenge1/analysis/cpandrianatos/TerminalNode.py @@ -0,0 +1,34 @@ +from Node import Node + +""" + @author Pavlo Andrianatos + Date: 11/02/2023 +""" + + +class TerminalNode(Node): + label: str + + # variables in data + # ID Label House Year Month Temperature Daylight EnergyProduction + + """ + Each terminal nodes represents one variable in each data entry (row in dataset), excluding the ID and + EnergyProduction, you can't use EnergyProduction during training since that is what you are trying to + predict. + Terminal nodes are considered leaf nodes and do not have children. + """ + + def __init__(self, label: str): + super().__init__() + self.label = label + + def getLabel(self) -> str: + return self.label + + def setLabel(self, newLabel: str): + self.label = newLabel + + # Return 1 since these are leaf nodes in the tree + def CountNodes(self) -> int: + return 1 diff --git a/challenge1/analysis/cpandrianatos/environment.yml b/challenge1/analysis/cpandrianatos/environment.yml new file mode 100644 index 000000000..4c6513332 --- /dev/null +++ b/challenge1/analysis/cpandrianatos/environment.yml @@ -0,0 +1,20 @@ +channels: + - defaults +dependencies: + - bzip2=1.0.8=he774522_0 + - ca-certificates=2023.01.10=haa95532_0 + - certifi=2022.12.7=py310haa95532_0 + - libffi=3.4.2=hd77b12b_6 + - openssl=1.1.1s=h2bbff1b_0 + - pip=22.3.1=py310haa95532_0 + - python=3.10.9=h966fe2a_0 + - setuptools=65.6.3=py310haa95532_0 + - sqlite=3.40.1=h2bbff1b_0 + - tk=8.6.12=h2bbff1b_0 + - tzdata=2022g=h04d1e81_0 + - vc=14.2=h21ff451_1 + - vs2015_runtime=14.27.29016=h5e58377_2 + - wheel=0.37.1=pyhd3eb1b0_0 + - wincertstore=0.2=py310haa95532_2 + - xz=5.2.10=h8cc25b3_1 + - zlib=1.2.13=h8cc25b3_0 diff --git a/challenge1/analysis/cpandrianatos/mape.txt b/challenge1/analysis/cpandrianatos/mape.txt new file mode 100644 index 000000000..1210a9afe --- /dev/null +++ b/challenge1/analysis/cpandrianatos/mape.txt @@ -0,0 +1 @@ +0.13358284671566079 \ No newline at end of file diff --git a/challenge1/analysis/cpandrianatos/parameters.config b/challenge1/analysis/cpandrianatos/parameters.config new file mode 100644 index 000000000..96d4043a2 --- /dev/null +++ b/challenge1/analysis/cpandrianatos/parameters.config @@ -0,0 +1,15 @@ +[Training] +filename = training_dataset_500.csv +random_seed = False +seed = 3054022391716465016 +max_generations = 3 +pop_size = 10 +max_depth = 3 +crossover_chance = 0.7 +mutation_chance = 0.2 +reproduction_chance = 0.1 +num_of_runs = 3 +ramped_half_half = 0.5 + +[Testing] +filename = test_dataset_500.csv \ No newline at end of file diff --git a/challenge1/analysis/cpandrianatos/predicted_energy_production.csv b/challenge1/analysis/cpandrianatos/predicted_energy_production.csv new file mode 100644 index 000000000..9956382fe --- /dev/null +++ b/challenge1/analysis/cpandrianatos/predicted_energy_production.csv @@ -0,0 +1,501 @@ +House,EnergyProduction +1,778 +2,627 +3,735 +4,533 +5,533 +6,670 +7,673 +8,560 +9,517 +10,455 +11,455 +12,670 +13,565 +14,455 +15,778 +16,627 +17,586 +18,518 +19,627 +20,467 +21,684 +22,560 +23,471 +24,670 +25,471 +26,668 +27,673 +28,736 +29,627 +30,523 +31,736 +32,523 +33,736 +34,584 +35,584 +36,565 +37,735 +38,673 +39,518 +40,735 +41,523 +42,471 +43,684 +44,735 +45,668 +46,522 +47,455 +48,471 +49,455 +50,668 +51,455 +52,684 +53,627 +54,517 +55,735 +56,471 +57,585 +58,534 +59,627 +60,736 +61,628 +62,523 +63,670 +64,455 +65,517 +66,565 +67,778 +68,523 +69,467 +70,565 +71,467 +72,586 +73,522 +74,585 +75,534 +76,627 +77,670 +78,534 +79,467 +80,534 +81,778 +82,670 +83,627 +84,627 +85,628 +86,886 +87,455 +88,455 +89,451 +90,670 +91,565 +92,533 +93,533 +94,736 +95,668 +96,886 +97,467 +98,584 +99,517 +100,534 +101,518 +102,460 +103,684 +104,735 +105,586 +106,736 +107,467 +108,517 +109,886 +110,778 +111,533 +112,560 +113,517 +114,627 +115,460 +116,471 +117,451 +118,670 +119,533 +120,533 +121,576 +122,684 +123,523 +124,586 +125,560 +126,585 +127,517 +128,673 +129,560 +130,627 +131,576 +132,684 +133,585 +134,534 +135,565 +136,576 +137,471 +138,533 +139,628 +140,585 +141,673 +142,523 +143,467 +144,518 +145,565 +146,518 +147,576 +148,467 +149,534 +150,576 +151,560 +152,886 +153,668 +154,534 +155,471 +156,483 +157,627 +158,565 +159,522 +160,886 +161,736 +162,517 +163,586 +164,460 +165,467 +166,684 +167,517 +168,460 +169,565 +170,560 +171,576 +172,451 +173,670 +174,736 +175,455 +176,471 +177,778 +178,534 +179,736 +180,534 +181,627 +182,576 +183,585 +184,668 +185,778 +186,522 +187,736 +188,518 +189,735 +190,778 +191,534 +192,735 +193,627 +194,585 +195,673 +196,576 +197,467 +198,471 +199,518 +200,673 +201,517 +202,467 +203,533 +204,560 +205,560 +206,455 +207,460 +208,673 +209,483 +210,735 +211,471 +212,778 +213,460 +214,517 +215,684 +216,522 +217,522 +218,534 +219,455 +220,455 +221,736 +222,673 +223,560 +224,522 +225,518 +226,467 +227,483 +228,523 +229,451 +230,736 +231,533 +232,627 +233,523 +234,668 +235,523 +236,735 +237,517 +238,584 +239,518 +240,517 +241,455 +242,533 +243,518 +244,467 +245,460 +246,522 +247,534 +248,560 +249,576 +250,523 +251,560 +252,460 +253,523 +254,523 +255,455 +256,586 +257,518 +258,684 +259,460 +260,560 +261,517 +262,778 +263,483 +264,455 +265,735 +266,735 +267,455 +268,455 +269,576 +270,455 +271,673 +272,736 +273,565 +274,455 +275,483 +276,560 +277,886 +278,565 +279,523 +280,735 +281,778 +282,627 +283,668 +284,523 +285,517 +286,576 +287,483 +288,523 +289,736 +290,684 +291,735 +292,533 +293,522 +294,471 +295,565 +296,455 +297,585 +298,584 +299,668 +300,628 +301,451 +302,467 +303,560 +304,735 +305,576 +306,565 +307,627 +308,565 +309,460 +310,451 +311,560 +312,517 +313,673 +314,670 +315,628 +316,517 +317,628 +318,628 +319,628 +320,451 +321,460 +322,585 +323,585 +324,627 +325,673 +326,534 +327,565 +328,518 +329,628 +330,523 +331,670 +332,455 +333,736 +334,668 +335,467 +336,627 +337,576 +338,778 +339,628 +340,673 +341,778 +342,736 +343,523 +344,533 +345,467 +346,533 +347,586 +348,736 +349,584 +350,684 +351,735 +352,471 +353,586 +354,523 +355,628 +356,585 +357,483 +358,585 +359,483 +360,736 +361,467 +362,735 +363,628 +364,668 +365,886 +366,668 +367,467 +368,455 +369,460 +370,518 +371,735 +372,627 +373,518 +374,684 +375,670 +376,684 +377,585 +378,628 +379,455 +380,523 +381,735 +382,673 +383,471 +384,684 +385,586 +386,778 +387,565 +388,673 +389,668 +390,522 +391,778 +392,560 +393,673 +394,584 +395,886 +396,627 +397,584 +398,517 +399,735 +400,522 +401,627 +402,455 +403,565 +404,627 +405,627 +406,533 +407,670 +408,684 +409,668 +410,533 +411,673 +412,585 +413,628 +414,576 +415,585 +416,534 +417,735 +418,576 +419,684 +420,523 +421,585 +422,684 +423,735 +424,455 +425,467 +426,670 +427,668 +428,522 +429,522 +430,523 +431,560 +432,517 +433,586 +434,673 +435,522 +436,533 +437,628 +438,483 +439,483 +440,584 +441,483 +442,560 +443,886 +444,451 +445,585 +446,522 +447,778 +448,586 +449,451 +450,584 +451,522 +452,627 +453,684 +454,736 +455,585 +456,534 +457,628 +458,736 +459,668 +460,522 +461,778 +462,471 +463,455 +464,533 +465,627 +466,460 +467,565 +468,576 +469,483 +470,534 +471,736 +472,533 +473,471 +474,584 +475,522 +476,576 +477,684 +478,576 +479,451 +480,585 +481,517 +482,585 +483,471 +484,518 +485,668 +486,560 +487,627 +488,585 +489,560 +490,584 +491,534 +492,670 +493,736 +494,533 +495,455 +496,483 +497,628 +498,673 +499,735 +500,586