| 1 | package de.ugoe.cs.cpdp.training; |
|---|
| 2 | |
|---|
| 3 | import java.util.LinkedList; |
|---|
| 4 | import java.util.List; |
|---|
| 5 | |
|---|
| 6 | import org.apache.commons.collections4.list.SetUniqueList; |
|---|
| 7 | |
|---|
| 8 | import weka.classifiers.AbstractClassifier; |
|---|
| 9 | import weka.classifiers.Classifier; |
|---|
| 10 | import weka.core.Instance; |
|---|
| 11 | import weka.core.Instances; |
|---|
| 12 | import org.apache.commons.lang3.ArrayUtils; |
|---|
| 13 | import org.jgap.Configuration; |
|---|
| 14 | import org.jgap.InvalidConfigurationException; |
|---|
| 15 | import org.jgap.gp.CommandGene; |
|---|
| 16 | import org.jgap.gp.GPProblem; |
|---|
| 17 | |
|---|
| 18 | import org.jgap.gp.function.Add; |
|---|
| 19 | import org.jgap.gp.function.Multiply; |
|---|
| 20 | import org.jgap.gp.function.Log; |
|---|
| 21 | import org.jgap.gp.function.Subtract; |
|---|
| 22 | import org.jgap.gp.function.Divide; |
|---|
| 23 | import org.jgap.gp.function.Sine; |
|---|
| 24 | import org.jgap.gp.function.Cosine; |
|---|
| 25 | import org.jgap.gp.function.Max; |
|---|
| 26 | import org.jgap.gp.function.Exp; |
|---|
| 27 | |
|---|
| 28 | import org.jgap.gp.impl.DeltaGPFitnessEvaluator; |
|---|
| 29 | import org.jgap.gp.impl.GPConfiguration; |
|---|
| 30 | import org.jgap.gp.impl.GPGenotype; |
|---|
| 31 | import org.jgap.gp.impl.TournamentSelector; |
|---|
| 32 | import org.jgap.gp.terminal.Terminal; |
|---|
| 33 | import org.jgap.gp.GPFitnessFunction; |
|---|
| 34 | import org.jgap.gp.IGPProgram; |
|---|
| 35 | import org.jgap.gp.terminal.Variable; |
|---|
| 36 | import org.jgap.gp.MathCommand; |
|---|
| 37 | import org.jgap.util.ICloneable; |
|---|
| 38 | |
|---|
| 39 | import de.ugoe.cs.cpdp.util.WekaUtils; |
|---|
| 40 | |
|---|
| 41 | import org.jgap.gp.impl.ProgramChromosome; |
|---|
| 42 | import org.jgap.util.CloneException; |
|---|
| 43 | |
|---|
| 44 | /** |
|---|
| 45 | * Genetic Programming Trainer |
|---|
| 46 | * |
|---|
| 47 | * Implementation (mostly) according to Liu et al. Evolutionary Optimization of Software Quality Modeling with Multiple Repositories. |
|---|
| 48 | * |
|---|
| 49 | * - GPRun is a Run of a complete Genetic Programm Evolution, we want several complete runs. |
|---|
| 50 | * - GPVClassifier is the Validation Classifier |
|---|
| 51 | * - GPVVClassifier is the Validation-Voting Classifier |
|---|
| 52 | * |
|---|
| 53 | * config: <setwisetrainer name="GPTraining" param="populationSize:1000,numberRuns:10" /> |
|---|
| 54 | */ |
|---|
| 55 | public class GPTraining implements ISetWiseTrainingStrategy, IWekaCompatibleTrainer { |
|---|
| 56 | |
|---|
| 57 | private GPVVClassifier classifier = null; |
|---|
| 58 | |
|---|
| 59 | // default values from the paper |
|---|
| 60 | private int populationSize = 1000; |
|---|
| 61 | private int initMinDepth = 2; |
|---|
| 62 | private int initMaxDepth = 6; |
|---|
| 63 | private int tournamentSize = 7; |
|---|
| 64 | private int maxGenerations = 50; |
|---|
| 65 | private double errorType2Weight = 15; |
|---|
| 66 | private int numberRuns = 20; // im paper 20 per errorType2Weight then additional 20 |
|---|
| 67 | private int maxDepth = 20; // max depth within one program |
|---|
| 68 | private int maxNodes = 100; // max nodes within one program |
|---|
| 69 | |
|---|
| 70 | @Override |
|---|
| 71 | public void setParameter(String parameters) { |
|---|
| 72 | |
|---|
| 73 | String[] params = parameters.split(","); |
|---|
| 74 | String[] keyvalue = new String[2]; |
|---|
| 75 | |
|---|
| 76 | for(int i=0; i < params.length; i++) { |
|---|
| 77 | keyvalue = params[i].split(":"); |
|---|
| 78 | |
|---|
| 79 | switch(keyvalue[0]) { |
|---|
| 80 | case "populationSize": |
|---|
| 81 | this.populationSize = Integer.parseInt(keyvalue[1]); |
|---|
| 82 | break; |
|---|
| 83 | |
|---|
| 84 | case "initMinDepth": |
|---|
| 85 | this.initMinDepth = Integer.parseInt(keyvalue[1]); |
|---|
| 86 | break; |
|---|
| 87 | |
|---|
| 88 | case "tournamentSize": |
|---|
| 89 | this.tournamentSize = Integer.parseInt(keyvalue[1]); |
|---|
| 90 | break; |
|---|
| 91 | |
|---|
| 92 | case "maxGenerations": |
|---|
| 93 | this.maxGenerations = Integer.parseInt(keyvalue[1]); |
|---|
| 94 | break; |
|---|
| 95 | |
|---|
| 96 | case "errorType2Weight": |
|---|
| 97 | this.errorType2Weight = Double.parseDouble(keyvalue[1]); |
|---|
| 98 | break; |
|---|
| 99 | |
|---|
| 100 | case "numberRuns": |
|---|
| 101 | this.numberRuns = Integer.parseInt(keyvalue[1]); |
|---|
| 102 | break; |
|---|
| 103 | |
|---|
| 104 | case "maxDepth": |
|---|
| 105 | this.maxDepth = Integer.parseInt(keyvalue[1]); |
|---|
| 106 | break; |
|---|
| 107 | |
|---|
| 108 | case "maxNodes": |
|---|
| 109 | this.maxNodes = Integer.parseInt(keyvalue[1]); |
|---|
| 110 | break; |
|---|
| 111 | } |
|---|
| 112 | } |
|---|
| 113 | |
|---|
| 114 | this.classifier = new GPVVClassifier(); |
|---|
| 115 | ((GPVClassifier)this.classifier).configure(populationSize, initMinDepth, initMaxDepth, tournamentSize, maxGenerations, errorType2Weight, numberRuns, maxDepth, maxNodes); |
|---|
| 116 | } |
|---|
| 117 | |
|---|
| 118 | @Override |
|---|
| 119 | public void apply(SetUniqueList<Instances> traindataSet) { |
|---|
| 120 | try { |
|---|
| 121 | classifier.buildClassifier(traindataSet); |
|---|
| 122 | }catch(Exception e) { |
|---|
| 123 | throw new RuntimeException(e); |
|---|
| 124 | } |
|---|
| 125 | } |
|---|
| 126 | |
|---|
| 127 | @Override |
|---|
| 128 | public String getName() { |
|---|
| 129 | return "GPTraining"; |
|---|
| 130 | } |
|---|
| 131 | |
|---|
| 132 | @Override |
|---|
| 133 | public Classifier getClassifier() { |
|---|
| 134 | return this.classifier; |
|---|
| 135 | } |
|---|
| 136 | |
|---|
| 137 | public class InstanceData { |
|---|
| 138 | private double[][] instances_x; |
|---|
| 139 | private boolean[] instances_y; |
|---|
| 140 | |
|---|
| 141 | public InstanceData(Instances instances) { |
|---|
| 142 | this.instances_x = new double[instances.numInstances()][instances.numAttributes()-1]; |
|---|
| 143 | this.instances_y = new boolean[instances.numInstances()]; |
|---|
| 144 | |
|---|
| 145 | Instance current; |
|---|
| 146 | for(int i=0; i < this.instances_x.length; i++) { |
|---|
| 147 | current = instances.get(i); |
|---|
| 148 | this.instances_x[i] = WekaUtils.instanceValues(current); |
|---|
| 149 | this.instances_y[i] = 1.0 == current.classValue(); |
|---|
| 150 | } |
|---|
| 151 | } |
|---|
| 152 | |
|---|
| 153 | public double[][] getX() { |
|---|
| 154 | return instances_x; |
|---|
| 155 | } |
|---|
| 156 | public boolean[] getY() { |
|---|
| 157 | return instances_y; |
|---|
| 158 | } |
|---|
| 159 | } |
|---|
| 160 | |
|---|
| 161 | /** |
|---|
| 162 | * One Run executed by a GP Classifier |
|---|
| 163 | */ |
|---|
| 164 | public class GPRun extends AbstractClassifier { |
|---|
| 165 | private static final long serialVersionUID = -4250422550107888789L; |
|---|
| 166 | |
|---|
| 167 | private int populationSize; |
|---|
| 168 | private int initMinDepth; |
|---|
| 169 | private int initMaxDepth; |
|---|
| 170 | private int tournamentSize; |
|---|
| 171 | private int maxGenerations; |
|---|
| 172 | private double errorType2Weight; |
|---|
| 173 | private int maxDepth; |
|---|
| 174 | private int maxNodes; |
|---|
| 175 | |
|---|
| 176 | private GPGenotype gp; |
|---|
| 177 | private GPProblem problem; |
|---|
| 178 | |
|---|
| 179 | public void configure(int populationSize, int initMinDepth, int initMaxDepth, int tournamentSize, int maxGenerations, double errorType2Weight, int maxDepth, int maxNodes) { |
|---|
| 180 | this.populationSize = populationSize; |
|---|
| 181 | this.initMinDepth = initMinDepth; |
|---|
| 182 | this.initMaxDepth = initMaxDepth; |
|---|
| 183 | this.tournamentSize = tournamentSize; |
|---|
| 184 | this.maxGenerations = maxGenerations; |
|---|
| 185 | this.errorType2Weight = errorType2Weight; |
|---|
| 186 | this.maxDepth = maxDepth; |
|---|
| 187 | this.maxNodes = maxNodes; |
|---|
| 188 | } |
|---|
| 189 | |
|---|
| 190 | public GPGenotype getGp() { |
|---|
| 191 | return this.gp; |
|---|
| 192 | } |
|---|
| 193 | |
|---|
| 194 | public Variable[] getVariables() { |
|---|
| 195 | return ((CrossPareGP)this.problem).getVariables(); |
|---|
| 196 | } |
|---|
| 197 | |
|---|
| 198 | @Override |
|---|
| 199 | public void buildClassifier(Instances traindata) throws Exception { |
|---|
| 200 | InstanceData train = new InstanceData(traindata); |
|---|
| 201 | this.problem = new CrossPareGP(train.getX(), train.getY(), this.populationSize, this.initMinDepth, this.initMaxDepth, this.tournamentSize, this.errorType2Weight, this.maxDepth, this.maxNodes); |
|---|
| 202 | this.gp = problem.create(); |
|---|
| 203 | this.gp.evolve(this.maxGenerations); |
|---|
| 204 | } |
|---|
| 205 | |
|---|
| 206 | /** |
|---|
| 207 | * GPProblem implementation |
|---|
| 208 | */ |
|---|
| 209 | class CrossPareGP extends GPProblem { |
|---|
| 210 | private double[][] instances; |
|---|
| 211 | private boolean[] output; |
|---|
| 212 | |
|---|
| 213 | private int maxDepth; |
|---|
| 214 | private int maxNodes; |
|---|
| 215 | |
|---|
| 216 | private Variable[] x; |
|---|
| 217 | |
|---|
| 218 | public CrossPareGP(double[][] instances, boolean[] output, int populationSize, int minInitDept, int maxInitDepth, int tournamentSize, double errorType2Weight, int maxDepth, int maxNodes) throws InvalidConfigurationException { |
|---|
| 219 | super(new GPConfiguration()); |
|---|
| 220 | |
|---|
| 221 | this.instances = instances; |
|---|
| 222 | this.output = output; |
|---|
| 223 | this.maxDepth = maxDepth; |
|---|
| 224 | this.maxNodes = maxNodes; |
|---|
| 225 | |
|---|
| 226 | Configuration.reset(); |
|---|
| 227 | GPConfiguration config = this.getGPConfiguration(); |
|---|
| 228 | |
|---|
| 229 | this.x = new Variable[this.instances[0].length]; |
|---|
| 230 | |
|---|
| 231 | for(int j=0; j < this.x.length; j++) { |
|---|
| 232 | this.x[j] = Variable.create(config, "X"+j, CommandGene.DoubleClass); |
|---|
| 233 | } |
|---|
| 234 | |
|---|
| 235 | config.setGPFitnessEvaluator(new DeltaGPFitnessEvaluator()); // smaller fitness is better |
|---|
| 236 | //config.setGPFitnessEvaluator(new DefaultGPFitnessEvaluator()); // bigger fitness is better |
|---|
| 237 | |
|---|
| 238 | config.setMinInitDepth(minInitDept); |
|---|
| 239 | config.setMaxInitDepth(maxInitDepth); |
|---|
| 240 | |
|---|
| 241 | config.setCrossoverProb((float)0.60); |
|---|
| 242 | config.setReproductionProb((float)0.10); |
|---|
| 243 | config.setMutationProb((float)0.30); |
|---|
| 244 | |
|---|
| 245 | config.setSelectionMethod(new TournamentSelector(tournamentSize)); |
|---|
| 246 | |
|---|
| 247 | config.setPopulationSize(populationSize); |
|---|
| 248 | |
|---|
| 249 | config.setMaxCrossoverDepth(4); |
|---|
| 250 | config.setFitnessFunction(new CrossPareFitness(this.x, this.instances, this.output, errorType2Weight)); |
|---|
| 251 | config.setStrictProgramCreation(true); |
|---|
| 252 | } |
|---|
| 253 | |
|---|
| 254 | // used for running the fitness function again for testing |
|---|
| 255 | public Variable[] getVariables() { |
|---|
| 256 | return this.x; |
|---|
| 257 | } |
|---|
| 258 | |
|---|
| 259 | |
|---|
| 260 | public GPGenotype create() throws InvalidConfigurationException { |
|---|
| 261 | GPConfiguration config = this.getGPConfiguration(); |
|---|
| 262 | |
|---|
| 263 | // return type |
|---|
| 264 | Class[] types = {CommandGene.DoubleClass}; |
|---|
| 265 | |
|---|
| 266 | // Arguments of result-producing chromosome: none |
|---|
| 267 | Class[][] argTypes = { {} }; |
|---|
| 268 | |
|---|
| 269 | // variables + functions, we set the variables with the values of the instances here |
|---|
| 270 | CommandGene[] vars = new CommandGene[this.instances[0].length]; |
|---|
| 271 | for(int j=0; j < this.instances[0].length; j++) { |
|---|
| 272 | vars[j] = this.x[j]; |
|---|
| 273 | } |
|---|
| 274 | CommandGene[] funcs = { |
|---|
| 275 | new Add(config, CommandGene.DoubleClass), |
|---|
| 276 | new Subtract(config, CommandGene.DoubleClass), |
|---|
| 277 | new Multiply(config, CommandGene.DoubleClass), |
|---|
| 278 | new Divide(config, CommandGene.DoubleClass), |
|---|
| 279 | new Sine(config, CommandGene.DoubleClass), |
|---|
| 280 | new Cosine(config, CommandGene.DoubleClass), |
|---|
| 281 | new Exp(config, CommandGene.DoubleClass), |
|---|
| 282 | new Log(config, CommandGene.DoubleClass), |
|---|
| 283 | new GT(config, CommandGene.DoubleClass), |
|---|
| 284 | new Max(config, CommandGene.DoubleClass), |
|---|
| 285 | new Terminal(config, CommandGene.DoubleClass, -100.0, 100.0, true), // min, max, whole numbers |
|---|
| 286 | }; |
|---|
| 287 | |
|---|
| 288 | CommandGene[] comb = (CommandGene[])ArrayUtils.addAll(vars, funcs); |
|---|
| 289 | CommandGene[][] nodeSets = { |
|---|
| 290 | comb, |
|---|
| 291 | }; |
|---|
| 292 | |
|---|
| 293 | // we only have one chromosome so this suffices |
|---|
| 294 | int minDepths[] = {config.getMinInitDepth()}; |
|---|
| 295 | int maxDepths[] = {this.maxDepth}; |
|---|
| 296 | GPGenotype result = GPGenotype.randomInitialGenotype(config, types, argTypes, nodeSets, minDepths, maxDepths, this.maxNodes, false); // 40 = maxNodes, true = verbose output |
|---|
| 297 | |
|---|
| 298 | return result; |
|---|
| 299 | } |
|---|
| 300 | } |
|---|
| 301 | |
|---|
| 302 | |
|---|
| 303 | /** |
|---|
| 304 | * Fitness function |
|---|
| 305 | */ |
|---|
| 306 | class CrossPareFitness extends GPFitnessFunction { |
|---|
| 307 | |
|---|
| 308 | private static final long serialVersionUID = 75234832484387L; |
|---|
| 309 | |
|---|
| 310 | private Variable[] x; |
|---|
| 311 | |
|---|
| 312 | private double[][] instances; |
|---|
| 313 | private boolean[] output; |
|---|
| 314 | |
|---|
| 315 | private double errorType2Weight = 1.0; |
|---|
| 316 | |
|---|
| 317 | // needed in evaluate |
|---|
| 318 | //private Object[] NO_ARGS = new Object[0]; |
|---|
| 319 | |
|---|
| 320 | private double sfitness = 0.0f; |
|---|
| 321 | private int errorType1 = 0; |
|---|
| 322 | private int errorType2 = 0; |
|---|
| 323 | |
|---|
| 324 | public CrossPareFitness(Variable[] x, double[][] instances, boolean[] output, double errorType2Weight) { |
|---|
| 325 | this.x = x; |
|---|
| 326 | this.instances = instances; |
|---|
| 327 | this.output = output; |
|---|
| 328 | this.errorType2Weight = errorType2Weight; |
|---|
| 329 | } |
|---|
| 330 | |
|---|
| 331 | public int getErrorType1() { |
|---|
| 332 | return this.errorType1; |
|---|
| 333 | } |
|---|
| 334 | |
|---|
| 335 | public int getErrorType2() { |
|---|
| 336 | return this.errorType2; |
|---|
| 337 | } |
|---|
| 338 | |
|---|
| 339 | public double getSecondFitness() { |
|---|
| 340 | return this.sfitness; |
|---|
| 341 | } |
|---|
| 342 | |
|---|
| 343 | public int getNumInstances() { |
|---|
| 344 | return this.instances.length; |
|---|
| 345 | } |
|---|
| 346 | |
|---|
| 347 | /** |
|---|
| 348 | * This is the fitness function |
|---|
| 349 | * |
|---|
| 350 | * Our fitness is best if we have the less wrong classifications, this includes a weight for type2 errors |
|---|
| 351 | */ |
|---|
| 352 | @Override |
|---|
| 353 | protected double evaluate(final IGPProgram program) { |
|---|
| 354 | double pfitness = 0.0f; |
|---|
| 355 | this.sfitness = 0.0f; |
|---|
| 356 | double value = 0.0f; |
|---|
| 357 | |
|---|
| 358 | // count classification errors |
|---|
| 359 | this.errorType1 = 0; |
|---|
| 360 | this.errorType2 = 0; |
|---|
| 361 | |
|---|
| 362 | for(int i=0; i < this.instances.length; i++) { |
|---|
| 363 | |
|---|
| 364 | // requires that we have a variable for each column of our dataset (attribute of instance) |
|---|
| 365 | for(int j=0; j < this.x.length; j++) { |
|---|
| 366 | this.x[j].set(this.instances[i][j]); |
|---|
| 367 | } |
|---|
| 368 | |
|---|
| 369 | // value gives us a double, if < 0.5 we set this instance as faulty |
|---|
| 370 | value = program.execute_double(0, this.x); |
|---|
| 371 | |
|---|
| 372 | if(value < 0.5) { |
|---|
| 373 | if(this.output[i] != true) { |
|---|
| 374 | this.errorType1 += 1; |
|---|
| 375 | } |
|---|
| 376 | }else { |
|---|
| 377 | if(this.output[i] == true) { |
|---|
| 378 | this.errorType2 += 1; |
|---|
| 379 | } |
|---|
| 380 | } |
|---|
| 381 | } |
|---|
| 382 | |
|---|
| 383 | // now calc pfitness |
|---|
| 384 | pfitness = (this.errorType1 + this.errorType2Weight * this.errorType2) / this.instances.length; |
|---|
| 385 | |
|---|
| 386 | // number of nodes in the programm, if lower then 10 we assign sFitness of 10 |
|---|
| 387 | // we can set metadata with setProgramData to save this |
|---|
| 388 | if(program.getChromosome(0).getSize(0) < 10) { |
|---|
| 389 | program.setApplicationData(10.0f); |
|---|
| 390 | } |
|---|
| 391 | |
|---|
| 392 | return pfitness; |
|---|
| 393 | } |
|---|
| 394 | } |
|---|
| 395 | |
|---|
| 396 | /** |
|---|
| 397 | * Custom GT implementation used in the GP Algorithm. |
|---|
| 398 | */ |
|---|
| 399 | public class GT extends MathCommand implements ICloneable { |
|---|
| 400 | |
|---|
| 401 | private static final long serialVersionUID = 113454184817L; |
|---|
| 402 | |
|---|
| 403 | public GT(final GPConfiguration a_conf, java.lang.Class a_returnType) throws InvalidConfigurationException { |
|---|
| 404 | super(a_conf, 2, a_returnType); |
|---|
| 405 | } |
|---|
| 406 | |
|---|
| 407 | public String toString() { |
|---|
| 408 | return "GT(&1, &2)"; |
|---|
| 409 | } |
|---|
| 410 | |
|---|
| 411 | public String getName() { |
|---|
| 412 | return "GT"; |
|---|
| 413 | } |
|---|
| 414 | |
|---|
| 415 | public float execute_float(ProgramChromosome c, int n, Object[] args) { |
|---|
| 416 | float f1 = c.execute_float(n, 0, args); |
|---|
| 417 | float f2 = c.execute_float(n, 1, args); |
|---|
| 418 | |
|---|
| 419 | float ret = 1.0f; |
|---|
| 420 | if(f1 > f2) { |
|---|
| 421 | ret = 0.0f; |
|---|
| 422 | } |
|---|
| 423 | |
|---|
| 424 | return ret; |
|---|
| 425 | } |
|---|
| 426 | |
|---|
| 427 | public double execute_double(ProgramChromosome c, int n, Object[] args) { |
|---|
| 428 | double f1 = c.execute_double(n, 0, args); |
|---|
| 429 | double f2 = c.execute_double(n, 1, args); |
|---|
| 430 | |
|---|
| 431 | double ret = 1; |
|---|
| 432 | if(f1 > f2) { |
|---|
| 433 | ret = 0; |
|---|
| 434 | } |
|---|
| 435 | return ret; |
|---|
| 436 | } |
|---|
| 437 | |
|---|
| 438 | public Object clone() { |
|---|
| 439 | try { |
|---|
| 440 | GT result = new GT(getGPConfiguration(), getReturnType()); |
|---|
| 441 | return result; |
|---|
| 442 | }catch(Exception ex) { |
|---|
| 443 | throw new CloneException(ex); |
|---|
| 444 | } |
|---|
| 445 | } |
|---|
| 446 | } |
|---|
| 447 | } |
|---|
| 448 | |
|---|
| 449 | /** |
|---|
| 450 | * GP Multiple Data Sets Validation-Voting Classifier |
|---|
| 451 | * |
|---|
| 452 | * Basically the same as the GP Multiple Data Sets Validation Classifier. |
|---|
| 453 | * But here we do keep a model candidate for each training set which may later vote |
|---|
| 454 | * |
|---|
| 455 | */ |
|---|
| 456 | public class GPVVClassifier extends GPVClassifier { |
|---|
| 457 | |
|---|
| 458 | private static final long serialVersionUID = -654710583852839901L; |
|---|
| 459 | private List<Classifier> classifiers = null; |
|---|
| 460 | |
|---|
| 461 | @Override |
|---|
| 462 | public void buildClassifier(Instances arg0) throws Exception { |
|---|
| 463 | // TODO Auto-generated method stub |
|---|
| 464 | |
|---|
| 465 | } |
|---|
| 466 | |
|---|
| 467 | /** Build the GP Multiple Data Sets Validation-Voting Classifier |
|---|
| 468 | * |
|---|
| 469 | * This is according to Section 6 of the Paper by Liu et al. |
|---|
| 470 | * It is basically the Multiple Data Sets Validation Classifier but here we keep the best models an let them vote. |
|---|
| 471 | * |
|---|
| 472 | * @param traindataSet |
|---|
| 473 | * @throws Exception |
|---|
| 474 | */ |
|---|
| 475 | public void buildClassifier(SetUniqueList<Instances> traindataSet) throws Exception { |
|---|
| 476 | |
|---|
| 477 | // each classifier is trained with one project from the set |
|---|
| 478 | // then is evaluated on the rest |
|---|
| 479 | classifiers = new LinkedList<>(); |
|---|
| 480 | for(int i=0; i < traindataSet.size(); i++) { |
|---|
| 481 | |
|---|
| 482 | // candidates we get out of evaluation |
|---|
| 483 | LinkedList<Classifier> candidates = new LinkedList<>(); |
|---|
| 484 | |
|---|
| 485 | // number of runs, yields the best of these |
|---|
| 486 | for(int k=0; k < this.numberRuns; k++) { |
|---|
| 487 | Classifier classifier = new GPRun(); |
|---|
| 488 | ((GPRun)classifier).configure(this.populationSize, this.initMinDepth, this.initMaxDepth, this.tournamentSize, this.maxGenerations, this.errorType2Weight, this.maxDepth, this.maxNodes); |
|---|
| 489 | |
|---|
| 490 | // one project is training data |
|---|
| 491 | classifier.buildClassifier(traindataSet.get(i)); |
|---|
| 492 | |
|---|
| 493 | double[] errors; |
|---|
| 494 | // rest of the set is evaluation data, we evaluate now |
|---|
| 495 | for(int j=0; j < traindataSet.size(); j++) { |
|---|
| 496 | if(j != i) { |
|---|
| 497 | // if type1 and type2 errors are < 0.5 we allow the model in the candidates |
|---|
| 498 | errors = this.evaluate((GPRun)classifier, traindataSet.get(j)); |
|---|
| 499 | if((errors[0] < 0.5) && (errors[1] < 0.5)) { |
|---|
| 500 | candidates.add(classifier); |
|---|
| 501 | } |
|---|
| 502 | } |
|---|
| 503 | } |
|---|
| 504 | } |
|---|
| 505 | |
|---|
| 506 | // now after the evaluation we do a model selection where only one model remains for the given training data |
|---|
| 507 | // we select the model which is best on all evaluation data |
|---|
| 508 | double smallest_error_count = Double.MAX_VALUE; |
|---|
| 509 | double[] errors; |
|---|
| 510 | Classifier best = null; |
|---|
| 511 | for(int ii=0; ii < candidates.size(); ii++) { |
|---|
| 512 | double[] errors_eval = {0.0, 0.0}; |
|---|
| 513 | |
|---|
| 514 | // we add the errors the candidate makes over the evaldata |
|---|
| 515 | for(int j=0; j < traindataSet.size(); j++) { |
|---|
| 516 | if(j != i) { |
|---|
| 517 | errors = this.evaluate((GPRun)candidates.get(ii), traindataSet.get(j)); |
|---|
| 518 | errors_eval[0] += errors[0]; |
|---|
| 519 | errors_eval[1] += errors[1]; |
|---|
| 520 | } |
|---|
| 521 | } |
|---|
| 522 | |
|---|
| 523 | // if the candidate made fewer errors it is now the best |
|---|
| 524 | if(errors_eval[0] + errors_eval[1] < smallest_error_count) { |
|---|
| 525 | best = candidates.get(ii); |
|---|
| 526 | smallest_error_count = errors_eval[0] + errors_eval[1]; |
|---|
| 527 | } |
|---|
| 528 | } |
|---|
| 529 | |
|---|
| 530 | |
|---|
| 531 | // now we have the best classifier for this training data |
|---|
| 532 | classifiers.add(best); |
|---|
| 533 | } |
|---|
| 534 | } |
|---|
| 535 | |
|---|
| 536 | /** |
|---|
| 537 | * Use the best classifiers for each training data in a majority voting |
|---|
| 538 | */ |
|---|
| 539 | @Override |
|---|
| 540 | public double classifyInstance(Instance instance) { |
|---|
| 541 | |
|---|
| 542 | int vote_positive = 0; |
|---|
| 543 | |
|---|
| 544 | for (int i = 0; i < classifiers.size(); i++) { |
|---|
| 545 | Classifier classifier = classifiers.get(i); |
|---|
| 546 | |
|---|
| 547 | GPGenotype gp = ((GPRun)classifier).getGp(); |
|---|
| 548 | Variable[] vars = ((GPRun)classifier).getVariables(); |
|---|
| 549 | |
|---|
| 550 | IGPProgram fitest = gp.getAllTimeBest(); // all time fitest |
|---|
| 551 | for(int j = 0; j < instance.numAttributes()-1; j++) { |
|---|
| 552 | vars[j].set(instance.value(j)); |
|---|
| 553 | } |
|---|
| 554 | |
|---|
| 555 | if(fitest.execute_double(0, vars) < 0.5) { |
|---|
| 556 | vote_positive += 1; |
|---|
| 557 | } |
|---|
| 558 | } |
|---|
| 559 | |
|---|
| 560 | if(vote_positive >= (classifiers.size()/2)) { |
|---|
| 561 | return 1.0; |
|---|
| 562 | }else { |
|---|
| 563 | return 0.0; |
|---|
| 564 | } |
|---|
| 565 | } |
|---|
| 566 | } |
|---|
| 567 | |
|---|
| 568 | /** |
|---|
| 569 | * GP Multiple Data Sets Validation Classifier |
|---|
| 570 | * |
|---|
| 571 | * We train a Classifier with one training project $numberRun times. |
|---|
| 572 | * Then we evaluate the classifier on the rest of the training projects and keep the best classifier. |
|---|
| 573 | * After that we have for each training project the best classifier as per the evaluation on the rest of the data set. |
|---|
| 574 | * Then we determine the best classifier from these candidates and keep it to be used later. |
|---|
| 575 | */ |
|---|
| 576 | public class GPVClassifier extends AbstractClassifier { |
|---|
| 577 | |
|---|
| 578 | private List<Classifier> classifiers = null; |
|---|
| 579 | private Classifier best = null; |
|---|
| 580 | |
|---|
| 581 | private static final long serialVersionUID = 3708714057579101522L; |
|---|
| 582 | |
|---|
| 583 | protected int populationSize; |
|---|
| 584 | protected int initMinDepth; |
|---|
| 585 | protected int initMaxDepth; |
|---|
| 586 | protected int tournamentSize; |
|---|
| 587 | protected int maxGenerations; |
|---|
| 588 | protected double errorType2Weight; |
|---|
| 589 | protected int numberRuns; |
|---|
| 590 | protected int maxDepth; |
|---|
| 591 | protected int maxNodes; |
|---|
| 592 | |
|---|
| 593 | /** |
|---|
| 594 | * Configure the GP Params and number of Runs |
|---|
| 595 | * |
|---|
| 596 | * @param populationSize |
|---|
| 597 | * @param initMinDepth |
|---|
| 598 | * @param initMaxDepth |
|---|
| 599 | * @param tournamentSize |
|---|
| 600 | * @param maxGenerations |
|---|
| 601 | * @param errorType2Weight |
|---|
| 602 | */ |
|---|
| 603 | public void configure(int populationSize, int initMinDepth, int initMaxDepth, int tournamentSize, int maxGenerations, double errorType2Weight, int numberRuns, int maxDepth, int maxNodes) { |
|---|
| 604 | this.populationSize = populationSize; |
|---|
| 605 | this.initMinDepth = initMinDepth; |
|---|
| 606 | this.initMaxDepth = initMaxDepth; |
|---|
| 607 | this.tournamentSize = tournamentSize; |
|---|
| 608 | this.maxGenerations = maxGenerations; |
|---|
| 609 | this.errorType2Weight = errorType2Weight; |
|---|
| 610 | this.numberRuns = numberRuns; |
|---|
| 611 | this.maxDepth = maxDepth; |
|---|
| 612 | this.maxNodes = maxNodes; |
|---|
| 613 | } |
|---|
| 614 | |
|---|
| 615 | /** Build the GP Multiple Data Sets Validation Classifier |
|---|
| 616 | * |
|---|
| 617 | * This is according to Section 6 of the Paper by Liu et al. except for the selection of the best model. |
|---|
| 618 | * Section 4 describes a slightly different approach. |
|---|
| 619 | * |
|---|
| 620 | * @param traindataSet |
|---|
| 621 | * @throws Exception |
|---|
| 622 | */ |
|---|
| 623 | public void buildClassifier(SetUniqueList<Instances> traindataSet) throws Exception { |
|---|
| 624 | |
|---|
| 625 | // each classifier is trained with one project from the set |
|---|
| 626 | // then is evaluated on the rest |
|---|
| 627 | for(int i=0; i < traindataSet.size(); i++) { |
|---|
| 628 | |
|---|
| 629 | // candidates we get out of evaluation |
|---|
| 630 | LinkedList<Classifier> candidates = new LinkedList<>(); |
|---|
| 631 | |
|---|
| 632 | // numberRuns full GPRuns, we generate numberRuns models for each traindata |
|---|
| 633 | for(int k=0; k < this.numberRuns; k++) { |
|---|
| 634 | Classifier classifier = new GPRun(); |
|---|
| 635 | ((GPRun)classifier).configure(this.populationSize, this.initMinDepth, this.initMaxDepth, this.tournamentSize, this.maxGenerations, this.errorType2Weight, this.maxDepth, this.maxNodes); |
|---|
| 636 | |
|---|
| 637 | classifier.buildClassifier(traindataSet.get(i)); |
|---|
| 638 | |
|---|
| 639 | double[] errors; |
|---|
| 640 | |
|---|
| 641 | // rest of the set is evaluation data, we evaluate now |
|---|
| 642 | for(int j=0; j < traindataSet.size(); j++) { |
|---|
| 643 | if(j != i) { |
|---|
| 644 | // if type1 and type2 errors are < 0.5 we allow the model in the candidate list |
|---|
| 645 | errors = this.evaluate((GPRun)classifier, traindataSet.get(j)); |
|---|
| 646 | if((errors[0] < 0.5) && (errors[1] < 0.5)) { |
|---|
| 647 | candidates.add(classifier); |
|---|
| 648 | } |
|---|
| 649 | } |
|---|
| 650 | } |
|---|
| 651 | } |
|---|
| 652 | |
|---|
| 653 | // now after the evaluation we do a model selection where only one model remains for the given training data |
|---|
| 654 | // we select the model which is best on all evaluation data |
|---|
| 655 | double smallest_error_count = Double.MAX_VALUE; |
|---|
| 656 | double[] errors; |
|---|
| 657 | Classifier best = null; |
|---|
| 658 | for(int ii=0; ii < candidates.size(); ii++) { |
|---|
| 659 | double[] errors_eval = {0.0, 0.0}; |
|---|
| 660 | |
|---|
| 661 | // we add the errors the candidate makes over the evaldata |
|---|
| 662 | for(int j=0; j < traindataSet.size(); j++) { |
|---|
| 663 | if(j != i) { |
|---|
| 664 | errors = this.evaluate((GPRun)candidates.get(ii), traindataSet.get(j)); |
|---|
| 665 | errors_eval[0] += errors[0]; |
|---|
| 666 | errors_eval[1] += errors[1]; |
|---|
| 667 | } |
|---|
| 668 | } |
|---|
| 669 | |
|---|
| 670 | // if the candidate made fewer errors it is now the best |
|---|
| 671 | if(errors_eval[0] + errors_eval[1] < smallest_error_count) { |
|---|
| 672 | best = candidates.get(ii); |
|---|
| 673 | smallest_error_count = errors_eval[0] + errors_eval[1]; |
|---|
| 674 | } |
|---|
| 675 | } |
|---|
| 676 | |
|---|
| 677 | |
|---|
| 678 | // now we have the best classifier for this training data |
|---|
| 679 | classifiers.add(best); |
|---|
| 680 | |
|---|
| 681 | } /* endfor trainData */ |
|---|
| 682 | |
|---|
| 683 | // now we have one best classifier for each trainData |
|---|
| 684 | // we evaluate again to find the best classifier of all time |
|---|
| 685 | // this selection is now according to section 4 of the paper and not 6 where an average of the 6 models is build |
|---|
| 686 | double smallest_error_count = Double.MAX_VALUE; |
|---|
| 687 | double error_count; |
|---|
| 688 | double errors[]; |
|---|
| 689 | for(int j=0; j < classifiers.size(); j++) { |
|---|
| 690 | error_count = 0; |
|---|
| 691 | Classifier current = classifiers.get(j); |
|---|
| 692 | for(int i=0; i < traindataSet.size(); i++) { |
|---|
| 693 | errors = this.evaluate((GPRun)current, traindataSet.get(i)); |
|---|
| 694 | error_count = errors[0] + errors[1]; |
|---|
| 695 | } |
|---|
| 696 | |
|---|
| 697 | if(error_count < smallest_error_count) { |
|---|
| 698 | best = current; |
|---|
| 699 | } |
|---|
| 700 | } |
|---|
| 701 | } |
|---|
| 702 | |
|---|
| 703 | @Override |
|---|
| 704 | public void buildClassifier(Instances traindata) throws Exception { |
|---|
| 705 | final Classifier classifier = new GPRun(); |
|---|
| 706 | ((GPRun)classifier).configure(populationSize, initMinDepth, initMaxDepth, tournamentSize, maxGenerations, errorType2Weight, this.maxDepth, this.maxNodes); |
|---|
| 707 | classifier.buildClassifier(traindata); |
|---|
| 708 | classifiers.add(classifier); |
|---|
| 709 | } |
|---|
| 710 | |
|---|
| 711 | /** |
|---|
| 712 | * Evaluation of the Classifier |
|---|
| 713 | * |
|---|
| 714 | * We evaluate the classifier with the Instances of the evalData. |
|---|
| 715 | * It basically assigns the instance attribute values to the variables of the s-expression-tree and |
|---|
| 716 | * then counts the missclassifications. |
|---|
| 717 | * |
|---|
| 718 | * @param classifier |
|---|
| 719 | * @param evalData |
|---|
| 720 | * @return |
|---|
| 721 | */ |
|---|
| 722 | public double[] evaluate(GPRun classifier, Instances evalData) { |
|---|
| 723 | GPGenotype gp = classifier.getGp(); |
|---|
| 724 | Variable[] vars = classifier.getVariables(); |
|---|
| 725 | |
|---|
| 726 | IGPProgram fitest = gp.getAllTimeBest(); // selects the fitest of all not just the last generation |
|---|
| 727 | |
|---|
| 728 | double classification; |
|---|
| 729 | int error_type1 = 0; |
|---|
| 730 | int error_type2 = 0; |
|---|
| 731 | int positive = 0; |
|---|
| 732 | int negative = 0; |
|---|
| 733 | |
|---|
| 734 | for(Instance instance: evalData) { |
|---|
| 735 | |
|---|
| 736 | // assign instance attribute values to the variables of the s-expression-tree |
|---|
| 737 | double[] tmp = WekaUtils.instanceValues(instance); |
|---|
| 738 | for(int i = 0; i < tmp.length; i++) { |
|---|
| 739 | vars[i].set(tmp[i]); |
|---|
| 740 | } |
|---|
| 741 | |
|---|
| 742 | classification = fitest.execute_double(0, vars); |
|---|
| 743 | |
|---|
| 744 | // we need to count the absolutes of positives for percentage |
|---|
| 745 | if(instance.classValue() == 1.0) { |
|---|
| 746 | positive +=1; |
|---|
| 747 | }else { |
|---|
| 748 | negative +=1; |
|---|
| 749 | } |
|---|
| 750 | |
|---|
| 751 | // classification < 0.5 we say defective |
|---|
| 752 | if(classification < 0.5) { |
|---|
| 753 | if(instance.classValue() != 1.0) { |
|---|
| 754 | error_type1 += 1; |
|---|
| 755 | } |
|---|
| 756 | }else { |
|---|
| 757 | if(instance.classValue() == 1.0) { |
|---|
| 758 | error_type2 += 1; |
|---|
| 759 | } |
|---|
| 760 | } |
|---|
| 761 | } |
|---|
| 762 | |
|---|
| 763 | // return error types percentages for the types |
|---|
| 764 | double et1_per = error_type1 / negative; |
|---|
| 765 | double et2_per = error_type2 / positive; |
|---|
| 766 | return new double[]{et1_per, et2_per}; |
|---|
| 767 | } |
|---|
| 768 | |
|---|
| 769 | /** |
|---|
| 770 | * Use only the best classifier from our evaluation phase |
|---|
| 771 | */ |
|---|
| 772 | @Override |
|---|
| 773 | public double classifyInstance(Instance instance) { |
|---|
| 774 | GPGenotype gp = ((GPRun)best).getGp(); |
|---|
| 775 | Variable[] vars = ((GPRun)best).getVariables(); |
|---|
| 776 | |
|---|
| 777 | IGPProgram fitest = gp.getAllTimeBest(); // all time fitest |
|---|
| 778 | for(int i = 0; i < instance.numAttributes()-1; i++) { |
|---|
| 779 | vars[i].set(instance.value(i)); |
|---|
| 780 | } |
|---|
| 781 | |
|---|
| 782 | double classification = fitest.execute_double(0, vars); |
|---|
| 783 | |
|---|
| 784 | if(classification < 0.5) { |
|---|
| 785 | return 1.0; |
|---|
| 786 | }else { |
|---|
| 787 | return 0.0; |
|---|
| 788 | } |
|---|
| 789 | } |
|---|
| 790 | } |
|---|
| 791 | } |
|---|