| 1 | package de.ugoe.cs.cpdp.training; |
|---|
| 2 | |
|---|
| 3 | import java.util.LinkedList; |
|---|
| 4 | import java.util.List; |
|---|
| 5 | |
|---|
| 6 | import org.apache.commons.collections4.list.SetUniqueList; |
|---|
| 7 | |
|---|
| 8 | import weka.classifiers.AbstractClassifier; |
|---|
| 9 | import weka.classifiers.Classifier; |
|---|
| 10 | import weka.core.Instance; |
|---|
| 11 | import weka.core.Instances; |
|---|
| 12 | import org.apache.commons.lang3.ArrayUtils; |
|---|
| 13 | import org.jgap.Configuration; |
|---|
| 14 | import org.jgap.InvalidConfigurationException; |
|---|
| 15 | import org.jgap.gp.CommandGene; |
|---|
| 16 | import org.jgap.gp.GPProblem; |
|---|
| 17 | |
|---|
| 18 | import org.jgap.gp.function.Add; |
|---|
| 19 | import org.jgap.gp.function.Multiply; |
|---|
| 20 | import org.jgap.gp.function.Log; |
|---|
| 21 | import org.jgap.gp.function.Subtract; |
|---|
| 22 | import org.jgap.gp.function.Divide; |
|---|
| 23 | import org.jgap.gp.function.Sine; |
|---|
| 24 | import org.jgap.gp.function.Cosine; |
|---|
| 25 | import org.jgap.gp.function.Max; |
|---|
| 26 | import org.jgap.gp.function.Exp; |
|---|
| 27 | |
|---|
| 28 | import org.jgap.gp.impl.DeltaGPFitnessEvaluator; |
|---|
| 29 | import org.jgap.gp.impl.GPConfiguration; |
|---|
| 30 | import org.jgap.gp.impl.GPGenotype; |
|---|
| 31 | import org.jgap.gp.impl.TournamentSelector; |
|---|
| 32 | import org.jgap.gp.terminal.Terminal; |
|---|
| 33 | import org.jgap.gp.GPFitnessFunction; |
|---|
| 34 | import org.jgap.gp.IGPProgram; |
|---|
| 35 | import org.jgap.gp.terminal.Variable; |
|---|
| 36 | import org.jgap.gp.MathCommand; |
|---|
| 37 | import org.jgap.util.ICloneable; |
|---|
| 38 | |
|---|
| 39 | import de.ugoe.cs.cpdp.util.WekaUtils; |
|---|
| 40 | |
|---|
| 41 | import org.jgap.gp.impl.ProgramChromosome; |
|---|
| 42 | import org.jgap.util.CloneException; |
|---|
| 43 | |
|---|
| 44 | /** |
|---|
| 45 | * Genetic Programming Trainer |
|---|
| 46 | * |
|---|
| 47 | * |
|---|
| 48 | * - GPRun is a Run of a complete Genetic Programm Evolution, we want several complete runs. |
|---|
| 49 | * - GPVClassifier is the Validation Classifier |
|---|
| 50 | * - GPVVClassifier is the Validation-Voting Classifier |
|---|
| 51 | * |
|---|
| 52 | * config: <setwisetrainer name="GPTraining" param="GPVVClassifier" /> |
|---|
| 53 | */ |
|---|
| 54 | public class GPTraining implements ISetWiseTrainingStrategy, IWekaCompatibleTrainer { |
|---|
| 55 | |
|---|
| 56 | private GPVClassifier classifier = null; |
|---|
| 57 | |
|---|
| 58 | private int populationSize = 1000; |
|---|
| 59 | private int initMinDepth = 2; |
|---|
| 60 | private int initMaxDepth = 6; |
|---|
| 61 | private int tournamentSize = 7; |
|---|
| 62 | private int maxGenerations = 50; |
|---|
| 63 | private double errorType2Weight = 15; |
|---|
| 64 | private int numberRuns = 1; // 200 in the paper |
|---|
| 65 | private int maxDepth = 20; // max depth within one program |
|---|
| 66 | private int maxNodes = 100; // max nodes within one program |
|---|
| 67 | |
|---|
| 68 | @Override |
|---|
| 69 | public void setParameter(String parameters) { |
|---|
| 70 | |
|---|
| 71 | // todo: split parameters to get classifier and the configuration variables for the gprun |
|---|
| 72 | if(parameters.equals("GPVVClassifier")) { |
|---|
| 73 | this.classifier = new GPVVClassifier(); |
|---|
| 74 | ((GPVVClassifier)this.classifier).configure(populationSize, initMinDepth, initMaxDepth, tournamentSize, maxGenerations, errorType2Weight, numberRuns, maxDepth, maxNodes); |
|---|
| 75 | }else if(parameters.equals("GPVClassifier")) { |
|---|
| 76 | this.classifier = new GPVClassifier(); |
|---|
| 77 | ((GPVClassifier)this.classifier).configure(populationSize, initMinDepth, initMaxDepth, tournamentSize, maxGenerations, errorType2Weight, numberRuns, maxDepth, maxNodes); |
|---|
| 78 | }else { |
|---|
| 79 | // default |
|---|
| 80 | this.classifier = new GPVVClassifier(); |
|---|
| 81 | ((GPVVClassifier)this.classifier).configure(populationSize, initMinDepth, initMaxDepth, tournamentSize, maxGenerations, errorType2Weight, numberRuns, maxDepth, maxNodes); |
|---|
| 82 | } |
|---|
| 83 | } |
|---|
| 84 | |
|---|
| 85 | @Override |
|---|
| 86 | public void apply(SetUniqueList<Instances> traindataSet) { |
|---|
| 87 | try { |
|---|
| 88 | classifier.buildClassifier(traindataSet); |
|---|
| 89 | }catch(Exception e) { |
|---|
| 90 | throw new RuntimeException(e); |
|---|
| 91 | } |
|---|
| 92 | } |
|---|
| 93 | |
|---|
| 94 | @Override |
|---|
| 95 | public String getName() { |
|---|
| 96 | return "GPTraining"; |
|---|
| 97 | } |
|---|
| 98 | |
|---|
| 99 | @Override |
|---|
| 100 | public Classifier getClassifier() { |
|---|
| 101 | return this.classifier; |
|---|
| 102 | } |
|---|
| 103 | |
|---|
| 104 | public class InstanceData { |
|---|
| 105 | private double[][] instances_x; |
|---|
| 106 | private boolean[] instances_y; |
|---|
| 107 | |
|---|
| 108 | public InstanceData(Instances instances) { |
|---|
| 109 | this.instances_x = new double[instances.numInstances()][instances.numAttributes()-1]; |
|---|
| 110 | this.instances_y = new boolean[instances.numInstances()]; |
|---|
| 111 | |
|---|
| 112 | Instance current; |
|---|
| 113 | for(int i=0; i < this.instances_x.length; i++) { |
|---|
| 114 | current = instances.get(i); |
|---|
| 115 | this.instances_x[i] = WekaUtils.instanceValues(current); |
|---|
| 116 | this.instances_y[i] = 1.0 == current.classValue(); |
|---|
| 117 | } |
|---|
| 118 | } |
|---|
| 119 | |
|---|
| 120 | public double[][] getX() { |
|---|
| 121 | return instances_x; |
|---|
| 122 | } |
|---|
| 123 | public boolean[] getY() { |
|---|
| 124 | return instances_y; |
|---|
| 125 | } |
|---|
| 126 | } |
|---|
| 127 | |
|---|
| 128 | /** |
|---|
| 129 | * One Run executed by a GP Classifier |
|---|
| 130 | */ |
|---|
| 131 | public class GPRun extends AbstractClassifier { |
|---|
| 132 | private static final long serialVersionUID = -4250422550107888789L; |
|---|
| 133 | |
|---|
| 134 | private int populationSize; |
|---|
| 135 | private int initMinDepth; |
|---|
| 136 | private int initMaxDepth; |
|---|
| 137 | private int tournamentSize; |
|---|
| 138 | private int maxGenerations; |
|---|
| 139 | private double errorType2Weight; |
|---|
| 140 | private int maxDepth; |
|---|
| 141 | private int maxNodes; |
|---|
| 142 | |
|---|
| 143 | private GPGenotype gp; |
|---|
| 144 | private GPProblem problem; |
|---|
| 145 | |
|---|
| 146 | public void configure(int populationSize, int initMinDepth, int initMaxDepth, int tournamentSize, int maxGenerations, double errorType2Weight, int maxDepth, int maxNodes) { |
|---|
| 147 | this.populationSize = populationSize; |
|---|
| 148 | this.initMinDepth = initMinDepth; |
|---|
| 149 | this.initMaxDepth = initMaxDepth; |
|---|
| 150 | this.tournamentSize = tournamentSize; |
|---|
| 151 | this.maxGenerations = maxGenerations; |
|---|
| 152 | this.errorType2Weight = errorType2Weight; |
|---|
| 153 | this.maxDepth = maxDepth; |
|---|
| 154 | this.maxNodes = maxNodes; |
|---|
| 155 | } |
|---|
| 156 | |
|---|
| 157 | public GPGenotype getGp() { |
|---|
| 158 | return this.gp; |
|---|
| 159 | } |
|---|
| 160 | |
|---|
| 161 | public Variable[] getVariables() { |
|---|
| 162 | return ((CrossPareGP)this.problem).getVariables(); |
|---|
| 163 | } |
|---|
| 164 | |
|---|
| 165 | @Override |
|---|
| 166 | public void buildClassifier(Instances traindata) throws Exception { |
|---|
| 167 | InstanceData train = new InstanceData(traindata); |
|---|
| 168 | this.problem = new CrossPareGP(train.getX(), train.getY(), this.populationSize, this.initMinDepth, this.initMaxDepth, this.tournamentSize, this.errorType2Weight, this.maxDepth, this.maxNodes); |
|---|
| 169 | this.gp = problem.create(); |
|---|
| 170 | this.gp.evolve(this.maxGenerations); |
|---|
| 171 | } |
|---|
| 172 | |
|---|
| 173 | /** |
|---|
| 174 | * GPProblem implementation |
|---|
| 175 | */ |
|---|
| 176 | class CrossPareGP extends GPProblem { |
|---|
| 177 | private double[][] instances; |
|---|
| 178 | private boolean[] output; |
|---|
| 179 | |
|---|
| 180 | private int maxDepth; |
|---|
| 181 | private int maxNodes; |
|---|
| 182 | |
|---|
| 183 | private Variable[] x; |
|---|
| 184 | |
|---|
| 185 | public CrossPareGP(double[][] instances, boolean[] output, int populationSize, int minInitDept, int maxInitDepth, int tournamentSize, double errorType2Weight, int maxDepth, int maxNodes) throws InvalidConfigurationException { |
|---|
| 186 | super(new GPConfiguration()); |
|---|
| 187 | |
|---|
| 188 | this.instances = instances; |
|---|
| 189 | this.output = output; |
|---|
| 190 | this.maxDepth = maxDepth; |
|---|
| 191 | this.maxNodes = maxNodes; |
|---|
| 192 | |
|---|
| 193 | Configuration.reset(); |
|---|
| 194 | GPConfiguration config = this.getGPConfiguration(); |
|---|
| 195 | |
|---|
| 196 | this.x = new Variable[this.instances[0].length]; |
|---|
| 197 | |
|---|
| 198 | for(int j=0; j < this.x.length; j++) { |
|---|
| 199 | this.x[j] = Variable.create(config, "X"+j, CommandGene.DoubleClass); |
|---|
| 200 | } |
|---|
| 201 | |
|---|
| 202 | config.setGPFitnessEvaluator(new DeltaGPFitnessEvaluator()); // smaller fitness is better |
|---|
| 203 | //config.setGPFitnessEvaluator(new DefaultGPFitnessEvaluator()); // bigger fitness is better |
|---|
| 204 | |
|---|
| 205 | config.setMinInitDepth(minInitDept); |
|---|
| 206 | config.setMaxInitDepth(maxInitDepth); |
|---|
| 207 | |
|---|
| 208 | config.setCrossoverProb((float)0.60); |
|---|
| 209 | config.setReproductionProb((float)0.10); |
|---|
| 210 | config.setMutationProb((float)0.30); |
|---|
| 211 | |
|---|
| 212 | config.setSelectionMethod(new TournamentSelector(tournamentSize)); |
|---|
| 213 | |
|---|
| 214 | config.setPopulationSize(populationSize); |
|---|
| 215 | |
|---|
| 216 | config.setMaxCrossoverDepth(4); |
|---|
| 217 | config.setFitnessFunction(new CrossPareFitness(this.x, this.instances, this.output, errorType2Weight)); |
|---|
| 218 | config.setStrictProgramCreation(true); |
|---|
| 219 | } |
|---|
| 220 | |
|---|
| 221 | // used for running the fitness function again for testing |
|---|
| 222 | public Variable[] getVariables() { |
|---|
| 223 | return this.x; |
|---|
| 224 | } |
|---|
| 225 | |
|---|
| 226 | |
|---|
| 227 | public GPGenotype create() throws InvalidConfigurationException { |
|---|
| 228 | GPConfiguration config = this.getGPConfiguration(); |
|---|
| 229 | |
|---|
| 230 | // return type |
|---|
| 231 | Class[] types = {CommandGene.DoubleClass}; |
|---|
| 232 | |
|---|
| 233 | // Arguments of result-producing chromosome: none |
|---|
| 234 | Class[][] argTypes = { {} }; |
|---|
| 235 | |
|---|
| 236 | // variables + functions, we set the variables with the values of the instances here |
|---|
| 237 | CommandGene[] vars = new CommandGene[this.instances[0].length]; |
|---|
| 238 | for(int j=0; j < this.instances[0].length; j++) { |
|---|
| 239 | vars[j] = this.x[j]; |
|---|
| 240 | } |
|---|
| 241 | CommandGene[] funcs = { |
|---|
| 242 | new Add(config, CommandGene.DoubleClass), |
|---|
| 243 | new Subtract(config, CommandGene.DoubleClass), |
|---|
| 244 | new Multiply(config, CommandGene.DoubleClass), |
|---|
| 245 | new Divide(config, CommandGene.DoubleClass), |
|---|
| 246 | new Sine(config, CommandGene.DoubleClass), |
|---|
| 247 | new Cosine(config, CommandGene.DoubleClass), |
|---|
| 248 | new Exp(config, CommandGene.DoubleClass), |
|---|
| 249 | new Log(config, CommandGene.DoubleClass), |
|---|
| 250 | new GT(config, CommandGene.DoubleClass), |
|---|
| 251 | new Max(config, CommandGene.DoubleClass), |
|---|
| 252 | new Terminal(config, CommandGene.DoubleClass, -100.0, 100.0, true), // min, max, whole numbers |
|---|
| 253 | }; |
|---|
| 254 | |
|---|
| 255 | CommandGene[] comb = (CommandGene[])ArrayUtils.addAll(vars, funcs); |
|---|
| 256 | CommandGene[][] nodeSets = { |
|---|
| 257 | comb, |
|---|
| 258 | }; |
|---|
| 259 | |
|---|
| 260 | // we only have one chromosome so this suffices |
|---|
| 261 | int minDepths[] = {config.getMinInitDepth()}; |
|---|
| 262 | int maxDepths[] = {this.maxDepth}; |
|---|
| 263 | GPGenotype result = GPGenotype.randomInitialGenotype(config, types, argTypes, nodeSets, minDepths, maxDepths, this.maxNodes, false); // 40 = maxNodes, true = verbose output |
|---|
| 264 | |
|---|
| 265 | return result; |
|---|
| 266 | } |
|---|
| 267 | } |
|---|
| 268 | |
|---|
| 269 | |
|---|
| 270 | /** |
|---|
| 271 | * Fitness function |
|---|
| 272 | */ |
|---|
| 273 | class CrossPareFitness extends GPFitnessFunction { |
|---|
| 274 | |
|---|
| 275 | private static final long serialVersionUID = 75234832484387L; |
|---|
| 276 | |
|---|
| 277 | private Variable[] x; |
|---|
| 278 | |
|---|
| 279 | private double[][] instances; |
|---|
| 280 | private boolean[] output; |
|---|
| 281 | |
|---|
| 282 | private double errorType2Weight = 1.0; |
|---|
| 283 | |
|---|
| 284 | // needed in evaluate |
|---|
| 285 | //private Object[] NO_ARGS = new Object[0]; |
|---|
| 286 | |
|---|
| 287 | private double sfitness = 0.0f; |
|---|
| 288 | private int errorType1 = 0; |
|---|
| 289 | private int errorType2 = 0; |
|---|
| 290 | |
|---|
| 291 | public CrossPareFitness(Variable[] x, double[][] instances, boolean[] output, double errorType2Weight) { |
|---|
| 292 | this.x = x; |
|---|
| 293 | this.instances = instances; |
|---|
| 294 | this.output = output; |
|---|
| 295 | this.errorType2Weight = errorType2Weight; |
|---|
| 296 | } |
|---|
| 297 | |
|---|
| 298 | public int getErrorType1() { |
|---|
| 299 | return this.errorType1; |
|---|
| 300 | } |
|---|
| 301 | |
|---|
| 302 | public int getErrorType2() { |
|---|
| 303 | return this.errorType2; |
|---|
| 304 | } |
|---|
| 305 | |
|---|
| 306 | public double getSecondFitness() { |
|---|
| 307 | return this.sfitness; |
|---|
| 308 | } |
|---|
| 309 | |
|---|
| 310 | public int getNumInstances() { |
|---|
| 311 | return this.instances.length; |
|---|
| 312 | } |
|---|
| 313 | |
|---|
| 314 | /** |
|---|
| 315 | * This is the fitness function |
|---|
| 316 | * |
|---|
| 317 | * Our fitness is best if we have the less wrong classifications, this includes a weight for type2 errors |
|---|
| 318 | */ |
|---|
| 319 | @Override |
|---|
| 320 | protected double evaluate(final IGPProgram program) { |
|---|
| 321 | double pfitness = 0.0f; |
|---|
| 322 | this.sfitness = 0.0f; |
|---|
| 323 | double value = 0.0f; |
|---|
| 324 | |
|---|
| 325 | // count classification errors |
|---|
| 326 | this.errorType1 = 0; |
|---|
| 327 | this.errorType2 = 0; |
|---|
| 328 | |
|---|
| 329 | for(int i=0; i < this.instances.length; i++) { |
|---|
| 330 | |
|---|
| 331 | // requires that we have a variable for each column of our dataset (attribute of instance) |
|---|
| 332 | for(int j=0; j < this.x.length; j++) { |
|---|
| 333 | this.x[j].set(this.instances[i][j]); |
|---|
| 334 | } |
|---|
| 335 | |
|---|
| 336 | // value gives us a double, if < 0.5 we set this instance as faulty |
|---|
| 337 | value = program.execute_double(0, this.x); |
|---|
| 338 | |
|---|
| 339 | if(value < 0.5) { |
|---|
| 340 | if(this.output[i] != true) { |
|---|
| 341 | this.errorType1 += 1; |
|---|
| 342 | } |
|---|
| 343 | }else { |
|---|
| 344 | if(this.output[i] == true) { |
|---|
| 345 | this.errorType2 += 1; |
|---|
| 346 | } |
|---|
| 347 | } |
|---|
| 348 | } |
|---|
| 349 | |
|---|
| 350 | // now calc pfitness |
|---|
| 351 | pfitness = (this.errorType1 + this.errorType2Weight * this.errorType2) / this.instances.length; |
|---|
| 352 | |
|---|
| 353 | // number of nodes in the programm, if lower then 10 we assign sFitness of 10 |
|---|
| 354 | // we can set metadata with setProgramData to save this |
|---|
| 355 | if(program.getChromosome(0).getSize(0) < 10) { |
|---|
| 356 | program.setApplicationData(10.0f); |
|---|
| 357 | } |
|---|
| 358 | |
|---|
| 359 | return pfitness; |
|---|
| 360 | } |
|---|
| 361 | } |
|---|
| 362 | |
|---|
| 363 | /** |
|---|
| 364 | * Custom GT implementation used in the GP Algorithm. |
|---|
| 365 | */ |
|---|
| 366 | public class GT extends MathCommand implements ICloneable { |
|---|
| 367 | |
|---|
| 368 | private static final long serialVersionUID = 113454184817L; |
|---|
| 369 | |
|---|
| 370 | public GT(final GPConfiguration a_conf, java.lang.Class a_returnType) throws InvalidConfigurationException { |
|---|
| 371 | super(a_conf, 2, a_returnType); |
|---|
| 372 | } |
|---|
| 373 | |
|---|
| 374 | public String toString() { |
|---|
| 375 | return "GT(&1, &2)"; |
|---|
| 376 | } |
|---|
| 377 | |
|---|
| 378 | public String getName() { |
|---|
| 379 | return "GT"; |
|---|
| 380 | } |
|---|
| 381 | |
|---|
| 382 | public float execute_float(ProgramChromosome c, int n, Object[] args) { |
|---|
| 383 | float f1 = c.execute_float(n, 0, args); |
|---|
| 384 | float f2 = c.execute_float(n, 1, args); |
|---|
| 385 | |
|---|
| 386 | float ret = 1.0f; |
|---|
| 387 | if(f1 > f2) { |
|---|
| 388 | ret = 0.0f; |
|---|
| 389 | } |
|---|
| 390 | |
|---|
| 391 | return ret; |
|---|
| 392 | } |
|---|
| 393 | |
|---|
| 394 | public double execute_double(ProgramChromosome c, int n, Object[] args) { |
|---|
| 395 | double f1 = c.execute_double(n, 0, args); |
|---|
| 396 | double f2 = c.execute_double(n, 1, args); |
|---|
| 397 | |
|---|
| 398 | double ret = 1; |
|---|
| 399 | if(f1 > f2) { |
|---|
| 400 | ret = 0; |
|---|
| 401 | } |
|---|
| 402 | return ret; |
|---|
| 403 | } |
|---|
| 404 | |
|---|
| 405 | public Object clone() { |
|---|
| 406 | try { |
|---|
| 407 | GT result = new GT(getGPConfiguration(), getReturnType()); |
|---|
| 408 | return result; |
|---|
| 409 | }catch(Exception ex) { |
|---|
| 410 | throw new CloneException(ex); |
|---|
| 411 | } |
|---|
| 412 | } |
|---|
| 413 | } |
|---|
| 414 | } |
|---|
| 415 | |
|---|
| 416 | /** |
|---|
| 417 | * GP Multiple Data Sets Validation-Voting Classifier |
|---|
| 418 | * |
|---|
| 419 | * Basically the same as the GP Multiple Data Sets Validation Classifier. |
|---|
| 420 | * But here we do keep a model candidate for each training set which may later vote |
|---|
| 421 | * |
|---|
| 422 | */ |
|---|
| 423 | public class GPVVClassifier extends GPVClassifier { |
|---|
| 424 | |
|---|
| 425 | private static final long serialVersionUID = -654710583852839901L; |
|---|
| 426 | private List<Classifier> classifiers = null; |
|---|
| 427 | |
|---|
| 428 | @Override |
|---|
| 429 | public void buildClassifier(Instances arg0) throws Exception { |
|---|
| 430 | // TODO Auto-generated method stub |
|---|
| 431 | |
|---|
| 432 | } |
|---|
| 433 | |
|---|
| 434 | /** Build the GP Multiple Data Sets Validation-Voting Classifier |
|---|
| 435 | * |
|---|
| 436 | * This is according to Section 6 of the Paper by Liu et al. |
|---|
| 437 | * It is basically the Multiple Data Sets Validation Classifier but here we keep the best models an let them vote. |
|---|
| 438 | * |
|---|
| 439 | * @param traindataSet |
|---|
| 440 | * @throws Exception |
|---|
| 441 | */ |
|---|
| 442 | public void buildClassifier(SetUniqueList<Instances> traindataSet) throws Exception { |
|---|
| 443 | |
|---|
| 444 | // each classifier is trained with one project from the set |
|---|
| 445 | // then is evaluated on the rest |
|---|
| 446 | classifiers = new LinkedList<>(); |
|---|
| 447 | for(int i=0; i < traindataSet.size(); i++) { |
|---|
| 448 | |
|---|
| 449 | // candidates we get out of evaluation |
|---|
| 450 | LinkedList<Classifier> candidates = new LinkedList<>(); |
|---|
| 451 | |
|---|
| 452 | // number of runs |
|---|
| 453 | for(int k=0; k < this.numberRuns; k++) { |
|---|
| 454 | Classifier classifier = new GPRun(); |
|---|
| 455 | ((GPRun)classifier).configure(this.populationSize, this.initMinDepth, this.initMaxDepth, this.tournamentSize, this.maxGenerations, this.errorType2Weight, this.maxDepth, this.maxNodes); |
|---|
| 456 | |
|---|
| 457 | // one project is training data |
|---|
| 458 | classifier.buildClassifier(traindataSet.get(i)); |
|---|
| 459 | |
|---|
| 460 | double[] errors; |
|---|
| 461 | // rest of the set is evaluation data, we evaluate now |
|---|
| 462 | for(int j=0; j < traindataSet.size(); j++) { |
|---|
| 463 | if(j != i) { |
|---|
| 464 | // if type1 and type2 errors are < 0.5 we allow the model in the candidates |
|---|
| 465 | errors = this.evaluate((GPRun)classifier, traindataSet.get(j)); |
|---|
| 466 | if((errors[0] < 0.5) && (errors[0] < 0.5)) { |
|---|
| 467 | candidates.add(classifier); |
|---|
| 468 | } |
|---|
| 469 | } |
|---|
| 470 | } |
|---|
| 471 | } |
|---|
| 472 | |
|---|
| 473 | // now after the evaluation we do a model selection where only one model remains for the given training data |
|---|
| 474 | double smallest_error_count = Double.MAX_VALUE; |
|---|
| 475 | double[] errors; |
|---|
| 476 | Classifier best = null; |
|---|
| 477 | for(int ii=0; ii < candidates.size(); ii++) { |
|---|
| 478 | for(int j=0; j < traindataSet.size(); j++) { |
|---|
| 479 | if(j != i) { |
|---|
| 480 | errors = this.evaluate((GPRun)candidates.get(ii), traindataSet.get(j)); |
|---|
| 481 | |
|---|
| 482 | if(errors[0]+errors[1] < smallest_error_count) { |
|---|
| 483 | best = candidates.get(ii); |
|---|
| 484 | } |
|---|
| 485 | } |
|---|
| 486 | } |
|---|
| 487 | } |
|---|
| 488 | |
|---|
| 489 | // now we have the best classifier for this training data |
|---|
| 490 | classifiers.add(best); |
|---|
| 491 | } |
|---|
| 492 | } |
|---|
| 493 | |
|---|
| 494 | /** |
|---|
| 495 | * Use the best classifiers for each training data in a majority voting |
|---|
| 496 | */ |
|---|
| 497 | @Override |
|---|
| 498 | public double classifyInstance(Instance instance) { |
|---|
| 499 | |
|---|
| 500 | int vote_positive = 0; |
|---|
| 501 | |
|---|
| 502 | for (int i = 0; i < classifiers.size(); i++) { |
|---|
| 503 | Classifier classifier = classifiers.get(i); |
|---|
| 504 | |
|---|
| 505 | GPGenotype gp = ((GPRun)classifier).getGp(); |
|---|
| 506 | Variable[] vars = ((GPRun)classifier).getVariables(); |
|---|
| 507 | |
|---|
| 508 | IGPProgram fitest = gp.getAllTimeBest(); // all time fitest |
|---|
| 509 | for(int j = 0; j < instance.numAttributes()-1; j++) { |
|---|
| 510 | vars[j].set(instance.value(j)); |
|---|
| 511 | } |
|---|
| 512 | |
|---|
| 513 | if(fitest.execute_double(0, vars) < 0.5) { |
|---|
| 514 | vote_positive += 1; |
|---|
| 515 | } |
|---|
| 516 | } |
|---|
| 517 | |
|---|
| 518 | if(vote_positive >= (classifiers.size()/2)) { |
|---|
| 519 | return 1.0; |
|---|
| 520 | }else { |
|---|
| 521 | return 0.0; |
|---|
| 522 | } |
|---|
| 523 | } |
|---|
| 524 | } |
|---|
| 525 | |
|---|
| 526 | /** |
|---|
| 527 | * GP Multiple Data Sets Validation Classifier |
|---|
| 528 | * |
|---|
| 529 | * We train a Classifier with one training project $numberRun times. |
|---|
| 530 | * Then we evaluate the classifier on the rest of the training projects and keep the best classifier. |
|---|
| 531 | * After that we have for each training project the best classifier as per the evaluation on the rest of the data set. |
|---|
| 532 | * Then we determine the best classifier from these candidates and keep it to be used later. |
|---|
| 533 | */ |
|---|
| 534 | public class GPVClassifier extends AbstractClassifier { |
|---|
| 535 | |
|---|
| 536 | private List<Classifier> classifiers = null; |
|---|
| 537 | private Classifier best = null; |
|---|
| 538 | |
|---|
| 539 | private static final long serialVersionUID = 3708714057579101522L; |
|---|
| 540 | |
|---|
| 541 | protected int populationSize; |
|---|
| 542 | protected int initMinDepth; |
|---|
| 543 | protected int initMaxDepth; |
|---|
| 544 | protected int tournamentSize; |
|---|
| 545 | protected int maxGenerations; |
|---|
| 546 | protected double errorType2Weight; |
|---|
| 547 | protected int numberRuns; |
|---|
| 548 | protected int maxDepth; |
|---|
| 549 | protected int maxNodes; |
|---|
| 550 | |
|---|
| 551 | /** |
|---|
| 552 | * Configure the GP Params and number of Runs |
|---|
| 553 | * |
|---|
| 554 | * @param populationSize |
|---|
| 555 | * @param initMinDepth |
|---|
| 556 | * @param initMaxDepth |
|---|
| 557 | * @param tournamentSize |
|---|
| 558 | * @param maxGenerations |
|---|
| 559 | * @param errorType2Weight |
|---|
| 560 | */ |
|---|
| 561 | public void configure(int populationSize, int initMinDepth, int initMaxDepth, int tournamentSize, int maxGenerations, double errorType2Weight, int numberRuns, int maxDepth, int maxNodes) { |
|---|
| 562 | this.populationSize = populationSize; |
|---|
| 563 | this.initMinDepth = initMinDepth; |
|---|
| 564 | this.initMaxDepth = initMaxDepth; |
|---|
| 565 | this.tournamentSize = tournamentSize; |
|---|
| 566 | this.maxGenerations = maxGenerations; |
|---|
| 567 | this.errorType2Weight = errorType2Weight; |
|---|
| 568 | this.numberRuns = numberRuns; |
|---|
| 569 | this.maxDepth = maxDepth; |
|---|
| 570 | this.maxNodes = maxNodes; |
|---|
| 571 | } |
|---|
| 572 | |
|---|
| 573 | /** Build the GP Multiple Data Sets Validation Classifier |
|---|
| 574 | * |
|---|
| 575 | * This is according to Section 6 of the Paper by Liu et al. except for the selection of the best model. |
|---|
| 576 | * Section 4 describes a slightly different approach. |
|---|
| 577 | * |
|---|
| 578 | * @param traindataSet |
|---|
| 579 | * @throws Exception |
|---|
| 580 | */ |
|---|
| 581 | public void buildClassifier(SetUniqueList<Instances> traindataSet) throws Exception { |
|---|
| 582 | |
|---|
| 583 | // each classifier is trained with one project from the set |
|---|
| 584 | // then is evaluated on the rest |
|---|
| 585 | for(int i=0; i < traindataSet.size(); i++) { |
|---|
| 586 | |
|---|
| 587 | // candidates we get out of evaluation |
|---|
| 588 | LinkedList<Classifier> candidates = new LinkedList<>(); |
|---|
| 589 | |
|---|
| 590 | // numberRuns full GPRuns, we generate numberRuns models for each traindata |
|---|
| 591 | for(int k=0; k < this.numberRuns; k++) { |
|---|
| 592 | Classifier classifier = new GPRun(); |
|---|
| 593 | ((GPRun)classifier).configure(this.populationSize, this.initMinDepth, this.initMaxDepth, this.tournamentSize, this.maxGenerations, this.errorType2Weight, this.maxDepth, this.maxNodes); |
|---|
| 594 | |
|---|
| 595 | classifier.buildClassifier(traindataSet.get(i)); |
|---|
| 596 | |
|---|
| 597 | double[] errors; |
|---|
| 598 | |
|---|
| 599 | // rest of the set is evaluation data, we evaluate now |
|---|
| 600 | for(int j=0; j < traindataSet.size(); j++) { |
|---|
| 601 | if(j != i) { |
|---|
| 602 | // if type1 and type2 errors are < 0.5 we allow the model in the candidate list |
|---|
| 603 | errors = this.evaluate((GPRun)classifier, traindataSet.get(j)); |
|---|
| 604 | if((errors[0] < 0.5) && (errors[0] < 0.5)) { |
|---|
| 605 | candidates.add(classifier); |
|---|
| 606 | } |
|---|
| 607 | } |
|---|
| 608 | } |
|---|
| 609 | } |
|---|
| 610 | |
|---|
| 611 | // after the numberRuns we have < numberRuns candidate models for this trainData |
|---|
| 612 | // we now evaluate the candidates |
|---|
| 613 | // finding the best model is not really described in the paper we go with least errors |
|---|
| 614 | double smallest_error_count = Double.MAX_VALUE; |
|---|
| 615 | double[] errors; |
|---|
| 616 | Classifier best = null; |
|---|
| 617 | for(int ii=0; ii < candidates.size(); ii++) { |
|---|
| 618 | for(int j=0; j < traindataSet.size(); j++) { |
|---|
| 619 | if(j != i) { |
|---|
| 620 | errors = this.evaluate((GPRun)candidates.get(ii), traindataSet.get(j)); |
|---|
| 621 | |
|---|
| 622 | if(errors[0]+errors[1] < smallest_error_count) { |
|---|
| 623 | best = candidates.get(ii); |
|---|
| 624 | } |
|---|
| 625 | } |
|---|
| 626 | } |
|---|
| 627 | } |
|---|
| 628 | |
|---|
| 629 | // now we have the best classifier for this training data |
|---|
| 630 | classifiers.add(best); |
|---|
| 631 | } /* endfor trainData */ |
|---|
| 632 | |
|---|
| 633 | // now we have one best classifier for each trainData |
|---|
| 634 | // we evaluate again to find the best classifier of all time |
|---|
| 635 | // this selection is now according to section 4 of the paper and not 6 where an average of the 6 models is build |
|---|
| 636 | double smallest_error_count = Double.MAX_VALUE; |
|---|
| 637 | double error_count; |
|---|
| 638 | double errors[]; |
|---|
| 639 | for(int j=0; j < classifiers.size(); j++) { |
|---|
| 640 | error_count = 0; |
|---|
| 641 | Classifier current = classifiers.get(j); |
|---|
| 642 | for(int i=0; i < traindataSet.size(); i++) { |
|---|
| 643 | errors = this.evaluate((GPRun)current, traindataSet.get(i)); |
|---|
| 644 | error_count = errors[0] + errors[1]; |
|---|
| 645 | } |
|---|
| 646 | |
|---|
| 647 | if(error_count < smallest_error_count) { |
|---|
| 648 | best = current; |
|---|
| 649 | } |
|---|
| 650 | } |
|---|
| 651 | } |
|---|
| 652 | |
|---|
| 653 | @Override |
|---|
| 654 | public void buildClassifier(Instances traindata) throws Exception { |
|---|
| 655 | final Classifier classifier = new GPRun(); |
|---|
| 656 | ((GPRun)classifier).configure(populationSize, initMinDepth, initMaxDepth, tournamentSize, maxGenerations, errorType2Weight, this.maxDepth, this.maxNodes); |
|---|
| 657 | classifier.buildClassifier(traindata); |
|---|
| 658 | classifiers.add(classifier); |
|---|
| 659 | } |
|---|
| 660 | |
|---|
| 661 | /** |
|---|
| 662 | * Evaluation of the Classifier |
|---|
| 663 | * |
|---|
| 664 | * We evaluate the classifier with the Instances of the evalData. |
|---|
| 665 | * It basically assigns the instance attribute values to the variables of the s-expression-tree and |
|---|
| 666 | * then counts the missclassifications. |
|---|
| 667 | * |
|---|
| 668 | * @param classifier |
|---|
| 669 | * @param evalData |
|---|
| 670 | * @return |
|---|
| 671 | */ |
|---|
| 672 | public double[] evaluate(GPRun classifier, Instances evalData) { |
|---|
| 673 | GPGenotype gp = classifier.getGp(); |
|---|
| 674 | Variable[] vars = classifier.getVariables(); |
|---|
| 675 | |
|---|
| 676 | IGPProgram fitest = gp.getAllTimeBest(); // selects the fitest of all not just the last generation |
|---|
| 677 | |
|---|
| 678 | double classification; |
|---|
| 679 | int error_type1 = 0; |
|---|
| 680 | int error_type2 = 0; |
|---|
| 681 | int positive = 0; |
|---|
| 682 | int negative = 0; |
|---|
| 683 | |
|---|
| 684 | for(Instance instance: evalData) { |
|---|
| 685 | |
|---|
| 686 | // assign instance attribute values to the variables of the s-expression-tree |
|---|
| 687 | double[] tmp = WekaUtils.instanceValues(instance); |
|---|
| 688 | for(int i = 0; i < tmp.length; i++) { |
|---|
| 689 | vars[i].set(tmp[i]); |
|---|
| 690 | } |
|---|
| 691 | |
|---|
| 692 | classification = fitest.execute_double(0, vars); |
|---|
| 693 | |
|---|
| 694 | // we need to count the absolutes of positives for percentage |
|---|
| 695 | if(instance.classValue() == 1.0) { |
|---|
| 696 | positive +=1; |
|---|
| 697 | }else { |
|---|
| 698 | negative +=1; |
|---|
| 699 | } |
|---|
| 700 | |
|---|
| 701 | // classification < 0.5 we say defective |
|---|
| 702 | if(classification < 0.5) { |
|---|
| 703 | if(instance.classValue() != 1.0) { |
|---|
| 704 | error_type1 += 1; |
|---|
| 705 | } |
|---|
| 706 | }else { |
|---|
| 707 | if(instance.classValue() == 1.0) { |
|---|
| 708 | error_type2 += 1; |
|---|
| 709 | } |
|---|
| 710 | } |
|---|
| 711 | } |
|---|
| 712 | |
|---|
| 713 | // return error types percentages for the types |
|---|
| 714 | double et1_per = error_type1 / negative; |
|---|
| 715 | double et2_per = error_type2 / positive; |
|---|
| 716 | return new double[]{et1_per, et2_per}; |
|---|
| 717 | } |
|---|
| 718 | |
|---|
| 719 | /** |
|---|
| 720 | * Use only the best classifier from our evaluation phase |
|---|
| 721 | */ |
|---|
| 722 | @Override |
|---|
| 723 | public double classifyInstance(Instance instance) { |
|---|
| 724 | GPGenotype gp = ((GPRun)best).getGp(); |
|---|
| 725 | Variable[] vars = ((GPRun)best).getVariables(); |
|---|
| 726 | |
|---|
| 727 | IGPProgram fitest = gp.getAllTimeBest(); // all time fitest |
|---|
| 728 | for(int i = 0; i < instance.numAttributes()-1; i++) { |
|---|
| 729 | vars[i].set(instance.value(i)); |
|---|
| 730 | } |
|---|
| 731 | |
|---|
| 732 | double classification = fitest.execute_double(0, vars); |
|---|
| 733 | |
|---|
| 734 | if(classification < 0.5) { |
|---|
| 735 | return 1.0; |
|---|
| 736 | }else { |
|---|
| 737 | return 0.0; |
|---|
| 738 | } |
|---|
| 739 | } |
|---|
| 740 | } |
|---|
| 741 | } |
|---|