Context Navigation

GPTraining.java @ 139

Last change on this file since 139 was 136, checked in by sherbold, 8 years ago
more code documentation
File size: 45.7 KB

Line
1
2	package de.ugoe.cs.cpdp.training;
3
4	import java.util.LinkedList;
5	import java.util.List;
6
7	import org.apache.commons.collections4.list.SetUniqueList;
8
9	import weka.classifiers.AbstractClassifier;
10	import weka.classifiers.Classifier;
11	import weka.core.Instance;
12	import weka.core.Instances;
13	import org.apache.commons.lang3.ArrayUtils;
14	import org.jgap.Configuration;
15	import org.jgap.InvalidConfigurationException;
16	import org.jgap.gp.CommandGene;
17	import org.jgap.gp.GPProblem;
18
19	import org.jgap.gp.function.Add;
20	import org.jgap.gp.function.Multiply;
21	import org.jgap.gp.function.Log;
22	import org.jgap.gp.function.Subtract;
23	import org.jgap.gp.function.Divide;
24	import org.jgap.gp.function.Sine;
25	import org.jgap.gp.function.Cosine;
26	import org.jgap.gp.function.Max;
27	import org.jgap.gp.function.Exp;
28
29	import org.jgap.gp.impl.DeltaGPFitnessEvaluator;
30	import org.jgap.gp.impl.GPConfiguration;
31	import org.jgap.gp.impl.GPGenotype;
32	import org.jgap.gp.impl.TournamentSelector;
33	import org.jgap.gp.terminal.Terminal;
34	import org.jgap.gp.GPFitnessFunction;
35	import org.jgap.gp.IGPProgram;
36	import org.jgap.gp.terminal.Variable;
37	import org.jgap.gp.MathCommand;
38	import org.jgap.util.ICloneable;
39
40	import de.ugoe.cs.cpdp.util.WekaUtils;
41
42	import org.jgap.gp.impl.ProgramChromosome;
43	import org.jgap.util.CloneException;
44
45	/**
46	* Genetic Programming Trainer
47	*
48	* Implementation (mostly) according to Liu et al. Evolutionary Optimization of Software Quality
49	* Modeling with Multiple Repositories.
50	*
51	* - GPRun is a Run of a complete Genetic Programm Evolution, we want several complete runs. -
52	* GPVClassifier is the Validation Classifier - GPVVClassifier is the Validation-Voting Classifier
53	*
54	* config: <setwisetrainer name="GPTraining" param="populationSize:1000,numberRuns:10" />
55	*
56	* @author Alexander Trautsch
57	*/
58	public class GPTraining implements ISetWiseTrainingStrategy, IWekaCompatibleTrainer {
59
60	/**
61	* the interal validation-and-voting classifier
62	*/
63	private GPVVClassifier classifier = null;
64
65	/**
66	* size of the population of the genetic program; default from the paper is 1000
67	*/
68	private int populationSize = 1000;
69
70	/**
71	* minimal depth of the S-expression tree at the start of the training; default from the paper
72	* is 2
73	*/
74	private int initMinDepth = 2;
75
76	/**
77	* maximal depth of the S-expression tree at the start of the training; default from the paper
78	* is 6
79	*/
80	private int initMaxDepth = 6;
81
82	/**
83	* size of the tournaments used for selection; default from the paper is 7
84	*/
85	private int tournamentSize = 7;
86
87	/**
88	* number of genetic generations considered (i.e., number of iterations; default from the paper
89	* is 50
90	*/
91	private int maxGenerations = 50;
92
93	/**
94	* weight factor for the prediction errors for cost estimation; default from the paper is 15
95	*/
96	private double errorType2Weight = 15;
97
98	/**
99	* number of internal replications from which the best result is picked; default from the paper
100	* is 20
101	*/
102	private int numberRuns = 20;
103
104	/**
105	* maximal depth of the S-expression tree; default from the paper is 20
106	*/
107	private int maxDepth = 20;
108
109	/**
110	* maximal number of nodes of the S-expression tree; default from the paper is 100
111	*/
112	private int maxNodes = 100;
113
114	/*
115	* (non-Javadoc)
116	*
117	* @see de.ugoe.cs.cpdp.IParameterizable#setParameter(java.lang.String)
118	*/
119	@Override
120	public void setParameter(String parameters) {
121
122	String[] params = parameters.split(",");
123	String[] keyvalue = new String[2];
124
125	for (int i = 0; i < params.length; i++) {
126	keyvalue = params[i].split(":");
127
128	switch (keyvalue[0])
129	{
130	case "populationSize":
131	this.populationSize = Integer.parseInt(keyvalue[1]);
132	break;
133
134	case "initMinDepth":
135	this.initMinDepth = Integer.parseInt(keyvalue[1]);
136	break;
137
138	case "tournamentSize":
139	this.tournamentSize = Integer.parseInt(keyvalue[1]);
140	break;
141
142	case "maxGenerations":
143	this.maxGenerations = Integer.parseInt(keyvalue[1]);
144	break;
145
146	case "errorType2Weight":
147	this.errorType2Weight = Double.parseDouble(keyvalue[1]);
148	break;
149
150	case "numberRuns":
151	this.numberRuns = Integer.parseInt(keyvalue[1]);
152	break;
153
154	case "maxDepth":
155	this.maxDepth = Integer.parseInt(keyvalue[1]);
156	break;
157
158	case "maxNodes":
159	this.maxNodes = Integer.parseInt(keyvalue[1]);
160	break;
161	}
162	}
163
164	this.classifier = new GPVVClassifier();
165	((GPVClassifier) this.classifier)
166	.configure(populationSize, initMinDepth, initMaxDepth, tournamentSize, maxGenerations,
167	errorType2Weight, numberRuns, maxDepth, maxNodes);
168	}
169
170	/*
171	* (non-Javadoc)
172	*
173	* @see
174	* de.ugoe.cs.cpdp.training.ISetWiseTrainingStrategy#apply(org.apache.commons.collections4.list.
175	* SetUniqueList)
176	*/
177	@Override
178	public void apply(SetUniqueList<Instances> traindataSet) {
179	try {
180	classifier.buildClassifier(traindataSet);
181	}
182	catch (Exception e) {
183	throw new RuntimeException(e);
184	}
185	}
186
187	/*
188	* (non-Javadoc)
189	*
190	* @see de.ugoe.cs.cpdp.training.ISetWiseTrainingStrategy#getName()
191	*/
192	@Override
193	public String getName() {
194	return "GPTraining";
195	}
196
197	/*
198	* (non-Javadoc)
199	*
200	* @see de.ugoe.cs.cpdp.training.IWekaCompatibleTrainer#getClassifier()
201	*/
202	@Override
203	public Classifier getClassifier() {
204	return this.classifier;
205	}
206
207	/**
208	* <p>
209	* Internal helper class that stores the data in a format that can be used by the genetic
210	* program.
211	* </p>
212	*
213	* @author Alexander Trautsch
214	*/
215	public class InstanceData {
216
217	/**
218	* instances values
219	*/
220	private double[][] instances_x;
221
222	/**
223	* class labels
224	*/
225	private boolean[] instances_y;
226
227	/**
228	* <p>
229	* Constructor. Creates the internal data representation.
230	* </p>
231	*
232	* @param instances
233	*/
234	public InstanceData(Instances instances) {
235	this.instances_x = new double[instances.numInstances()][instances.numAttributes() - 1];
236	this.instances_y = new boolean[instances.numInstances()];
237
238	Instance current;
239	for (int i = 0; i < this.instances_x.length; i++) {
240	current = instances.get(i);
241	this.instances_x[i] = WekaUtils.instanceValues(current);
242	this.instances_y[i] = 1.0 == current.classValue();
243	}
244	}
245
246	/**
247	* <p>
248	* returns the instance values
249	* </p>
250	*
251	* @return the instance values
252	*/
253	public double[][] getX() {
254	return instances_x;
255	}
256
257	/**
258	* <p>
259	* returns the instance labels
260	* </p>
261	*
262	* @return the instance labels
263	*/
264	public boolean[] getY() {
265	return instances_y;
266	}
267	}
268
269	/**
270	* One Run executed by a GP Classifier
271	*/
272	public class GPRun extends AbstractClassifier {
273
274	/**
275	* generated serialization ID
276	*/
277	private static final long serialVersionUID = -4250422550107888789L;
278
279	/**
280	* size of the population of the genetic program
281	*/
282	private int populationSize;
283
284	/**
285	* minimal depth of the S-expression tree at the start of the training
286	*/
287	private int initMinDepth;
288
289	/**
290	* maximal depth of the S-expression tree at the start of the training
291	*/
292	private int initMaxDepth;
293
294	/**
295	* size of the tournaments used for selection
296	*/
297	private int tournamentSize;
298
299	/**
300	* number of genetic generations considered (i.e., number of iterations
301	*/
302	private int maxGenerations;
303
304	/**
305	* weight factor for the prediction errors for cost estimation
306	*/
307	private double errorType2Weight;
308
309	/**
310	* maximal depth of the S-expression tree
311	*/
312	private int maxDepth;
313
314	/**
315	* maximal number of nodes of the S-expression tree
316	*/
317	private int maxNodes;
318
319	/**
320	* genetic program
321	*/
322	private GPGenotype gp;
323
324	/**
325	* description of the problem to be solved by the genetic program
326	*/
327	private GPProblem problem;
328
329	/**
330	* <p>
331	* Configures the runner
332	* </p>
333	*
334	* @param populationSize
335	* the population size
336	* @param initMinDepth
337	* the initial minimal depth of the S-expression tree
338	* @param initMaxDepth
339	* the initial maximal depth of the S-expression tree
340	* @param tournamentSize
341	* the tournament size for selection
342	* @param maxGenerations
343	* the number of generations created
344	* @param errorType2Weight
345	* weigth factor for the prediction errors
346	* @param maxDepth
347	* maximal depth of the S-expression tree
348	* @param maxNodes
349	* maximal number of nodes of the S-expression tree
350	*/
351	public void configure(int populationSize,
352	int initMinDepth,
353	int initMaxDepth,
354	int tournamentSize,
355	int maxGenerations,
356	double errorType2Weight,
357	int maxDepth,
358	int maxNodes)
359	{
360	this.populationSize = populationSize;
361	this.initMinDepth = initMinDepth;
362	this.initMaxDepth = initMaxDepth;
363	this.tournamentSize = tournamentSize;
364	this.maxGenerations = maxGenerations;
365	this.errorType2Weight = errorType2Weight;
366	this.maxDepth = maxDepth;
367	this.maxNodes = maxNodes;
368	}
369
370	/**
371	* <p>
372	* returns the genetic program
373	* </p>
374	*
375	* @return the genetic program
376	*/
377	public GPGenotype getGp() {
378	return this.gp;
379	}
380
381	/**
382	* <p>
383	* returns the variables of the genetic program
384	* </p>
385	*
386	* @return the variables
387	*/
388	public Variable[] getVariables() {
389	return ((CrossPareGP) this.problem).getVariables();
390	}
391
392	/*
393	* (non-Javadoc)
394	*
395	* @see weka.classifiers.Classifier#buildClassifier(weka.core.Instances)
396	*/
397	@Override
398	public void buildClassifier(Instances traindata) throws Exception {
399	InstanceData train = new InstanceData(traindata);
400	this.problem =
401	new CrossPareGP(train.getX(), train.getY(), this.populationSize, this.initMinDepth,
402	this.initMaxDepth, this.tournamentSize, this.errorType2Weight,
403	this.maxDepth, this.maxNodes);
404	this.gp = problem.create();
405	this.gp.evolve(this.maxGenerations);
406	}
407
408	/**
409	* GPProblem implementation
410	*
411	* @author Alexander Trautsch
412	*/
413	class CrossPareGP extends GPProblem {
414
415	/**
416	* Instance values of the training data
417	*/
418	private double[][] instances;
419
420	/**
421	* Classifications of the training data
422	*/
423	private boolean[] output;
424
425	/**
426	* maximal depth of the S-expression tree
427	*/
428	private int maxDepth;
429
430	/**
431	* maximal number of nodes of the S-expression tree
432	*/
433	private int maxNodes;
434
435	/**
436	* variables of the genetic program
437	*/
438	private Variable[] x;
439
440	/**
441	*
442	* <p>
443	* Constructor. Creates a new genetic program.
444	* </p>
445	*
446	* @param instances
447	* instance values of the training data
448	* @param output
449	* classifications of the training data
450	* @param populationSize
451	* the population size
452	* @param minInitDept
453	* the initial minimal depth of the S-expression tree
454	* @param maxInitDepth
455	* the initial maximal depth of the S-expression tree
456	* @param tournamentSize
457	* the tournament size for selection
458	* @param maxGenerations
459	* the number of generations created
460	* @param errorType2Weight
461	* weigth factor for the prediction errors
462	* @param maxDepth
463	* maximal depth of the S-expression tree
464	* @param maxNodes
465	* maximal number of nodes of the S-expression tree
466	* @throws InvalidConfigurationException
467	* thrown in case the problem cannot be created
468	*/
469	public CrossPareGP(double[][] instances,
470	boolean[] output,
471	int populationSize,
472	int minInitDept,
473	int maxInitDepth,
474	int tournamentSize,
475	double errorType2Weight,
476	int maxDepth,
477	int maxNodes)
478	throws InvalidConfigurationException
479	{
480	super(new GPConfiguration());
481
482	this.instances = instances;
483	this.output = output;
484	this.maxDepth = maxDepth;
485	this.maxNodes = maxNodes;
486
487	Configuration.reset();
488	GPConfiguration config = this.getGPConfiguration();
489
490	this.x = new Variable[this.instances[0].length];
491
492	for (int j = 0; j < this.x.length; j++) {
493	this.x[j] = Variable.create(config, "X" + j, CommandGene.DoubleClass);
494	}
495
496	config.setGPFitnessEvaluator(new DeltaGPFitnessEvaluator()); // smaller fitness is
497	// better
498	// config.setGPFitnessEvaluator(new DefaultGPFitnessEvaluator()); // bigger fitness
499	// is better
500
501	config.setMinInitDepth(minInitDept);
502	config.setMaxInitDepth(maxInitDepth);
503
504	config.setCrossoverProb((float) 0.60);
505	config.setReproductionProb((float) 0.10);
506	config.setMutationProb((float) 0.30);
507
508	config.setSelectionMethod(new TournamentSelector(tournamentSize));
509
510	config.setPopulationSize(populationSize);
511
512	config.setMaxCrossoverDepth(4);
513	config.setFitnessFunction(new CrossPareFitness(this.x, this.instances, this.output,
514	errorType2Weight));
515	config.setStrictProgramCreation(true);
516	}
517
518	/**
519	* <p>
520	* Returns the variables of the problem. Used for running the fitness function again for
521	* testing.
522	* </p>
523	*
524	* @return the variables
525	*/
526	public Variable[] getVariables() {
527	return this.x;
528	}
529
530	/**
531	* creates the genetic program
532	*/
533	@SuppressWarnings("rawtypes")
534	public GPGenotype create() throws InvalidConfigurationException {
535	GPConfiguration config = this.getGPConfiguration();
536
537	// return type
538	Class[] types =
539	{ CommandGene.DoubleClass };
540
541	// Arguments of result-producing chromosome: none
542	Class[][] argTypes =
543	{ { } };
544
545	// variables + functions, we set the variables with the values of the instances here
546	CommandGene[] vars = new CommandGene[this.instances[0].length];
547	for (int j = 0; j < this.instances[0].length; j++) {
548	vars[j] = this.x[j];
549	}
550	CommandGene[] funcs =
551	{ new Add(config, CommandGene.DoubleClass),
552	new Subtract(config, CommandGene.DoubleClass),
553	new Multiply(config, CommandGene.DoubleClass),
554	new Divide(config, CommandGene.DoubleClass),
555	new Sine(config, CommandGene.DoubleClass),
556	new Cosine(config, CommandGene.DoubleClass),
557	new Exp(config, CommandGene.DoubleClass),
558	new Log(config, CommandGene.DoubleClass),
559	new GT(config, CommandGene.DoubleClass),
560	new Max(config, CommandGene.DoubleClass),
561	new Terminal(config, CommandGene.DoubleClass, -100.0, 100.0, true), // min,
562	// max,
563	// whole
564	// numbers
565	};
566
567	CommandGene[] comb = (CommandGene[]) ArrayUtils.addAll(vars, funcs);
568	CommandGene[][] nodeSets =
569	{ comb, };
570
571	// we only have one chromosome so this suffices
572	int minDepths[] =
573	{ config.getMinInitDepth() };
574	int maxDepths[] =
575	{ this.maxDepth };
576	GPGenotype result =
577	GPGenotype.randomInitialGenotype(config, types, argTypes, nodeSets, minDepths,
578	maxDepths, this.maxNodes, false); // 40 =
579	// maxNodes,
580	// true =
581	// verbose
582	// output
583
584	return result;
585	}
586	}
587
588	/**
589	* Internal helper class for the fitness function.
590	*
591	* @author Alexander Trautsch
592	*/
593	class CrossPareFitness extends GPFitnessFunction {
594
595	/**
596	* generated serialization ID
597	*/
598	private static final long serialVersionUID = 75234832484387L;
599
600	/**
601	* variables of the genetic program
602	*/
603	private Variable[] x;
604
605	/**
606	* instance values of the training data
607	*/
608	private double[][] instances;
609
610	/**
611	* classifications of the training data
612	*/
613	private boolean[] output;
614
615	/**
616	* weight of the error costs
617	*/
618	private double errorType2Weight = 1.0;
619
620	// needed in evaluate
621	// private Object[] NO_ARGS = new Object[0];
622
623	/**
624	* fitness value
625	*/
626	private double sfitness = 0.0f;
627
628	/**
629	* type I error
630	*/
631	private int errorType1 = 0;
632
633	/**
634	* type II error
635	*/
636	private int errorType2 = 0;
637
638	/**
639	* <p>
640	* Constructor. Creates a new fitness function.
641	* </p>
642	*
643	* @param x
644	* variables of the genetic program
645	* @param instances
646	* instance values of the training data
647	* @param output
648	* classification of the training data
649	* @param errorType2Weight
650	* weight of the error costs
651	*/
652	public CrossPareFitness(Variable[] x,
653	double[][] instances,
654	boolean[] output,
655	double errorType2Weight)
656	{
657	this.x = x;
658	this.instances = instances;
659	this.output = output;
660	this.errorType2Weight = errorType2Weight;
661	}
662
663	/**
664	* <p>
665	* returns the type I error
666	* </p>
667	*
668	* @return type I error
669	*/
670	public int getErrorType1() {
671	return this.errorType1;
672	}
673
674	/**
675	* <p>
676	* returns the type II error
677	* </p>
678	*
679	* @return type II error
680	*/
681	public int getErrorType2() {
682	return this.errorType2;
683	}
684
685	/**
686	* <p>
687	* returns the value of the secondary fitness function
688	* </p>
689	*
690	* @return secondary fitness
691	*/
692	public double getSecondFitness() {
693	return this.sfitness;
694	}
695
696	/**
697	* <p>
698	* returns the number of training instances
699	* </p>
700	*
701	* @return number of instances
702	*/
703	public int getNumInstances() {
704	return this.instances.length;
705	}
706
707	/**
708	* <p>
709	* The fitness function. Our fitness is best if we have the less wrong classifications,
710	* this includes a weight for type2 errors.
711	* </p>
712	*
713	* @param program
714	* the genetic program whose fitness is evaluated.
715	*
716	* @see org.jgap.gp.GPFitnessFunction#evaluate(org.jgap.gp.IGPProgram)
717	*/
718	@Override
719	protected double evaluate(final IGPProgram program) {
720	double pfitness = 0.0f;
721	this.sfitness = 0.0f;
722	double value = 0.0f;
723
724	// count classification errors
725	this.errorType1 = 0;
726	this.errorType2 = 0;
727
728	for (int i = 0; i < this.instances.length; i++) {
729
730	// requires that we have a variable for each column of our dataset (attribute of
731	// instance)
732	for (int j = 0; j < this.x.length; j++) {
733	this.x[j].set(this.instances[i][j]);
734	}
735
736	// value gives us a double, if < 0.5 we set this instance as faulty
737	value = program.execute_double(0, this.x);
738
739	if (value < 0.5) {
740	if (this.output[i] != true) {
741	this.errorType1 += 1;
742	}
743	}
744	else {
745	if (this.output[i] == true) {
746	this.errorType2 += 1;
747	}
748	}
749	}
750
751	// now calc pfitness
752	pfitness = (this.errorType1 + this.errorType2Weight * this.errorType2) /
753	this.instances.length;
754
755	// number of nodes in the programm, if lower then 10 we assign sFitness of 10
756	// we can set metadata with setProgramData to save this
757	if (program.getChromosome(0).getSize(0) < 10) {
758	program.setApplicationData(10.0f);
759	}
760
761	return pfitness;
762	}
763	}
764
765	/**
766	* Custom GT implementation used in the GP Algorithm.
767	*
768	* @author Alexander Trautsch
769	*/
770	public class GT extends MathCommand implements ICloneable {
771
772	/**
773	* generated serialization ID.
774	*/
775	private static final long serialVersionUID = 113454184817L;
776
777	/**
778	* <p>
779	* Constructor. Creates a new GT.
780	* </p>
781	*
782	* @param a_conf
783	* the configuration of the genetic program
784	* @param a_returnType
785	* the return type
786	* @throws InvalidConfigurationException
787	* thrown is there is a problem during the initialization of the super class
788	*
789	* @see MathCommand
790	*/
791	public GT(final GPConfiguration a_conf, @SuppressWarnings("rawtypes") java.lang.Class a_returnType)
792	throws InvalidConfigurationException
793	{
794	super(a_conf, 2, a_returnType);
795	}
796
797	/*
798	* (non-Javadoc)
799	*
800	* @see org.jgap.gp.CommandGene#toString()
801	*/
802	@Override
803	public String toString() {
804	return "GT(&1, &2)";
805	}
806
807	/*
808	* (non-Javadoc)
809	*
810	* @see org.jgap.gp.CommandGene#getName()
811	*/
812	@Override
813	public String getName() {
814	return "GT";
815	}
816
817	/*
818	* (non-Javadoc)
819	*
820	* @see org.jgap.gp.CommandGene#execute_float(org.jgap.gp.impl.ProgramChromosome, int,
821	* java.lang.Object[])
822	*/
823	@Override
824	public float execute_float(ProgramChromosome c, int n, Object[] args) {
825	float f1 = c.execute_float(n, 0, args);
826	float f2 = c.execute_float(n, 1, args);
827
828	float ret = 1.0f;
829	if (f1 > f2) {
830	ret = 0.0f;
831	}
832
833	return ret;
834	}
835
836	/*
837	* (non-Javadoc)
838	*
839	* @see org.jgap.gp.CommandGene#execute_double(org.jgap.gp.impl.ProgramChromosome, int,
840	* java.lang.Object[])
841	*/
842	@Override
843	public double execute_double(ProgramChromosome c, int n, Object[] args) {
844	double f1 = c.execute_double(n, 0, args);
845	double f2 = c.execute_double(n, 1, args);
846
847	double ret = 1;
848	if (f1 > f2) {
849	ret = 0;
850	}
851	return ret;
852	}
853
854	/*
855	* (non-Javadoc)
856	*
857	* @see java.lang.Object#clone()
858	*/
859	@Override
860	public Object clone() {
861	try {
862	GT result = new GT(getGPConfiguration(), getReturnType());
863	return result;
864	}
865	catch (Exception ex) {
866	throw new CloneException(ex);
867	}
868	}
869	}
870	}
871
872	/**
873	* GP Multiple Data Sets Validation-Voting Classifier
874	*
875	* Basically the same as the GP Multiple Data Sets Validation Classifier. But here we do keep a
876	* model candidate for each training set which may later vote
877	*
878	*/
879	public class GPVVClassifier extends GPVClassifier {
880
881	/**
882	* generated serialization ID
883	*/
884	private static final long serialVersionUID = -654710583852839901L;
885
886	/**
887	* classifiers for each validation set
888	*/
889	private List<Classifier> classifiers = null;
890
891	/*
892	* (non-Javadoc)
893	*
894	* @see
895	* de.ugoe.cs.cpdp.training.GPTraining.GPVClassifier#buildClassifier(weka.core.Instances)
896	*/
897	@Override
898	public void buildClassifier(Instances arg0) throws Exception {
899	// TODO Auto-generated method stub
900	}
901
902	/**
903	* Build the GP Multiple Data Sets Validation-Voting Classifier
904	*
905	* This is according to Section 6 of the Paper by Liu et al. It is basically the Multiple
906	* Data Sets Validation Classifier but here we keep the best models an let them vote.
907	*
908	* @param traindataSet
909	* the training data
910	* @throws Exception
911	* thrown in case of a problem with the training
912	*/
913	public void buildClassifier(SetUniqueList<Instances> traindataSet) throws Exception {
914
915	// each classifier is trained with one project from the set
916	// then is evaluated on the rest
917	classifiers = new LinkedList<>();
918	for (int i = 0; i < traindataSet.size(); i++) {
919
920	// candidates we get out of evaluation
921	LinkedList<Classifier> candidates = new LinkedList<>();
922
923	// number of runs, yields the best of these
924	double smallest_error_count_train = Double.MAX_VALUE;
925	Classifier bestTrain = null;
926	for (int k = 0; k < this.numberRuns; k++) {
927	double[] errors_eval =
928	{ 0.0, 0.0 };
929	Classifier classifier = new GPRun();
930	((GPRun) classifier).configure(this.populationSize, this.initMinDepth,
931	this.initMaxDepth, this.tournamentSize,
932	this.maxGenerations, this.errorType2Weight,
933	this.maxDepth, this.maxNodes);
934
935	// one project is training data
936	classifier.buildClassifier(traindataSet.get(i));
937
938	double[] errors;
939	// rest of the set is evaluation data, we evaluate now
940	for (int j = 0; j < traindataSet.size(); j++) {
941	if (j != i) {
942	// if type1 and type2 errors are < 0.5 we allow the model in the
943	// candidates
944	errors = this.evaluate((GPRun) classifier, traindataSet.get(j));
945	errors_eval[0] += errors[0];
946	errors_eval[1] += errors[1];
947	if ((errors[0] < 0.5) && (errors[1] < 0.5)) {
948	candidates.add(classifier);
949	}
950	}
951	}
952
953	// if the candidate made fewer errors it is now the best
954	if (errors_eval[0] + errors_eval[1] < smallest_error_count_train) {
955	bestTrain = classifier;
956	smallest_error_count_train = errors_eval[0] + errors_eval[1];
957	}
958	}
959
960	// now after the evaluation we do a model selection where only one model remains for
961	// the given training data
962	// we select the model which is best on all evaluation data
963	double smallest_error_count = Double.MAX_VALUE;
964	double[] errors;
965	Classifier best = null;
966	for (int ii = 0; ii < candidates.size(); ii++) {
967	double[] errors_eval =
968	{ 0.0, 0.0 };
969
970	// we add the errors the candidate makes over the evaldata
971	for (int j = 0; j < traindataSet.size(); j++) {
972	if (j != i) {
973	errors = this.evaluate((GPRun) candidates.get(ii), traindataSet.get(j));
974	errors_eval[0] += errors[0];
975	errors_eval[1] += errors[1];
976	}
977	}
978
979	// if the candidate made fewer errors it is now the best
980	if (errors_eval[0] + errors_eval[1] < smallest_error_count) {
981	best = candidates.get(ii);
982	smallest_error_count = errors_eval[0] + errors_eval[1];
983	}
984	}
985
986	if (best == null) {
987	best = bestTrain;
988	}
989	// now we have the best classifier for this training data
990	classifiers.add(best);
991	}
992	}
993
994	/**
995	* Use the best classifiers for each training data in a majority voting
996	*
997	* @param instance
998	* instance that is classified
999	*
1000	* @see de.ugoe.cs.cpdp.training.GPTraining.GPVClassifier#classifyInstance(weka.core.Instance)
1001	*/
1002	@Override
1003	public double classifyInstance(Instance instance) {
1004
1005	int vote_positive = 0;
1006
1007	for (int i = 0; i < classifiers.size(); i++) {
1008	Classifier classifier = classifiers.get(i);
1009
1010	GPGenotype gp = ((GPRun) classifier).getGp();
1011	Variable[] vars = ((GPRun) classifier).getVariables();
1012
1013	IGPProgram fitest = gp.getAllTimeBest(); // all time fitest
1014	for (int j = 0; j < instance.numAttributes() - 1; j++) {
1015	vars[j].set(instance.value(j));
1016	}
1017
1018	if (fitest.execute_double(0, vars) < 0.5) {
1019	vote_positive += 1;
1020	}
1021	}
1022
1023	if (vote_positive >= (classifiers.size() / 2)) {
1024	return 1.0;
1025	}
1026	else {
1027	return 0.0;
1028	}
1029	}
1030	}
1031
1032	/**
1033	* GP Multiple Data Sets Validation Classifier
1034	*
1035	* We train a Classifier with one training project $numberRun times. Then we evaluate the
1036	* classifier on the rest of the training projects and keep the best classifier. After that we
1037	* have for each training project the best classifier as per the evaluation on the rest of the
1038	* data set. Then we determine the best classifier from these candidates and keep it to be used
1039	* later.
1040	*
1041	* @author sherbold Alexander Trautsch
1042	*/
1043	public class GPVClassifier extends AbstractClassifier {
1044
1045	private List<Classifier> classifiers = null;
1046	private Classifier best = null;
1047
1048	private static final long serialVersionUID = 3708714057579101522L;
1049
1050	/**
1051	* size of the population of the genetic program
1052	*/
1053	protected int populationSize;
1054
1055	/**
1056	* minimal depth of the S-expression tree at the start of the training
1057	*/
1058	protected int initMinDepth;
1059
1060	/**
1061	* maximal depth of the S-expression tree at the start of the training
1062	*/
1063	protected int initMaxDepth;
1064
1065	/**
1066	* size of the tournaments used for selection
1067	*/
1068	protected int tournamentSize;
1069
1070	/**
1071	* number of genetic generations considered (i.e., number of iterations
1072	*/
1073	protected int maxGenerations;
1074
1075	/**
1076	* weight factor for the prediction errors for cost estimation
1077	*/
1078	protected double errorType2Weight;
1079
1080	/**
1081	* number of internal replications from which the best result is picked
1082	*/
1083	protected int numberRuns = 20;
1084
1085	/**
1086	* maximal depth of the S-expression tree
1087	*/
1088	protected int maxDepth;
1089
1090	/**
1091	* maximal number of nodes of the S-expression tree
1092	*/
1093	protected int maxNodes;
1094
1095	/**
1096	*
1097	* <p>
1098	* Configure the GP Params and number of Runs
1099	* </p>
1100	*
1101	* @param populationSize
1102	* the population size
1103	* @param initMinDepth
1104	* the initial minimal depth of the S-expression tree
1105	* @param initMaxDepth
1106	* the initial maximal depth of the S-expression tree
1107	* @param tournamentSize
1108	* the tournament size for selection
1109	* @param maxGenerations
1110	* the number of generations created
1111	* @param errorType2Weight
1112	* weigth factor for the prediction errors
1113	* @param numberRuns
1114	* number of internal replications from which the best result is picked
1115	* @param maxDepth
1116	* maximal depth of the S-expression tree
1117	* @param maxNodes
1118	* maximal number of nodes of the S-expression tree
1119	*/
1120	public void configure(int populationSize,
1121	int initMinDepth,
1122	int initMaxDepth,
1123	int tournamentSize,
1124	int maxGenerations,
1125	double errorType2Weight,
1126	int numberRuns,
1127	int maxDepth,
1128	int maxNodes)
1129	{
1130	this.populationSize = populationSize;
1131	this.initMinDepth = initMinDepth;
1132	this.initMaxDepth = initMaxDepth;
1133	this.tournamentSize = tournamentSize;
1134	this.maxGenerations = maxGenerations;
1135	this.errorType2Weight = errorType2Weight;
1136	this.numberRuns = numberRuns;
1137	this.maxDepth = maxDepth;
1138	this.maxNodes = maxNodes;
1139	}
1140
1141	/**
1142	* Build the GP Multiple Data Sets Validation Classifier
1143	*
1144	* This is according to Section 6 of the Paper by Liu et al. except for the selection of the
1145	* best model. Section 4 describes a slightly different approach.
1146	*
1147	* @param traindataSet
1148	* the training data
1149	* @throws Exception
1150	* thrown in case of a problem with the training
1151	*/
1152	public void buildClassifier(SetUniqueList<Instances> traindataSet) throws Exception {
1153
1154	// each classifier is trained with one project from the set
1155	// then is evaluated on the rest
1156	for (int i = 0; i < traindataSet.size(); i++) {
1157
1158	// candidates we get out of evaluation
1159	LinkedList<Classifier> candidates = new LinkedList<>();
1160
1161	// numberRuns full GPRuns, we generate numberRuns models for each traindata
1162	for (int k = 0; k < this.numberRuns; k++) {
1163	Classifier classifier = new GPRun();
1164	((GPRun) classifier).configure(this.populationSize, this.initMinDepth,
1165	this.initMaxDepth, this.tournamentSize,
1166	this.maxGenerations, this.errorType2Weight,
1167	this.maxDepth, this.maxNodes);
1168
1169	classifier.buildClassifier(traindataSet.get(i));
1170
1171	double[] errors;
1172
1173	// rest of the set is evaluation data, we evaluate now
1174	for (int j = 0; j < traindataSet.size(); j++) {
1175	if (j != i) {
1176	// if type1 and type2 errors are < 0.5 we allow the model in the
1177	// candidate list
1178	errors = this.evaluate((GPRun) classifier, traindataSet.get(j));
1179	if ((errors[0] < 0.5) && (errors[1] < 0.5)) {
1180	candidates.add(classifier);
1181	}
1182	}
1183	}
1184	}
1185
1186	// now after the evaluation we do a model selection where only one model remains for
1187	// the given training data
1188	// we select the model which is best on all evaluation data
1189	double smallest_error_count = Double.MAX_VALUE;
1190	double[] errors;
1191	Classifier best = null;
1192	for (int ii = 0; ii < candidates.size(); ii++) {
1193	double[] errors_eval =
1194	{ 0.0, 0.0 };
1195
1196	// we add the errors the candidate makes over the evaldata
1197	for (int j = 0; j < traindataSet.size(); j++) {
1198	if (j != i) {
1199	errors = this.evaluate((GPRun) candidates.get(ii), traindataSet.get(j));
1200	errors_eval[0] += errors[0];
1201	errors_eval[1] += errors[1];
1202	}
1203	}
1204
1205	// if the candidate made fewer errors it is now the best
1206	if (errors_eval[0] + errors_eval[1] < smallest_error_count) {
1207	best = candidates.get(ii);
1208	smallest_error_count = errors_eval[0] + errors_eval[1];
1209	}
1210	}
1211
1212	// now we have the best classifier for this training data
1213	classifiers.add(best);
1214
1215	} /* endfor trainData */
1216
1217	// now we have one best classifier for each trainData
1218	// we evaluate again to find the best classifier of all time
1219	// this selection is now according to section 4 of the paper and not 6 where an average
1220	// of the 6 models is build
1221	double smallest_error_count = Double.MAX_VALUE;
1222	double error_count;
1223	double errors[];
1224	for (int j = 0; j < classifiers.size(); j++) {
1225	error_count = 0;
1226	Classifier current = classifiers.get(j);
1227	for (int i = 0; i < traindataSet.size(); i++) {
1228	errors = this.evaluate((GPRun) current, traindataSet.get(i));
1229	error_count = errors[0] + errors[1];
1230	}
1231
1232	if (error_count < smallest_error_count) {
1233	best = current;
1234	}
1235	}
1236	}
1237
1238	/*
1239	* (non-Javadoc)
1240	*
1241	* @see weka.classifiers.Classifier#buildClassifier(weka.core.Instances)
1242	*/
1243	@Override
1244	public void buildClassifier(Instances traindata) throws Exception {
1245	final Classifier classifier = new GPRun();
1246	((GPRun) classifier).configure(populationSize, initMinDepth, initMaxDepth,
1247	tournamentSize, maxGenerations, errorType2Weight,
1248	this.maxDepth, this.maxNodes);
1249	classifier.buildClassifier(traindata);
1250	classifiers.add(classifier);
1251	}
1252
1253	/**
1254	* <p>
1255	* Evaluation of the Classifier.
1256	* </p>
1257	* <p>
1258	* We evaluate the classifier with the Instances of the evalData. It basically assigns the
1259	* instance attribute values to the variables of the s-expression-tree and then counts the
1260	* missclassifications.
1261	* </p>
1262	*
1263	* @param classifier
1264	* the classifier that is evaluated
1265	* @param evalData
1266	* the validation data
1267	* @return the type I and type II error rates
1268	*/
1269	public double[] evaluate(GPRun classifier, Instances evalData) {
1270	GPGenotype gp = classifier.getGp();
1271	Variable[] vars = classifier.getVariables();
1272
1273	IGPProgram fitest = gp.getAllTimeBest(); // selects the fitest of all not just the last
1274	// generation
1275
1276	double classification;
1277	int error_type1 = 0;
1278	int error_type2 = 0;
1279	int positive = 0;
1280	int negative = 0;
1281
1282	for (Instance instance : evalData) {
1283
1284	// assign instance attribute values to the variables of the s-expression-tree
1285	double[] tmp = WekaUtils.instanceValues(instance);
1286	for (int i = 0; i < tmp.length; i++) {
1287	vars[i].set(tmp[i]);
1288	}
1289
1290	classification = fitest.execute_double(0, vars);
1291
1292	// we need to count the absolutes of positives for percentage
1293	if (instance.classValue() == 1.0) {
1294	positive += 1;
1295	}
1296	else {
1297	negative += 1;
1298	}
1299
1300	// classification < 0.5 we say defective
1301	if (classification < 0.5) {
1302	if (instance.classValue() != 1.0) {
1303	error_type1 += 1;
1304	}
1305	}
1306	else {
1307	if (instance.classValue() == 1.0) {
1308	error_type2 += 1;
1309	}
1310	}
1311	}
1312
1313	// return error types percentages for the types
1314	double et1_per = error_type1 / negative;
1315	double et2_per = error_type2 / positive;
1316	return new double[]
1317	{ et1_per, et2_per };
1318	}
1319
1320	/**
1321	* Use only the best classifier from our evaluation phase
1322	*
1323	* @param instance
1324	* instance that is classified
1325	*
1326	* @see weka.classifiers.AbstractClassifier#classifyInstance(weka.core.Instance)
1327	*/
1328	@Override
1329	public double classifyInstance(Instance instance) {
1330	GPGenotype gp = ((GPRun) best).getGp();
1331	Variable[] vars = ((GPRun) best).getVariables();
1332
1333	IGPProgram fitest = gp.getAllTimeBest(); // all time fitest
1334	for (int i = 0; i < instance.numAttributes() - 1; i++) {
1335	vars[i].set(instance.value(i));
1336	}
1337
1338	double classification = fitest.execute_double(0, vars);
1339
1340	if (classification < 0.5) {
1341	return 1.0;
1342	}
1343	else {
1344	return 0.0;
1345	}
1346	}
1347	}
1348	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/CrossPare/src/de/ugoe/cs/cpdp/training/GPTraining.java @ 139

Download in other formats: