Changeset 135 for trunk/CrossPare
- Timestamp:
- 07/18/16 12:26:03 (9 years ago)
- Location:
- trunk/CrossPare
- Files:
-
- 76 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/CrossPare/src/de/ugoe/cs/cpdp/ExperimentConfiguration.java
r98 r135 161 161 */ 162 162 private Boolean saveClassifier = null; 163 163 164 164 /** 165 165 * number of repetitions of an experiment (to account for randomness) … … 426 426 return saveClassifier; 427 427 } 428 428 429 429 /** 430 430 * number of repetitions of an experiment … … 579 579 saveClassifier = true; 580 580 } 581 else if (qName.equals("repetitions")) {581 else if (qName.equals("repetitions")) { 582 582 repetitions = Integer.parseInt(attributes.getValue("number")); 583 583 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/Runner.java
r100 r135 53 53 createConfig(threadPool, file.getAbsolutePath()); 54 54 } 55 else if (file.isDirectory() && file.listFiles() !=null) {55 else if (file.isDirectory() && file.listFiles() != null) { 56 56 for (File subfile : file.listFiles()) { 57 57 if (subfile.isFile()) { -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/CLAMIProcessor.java
r86 r135 51 51 @Override 52 52 public void setParameter(String parameters) { 53 // TODO Auto-generated method stub 54 53 // dummy, parameters not used 55 54 } 56 55 -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/CLAProcessor.java
r86 r135 44 44 @Override 45 45 public void setParameter(String parameters) { 46 // TODO Auto-generated method stub 47 46 // dummy, parameters not used 48 47 } 49 48 -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/LogarithmTransform.java
r86 r135 112 112 Instance instance = traindata.instance(i); 113 113 for (int j = 0; j < testdata.numAttributes(); j++) { 114 if (traindata.attribute(j) != classAttribute && traindata.attribute(j).isNumeric()) 114 if (traindata.attribute(j) != classAttribute && 115 traindata.attribute(j).isNumeric()) 115 116 { 116 117 if (instance.value(j) < 0) { -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/MORPH.java
r120 r135 25 25 26 26 /** 27 * Implements the MORPH data privatization. 27 * Implements the MORPH data privatization. 28 28 * 29 29 * … … 36 36 */ 37 37 Random rand = new Random(); 38 38 39 39 /** 40 40 * parameter alpha for MORPH, default is 0.15 41 41 */ 42 42 double alpha = 0.15; 43 43 44 44 /** 45 45 * parameter beta for MORPH, default is 0.35 46 46 */ 47 47 double beta = 0.35; 48 48 49 49 /** 50 50 * Does not have parameters. String is ignored. … … 57 57 if (parameters != null && !parameters.equals("")) { 58 58 String[] values = parameters.split(" "); 59 if ( values.length!=2) {59 if (values.length != 2) { 60 60 throw new InvalidParameterException("MORPH requires two doubles as parameter or no parameters to use default values"); 61 61 } … … 63 63 alpha = Double.parseDouble(values[0]); 64 64 beta = Double.parseDouble(values[1]); 65 } catch(NumberFormatException e) { 65 } 66 catch (NumberFormatException e) { 66 67 throw new InvalidParameterException("MORPH requires two doubles as parameter or no parameters to use default values"); 67 68 } … … 75 76 @Override 76 77 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 77 for ( Instances traindata : traindataSet) {78 for (Instances traindata : traindataSet) { 78 79 applyMORPH(traindata); 79 80 } … … 88 89 applyMORPH(traindata); 89 90 } 90 91 91 92 /** 92 93 * … … 95 96 * </p> 96 97 * 97 * @param data data to which the processor is applied 98 * @param data 99 * data to which the processor is applied 98 100 */ 99 101 public void applyMORPH(Instances data) { 100 for (int i =0; i<data.numInstances(); i++) {102 for (int i = 0; i < data.numInstances(); i++) { 101 103 morphInstance(data.get(i), data); 102 104 } 103 105 } 104 106 105 107 /** 106 108 * <p> … … 108 110 * </p> 109 111 * 110 * @param instance instance that is morphed 111 * @param data data based on which the instance is morphed 112 * @param instance 113 * instance that is morphed 114 * @param data 115 * data based on which the instance is morphed 112 116 */ 113 117 public void morphInstance(Instance instance, Instances data) { 114 118 Instance nearestUnlikeNeighbor = getNearestUnlikeNeighbor(instance, data); 115 if( nearestUnlikeNeighbor==null ) { 116 throw new RuntimeException("could not find nearest unlike neighbor within the data: " + data.relationName()); 119 if (nearestUnlikeNeighbor == null) { 120 throw new RuntimeException("could not find nearest unlike neighbor within the data: " + 121 data.relationName()); 117 122 } 118 for( int j=0; j<data.numAttributes() ; j++ ) { 119 if( data.attribute(j)!=data.classAttribute() && data.attribute(j).isNumeric()) { 120 double randVal = rand.nextDouble()*(beta-alpha)+alpha; 121 instance.setValue(j, instance.value(j) + randVal*(instance.value(j)-nearestUnlikeNeighbor.value(j)) ); 123 for (int j = 0; j < data.numAttributes(); j++) { 124 if (data.attribute(j) != data.classAttribute() && data.attribute(j).isNumeric()) { 125 double randVal = rand.nextDouble() * (beta - alpha) + alpha; 126 instance.setValue(j, instance.value(j) + 127 randVal * (instance.value(j) - nearestUnlikeNeighbor.value(j))); 122 128 } 123 129 } 124 130 } 125 131 126 132 /** 127 133 * <p> 128 * Determines the nearest unlike neighbor of an instance. 134 * Determines the nearest unlike neighbor of an instance. 129 135 * </p> 130 136 * 131 * @param instance instance to which the nearest unlike neighbor is determined 132 * @param data data where the nearest unlike neighbor is determined from 137 * @param instance 138 * instance to which the nearest unlike neighbor is determined 139 * @param data 140 * data where the nearest unlike neighbor is determined from 133 141 * @return nearest unlike instance 134 142 */ 135 143 public Instance getNearestUnlikeNeighbor(Instance instance, Instances data) { 136 144 Instance nearestUnlikeNeighbor = null; 137 138 double[] instanceVector = new double[data.numAttributes() -1];145 146 double[] instanceVector = new double[data.numAttributes() - 1]; 139 147 int tmp = 0; 140 for ( int j=0; j<data.numAttributes(); j++) {141 if ( data.attribute(j)!=data.classAttribute() && data.attribute(j).isNumeric()) {148 for (int j = 0; j < data.numAttributes(); j++) { 149 if (data.attribute(j) != data.classAttribute() && data.attribute(j).isNumeric()) { 142 150 instanceVector[tmp] = instance.value(j); 143 151 } 144 152 } 145 153 146 154 double minDistance = Double.MAX_VALUE; 147 for ( int i=0 ; i<data.numInstances() ; i++) {148 if ( instance.classValue() != data.instance(i).classValue()) {155 for (int i = 0; i < data.numInstances(); i++) { 156 if (instance.classValue() != data.instance(i).classValue()) { 149 157 double[] otherVector = new double[data.numAttributes() - 1]; 150 158 tmp = 0; 151 159 for (int j = 0; j < data.numAttributes(); j++) { 152 if (data.attribute(j) != data.classAttribute() && data.attribute(j).isNumeric()) { 160 if (data.attribute(j) != data.classAttribute() && 161 data.attribute(j).isNumeric()) 162 { 153 163 otherVector[tmp++] = data.instance(i).value(j); 154 164 } 155 165 } 156 if ( MathArrays.distance(instanceVector, otherVector)<minDistance) {166 if (MathArrays.distance(instanceVector, otherVector) < minDistance) { 157 167 minDistance = MathArrays.distance(instanceVector, otherVector); 158 168 nearestUnlikeNeighbor = data.instance(i); -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/MedianAsReference.java
r86 r135 129 129 Instance instance = traindata.instance(i); 130 130 for (int j = 0; j < traindata.numAttributes(); j++) { 131 if (traindata.attribute(j) != classAttribute && traindata.attribute(j).isNumeric()) 131 if (traindata.attribute(j) != classAttribute && 132 traindata.attribute(j).isNumeric()) 132 133 { 133 134 instance.setValue(j, instance.value(j) + (median[j] - currentmedian[j])); -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/NominalAttributeFilter.java
r86 r135 95 95 96 96 // delete all instances where nominal attribute has the value of one of the parameter 97 if (indexOfnominalAttributeValues .contains(wekaInstance98 . value(indexOfConfidenceAttribute)))97 if (indexOfnominalAttributeValues 98 .contains(wekaInstance.value(indexOfConfidenceAttribute))) 99 99 { 100 100 traindata.delete(j); -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/Oversampling.java
r86 r135 80 80 81 81 Resample resample = new Resample(); 82 // TODO: resample.setSampleSizePercent((100.0*counts[1])/100+0.01);83 // Ohne +0.01 wird bei tomcat, xerces-1.2 und jedit-4.0 ein negative84 // weniger zurückgegeben85 82 resample.setSampleSizePercent((100.0 * counts[0]) / counts[1]); 86 83 try { -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/SynonymAttributePruning.java
r86 r135 59 59 double distance; 60 60 for (int j = traindata.numAttributes() - 1; j >= 0; j--) { 61 if ( j!=traindata.classIndex()) {61 if (j != traindata.classIndex()) { 62 62 boolean hasClosest = false; 63 63 for (int i1 = 0; !hasClosest && i1 < traindata.size(); i1++) { … … 67 67 double distanceJ = Double.MAX_VALUE; 68 68 for (int k = 0; k < traindata.numAttributes(); k++) { 69 distance = Math.abs(traindata.get(i1).value(k) - traindata.get(i2).value(k)); 69 distance = Math 70 .abs(traindata.get(i1).value(k) - traindata.get(i2).value(k)); 70 71 if (distance < minVal) { 71 72 minVal = distance; -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/TCAPlusNormalization.java
r86 r135 19 19 import weka.core.Instances; 20 20 21 // normalization selected according to TCA+ rules (TCA has to be applied separately 21 /** 22 * <p> 23 * Normalization selected according to the TCA+ rules after Nam et al. (Transfer Defect Learning). 24 * </p> 25 * 26 * @author Steffen Herbold 27 */ 22 28 public class TCAPlusNormalization implements IProcessesingStrategy { 23 29 … … 30 36 @Override 31 37 public void setParameter(String parameters) { 32 // TODO Auto-generated method stub 33 38 // dummy, paramters not used 34 39 } 35 40 41 /* 42 * (non-Javadoc) 43 * 44 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances, 45 * weka.core.Instances) 46 */ 36 47 @Override 37 48 public void apply(Instances testdata, Instances traindata) { 38 49 applyTCAPlus(testdata, traindata); 39 50 } 40 51 41 52 private void applyTCAPlus(Instances testdata, Instances traindata) { 42 53 DistChar dcTest = WekaUtils.datasetDistance(testdata); 43 54 DistChar dcTrain = WekaUtils.datasetDistance(traindata); 44 55 45 56 // RULE 1: 46 if( 0.9*dcTrain.mean<=dcTest.mean && 1.1*dcTrain.mean>=dcTest.mean && 47 0.9*dcTrain.std<=dcTest.std && 1.1*dcTrain.std>=dcTest.std) { 57 if (0.9 * dcTrain.mean <= dcTest.mean && 1.1 * dcTrain.mean >= dcTest.mean && 58 0.9 * dcTrain.std <= dcTest.std && 1.1 * dcTrain.std >= dcTest.std) 59 { 48 60 // do nothing 49 61 } 50 62 // RULE 2: 51 else if((0.4*dcTrain.min>dcTest.min || 1.6*dcTrain.min<dcTest.min) && 52 (0.4*dcTrain.max>dcTest.max || 1.6*dcTrain.min<dcTest.max) && 53 (0.4*dcTrain.min>dcTest.num || 1.6*dcTrain.min<dcTest.num)) { 63 else if ((0.4 * dcTrain.min > dcTest.min || 1.6 * dcTrain.min < dcTest.min) && 64 (0.4 * dcTrain.max > dcTest.max || 1.6 * dcTrain.min < dcTest.max) && 65 (0.4 * dcTrain.min > dcTest.num || 1.6 * dcTrain.min < dcTest.num)) 66 { 54 67 NormalizationUtil.minMax(testdata); 55 68 NormalizationUtil.minMax(traindata); 56 69 } 57 70 // RULE 3: 58 else if((0.4*dcTrain.std>dcTest.std && dcTrain.num<dcTest.num) || 59 (1.6*dcTrain.std<dcTest.std)&& dcTrain.num>dcTest.num) { 71 else if ((0.4 * dcTrain.std > dcTest.std && dcTrain.num < dcTest.num) || 72 (1.6 * dcTrain.std < dcTest.std) && dcTrain.num > dcTest.num) 73 { 60 74 NormalizationUtil.zScoreTraining(testdata, traindata); 61 75 } 62 76 // RULE 4: 63 else if((0.4*dcTrain.std>dcTest.std && dcTrain.num>dcTest.num) || 64 (1.6*dcTrain.std<dcTest.std)&& dcTrain.num<dcTest.num) { 77 else if ((0.4 * dcTrain.std > dcTest.std && dcTrain.num > dcTest.num) || 78 (1.6 * dcTrain.std < dcTest.std) && dcTrain.num < dcTest.num) 79 { 65 80 NormalizationUtil.zScoreTarget(testdata, traindata); 66 81 } 67 // RULE 5:82 // RULE 5: 68 83 else { 69 84 NormalizationUtil.zScore(testdata); -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/TopMetricFilter.java
r129 r135 52 52 */ 53 53 double correlationThreshold = 0.5; 54 54 55 55 /* 56 56 * (non-Javadoc) … … 60 60 @Override 61 61 public void setParameter(String parameters) { 62 if ( parameters!=null && !parameters.equals("")) {62 if (parameters != null && !parameters.equals("")) { 63 63 correlationThreshold = Double.parseDouble(parameters); 64 64 } … … 76 76 } 77 77 78 private void determineTopKAttributes(Instances testdata, SetUniqueList<Instances> traindataSet) throws Exception { 79 Integer[] counts = new Integer[traindataSet.get(0).numAttributes()-1]; 80 IntStream.range(0,counts.length).forEach(val -> counts[val] = 0); 81 for( Instances traindata : traindataSet ) { 78 private void determineTopKAttributes(Instances testdata, SetUniqueList<Instances> traindataSet) 79 throws Exception 80 { 81 Integer[] counts = new Integer[traindataSet.get(0).numAttributes() - 1]; 82 IntStream.range(0, counts.length).forEach(val -> counts[val] = 0); 83 for (Instances traindata : traindataSet) { 82 84 J48 decisionTree = new J48(); 83 85 decisionTree.buildClassifier(traindata); 84 int k =0;85 for ( int j=0; j<traindata.numAttributes(); j++) {86 if (j!=traindata.classIndex()){87 if ( decisionTree.toString().contains(traindata.attribute(j).name())) {88 counts[k] = counts[k] +1;86 int k = 0; 87 for (int j = 0; j < traindata.numAttributes(); j++) { 88 if (j != traindata.classIndex()) { 89 if (decisionTree.toString().contains(traindata.attribute(j).name())) { 90 counts[k] = counts[k] + 1; 89 91 } 90 92 k++; … … 93 95 } 94 96 int[] topkIndex = new int[counts.length]; 95 IntStream.range(0, counts.length).forEach(val -> topkIndex[val] = val);97 IntStream.range(0, counts.length).forEach(val -> topkIndex[val] = val); 96 98 SortUtils.quicksort(counts, topkIndex, true); 97 99 98 100 // get CFSs for each training set 99 101 List<Set<Integer>> cfsSets = new LinkedList<>(); 100 for ( Instances traindata : traindataSet) {102 for (Instances traindata : traindataSet) { 101 103 boolean selectionSuccessful = false; 102 104 boolean secondAttempt = false; … … 113 115 attsel.SelectAttributes(traindataCopy); 114 116 Set<Integer> cfsSet = new HashSet<>(); 115 for ( int attr : attsel.selectedAttributes()) {117 for (int attr : attsel.selectedAttributes()) { 116 118 cfsSet.add(attr); 117 119 } … … 128 130 attsel.SelectAttributes(traindata); 129 131 Set<Integer> cfsSet = new HashSet<>(); 130 for ( int attr : attsel.selectedAttributes()) {132 for (int attr : attsel.selectedAttributes()) { 131 133 cfsSet.add(attr); 132 134 } … … 160 162 while (!selectionSuccessful); // dummy loop for internal continue 161 163 } 162 164 163 165 double[] coverages = new double[topkIndex.length]; 164 for ( Set<Integer> cfsSet : cfsSets) {166 for (Set<Integer> cfsSet : cfsSets) { 165 167 Set<Integer> topkSet = new HashSet<>(); 166 for ( int k=0; k<topkIndex.length ; k++) {168 for (int k = 0; k < topkIndex.length; k++) { 167 169 topkSet.add(topkIndex[k]); 168 coverages[k] += (coverage(topkSet, cfsSet) /traindataSet.size());170 coverages[k] += (coverage(topkSet, cfsSet) / traindataSet.size()); 169 171 } 170 172 } 171 173 double bestCoverageValue = Double.MIN_VALUE; 172 174 int bestCoverageIndex = 0; 173 for ( int i=0; i<coverages.length; i++) {174 if ( coverages[i]>bestCoverageValue) {175 for (int i = 0; i < coverages.length; i++) { 176 if (coverages[i] > bestCoverageValue) { 175 177 bestCoverageValue = coverages[i]; 176 178 bestCoverageIndex = i; … … 180 182 SpearmansCorrelation corr = new SpearmansCorrelation(); 181 183 double[][] correlationMatrix = new double[bestCoverageIndex][bestCoverageIndex]; 182 for ( Instances traindata : traindataSet) {184 for (Instances traindata : traindataSet) { 183 185 double[][] vectors = new double[bestCoverageIndex][traindata.size()]; 184 for ( int i=0; i<traindata.size(); i++) {185 for ( int j=0; j<bestCoverageIndex; j++) {186 for (int i = 0; i < traindata.size(); i++) { 187 for (int j = 0; j < bestCoverageIndex; j++) { 186 188 vectors[j][i] = traindata.get(i).value(topkIndex[j]); 187 189 } 188 190 } 189 for ( int j=0; j<bestCoverageIndex; j++) {190 for ( int k=j+1; k<bestCoverageIndex; k++) {191 for (int j = 0; j < bestCoverageIndex; j++) { 192 for (int k = j + 1; k < bestCoverageIndex; k++) { 191 193 correlationMatrix[j][k] = Math.abs(corr.correlation(vectors[j], vectors[k])); 192 194 } … … 194 196 } 195 197 Set<Integer> topkSetIndexSet = new TreeSet<>(); 196 // j<30 ensures that the computational time does not explode since the powerset is 2^n in complexity 197 for( int j=0; j<bestCoverageIndex && j<30 ; j++ ) { 198 // j<30 ensures that the computational time does not explode since the powerset is 2^n in 199 // complexity 200 for (int j = 0; j < bestCoverageIndex && j < 30; j++) { 198 201 topkSetIndexSet.add(j); 199 202 } … … 201 204 double bestOptCoverage = Double.MIN_VALUE; 202 205 Set<Integer> opttopkSetIndexSet = null; 203 for ( Set<Integer> combination : allCombinations) {204 if ( isUncorrelated(correlationMatrix, combination)) {206 for (Set<Integer> combination : allCombinations) { 207 if (isUncorrelated(correlationMatrix, combination)) { 205 208 double currentCoverage = 0.0; 206 209 Set<Integer> topkCombination = new TreeSet<>(); 207 for ( Integer index : combination) {210 for (Integer index : combination) { 208 211 topkCombination.add(topkIndex[index]); 209 212 } 210 for ( Set<Integer> cfsSet : cfsSets) {211 currentCoverage += (coverage(topkCombination, cfsSet) /traindataSet.size());212 } 213 if ( currentCoverage > bestOptCoverage) {213 for (Set<Integer> cfsSet : cfsSets) { 214 currentCoverage += (coverage(topkCombination, cfsSet) / traindataSet.size()); 215 } 216 if (currentCoverage > bestOptCoverage) { 214 217 bestOptCoverage = currentCoverage; 215 218 opttopkSetIndexSet = combination; … … 218 221 } 219 222 Set<Integer> opttopkIndex = new TreeSet<>(); 220 for (Integer index : opttopkSetIndexSet) {223 for (Integer index : opttopkSetIndexSet) { 221 224 opttopkIndex.add(topkIndex[index]); 222 225 } 223 226 Console.traceln(Level.FINE, "selected the following metrics:"); 224 for (Integer index : opttopkIndex) {227 for (Integer index : opttopkIndex) { 225 228 Console.traceln(Level.FINE, traindataSet.get(0).attribute(index).name()); 226 229 } 227 230 // finally remove attributes 228 for ( int j=testdata.numAttributes()-1; j>=0; j--) {229 if ( j!=testdata.classIndex() && !opttopkIndex.contains(j)) {231 for (int j = testdata.numAttributes() - 1; j >= 0; j--) { 232 if (j != testdata.classIndex() && !opttopkIndex.contains(j)) { 230 233 testdata.deleteAttributeAt(j); 231 for ( Instances traindata : traindataSet) {234 for (Instances traindata : traindataSet) { 232 235 traindata.deleteAttributeAt(j); 233 236 } … … 235 238 } 236 239 } 237 240 238 241 private boolean isUncorrelated(double[][] correlationMatrix, Set<Integer> combination) { 239 242 Integer[] intCombination = combination.toArray(new Integer[0]); 240 243 boolean areUncorrelated = true; 241 for( int i=0 ; areUncorrelated && i<intCombination.length ; i++ ) { 242 for( int j=i+1; areUncorrelated && j<intCombination.length ; j++ ) { 243 areUncorrelated &= correlationMatrix[intCombination[i]][intCombination[j]]>correlationThreshold; 244 for (int i = 0; areUncorrelated && i < intCombination.length; i++) { 245 for (int j = i + 1; areUncorrelated && j < intCombination.length; j++) { 246 areUncorrelated &= 247 correlationMatrix[intCombination[i]][intCombination[j]] > correlationThreshold; 244 248 } 245 249 } 246 250 return areUncorrelated; 247 251 } 248 252 249 253 private double coverage(Set<Integer> topkSet, Set<Integer> cfsSet) { 250 254 Set<Integer> topkSetCopy1 = new HashSet<>(topkSet); … … 252 256 Set<Integer> topkSetCopy2 = new HashSet<>(topkSet); 253 257 topkSetCopy2.addAll(cfsSet); 254 return ((double) topkSetCopy1.size()) /topkSetCopy2.size();258 return ((double) topkSetCopy1.size()) / topkSetCopy2.size(); 255 259 } 256 260 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/TransferComponentAnalysis.java
r86 r135 37 37 * </p> 38 38 * 39 * TODO comment class40 39 * @author Steffen Herbold 41 40 */ 42 41 public class TransferComponentAnalysis implements IProcessesingStrategy { 43 42 43 /** 44 * Dimension of the reduced data. 45 */ 44 46 int reducedDimension = 5; 45 47 48 /* 49 * (non-Javadoc) 50 * 51 * @see de.ugoe.cs.cpdp.IParameterizable#setParameter(java.lang.String) 52 */ 46 53 @Override 47 54 public void setParameter(String parameters) { 48 49 } 50 55 // dummy, paramters ignored 56 } 57 58 /* 59 * (non-Javadoc) 60 * 61 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances, 62 * weka.core.Instances) 63 */ 51 64 @Override 52 65 public void apply(Instances testdata, Instances traindata) { … … 54 67 } 55 68 69 /** 70 * <p> 71 * calculates the linear kernel function between two instances 72 * </p> 73 * 74 * @param x1 75 * first instance 76 * @param x2 77 * second instance 78 * @return kernel value 79 */ 56 80 private double linearKernel(Instance x1, Instance x2) { 57 81 double value = 0.0d; … … 64 88 } 65 89 90 /** 91 * <p> 92 * Applies TCA to the test and training data. 93 * </p> 94 * 95 * @param testdata 96 * the test data 97 * @param traindata 98 * the training data 99 */ 66 100 private void applyTCA(Instances testdata, Instances traindata) { 67 101 final int sizeTest = testdata.numInstances(); … … 125 159 } 126 160 161 /** 162 * <p> 163 * Creates the kernel matrix of the test and training data 164 * </p> 165 * 166 * @param testdata 167 * the test data 168 * @param traindata 169 * the training data 170 * @return kernel matrix 171 */ 127 172 private PrimitiveMatrix buildKernel(Instances testdata, Instances traindata) { 128 173 final int kernelDim = traindata.numInstances() + testdata.numInstances(); … … 162 207 } 163 208 209 /** 210 * <p> 211 * Calculates the kernel norm matrix, i.e., the matrix which is used for matrix multiplication 212 * to calculate the kernel norm. 213 * </p> 214 * 215 * @param dimTest 216 * dimension of the test data 217 * @param sizeTrain 218 * number of instances of the training data 219 * @return kernel norm matrix 220 */ 164 221 private PrimitiveMatrix buildKernelNormMatrix(final int dimTest, final int sizeTrain) { 165 222 final double trainSquared = 1.0 / (sizeTrain * (double) sizeTrain); … … 199 256 } 200 257 258 /** 259 * <p> 260 * Creates the center matrix 261 * </p> 262 * 263 * @param sizeTest 264 * number of instances of the test data 265 * @param sizeTrain 266 * number of instances of the training data 267 * @return center matrix 268 */ 201 269 private PrimitiveMatrix buildCenterMatrix(final int sizeTest, final int sizeTrain) { 202 270 Builder<PrimitiveMatrix> centerMatrix = … … 208 276 } 209 277 278 /** 279 * <p> 280 * Builds the mu-Matrix for offsetting values. 281 * </p> 282 * 283 * @param sizeTest 284 * number of instances of the test data 285 * @param sizeTrain 286 * number of instances of the training data 287 * @param mu 288 * mu parameter 289 * @return mu-Matrix 290 */ 210 291 private PrimitiveMatrix buildMuMatrix(final int sizeTest, 211 292 final int sizeTrain, -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/Undersampling.java
r86 r135 80 80 81 81 Resample resample = new Resample(); 82 // TODO: resample.setSampleSizePercent((100.0*counts[1])/100+0.01);83 // Ohne +0.01 wird bei tomcat, xerces-1.2 und jedit-4.0 ein negative weniger84 // zurückgegeben85 82 resample.setSampleSizePercent((100.0 * counts[1]) / counts[0]); 86 83 try { -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/ZScoreTargetNormalization.java
r86 r135 24 24 * @author Steffen Herbold 25 25 */ 26 public class ZScoreTargetNormalization implements ISetWiseProcessingStrategy, IProcessesingStrategy 26 public class ZScoreTargetNormalization 27 implements ISetWiseProcessingStrategy, IProcessesingStrategy 27 28 { 28 29 -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/AbstractCharacteristicSelection.java
r86 r135 104 104 } 105 105 else if ("median".equals(characteristics[j])) { 106 instanceValues[i * characteristics.length + j] = Utils.kthSmallestValue(testdata.attributeToDoubleArray(i), testdata.size()/2); 106 instanceValues[i * characteristics.length + j] = 107 Utils.kthSmallestValue(testdata.attributeToDoubleArray(i), 108 testdata.size() / 2); 107 109 } 108 110 else { … … 138 140 } 139 141 else if ("median".equals(characteristics[j])) { 140 instanceValues[i * characteristics.length + j] = Utils.kthSmallestValue(traindata.attributeToDoubleArray(i), traindata.size()/2); 142 instanceValues[i * characteristics.length + j] = 143 Utils.kthSmallestValue(traindata.attributeToDoubleArray(i), 144 traindata.size() / 2); 141 145 } 142 146 else { … … 173 177 } 174 178 catch (Exception e) { 175 throw new RuntimeException( 176 "Unexpected exception during normalization of distributional characteristics.", 179 throw new RuntimeException("Unexpected exception during normalization of distributional characteristics.", 177 180 e); 178 181 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/CLIFF.java
r120 r135 28 28 public class CLIFF implements IPointWiseDataselectionStrategy, ISetWiseDataselectionStrategy { 29 29 30 /** 31 * percentage of data selected 32 */ 30 33 private double percentage = 0.10; 31 34 35 /** 36 * number of ranges considered 37 */ 32 38 private final int numRanges = 10; 33 39 … … 40 46 @Override 41 47 public void setParameter(String parameters) { 42 if ( parameters!=null) {48 if (parameters != null) { 43 49 percentage = Double.parseDouble(parameters); 44 50 } 45 51 } 46 47 /* *52 53 /* 48 54 * @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances, 49 * 55 * org.apache.commons.collections4.list.SetUniqueList) 50 56 */ 51 57 @Override 52 58 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 53 for ( Instances traindata : traindataSet) {59 for (Instances traindata : traindataSet) { 54 60 applyCLIFF(traindata); 55 61 } 56 62 } 57 63 58 /* *64 /* 59 65 * @see de.ugoe.cs.cpdp.dataselection.PointWiseDataselectionStrategy#apply(weka.core.Instances, 60 * 66 * weka.core.Instances) 61 67 */ 62 68 @Override … … 65 71 } 66 72 73 /** 74 * <p> 75 * Applies the CLIFF relevancy filter to the data. 76 * </p> 77 * 78 * @param data 79 * the data 80 * @return CLIFF-filtered data 81 */ 67 82 protected Instances applyCLIFF(Instances data) { 68 83 final double[][] powerAttributes = new double[data.size()][data.numAttributes()]; 69 84 final double[] powerEntity = new double[data.size()]; 70 85 71 86 final int[] counts = data.attributeStats(data.classIndex()).nominalCounts; 72 87 final double probDefect = data.numInstances() / (double) counts[1]; 73 74 for ( int j=0; j<data.numAttributes(); j++) {75 if ( data.attribute(j)!=data.classAttribute()) {88 89 for (int j = 0; j < data.numAttributes(); j++) { 90 if (data.attribute(j) != data.classAttribute()) { 76 91 final double[] ranges = getRanges(data, j); 77 92 final double[] probDefectRange = getRangeProbabilities(data, j, ranges); 78 79 for ( int i=0 ; i<data.numInstances() ; i++) {93 94 for (int i = 0; i < data.numInstances(); i++) { 80 95 final double value = data.instance(i).value(j); 81 96 final int range = determineRange(ranges, value); 82 97 double probClass, probNotClass, probRangeClass, probRangeNotClass; 83 if ( data.instance(i).classValue()==1) {98 if (data.instance(i).classValue() == 1) { 84 99 probClass = probDefect; 85 probNotClass = 1.0 -probDefect;100 probNotClass = 1.0 - probDefect; 86 101 probRangeClass = probDefectRange[range]; 87 probRangeNotClass = 1.0-probDefectRange[range]; 88 } else { 89 probClass = 1.0-probDefect; 102 probRangeNotClass = 1.0 - probDefectRange[range]; 103 } 104 else { 105 probClass = 1.0 - probDefect; 90 106 probNotClass = probDefect; 91 probRangeClass = 1.0 -probDefectRange[range];107 probRangeClass = 1.0 - probDefectRange[range]; 92 108 probRangeNotClass = probDefectRange[range]; 93 109 } 94 powerAttributes[i][j] = Math.pow(probRangeClass, 2.0)/(probRangeClass*probClass+probRangeNotClass*probNotClass); 110 powerAttributes[i][j] = Math.pow(probRangeClass, 2.0) / 111 (probRangeClass * probClass + probRangeNotClass * probNotClass); 95 112 } 96 113 } 97 114 } 98 99 for ( int i=0; i<data.numInstances(); i++) {115 116 for (int i = 0; i < data.numInstances(); i++) { 100 117 powerEntity[i] = 1.0; 101 for (int j =0; j<data.numAttributes() ; j++) {118 for (int j = 0; j < data.numAttributes(); j++) { 102 119 powerEntity[i] *= powerAttributes[i][j]; 103 120 } … … 105 122 double[] sortedPower = powerEntity.clone(); 106 123 Arrays.sort(sortedPower); 107 double cutOff = sortedPower[(int) (data.numInstances() *(1-percentage))];124 double cutOff = sortedPower[(int) (data.numInstances() * (1 - percentage))]; 108 125 109 126 final Instances selected = new Instances(data); 110 127 selected.delete(); 111 for (int i =0; i<data.numInstances(); i++) {112 if ( powerEntity[i]>=cutOff) {128 for (int i = 0; i < data.numInstances(); i++) { 129 if (powerEntity[i] >= cutOff) { 113 130 selected.add(data.instance(i)); 114 131 } … … 116 133 return selected; 117 134 } 118 135 136 /** 137 * <p> 138 * Gets an array with the ranges from the data for a given attribute 139 * </p> 140 * 141 * @param data 142 * the data 143 * @param j 144 * index of the attribute 145 * @return the ranges for the attribute 146 */ 119 147 private double[] getRanges(Instances data, int j) { 120 double[] values = new double[numRanges +1];121 for ( int k=0; k<numRanges; k++) {122 values[k] = data.kthSmallestValue(j, (int) (data.size() *(k+1.0)/numRanges));148 double[] values = new double[numRanges + 1]; 149 for (int k = 0; k < numRanges; k++) { 150 values[k] = data.kthSmallestValue(j, (int) (data.size() * (k + 1.0) / numRanges)); 123 151 } 124 152 values[numRanges] = data.attributeStats(j).numericStats.max; 125 153 return values; 126 154 } 127 155 156 /** 157 * <p> 158 * Gets the probabilities of a positive prediction for each range for a given attribute 159 * </p> 160 * 161 * @param data 162 * the data 163 * @param j 164 * index of the attribute 165 * @param ranges 166 * the ranges 167 * @return probabilities for each range 168 */ 128 169 private double[] getRangeProbabilities(Instances data, int j, double[] ranges) { 129 170 double[] probDefectRange = new double[numRanges]; 130 171 int[] countRange = new int[numRanges]; 131 172 int[] countDefect = new int[numRanges]; 132 for ( int i=0; i<data.numInstances() ; i++) {133 int range = determineRange(ranges, data.instance(i).value(j)); 173 for (int i = 0; i < data.numInstances(); i++) { 174 int range = determineRange(ranges, data.instance(i).value(j)); 134 175 countRange[range]++; 135 if ( data.instance(i).classValue()== 1) {176 if (data.instance(i).classValue() == 1) { 136 177 countDefect[range]++; 137 178 } 138 179 139 180 } 140 for ( int k=0; k<numRanges; k++) {181 for (int k = 0; k < numRanges; k++) { 141 182 probDefectRange[k] = ((double) countDefect[k]) / countRange[k]; 142 183 } 143 184 return probDefectRange; 144 185 } 145 186 187 /** 188 * <p> 189 * Determines the range of a give value 190 * </p> 191 * 192 * @param ranges 193 * the possible ranges 194 * @param value 195 * the value 196 * @return index of the range 197 */ 146 198 private int determineRange(double[] ranges, double value) { 147 for ( int k=0; k<numRanges; k++) {148 if ( value<=ranges[k+1]) {199 for (int k = 0; k < numRanges; k++) { 200 if (value <= ranges[k + 1]) { 149 201 return k; 150 202 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/DBSCANFilter.java
r92 r135 99 99 .valid(); clusterIter.advance()) 100 100 { 101 int internalIndex = clusterIter.internalGetIndex() - testdata.size() - firstInternalIndex; 101 int internalIndex = 102 clusterIter.internalGetIndex() - testdata.size() - firstInternalIndex; 102 103 if (internalIndex >= 0) { 103 104 // index belongs to a training instance -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/DecisionTreeSelection.java
r116 r135 84 84 } 85 85 REPTree repTree = new REPTree(); 86 if ( repTree.getNumFolds()>similarityData.size()) {86 if (repTree.getNumFolds() > similarityData.size()) { 87 87 repTree.setNumFolds(similarityData.size()); 88 88 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/LACE2.java
r120 r135 12 12 // See the License for the specific language governing permissions and 13 13 // limitations under the License. 14 15 14 16 15 package de.ugoe.cs.cpdp.dataselection; … … 39 38 public class LACE2 implements ISetWiseDataselectionStrategy { 40 39 40 /** 41 * percentage of data selected by the internal CLIFF. 42 */ 41 43 private double percentage = 0.10; 42 44 45 /* 46 * (non-Javadoc) 47 * 48 * @see de.ugoe.cs.cpdp.IParameterizable#setParameter(java.lang.String) 49 */ 43 50 @Override 44 51 public void setParameter(String parameters) { 45 if ( parameters!=null && !parameters.isEmpty()) {52 if (parameters != null && !parameters.isEmpty()) { 46 53 percentage = Double.parseDouble(parameters); 47 54 } 48 55 } 49 56 57 /* 58 * (non-Javadoc) 59 * 60 * @see de.ugoe.cs.cpdp.dataselection.ISetWiseDataselectionStrategy#apply(weka.core.Instances, 61 * org.apache.commons.collections4.list.SetUniqueList) 62 */ 50 63 @Override 51 64 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 52 65 Instances selectedData = new Instances(testdata); 53 66 selectedData.clear(); 54 67 55 68 LinkedList<Instances> traindataCopy = new LinkedList<>(traindataSet); 56 69 Collections.shuffle(traindataCopy); 57 70 58 71 CLIFF cliff = new CLIFF(); 59 72 cliff.setParameter(Double.toString(percentage)); … … 61 74 Median median = new Median(); 62 75 double minDist = Double.MIN_VALUE; 63 64 for ( Instances traindata : traindataCopy) {76 77 for (Instances traindata : traindataCopy) { 65 78 Instances cliffedData = cliff.applyCLIFF(traindata); 66 if ( minDist==Double.MIN_VALUE) {79 if (minDist == Double.MIN_VALUE) { 67 80 // determine distance for leader-follower algorithm 68 81 Instances sample; 69 if ( traindata.size()>100) {82 if (traindata.size() > 100) { 70 83 Resample resample = new Resample(); 71 resample.setSampleSizePercent(100.0 /traindata.size()*100.0);84 resample.setSampleSizePercent(100.0 / traindata.size() * 100.0); 72 85 resample.setBiasToUniformClass(0.0); 73 86 resample.setNoReplacement(true); … … 79 92 throw new RuntimeException(e); 80 93 } 81 } else { 94 } 95 else { 82 96 sample = new Instances(traindata); 83 97 } 84 98 double[] distances = new double[sample.size()]; 85 for ( int i=0; i<sample.size(); i++) {99 for (int i = 0; i < sample.size(); i++) { 86 100 Instance unlikeNeighbor = morph.getNearestUnlikeNeighbor(sample.get(i), sample); 87 distances[i] = MathArrays.distance(WekaUtils.instanceValues(sample.get(i)), WekaUtils.instanceValues(unlikeNeighbor)); 101 distances[i] = MathArrays.distance(WekaUtils.instanceValues(sample.get(i)), 102 WekaUtils.instanceValues(unlikeNeighbor)); 88 103 } 89 104 minDist = median.evaluate(distances); 90 105 } 91 for( int i=0; i<cliffedData.size(); i++ ) { 92 Instance unlikeNeighbor = morph.getNearestUnlikeNeighbor(cliffedData.get(i), selectedData); 93 if( unlikeNeighbor==null ) { 106 for (int i = 0; i < cliffedData.size(); i++) { 107 Instance unlikeNeighbor = 108 morph.getNearestUnlikeNeighbor(cliffedData.get(i), selectedData); 109 if (unlikeNeighbor == null) { 94 110 selectedData.add(cliffedData.get(i)); 95 } else { 96 double distance = MathArrays.distance(WekaUtils.instanceValues(cliffedData.get(i)), WekaUtils.instanceValues(unlikeNeighbor)); 97 if( distance>minDist ) { 111 } 112 else { 113 double distance = 114 MathArrays.distance(WekaUtils.instanceValues(cliffedData.get(i)), 115 WekaUtils.instanceValues(unlikeNeighbor)); 116 if (distance > minDist) { 98 117 morph.morphInstance(cliffedData.get(i), cliffedData); 99 118 selectedData.add(cliffedData.get(i)); … … 103 122 } 104 123 } 105 124 106 125 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/MahalanobisOutlierRemoval.java
r117 r135 97 97 RealMatrix inverseCovariance; 98 98 try { 99 inverseCovariance = 100 new LUDecomposition(new Covariance(values).getCovarianceMatrix()).getSolver() 101 .getInverse(); 102 } catch(SingularMatrixException e) { 103 Console.traceln(Level.WARNING, "could not perform Mahalanobis outlier removal due to singular covariance matrix"); 99 inverseCovariance = new LUDecomposition(new Covariance(values).getCovarianceMatrix()) 100 .getSolver().getInverse(); 101 } 102 catch (SingularMatrixException e) { 103 Console 104 .traceln(Level.WARNING, 105 "could not perform Mahalanobis outlier removal due to singular covariance matrix"); 104 106 return; 105 107 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/NeighborhoodFilter.java
r86 r135 36 36 @Override 37 37 public void setParameter(String parameters) { 38 // TODO Auto-generated method stub 39 38 // dummy, parameters not used 40 39 } 41 40 … … 56 55 * </p> 57 56 * 58 * @param testdata test data 59 * @param traindata training data 57 * @param testdata 58 * test data 59 * @param traindata 60 * training data 60 61 * @return filtered trainind data 61 62 */ -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/PetersFilter.java
r86 r135 27 27 28 28 /** 29 * Filter according to F. Peters, T. Menzies, and A. Marcus: Better Cross Company Defect Prediction <br> 29 * Filter according to F. Peters, T. Menzies, and A. Marcus: Better Cross Company Defect Prediction 30 * <br> 30 31 * <br> 31 32 * This filter does not work, the paper has been withdrawn. … … 36 37 public class PetersFilter implements IPointWiseDataselectionStrategy { 37 38 38 /* *39 /* 39 40 * @see de.ugoe.cs.cpdp.IParameterizable#setParameter(java.lang.String) 40 41 */ … … 44 45 } 45 46 46 /* *47 /* 47 48 * @see de.ugoe.cs.cpdp.dataselection.IPointWiseDataselectionStrategy#apply(weka.core.Instances, 48 * 49 * weka.core.Instances) 49 50 */ 50 51 @Override -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/PointWiseEMClusterSelection.java
r86 r135 31 31 * Use in Config: 32 32 * 33 * Specify number of clusters -N = Num Clusters <pointwiseselector34 * name="PointWiseEMClusterSelection" param="-N 10"/>33 * Specify number of clusters -N = Num Clusters 34 * <pointwiseselector name="PointWiseEMClusterSelection" param="-N 10"/> 35 35 * 36 36 * Try to determine the number of clusters: -I 10 = max iterations -X 5 = 5 folds for cross 37 * evaluation -max = max number of clusters <pointwiseselector name="PointWiseEMClusterSelection"38 * param="-I 10 -X 5 -max 300"/>37 * evaluation -max = max number of clusters 38 * <pointwiseselector name="PointWiseEMClusterSelection" param="-I 10 -X 5 -max 300"/> 39 39 * 40 40 * Don't forget to add: <preprocessor name="Normalization" param=""/> … … 42 42 public class PointWiseEMClusterSelection implements IPointWiseDataselectionStrategy { 43 43 44 /** 45 * paramters passed to the selection 46 */ 44 47 private String[] params; 45 48 49 /* 50 * (non-Javadoc) 51 * 52 * @see de.ugoe.cs.cpdp.IParameterizable#setParameter(java.lang.String) 53 */ 46 54 @Override 47 55 public void setParameter(String parameters) { … … 108 116 } 109 117 110 Console.traceln(Level.INFO, 111 String.format("our testdata is in: " + selectedCluster.size() + 112 " different clusters")); 118 Console.traceln(Level.INFO, String 119 .format("our testdata is in: " + selectedCluster.size() + " different clusters")); 113 120 114 121 // 5. get cluster membership of our traindata … … 127 134 for (int j = 0; j < ctrain.numInstances(); j++) { 128 135 // get the cluster number from the attributes 129 cnumber = 130 Integer.parseInt(ctrain.get(j).stringValue(ctrain.get(j).numAttributes() - 1) 131 .replace("cluster", "")); 136 cnumber = Integer.parseInt(ctrain.get(j) 137 .stringValue(ctrain.get(j).numAttributes() - 1).replace("cluster", "")); 132 138 133 139 // Console.traceln(Level.INFO, … … 145 151 } 146 152 147 Console.traceln(Level.INFO, 148 String.format("that leaves us with: " + selected.numInstances() + 149 " traindata instances from " + traindata.numInstances())); 153 Console.traceln(Level.INFO, String.format("that leaves us with: " + 154 selected.numInstances() + " traindata instances from " + traindata.numInstances())); 150 155 } 151 156 catch (Exception e) { -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/SeparatabilitySelection.java
r86 r135 86 86 inst.setClassValue(1.0); 87 87 sample.add(inst); 88 inst = 89 new DenseInstance( 90 traindata.instance(rand.nextInt(traindata.numInstances()))); 88 inst = new DenseInstance(traindata 89 .instance(rand.nextInt(traindata.numInstances()))); 91 90 inst.setDataset(sample); 92 91 inst.setClassValue(0.0); … … 101 100 } 102 101 catch (Exception e) { 103 throw new RuntimeException( 104 "cross-validation during calculation of separatability failed", 102 throw new RuntimeException("cross-validation during calculation of separatability failed", 105 103 e); 106 104 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/SetWiseEMClusterSelection.java
r86 r135 74 74 } 75 75 catch (Exception e) { 76 throw new RuntimeException( 77 "error applying setwise EM clustering training data selection", 76 throw new RuntimeException("error applying setwise EM clustering training data selection", 78 77 e); 79 78 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/SetWiseEMContextSelection.java
r86 r135 41 41 public class SetWiseEMContextSelection implements ISetWiseDataselectionStrategy { 42 42 43 /** 44 * context factors 45 */ 43 46 private String[] project_context_factors; // = new String[]{"TND", "TNC", "TNF", "TLOC"}; 44 47 48 /* 49 * (non-Javadoc) 50 * 51 * @see de.ugoe.cs.cpdp.IParameterizable#setParameter(java.lang.String) 52 */ 45 53 @Override 46 54 public void setParameter(String parameters) { … … 103 111 } 104 112 catch (Exception e) { 105 throw new RuntimeException( 106 "error applying setwise EM clustering training data selection", 113 throw new RuntimeException("error applying setwise EM clustering training data selection", 107 114 e); 108 115 } 109 116 } 110 117 118 /* 119 * (non-Javadoc) 120 * 121 * @see de.ugoe.cs.cpdp.dataselection.ISetWiseDataselectionStrategy#apply(weka.core.Instances, 122 * org.apache.commons.collections4.list.SetUniqueList) 123 */ 111 124 @Override 112 125 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { … … 131 144 * @return 132 145 */ 133 protected Instances getContextFactors(Instances testdata, SetUniqueList<Instances> traindataSet) 146 protected Instances getContextFactors(Instances testdata, 147 SetUniqueList<Instances> traindataSet) 134 148 { 135 149 // setup weka Instances for clustering … … 190 204 remove.add(traindata); 191 205 // Console.traceln(Level.WARNING, 192 // "rmove attribute "+attribute+" test: "+testdata.firstInstance().value(testdata.attribute(attribute))+" train: "+traindata.firstInstance().value(traindata.attribute(attribute))); 206 // "rmove attribute "+attribute+" test: 207 // "+testdata.firstInstance().value(testdata.attribute(attribute))+" train: 208 // "+traindata.firstInstance().value(traindata.attribute(attribute))); 193 209 } 194 210 } … … 218 234 } 219 235 catch (Exception e) { 220 throw new RuntimeException( 221 "Unexpected exception during normalization of distributional characteristics.", 236 throw new RuntimeException("Unexpected exception during normalization of distributional characteristics.", 222 237 e); 223 238 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/SetWiseKNNSelection.java
r86 r135 71 71 int closestIndex = 1; 72 72 for (int i = 1; i < data.numInstances(); i++) { 73 double distance = 74 MathArrays.distance(data.instance(0).toDoubleArray(), data.instance(i) 75 .toDoubleArray()); 73 double distance = MathArrays.distance(data.instance(0).toDoubleArray(), 74 data.instance(i).toDoubleArray()); 76 75 if (distance < closestDistance) { 77 76 closestDistance = distance; -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/SynonymOutlierRemoval.java
r86 r135 19 19 /** 20 20 * <p> 21 * Synonym outlier removal after Amasaki et al. (2015). 21 * Synonym outlier removal after Amasaki et al. (2015). 22 22 * </p> 23 23 * … … 26 26 public class SynonymOutlierRemoval implements IPointWiseDataselectionStrategy { 27 27 28 /* (non-Javadoc) 28 /* 29 * (non-Javadoc) 30 * 29 31 * @see de.ugoe.cs.cpdp.IParameterizable#setParameter(java.lang.String) 30 32 */ … … 34 36 } 35 37 36 /* (non-Javadoc) 37 * @see de.ugoe.cs.cpdp.dataselection.IPointWiseDataselectionStrategy#apply(weka.core.Instances, weka.core.Instances) 38 /* 39 * (non-Javadoc) 40 * 41 * @see de.ugoe.cs.cpdp.dataselection.IPointWiseDataselectionStrategy#apply(weka.core.Instances, 42 * weka.core.Instances) 38 43 */ 39 44 @Override … … 48 53 * </p> 49 54 * 50 * @param traindata data from which the outliers are removed. 55 * @param traindata 56 * data from which the outliers are removed. 51 57 */ 52 58 public void applySynonymRemoval(Instances traindata) { 53 double minDistance[][] = new double[traindata.size()][traindata.numAttributes() -1];54 double minDistanceAttribute[] = new double[traindata.numAttributes() -1];59 double minDistance[][] = new double[traindata.size()][traindata.numAttributes() - 1]; 60 double minDistanceAttribute[] = new double[traindata.numAttributes() - 1]; 55 61 double distance; 56 for ( int j=0; j<minDistanceAttribute.length; j++) {62 for (int j = 0; j < minDistanceAttribute.length; j++) { 57 63 minDistanceAttribute[j] = Double.MAX_VALUE; 58 64 } 59 for (int i1 = traindata.size() -1; i1 < traindata.size(); i1++) {60 int k =0;65 for (int i1 = traindata.size() - 1; i1 < traindata.size(); i1++) { 66 int k = 0; 61 67 for (int j = 0; j < traindata.numAttributes(); j++) { 62 if ( j!=traindata.classIndex()) {68 if (j != traindata.classIndex()) { 63 69 minDistance[i1][k] = Double.MAX_VALUE; 64 70 for (int i2 = 0; i2 < traindata.size(); i2++) { 65 71 if (i1 != i2) { 66 distance = Math.abs(traindata.get(i1).value(j) - traindata.get(i2).value(j)); 72 distance = 73 Math.abs(traindata.get(i1).value(j) - traindata.get(i2).value(j)); 67 74 if (distance < minDistance[i1][k]) { 68 75 minDistance[i1][k] = distance; 69 76 } 70 if ( distance < minDistanceAttribute[k]) {77 if (distance < minDistanceAttribute[k]) { 71 78 minDistanceAttribute[k] = distance; 72 79 } … … 77 84 } 78 85 } 79 for ( int i=traindata.size()-1; i>=0; i--) {86 for (int i = traindata.size() - 1; i >= 0; i--) { 80 87 boolean hasClosest = false; 81 for ( int j=0; !hasClosest && j<traindata.numAttributes(); j++) {82 hasClosest = minDistance[i][j] <=minDistanceAttribute[j];88 for (int j = 0; !hasClosest && j < traindata.numAttributes(); j++) { 89 hasClosest = minDistance[i][j] <= minDistanceAttribute[j]; 83 90 } 84 if ( !hasClosest) {91 if (!hasClosest) { 85 92 traindata.delete(i); 86 93 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/eval/AbstractWekaEvaluation.java
r132 r135 36 36 * <ul> 37 37 * <li>succHe: Success with recall>0.7, precision>0.5</li> 38 * <li>succZi: Success with recall> 0.7, precision>0.7</li>38 * <li>succZi: Success with recall>=0.75, precision>=0.7, and error<=0.25</li> 39 39 * <li>succG75: Success with gscore>0.75</li> 40 40 * <li>succG60: Success with gscore>0.6</li> … … 66 66 private PrintWriter output = new PrintWriter(System.out); 67 67 68 /** 69 * flag that defines if the output is the system out 70 */ 68 71 private boolean outputIsSystemOut = true; 69 72 73 /** 74 * name of the configuration 75 */ 70 76 private String configurationName = "default"; 71 77 … … 92 98 Instances traindata, 93 99 List<ITrainer> trainers, 94 List<Double> efforts, 100 List<Double> efforts, 95 101 boolean writeHeader, 96 102 List<IResultStorage> storages) … … 99 105 final List<ExperimentResult> experimentResults = new LinkedList<>(); 100 106 String productName = testdata.relationName(); 101 107 102 108 for (ITrainer trainer : trainers) { 103 109 if (trainer instanceof IWekaCompatibleTrainer) { 104 110 classifiers.add(((IWekaCompatibleTrainer) trainer).getClassifier()); 105 experimentResults.add(new ExperimentResult(configurationName, productName, ((IWekaCompatibleTrainer) trainer).getName())); 111 experimentResults 112 .add(new ExperimentResult(configurationName, productName, 113 ((IWekaCompatibleTrainer) trainer).getName())); 106 114 } 107 115 else { … … 153 161 double aucec = calculateReviewEffort(testdata, classifier, efforts); 154 162 double succHe = eval.recall(1) >= 0.7 && eval.precision(1) >= 0.5 ? 1.0 : 0.0; 155 double succZi = eval.recall(1) >= 0.7 && eval.precision(1) >= 0.7? 1.0 : 0.0;163 double succZi = eval.recall(1) >= 0.75 && eval.precision(1) >= 0.75 && eval.errorRate()<=0.25 ? 1.0 : 0.0; 156 164 double succG75 = gmeasure > 0.75 ? 1.0 : 0.0; 157 165 double succG60 = gmeasure > 0.6 ? 1.0 : 0.0; 158 166 159 167 output.append("," + succHe); 160 168 output.append("," + succZi); 161 169 output.append("," + succG75); 162 output.append("," + succG60); 170 output.append("," + succG60); 163 171 output.append("," + eval.errorRate()); 164 172 output.append("," + eval.recall(1)); … … 177 185 output.append("," + eval.numTrueNegatives(1)); 178 186 output.append("," + eval.numFalsePositives(1)); 179 187 180 188 ExperimentResult result = resultIter.next(); 181 189 result.setSizeTestData(testdata.numInstances()); 182 190 result.setSizeTrainingData(traindata.numInstances()); 183 result.setSuccHe(succHe);184 result.setSuccZi(succZi);185 result.setSuccG75(succG75);186 result.setSuccG60(succG60);187 191 result.setError(eval.errorRate()); 188 192 result.setRecall(eval.recall(1)); … … 201 205 result.setTn(eval.numTrueNegatives(1)); 202 206 result.setFp(eval.numFalsePositives(1)); 203 for ( IResultStorage storage : storages) {207 for (IResultStorage storage : storages) { 204 208 storage.addResult(result); 205 209 } … … 209 213 output.flush(); 210 214 } 211 212 private double calculateReviewEffort(Instances testdata, Classifier classifier, List<Double> efforts) { 213 if( efforts==null ) { 215 216 /** 217 * <p> 218 * Calculates the effort. TODO: IMPLEMENTATION BUGGY! MUST BE FIXED! 219 * </p> 220 * 221 * @param testdata 222 * the test data 223 * @param classifier 224 * the classifier 225 * @param efforts 226 * the effort information for each instance in the test data 227 * @return 228 */ 229 private double calculateReviewEffort(Instances testdata, 230 Classifier classifier, 231 List<Double> efforts) 232 { 233 if (efforts == null) { 214 234 return 0; 215 235 } 216 236 217 237 final List<Integer> bugPredicted = new ArrayList<>(); 218 238 final List<Integer> nobugPredicted = new ArrayList<>(); … … 229 249 } 230 250 catch (Exception e) { 231 throw new RuntimeException( 232 "unexpected error during the evaluation of the review effort", 251 throw new RuntimeException("unexpected error during the evaluation of the review effort", 233 252 e); 234 253 } … … 297 316 } 298 317 318 /** 319 * <p> 320 * Calculates effort. Deprecated. Do not use! 321 * </p> 322 * 323 * @param testdata 324 * the test data 325 * @param classifier 326 * the classifier 327 * @return 328 */ 299 329 @SuppressWarnings("unused") 300 330 @Deprecated … … 315 345 loc = testdata.attribute("CountLineCodeExe"); 316 346 } 317 if ( loc == null) {347 if (loc == null) { 318 348 return 0.0; 319 349 } … … 333 363 } 334 364 catch (Exception e) { 335 throw new RuntimeException( 336 "unexpected error during the evaluation of the review effort", 365 throw new RuntimeException("unexpected error during the evaluation of the review effort", 337 366 e); 338 367 } … … 419 448 output = new PrintWriter(new FileOutputStream(parameters)); 420 449 outputIsSystemOut = false; 421 int filenameStart = parameters.lastIndexOf('/') +1;450 int filenameStart = parameters.lastIndexOf('/') + 1; 422 451 int filenameEnd = parameters.lastIndexOf('.'); 423 452 configurationName = parameters.substring(filenameStart, filenameEnd); -
trunk/CrossPare/src/de/ugoe/cs/cpdp/eval/CVWekaEvaluation.java
r86 r135 31 31 public class CVWekaEvaluation extends AbstractWekaEvaluation { 32 32 33 /* *33 /* 34 34 * @see de.ugoe.cs.cpdp.eval.AbstractWekaEvaluation#createEvaluator(weka.core.Instances, 35 35 * weka.classifiers.Classifier) -
trunk/CrossPare/src/de/ugoe/cs/cpdp/eval/ExperimentResult.java
r68 r135 1 1 2 package de.ugoe.cs.cpdp.eval; 2 3 4 /** 5 * <p> 6 * Data class to store experiment results 7 * </p> 8 * 9 * @author Steffen Herbold 10 */ 3 11 public class ExperimentResult { 4 12 13 /** 14 * configuration name of the experiment 15 */ 5 16 private final String configurationName; 17 18 /** 19 * name of the target product 20 */ 6 21 private final String productName; 22 23 /** 24 * name of the classifier used 25 */ 7 26 private final String classifier; 8 27 28 /** 29 * number of instances of the target product 30 */ 31 int sizeTestData; 32 33 /** 34 * number of instances of the training data 35 */ 36 int sizeTrainingData; 37 38 /** 39 * error of the prediction 40 */ 41 double error = Double.NaN; 42 43 /** 44 * recall of the prediction 45 */ 46 double recall = Double.NaN; 47 48 /** 49 * precision of the prediction 50 */ 51 double precision = Double.NaN; 52 53 /** 54 * F1 score of the prediction 55 */ 56 double fscore = Double.NaN; 57 58 /** 59 * G score of the prediction 60 */ 61 double gscore = Double.NaN; 62 63 /** 64 * Matthews correlation coefficient of the prediction 65 */ 66 double mcc = Double.NaN; 67 68 /** 69 * Area under the curve of the prediction 70 */ 71 double auc = Double.NaN; 72 73 /** 74 * Effort of the prediction 75 */ 76 double aucec = Double.NaN; 77 78 /** 79 * True positive rate of the prediction 80 */ 81 double tpr = Double.NaN; 82 83 /** 84 * True negative rate of the prediction 85 */ 86 double tnr = Double.NaN; 87 88 /** 89 * false positive rate of the prediction 90 */ 91 double fpr = Double.NaN; 92 93 /** 94 * false negative rate of the prediction 95 */ 96 double fnr = Double.NaN; 97 98 /** 99 * number of true positives 100 */ 101 double tp = Double.NaN; 102 103 /** 104 * number of false negatives 105 */ 106 double fn = Double.NaN; 107 108 /** 109 * number of true negatives 110 */ 111 double tn = Double.NaN; 112 113 /** 114 * number of false positives 115 */ 116 double fp = Double.NaN; 117 118 /** 119 * <p> 120 * Constructor. Creates a new ExperimentResult. 121 * </p> 122 * 123 * @param configurationName 124 * the configuration name 125 * @param productName 126 * the product name 127 * @param classifier 128 * the classifier name 129 */ 9 130 public ExperimentResult(String configurationName, String productName, String classifier) { 10 131 this.configurationName = configurationName; … … 12 133 this.classifier = classifier; 13 134 } 14 15 int sizeTestData; 16 int sizeTrainingData; 17 double succHe = Double.NaN; 18 double succZi = Double.NaN; 19 double succG75 = Double.NaN; 20 double succG60 = Double.NaN; 21 double error = Double.NaN; 22 double recall = Double.NaN; 23 double precision = Double.NaN; 24 double fscore = Double.NaN; 25 double gscore = Double.NaN; 26 double mcc = Double.NaN; 27 double auc = Double.NaN; 28 double aucec = Double.NaN; 29 double tpr = Double.NaN; 30 double tnr = Double.NaN; 31 double fpr = Double.NaN; 32 double fnr = Double.NaN; 33 double tp = Double.NaN; 34 double fn = Double.NaN; 35 double tn = Double.NaN; 36 double fp = Double.NaN; 37 135 136 /** 137 * <p> 138 * returns the configuration name 139 * </p> 140 * 141 * @return the configuration name 142 */ 38 143 public String getConfigurationName() { 39 144 return configurationName; 40 145 } 146 147 /** 148 * <p> 149 * returns the product name 150 * </p> 151 * 152 * @return the product name 153 */ 41 154 public String getProductName() { 42 155 return productName; 43 156 } 157 158 /** 159 * <p> 160 * returns the classifier name 161 * </p> 162 * 163 * @return the classifier name 164 */ 44 165 public String getClassifier() { 45 166 return classifier; 46 167 } 168 169 /** 170 * <p> 171 * returns the number of instances of the target product 172 * </p> 173 * 174 * @return number of instances 175 */ 47 176 public int getSizeTestData() { 48 177 return sizeTestData; 49 178 } 179 180 /** 181 * <p> 182 * sets the number of instances of the target product 183 * </p> 184 * 185 * @param sizeTestData 186 * number of instances 187 */ 50 188 public void setSizeTestData(int sizeTestData) { 51 189 this.sizeTestData = sizeTestData; 52 190 } 191 192 /** 193 * <p> 194 * returns the number of instances of the training data 195 * </p> 196 * 197 * @return number of instances 198 */ 53 199 public int getSizeTrainingData() { 54 200 return sizeTrainingData; 55 201 } 202 203 /** 204 * <p> 205 * sets the number of instances of the training data 206 * </p> 207 * 208 * @param sizeTrainingData 209 * number of instances 210 */ 56 211 public void setSizeTrainingData(int sizeTrainingData) { 57 212 this.sizeTrainingData = sizeTrainingData; 58 213 } 59 public double getSuccHe() { 60 return succHe; 61 } 62 public void setSuccHe(double succHe) { 63 this.succHe = succHe; 64 } 65 public double getSuccZi() { 66 return succZi; 67 } 68 public void setSuccZi(double succZi) { 69 this.succZi = succZi; 70 } 71 public double getSuccG75() { 72 return succG75; 73 } 74 public void setSuccG75(double succG75) { 75 this.succG75 = succG75; 76 } 77 public double getSuccG60() { 78 return succG60; 79 } 80 public void setSuccG60(double succG60) { 81 this.succG60 = succG60; 82 } 214 215 /** 216 * <p> 217 * returns the error 218 * </p> 219 * 220 * @return the error 221 */ 83 222 public double getError() { 84 223 return error; 85 224 } 225 226 /** 227 * <p> 228 * sets the error 229 * </p> 230 * 231 * @param error 232 * the error 233 */ 86 234 public void setError(double error) { 87 235 this.error = error; 88 236 } 237 238 /** 239 * <p> 240 * returns the recall 241 * </p> 242 * 243 * @return the recall 244 */ 89 245 public double getRecall() { 90 246 return recall; 91 247 } 248 249 /** 250 * <p> 251 * sets the recall 252 * </p> 253 * 254 * @param recall 255 * the recall 256 */ 92 257 public void setRecall(double recall) { 93 258 this.recall = recall; 94 259 } 260 261 /** 262 * <p> 263 * returns the precision 264 * </p> 265 * 266 * @return the precision 267 */ 95 268 public double getPrecision() { 96 269 return precision; 97 270 } 271 272 /** 273 * <p> 274 * sets the precision 275 * </p> 276 * 277 * @param precision 278 * the precision 279 */ 98 280 public void setPrecision(double precision) { 99 281 this.precision = precision; 100 282 } 283 284 /** 285 * <p> 286 * returns the F1 score 287 * </p> 288 * 289 * @return the F1 score 290 */ 101 291 public double getFscore() { 102 292 return fscore; 103 293 } 294 295 /** 296 * <p> 297 * sets the F1 score 298 * </p> 299 * 300 * @param fscore 301 * the F1 score 302 */ 104 303 public void setFscore(double fscore) { 105 304 this.fscore = fscore; 106 305 } 306 307 /** 308 * <p> 309 * returns the G score 310 * </p> 311 * 312 * @return the G score 313 */ 107 314 public double getGscore() { 108 315 return gscore; 109 316 } 317 318 /** 319 * <p> 320 * sets the G score 321 * </p> 322 * 323 * @param gscore 324 * the G score 325 */ 110 326 public void setGscore(double gscore) { 111 327 this.gscore = gscore; 112 328 } 329 330 /** 331 * <p> 332 * returns the MCC 333 * </p> 334 * 335 * @return the MCC 336 */ 113 337 public double getMcc() { 114 338 return mcc; 115 339 } 340 341 /** 342 * <p> 343 * sets the MCC 344 * </p> 345 * 346 * @param mcc 347 * the MCC 348 */ 116 349 public void setMcc(double mcc) { 117 350 this.mcc = mcc; 118 351 } 352 353 /** 354 * <p> 355 * returns the AUC 356 * </p> 357 * 358 * @return the AUC 359 */ 119 360 public double getAuc() { 120 361 return auc; 121 362 } 363 364 /** 365 * <p> 366 * sets the AUC 367 * </p> 368 * 369 * @param auc 370 * the AUC 371 */ 122 372 public void setAuc(double auc) { 123 373 this.auc = auc; 124 374 } 375 376 /** 377 * <p> 378 * returns the effort as AUCEC 379 * </p> 380 * 381 * @return the effort 382 */ 125 383 public double getAucec() { 126 384 return aucec; 127 385 } 386 387 /** 388 * <p> 389 * sets the effort as AUCEC 390 * </p> 391 * 392 * @param aucec 393 * the effort 394 */ 128 395 public void setAucec(double aucec) { 129 396 this.aucec = aucec; 130 397 } 398 399 /** 400 * <p> 401 * returns the TPR 402 * </p> 403 * 404 * @return the TPR 405 */ 131 406 public double getTpr() { 132 407 return tpr; 133 408 } 409 410 /** 411 * <p> 412 * sets the TPR 413 * </p> 414 * 415 * @param tpr 416 * the TPR 417 */ 134 418 public void setTpr(double tpr) { 135 419 this.tpr = tpr; 136 420 } 421 422 /** 423 * <p> 424 * sets the TNR 425 * </p> 426 * 427 * @return the TNR 428 */ 137 429 public double getTnr() { 138 430 return tnr; 139 431 } 432 433 /** 434 * <p> 435 * sets the TNR 436 * </p> 437 * 438 * @param tnr 439 * the TNR 440 */ 140 441 public void setTnr(double tnr) { 141 442 this.tnr = tnr; 142 443 } 444 445 /** 446 * <p> 447 * returns the FPR 448 * </p> 449 * 450 * @return the FPR 451 */ 143 452 public double getFpr() { 144 453 return fpr; 145 454 } 455 456 /** 457 * <p> 458 * sets the FPR 459 * </p> 460 * 461 * @param fpr 462 * the FPR 463 */ 146 464 public void setFpr(double fpr) { 147 465 this.fpr = fpr; 148 466 } 467 468 /** 469 * <p> 470 * returns the FNR 471 * </p> 472 * 473 * @return the FNR 474 */ 149 475 public double getFnr() { 150 476 return fnr; 151 477 } 478 479 /** 480 * <p> 481 * sets the FNR 482 * </p> 483 * 484 * @param fnr 485 * the FNR 486 */ 152 487 public void setFnr(double fnr) { 153 488 this.fnr = fnr; 154 489 } 490 491 /** 492 * <p> 493 * returns the TPs 494 * </p> 495 * 496 * @return the TPs 497 */ 155 498 public double getTp() { 156 499 return tp; 157 500 } 501 502 /** 503 * <p> 504 * sets the TPs 505 * </p> 506 * 507 * @param tp 508 * the TPs 509 */ 158 510 public void setTp(double tp) { 159 511 this.tp = tp; 160 512 } 513 514 /** 515 * <p> 516 * returns the FNs 517 * </p> 518 * 519 * @return the FNs 520 */ 161 521 public double getFn() { 162 522 return fn; 163 523 } 524 525 /** 526 * <p> 527 * sets the FNs 528 * </p> 529 * 530 * @param fn 531 */ 164 532 public void setFn(double fn) { 165 533 this.fn = fn; 166 534 } 535 536 /** 537 * <p> 538 * returns the TNs 539 * </p> 540 * 541 * @return the TNs 542 */ 167 543 public double getTn() { 168 544 return tn; 169 545 } 546 547 /** 548 * <p> 549 * sets the TNs 550 * </p> 551 * 552 * @param tn 553 * the TNs 554 */ 170 555 public void setTn(double tn) { 171 556 this.tn = tn; 172 557 } 558 559 /** 560 * <p> 561 * returns the FPs 562 * </p> 563 * 564 * @return the FPs 565 */ 173 566 public double getFp() { 174 567 return fp; 175 568 } 569 570 /** 571 * <p> 572 * sets the FPs 573 * </p> 574 * 575 * @param fp 576 * the FPs 577 */ 176 578 public void setFp(double fp) { 177 579 this.fp = fp; -
trunk/CrossPare/src/de/ugoe/cs/cpdp/eval/MySQLResultStorage.java
r121 r135 22 22 import java.util.Properties; 23 23 24 25 24 import com.mysql.jdbc.jdbc2.optional.MysqlDataSource; 26 25 … … 37 36 public class MySQLResultStorage implements IResultStorage { 38 37 39 /**40 * Connection to the database41 */42 //private Connection con = null;43 44 38 /** 45 39 * Connection pool for the data base. … … 121 115 sql.append(result.getSizeTestData() + ","); 122 116 sql.append(result.getSizeTrainingData() + ","); 123 sql.append(result.getSuccHe() + ",");124 sql.append(result.getSuccZi() + ",");125 sql.append(result.getSuccG75() + ",");126 sql.append(result.getSuccG60() + ",");127 117 sql.append(result.getError() + ","); 128 118 sql.append(result.getRecall() + ","); … … 164 154 public int containsResult(String experimentName, String productName, String classifierName) { 165 155 String sql = "SELECT COUNT(*) as cnt FROM crosspare.results WHERE configurationName=\'" + 166 experimentName + "\' AND productName=\'" + productName + "\' AND classifier=\'" + classifierName + "\';"; 156 experimentName + "\' AND productName=\'" + productName + "\' AND classifier=\'" + 157 classifierName + "\';"; 167 158 Statement stmt; 168 159 try { -
trunk/CrossPare/src/de/ugoe/cs/cpdp/eval/NormalWekaEvaluation.java
r86 r135 27 27 public class NormalWekaEvaluation extends AbstractWekaEvaluation { 28 28 29 /* *29 /* 30 30 * @see de.ugoe.cs.cpdp.eval.AbstractWekaEvaluation#createEvaluator(weka.core.Instances, 31 * 31 * weka.classifiers.Classifier) 32 32 */ 33 33 @Override -
trunk/CrossPare/src/de/ugoe/cs/cpdp/execution/AbstractCrossProjectExperiment.java
r132 r135 155 155 } 156 156 } 157 157 158 158 // sort versions 159 159 Collections.sort(versions); … … 342 342 } 343 343 344 /** 345 * <p> 346 * helper function that checks if the results are already in the data store 347 * </p> 348 * 349 * @param version 350 * version for which the results are checked 351 * @return 352 */ 344 353 private int resultsAvailable(SoftwareVersion version) { 345 354 if (config.getResultStorages().isEmpty()) { 346 355 return 0; 347 356 } 348 357 349 358 List<ITrainer> allTrainers = new LinkedList<>(); 350 359 for (ISetWiseTrainingStrategy setwiseTrainer : config.getSetWiseTrainers()) { … … 362 371 allTrainers.add(trainer); 363 372 } 364 373 365 374 int available = Integer.MAX_VALUE; 366 375 for (IResultStorage storage : config.getResultStorages()) { 367 376 String classifierName = ((IWekaCompatibleTrainer) allTrainers.get(0)).getName(); 368 int curAvailable = storage.containsResult(config.getExperimentName(), version.getVersion(), classifierName); 369 if( curAvailable<available ) { 377 int curAvailable = storage.containsResult(config.getExperimentName(), 378 version.getVersion(), classifierName); 379 if (curAvailable < available) { 370 380 available = curAvailable; 371 381 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/execution/ClassifierCreationExperiment.java
r132 r135 107 107 108 108 for (IProcessesingStrategy processor : config.getPreProcessors()) { 109 Console.traceln(Level.FINE, String 110 .format("[%s] [%02d/%02d] %s: applying preprocessor %s", 111 config.getExperimentName(), versionCount, versions.size(), 112 testVersion.getProject(), processor.getClass().getName())); 109 Console.traceln(Level.FINE, 110 String.format("[%s] [%02d/%02d] %s: applying preprocessor %s", 111 config.getExperimentName(), versionCount, 112 versions.size(), testVersion.getProject(), 113 processor.getClass().getName())); 113 114 processor.apply(testdata, traindata); 114 115 } 115 116 116 117 for (IPointWiseDataselectionStrategy dataselector : config.getPointWiseSelectors()) { 117 Console.traceln(Level.FINE, String 118 .format("[%s] [%02d/%02d] %s: applying pointwise selection %s", 119 config.getExperimentName(), versionCount, versions.size(), 120 testVersion.getProject(), dataselector.getClass().getName())); 118 Console 119 .traceln(Level.FINE, 120 String.format("[%s] [%02d/%02d] %s: applying pointwise selection %s", 121 config.getExperimentName(), versionCount, 122 versions.size(), testVersion.getProject(), 123 dataselector.getClass().getName())); 121 124 traindata = dataselector.apply(testdata, traindata); 122 125 } 123 126 124 127 for (IProcessesingStrategy processor : config.getPostProcessors()) { 125 Console.traceln(Level.FINE, String 126 .format("[%s] [%02d/%02d] %s: applying setwise postprocessor %s", 127 config.getExperimentName(), versionCount, versions.size(), 128 testVersion.getProject(), processor.getClass().getName())); 128 Console 129 .traceln(Level.FINE, 130 String.format("[%s] [%02d/%02d] %s: applying setwise postprocessor %s", 131 config.getExperimentName(), versionCount, 132 versions.size(), testVersion.getProject(), 133 processor.getClass().getName())); 129 134 processor.apply(testdata, traindata); 130 135 } … … 148 153 try { 149 154 weka.core.SerializationHelper.write(resultsDir.getAbsolutePath() + "/" + 150 trainer.getName() + "-" + 151 testVersion.getProject(), 155 trainer.getName() + "-" + testVersion.getProject(), 152 156 trainerToSave.getClassifier()); 153 157 } … … 160 164 161 165 for (IEvaluationStrategy evaluator : config.getEvaluators()) { 162 Console.traceln(Level.FINE, String 163 .format("[%s] [%02d/%02d] %s: applying evaluator %s", 164 config.getExperimentName(), versionCount, versions.size(), 165 testVersion.getProject(), evaluator.getClass().getName())); 166 Console.traceln(Level.FINE, 167 String.format("[%s] [%02d/%02d] %s: applying evaluator %s", 168 config.getExperimentName(), versionCount, 169 versions.size(), testVersion.getProject(), 170 evaluator.getClass().getName())); 166 171 167 172 if (writeHeader) { … … 169 174 config.getExperimentName() + ".csv"); 170 175 } 171 evaluator.apply(testdata, traindata, allTrainers, efforts, writeHeader, config.getResultStorages()); 176 evaluator.apply(testdata, traindata, allTrainers, efforts, writeHeader, 177 config.getResultStorages()); 172 178 writeHeader = false; 173 179 } … … 175 181 versionCount++; 176 182 177 Console.traceln(Level.INFO, String.format("[%s] [%02d/%02d] %s: finished", 178 config.getExperimentName(), versionCount, 179 versions.size(), testVersion.getProject())); 183 Console.traceln(Level.INFO, 184 String.format("[%s] [%02d/%02d] %s: finished", 185 config.getExperimentName(), versionCount, versions.size(), 186 testVersion.getProject())); 180 187 181 188 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/execution/CrossValidationExperiment.java
r132 r135 138 138 } 139 139 } 140 140 141 141 numTrainers += config.getSetWiseTrainers().size(); 142 142 numTrainers += config.getSetWiseTestdataAwareTrainers().size(); … … 154 154 testVersionCount, testVersion.getVersion())); 155 155 int numResultsAvailable = resultsAvailable(testVersion); 156 if (numResultsAvailable >= numTrainers *config.getRepetitions()) {156 if (numResultsAvailable >= numTrainers * config.getRepetitions()) { 157 157 Console.traceln(Level.INFO, 158 158 String.format( … … 167 167 Instances testdata = testVersion.getInstances(); 168 168 List<Double> efforts = testVersion.getEfforts(); 169 169 170 170 for (ITrainingStrategy trainer : config.getTrainers()) { 171 171 Console.traceln(Level.FINE, … … 176 176 trainer.apply(testdata); 177 177 } 178 178 179 179 File resultsDir = new File(config.getResultsPath()); 180 180 if (!resultsDir.exists()) { … … 236 236 } 237 237 238 /** 239 * <p> 240 * helper function that checks if the results are already in the data store 241 * </p> 242 * 243 * @param version 244 * version for which the results are checked 245 * @return 246 */ 238 247 private int resultsAvailable(SoftwareVersion version) { 239 248 if (config.getResultStorages().isEmpty()) { 240 249 return 0; 241 250 } 242 251 243 252 List<ITrainer> allTrainers = new LinkedList<>(); 244 253 for (ISetWiseTrainingStrategy setwiseTrainer : config.getSetWiseTrainers()) { … … 256 265 allTrainers.add(trainer); 257 266 } 258 267 259 268 int available = Integer.MAX_VALUE; 260 269 for (IResultStorage storage : config.getResultStorages()) { 261 270 String classifierName = ((IWekaCompatibleTrainer) allTrainers.get(0)).getName(); 262 int curAvailable = storage.containsResult(config.getExperimentName(), version.getVersion(), classifierName); 263 if( curAvailable<available ) { 271 int curAvailable = storage.containsResult(config.getExperimentName(), 272 version.getVersion(), classifierName); 273 if (curAvailable < available) { 264 274 available = curAvailable; 265 275 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/AUDIChangeFolderLoader.java
r86 r135 15 15 package de.ugoe.cs.cpdp.loader; 16 16 17 /** 18 * <p> 19 * Loads data from the automative defect data set from Audi Electronic Ventures donated by Altinger 20 * et al. at the MSR 2015. This loader contains the changes per commit, i.e., it is for JIT defect 21 * prediction. 22 * </p> 23 * 24 * @author Steffen Herbold 25 */ 17 26 public class AUDIChangeFolderLoader extends AbstractFolderLoader { 18 27 -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/AUDIChangeLoader.java
r86 r135 28 28 29 29 /** 30 * TODO 30 * <p> 31 * Loads data from the automative defect data set from Audi Electronic Ventures donated by Altinger 32 * et al. at the MSR 2015. This loader contains the changes per commit, i.e., it is for JIT defect 33 * prediction. 34 * </p> 31 35 * 32 * @author sherbold 33 * 36 * @author Steffen Herbold 34 37 */ 35 38 class AUDIChangeLoader implements SingleVersionLoader { 36 39 40 /** 41 * <p> 42 * Internal helper class. 43 * </p> 44 * 45 * @author Steffen Herbold 46 */ 37 47 private class EntityRevisionPair implements Comparable<EntityRevisionPair> { 48 49 /** 50 * string that defines an entity 51 */ 38 52 private final String entity; 53 54 /** 55 * revision number of the entity 56 */ 39 57 private final int revision; 40 58 59 /** 60 * <p> 61 * Constructor. Creates a new EntityRevisionPair. 62 * </p> 63 * 64 * @param entity 65 * the entity 66 * @param revision 67 * the revision 68 */ 41 69 public EntityRevisionPair(String entity, int revision) { 42 70 this.entity = entity; … … 44 72 } 45 73 74 /* 75 * (non-Javadoc) 76 * 77 * @see java.lang.Object#equals(java.lang.Object) 78 */ 46 79 @Override 47 80 public boolean equals(Object other) { … … 54 87 } 55 88 89 /* 90 * (non-Javadoc) 91 * 92 * @see java.lang.Object#hashCode() 93 */ 56 94 @Override 57 95 public int hashCode() { … … 59 97 } 60 98 99 /* 100 * (non-Javadoc) 101 * 102 * @see java.lang.Comparable#compareTo(java.lang.Object) 103 */ 61 104 @Override 62 105 public int compareTo(EntityRevisionPair other) { … … 68 111 } 69 112 113 /* 114 * (non-Javadoc) 115 * 116 * @see java.lang.Object#toString() 117 */ 70 118 @Override 71 119 public String toString() { … … 74 122 } 75 123 124 /* 125 * (non-Javadoc) 126 * 127 * @see de.ugoe.cs.cpdp.loader.SingleVersionLoader#load(java.io.File) 128 */ 76 129 @Override 77 130 public Instances load(File file) { … … 139 192 for (int i = 1; i < linesBug.length; i++) { 140 193 lineSplitBug = linesBug[i].split(";"); 141 entityRevisionPairs.put(new EntityRevisionPair(lineSplitBug[0], Integer 142 .parseInt(lineSplitBug[revisionIndex])), i); 194 entityRevisionPairs.put( 195 new EntityRevisionPair(lineSplitBug[0], 196 Integer 197 .parseInt(lineSplitBug[revisionIndex])), 198 i); 143 199 } 144 200 -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/AUDIDataLoader.java
r86 r135 25 25 26 26 /** 27 * TODO 27 * Loads data from the automative defect data set from Audi Electronic Ventures donated by Altinger 28 * et al. at the MSR 2015. This loader creates overall defect labels, for the final revision. 28 29 * 29 * @author sherbold30 * @author Steffen Herbold 30 31 * 31 32 */ -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/AUDIFolderLoader.java
r86 r135 15 15 package de.ugoe.cs.cpdp.loader; 16 16 17 /** 18 * 19 * <p> 20 * Loads data from the automative defect data set from Audi Electronic Ventures donated by Altinger 21 * et al. at the MSR 2015. This loader creates overall defect labels, for the final revision. 22 * </p> 23 * 24 * @author Steffen Herbold 25 */ 17 26 public class AUDIFolderLoader extends AbstractFolderLoader { 18 27 -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/AbstractFolderLoader.java
r132 r135 46 46 } 47 47 48 /* *48 /* 49 49 * @see de.ugoe.cs.cpdp.loader.IVersionLoader#load() 50 50 */ … … 68 68 Instances data = instancesLoader.load(versionFile); 69 69 String versionName = data.relationName(); 70 List<Double> efforts = getEfforts(data); 71 versions.add(new SoftwareVersion(projectName, versionName, data, efforts)); 70 List<Double> efforts = getEfforts(data); 71 versions 72 .add(new SoftwareVersion(projectName, versionName, data, efforts)); 72 73 } 73 74 } … … 77 78 return versions; 78 79 } 79 80 81 /** 82 * <p> 83 * Sets the efforts for the instances 84 * </p> 85 * 86 * @param data 87 * the data 88 * @return 89 */ 80 90 private List<Double> getEfforts(Instances data) { 81 91 // attribute in the JURECZKO data and default … … 93 103 effortAtt = data.attribute("CountLineCodeExe"); 94 104 } 95 if ( effortAtt == null) {105 if (effortAtt == null) { 96 106 return null; 97 107 } 98 108 List<Double> efforts = new ArrayList<>(data.size()); 99 for ( int i=0; i<data.size(); i++) {109 for (int i = 0; i < data.size(); i++) { 100 110 efforts.add(data.get(i).value(effortAtt)); 101 111 } … … 106 116 * Returns the concrete {@link SingleVersionLoader} to be used with this folder loader. 107 117 * 108 * @return 118 * @return the version loader 109 119 */ 110 120 abstract protected SingleVersionLoader getSingleLoader(); -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/CSVMockusDataLoader.java
r86 r135 24 24 import de.ugoe.cs.util.FileTools; 25 25 26 /** 27 * <p> 28 * Reads data from the data set provided by Mockus (and Zhang) for universal defect prediction. 29 * </p> 30 * 31 * @author Steffen Herbold 32 */ 26 33 class CSVMockusDataLoader implements SingleVersionLoader { 27 34 -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/CSVMockusFolderLoader.java
r86 r135 15 15 package de.ugoe.cs.cpdp.loader; 16 16 17 /** 18 * <p> 19 * Reads data from the data set provided by Mockus (and Zhang) for universal defect prediction. 20 * </p> 21 * 22 * @author Steffen Herbold 23 */ 17 24 public class CSVMockusFolderLoader extends AbstractFolderLoader { 18 25 -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/DecentDataLoader.java
r86 r135 237 237 for (String key : EPackage.Registry.INSTANCE.keySet()) { 238 238 metaModelCache.put(key, EPackage.Registry.INSTANCE.get(key)); 239 } ;239 } ; 240 240 241 241 for (String key : metaModelCache.keySet()) { 242 242 EPackage.Registry.INSTANCE.remove(key); 243 } ;243 } ; 244 244 245 245 // Workaround to gernerate a usable URI. Absolute path is not … … 445 445 } 446 446 else { 447 Console.printerrln("Could not determine model type, file should end with either .etl or .eol"); 447 Console 448 .printerrln("Could not determine model type, file should end with either .etl or .eol"); 448 449 return null; 449 450 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/IDecentVersionLoader.java
r86 r135 19 19 import de.ugoe.cs.cpdp.versions.SoftwareVersion; 20 20 21 /** 22 * <p> 23 * Extends the version load for the loading of DECENT models 24 * </p> 25 * 26 * @author Fabian Trautsch 27 */ 21 28 public interface IDecentVersionLoader extends IVersionLoader { 22 29 30 /** 31 * <p> 32 * loads the versions and defines the DECENT attributes to be used 33 * </p> 34 * 35 * @param decentAttributes the attributes 36 * @return the versions 37 */ 23 38 public List<SoftwareVersion> load(List<String> decentAttributes); 24 39 -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/RelinkLoader.java
r119 r135 1 1 2 package de.ugoe.cs.cpdp.loader; 2 3 … … 10 11 import weka.core.Instances; 11 12 13 /** 14 * <p> 15 * Loads data from the RELINK data set. 16 * </p> 17 * 18 * @author Steffen Herbold 19 */ 12 20 public class RelinkLoader implements SingleVersionLoader { 13 21 … … 67 75 attrNames.add("SumEssential"); 68 76 attrNames.add("isDefective"); 69 70 for ( int j=tmpData.numAttributes()-1; j>=0 ; j--) {71 if ( !attrNames.contains(tmpData.attribute(j).name())) {77 78 for (int j = tmpData.numAttributes() - 1; j >= 0; j--) { 79 if (!attrNames.contains(tmpData.attribute(j).name())) { 72 80 tmpData.deleteAttributeAt(j); 73 81 } 74 82 } 75 83 76 84 // setting class attribute 77 85 tmpData.setClassIndex(tmpData.numAttributes() - 1); -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/FixClass.java
r86 r135 29 29 public class FixClass extends AbstractClassifier { 30 30 31 /** 32 * default serial ID 33 */ 31 34 private static final long serialVersionUID = 1L; 32 35 36 /** 37 * default prediction: non-defective 38 */ 33 39 private double fixedClassValue = 0.0d; 34 40 … … 62 68 } 63 69 70 /* 71 * (non-Javadoc) 72 * 73 * @see weka.classifiers.AbstractClassifier#setOptions(java.lang.String[]) 74 */ 64 75 @Override 65 76 public void setOptions(String[] options) throws Exception { … … 67 78 } 68 79 80 /* 81 * (non-Javadoc) 82 * 83 * @see weka.classifiers.AbstractClassifier#classifyInstance(weka.core.Instance) 84 */ 69 85 @Override 70 86 public double classifyInstance(Instance instance) { … … 72 88 } 73 89 90 /* 91 * (non-Javadoc) 92 * 93 * @see weka.classifiers.Classifier#buildClassifier(weka.core.Instances) 94 */ 74 95 @Override 75 96 public void buildClassifier(Instances traindata) throws Exception { -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/GPTraining.java
r125 r135 1 1 2 package de.ugoe.cs.cpdp.training; 2 3 … … 45 46 * Genetic Programming Trainer 46 47 * 47 * Implementation (mostly) according to Liu et al. Evolutionary Optimization of Software Quality Modeling with Multiple Repositories. 48 * Implementation (mostly) according to Liu et al. Evolutionary Optimization of Software Quality 49 * Modeling with Multiple Repositories. 48 50 * 49 * - GPRun is a Run of a complete Genetic Programm Evolution, we want several complete runs. 50 * - GPVClassifier is the Validation Classifier 51 * - GPVVClassifier is the Validation-Voting Classifier 51 * - GPRun is a Run of a complete Genetic Programm Evolution, we want several complete runs. - 52 * GPVClassifier is the Validation Classifier - GPVVClassifier is the Validation-Voting Classifier 52 53 * 53 * config: <setwisetrainer name="GPTraining" param="populationSize:1000,numberRuns:10" /> 54 * config: <setwisetrainer name="GPTraining" param="populationSize:1000,numberRuns:10" /> 55 * 56 * @author Alexander Trautsch 54 57 */ 55 public class GPTraining implements ISetWiseTrainingStrategy, IWekaCompatibleTrainer { 56 58 public class GPTraining implements ISetWiseTrainingStrategy, IWekaCompatibleTrainer { 59 60 /** 61 * the interal validation-and-voting classifier 62 */ 57 63 private GPVVClassifier classifier = null; 58 59 // default values from the paper 64 65 /** 66 * size of the population of the genetic program; default from the paper is 1000 67 */ 60 68 private int populationSize = 1000; 69 70 /** 71 * minimal depth of the S-expression tree at the start of the training; default from the paper 72 * is 2 73 */ 61 74 private int initMinDepth = 2; 75 76 /** 77 * maximal depth of the S-expression tree at the start of the training; default from the paper 78 * is 6 79 */ 62 80 private int initMaxDepth = 6; 81 82 /** 83 * size of the tournaments used for selection; default from the paper is 7 84 */ 63 85 private int tournamentSize = 7; 86 87 /** 88 * number of genetic generations considered (i.e., number of iterations; default from the paper 89 * is 50 90 */ 64 91 private int maxGenerations = 50; 92 93 /** 94 * weight factor for the prediction errors for cost estimation; default from the paper is 15 95 */ 65 96 private double errorType2Weight = 15; 66 private int numberRuns = 20; // im paper 20 per errorType2Weight then additional 20 67 private int maxDepth = 20; // max depth within one program 68 private int maxNodes = 100; // max nodes within one program 69 97 98 /** 99 * number of internal replications from which the best result is picked; default from the paper 100 * is 20 101 */ 102 private int numberRuns = 20; 103 104 /** 105 * maximal depth of the S-expression tree; default from the paper is 20 106 */ 107 private int maxDepth = 20; 108 109 /** 110 * maximal number of nodes of the S-expression tree; default from the paper is 100 111 */ 112 private int maxNodes = 100; 113 114 /* 115 * (non-Javadoc) 116 * 117 * @see de.ugoe.cs.cpdp.IParameterizable#setParameter(java.lang.String) 118 */ 70 119 @Override 71 120 public void setParameter(String parameters) { 72 121 73 122 String[] params = parameters.split(","); 74 123 String[] keyvalue = new String[2]; 75 124 76 for (int i=0; i < params.length; i++) {125 for (int i = 0; i < params.length; i++) { 77 126 keyvalue = params[i].split(":"); 78 79 switch(keyvalue[0]) { 127 128 switch (keyvalue[0]) 129 { 80 130 case "populationSize": 81 131 this.populationSize = Integer.parseInt(keyvalue[1]); 82 break;83 132 break; 133 84 134 case "initMinDepth": 85 135 this.initMinDepth = Integer.parseInt(keyvalue[1]); 86 break;87 136 break; 137 88 138 case "tournamentSize": 89 139 this.tournamentSize = Integer.parseInt(keyvalue[1]); 90 break;91 140 break; 141 92 142 case "maxGenerations": 93 143 this.maxGenerations = Integer.parseInt(keyvalue[1]); 94 break;95 144 break; 145 96 146 case "errorType2Weight": 97 147 this.errorType2Weight = Double.parseDouble(keyvalue[1]); 98 break;99 148 break; 149 100 150 case "numberRuns": 101 151 this.numberRuns = Integer.parseInt(keyvalue[1]); 102 break;103 152 break; 153 104 154 case "maxDepth": 105 155 this.maxDepth = Integer.parseInt(keyvalue[1]); 106 break;107 156 break; 157 108 158 case "maxNodes": 109 159 this.maxNodes = Integer.parseInt(keyvalue[1]); 110 break;111 } 112 } 113 160 break; 161 } 162 } 163 114 164 this.classifier = new GPVVClassifier(); 115 ((GPVClassifier)this.classifier).configure(populationSize, initMinDepth, initMaxDepth, tournamentSize, maxGenerations, errorType2Weight, numberRuns, maxDepth, maxNodes); 165 ((GPVClassifier) this.classifier) 166 .configure(populationSize, initMinDepth, initMaxDepth, tournamentSize, maxGenerations, 167 errorType2Weight, numberRuns, maxDepth, maxNodes); 116 168 } 117 169 170 /* 171 * (non-Javadoc) 172 * 173 * @see 174 * de.ugoe.cs.cpdp.training.ISetWiseTrainingStrategy#apply(org.apache.commons.collections4.list. 175 * SetUniqueList) 176 */ 118 177 @Override 119 178 public void apply(SetUniqueList<Instances> traindataSet) { 120 179 try { 121 180 classifier.buildClassifier(traindataSet); 122 }catch(Exception e) { 181 } 182 catch (Exception e) { 123 183 throw new RuntimeException(e); 124 184 } 125 185 } 126 186 187 /* 188 * (non-Javadoc) 189 * 190 * @see de.ugoe.cs.cpdp.training.ISetWiseTrainingStrategy#getName() 191 */ 127 192 @Override 128 193 public String getName() { … … 130 195 } 131 196 197 /* 198 * (non-Javadoc) 199 * 200 * @see de.ugoe.cs.cpdp.training.IWekaCompatibleTrainer#getClassifier() 201 */ 132 202 @Override 133 203 public Classifier getClassifier() { 134 204 return this.classifier; 135 205 } 136 206 207 /** 208 * <p> 209 * Internal helper class that stores the data in a format that can be used by the genetic 210 * program. 211 * </p> 212 * 213 * @author Alexander Trautsch 214 */ 137 215 public class InstanceData { 216 217 /** 218 * instances values 219 */ 138 220 private double[][] instances_x; 221 222 /** 223 * class labels 224 */ 139 225 private boolean[] instances_y; 140 226 227 /** 228 * <p> 229 * Constructor. Creates the internal data representation. 230 * </p> 231 * 232 * @param instances 233 */ 141 234 public InstanceData(Instances instances) { 142 this.instances_x = new double[instances.numInstances()][instances.numAttributes() -1];235 this.instances_x = new double[instances.numInstances()][instances.numAttributes() - 1]; 143 236 this.instances_y = new boolean[instances.numInstances()]; 144 237 145 238 Instance current; 146 for (int i=0; i < this.instances_x.length; i++) {239 for (int i = 0; i < this.instances_x.length; i++) { 147 240 current = instances.get(i); 148 241 this.instances_x[i] = WekaUtils.instanceValues(current); … … 150 243 } 151 244 } 152 245 246 /** 247 * <p> 248 * returns the instance values 249 * </p> 250 * 251 * @return 252 */ 153 253 public double[][] getX() { 154 254 return instances_x; 155 255 } 256 257 /** 258 * <p> 259 * returns the instance labels 260 * </p> 261 * 262 * @return 263 */ 156 264 public boolean[] getY() { 157 265 return instances_y; 158 266 } 159 267 } 160 268 161 269 /** 162 270 * One Run executed by a GP Classifier 163 271 */ 164 272 public class GPRun extends AbstractClassifier { 273 274 /** 275 * generated serialization ID 276 */ 165 277 private static final long serialVersionUID = -4250422550107888789L; 166 278 279 /** 280 * size of the population of the genetic program 281 */ 167 282 private int populationSize; 283 284 /** 285 * minimal depth of the S-expression tree at the start of the training 286 */ 168 287 private int initMinDepth; 288 289 /** 290 * maximal depth of the S-expression tree at the start of the training 291 */ 169 292 private int initMaxDepth; 293 294 /** 295 * size of the tournaments used for selection 296 */ 170 297 private int tournamentSize; 298 299 /** 300 * number of genetic generations considered (i.e., number of iterations 301 */ 171 302 private int maxGenerations; 303 304 /** 305 * weight factor for the prediction errors for cost estimation 306 */ 172 307 private double errorType2Weight; 308 309 /** 310 * maximal depth of the S-expression tree 311 */ 173 312 private int maxDepth; 313 314 /** 315 * maximal number of nodes of the S-expression tree 316 */ 174 317 private int maxNodes; 175 318 319 /** 320 * genetic program 321 */ 176 322 private GPGenotype gp; 323 324 /** 325 * description of the problem to be solved by the genetic program 326 */ 177 327 private GPProblem problem; 178 179 public void configure(int populationSize, int initMinDepth, int initMaxDepth, int tournamentSize, int maxGenerations, double errorType2Weight, int maxDepth, int maxNodes) { 328 329 /** 330 * <p> 331 * Configures the runner 332 * </p> 333 * 334 * @param populationSize 335 * the population size 336 * @param initMinDepth 337 * the initial minimal depth of the S-expression tree 338 * @param initMaxDepth 339 * the initial maximal depth of the S-expression tree 340 * @param tournamentSize 341 * the tournament size for selection 342 * @param maxGenerations 343 * the number of generations created 344 * @param errorType2Weight 345 * weigth factor for the prediction errors 346 * @param maxDepth 347 * maximal depth of the S-expression tree 348 * @param maxNodes 349 * maximal number of nodes of the S-expression tree 350 */ 351 public void configure(int populationSize, 352 int initMinDepth, 353 int initMaxDepth, 354 int tournamentSize, 355 int maxGenerations, 356 double errorType2Weight, 357 int maxDepth, 358 int maxNodes) 359 { 180 360 this.populationSize = populationSize; 181 361 this.initMinDepth = initMinDepth; … … 187 367 this.maxNodes = maxNodes; 188 368 } 189 369 370 /** 371 * <p> 372 * returns the genetic program 373 * </p> 374 * 375 * @return the genetic program 376 */ 190 377 public GPGenotype getGp() { 191 378 return this.gp; 192 379 } 193 380 381 /** 382 * <p> 383 * returns the variables of the genetic program 384 * </p> 385 * 386 * @return the variables 387 */ 194 388 public Variable[] getVariables() { 195 return ((CrossPareGP)this.problem).getVariables(); 196 } 197 389 return ((CrossPareGP) this.problem).getVariables(); 390 } 391 392 /* 393 * (non-Javadoc) 394 * 395 * @see weka.classifiers.Classifier#buildClassifier(weka.core.Instances) 396 */ 198 397 @Override 199 398 public void buildClassifier(Instances traindata) throws Exception { 200 InstanceData train = new InstanceData(traindata); 201 this.problem = new CrossPareGP(train.getX(), train.getY(), this.populationSize, this.initMinDepth, this.initMaxDepth, this.tournamentSize, this.errorType2Weight, this.maxDepth, this.maxNodes); 399 InstanceData train = new InstanceData(traindata); 400 this.problem = 401 new CrossPareGP(train.getX(), train.getY(), this.populationSize, this.initMinDepth, 402 this.initMaxDepth, this.tournamentSize, this.errorType2Weight, 403 this.maxDepth, this.maxNodes); 202 404 this.gp = problem.create(); 203 405 this.gp.evolve(this.maxGenerations); 204 406 } 205 407 206 408 /** 207 409 * GPProblem implementation 410 * 411 * @author Alexander Trautsch 208 412 */ 209 413 class CrossPareGP extends GPProblem { 414 415 /** 416 * Instance values of the training data 417 */ 210 418 private double[][] instances; 419 420 /** 421 * Classifications of the training data 422 */ 211 423 private boolean[] output; 212 424 425 /** 426 * maximal depth of the S-expression tree 427 */ 213 428 private int maxDepth; 429 430 /** 431 * maximal number of nodes of the S-expression tree 432 */ 214 433 private int maxNodes; 215 434 435 /** 436 * variables of the genetic program 437 */ 216 438 private Variable[] x; 217 439 218 public CrossPareGP(double[][] instances, boolean[] output, int populationSize, int minInitDept, int maxInitDepth, int tournamentSize, double errorType2Weight, int maxDepth, int maxNodes) throws InvalidConfigurationException { 440 /** 441 * 442 * <p> 443 * Constructor. Creates a new genetic program. 444 * </p> 445 * 446 * @param instances 447 * instance values of the training data 448 * @param output 449 * classifications of the training data 450 * @param populationSize 451 * the population size 452 * @param initMinDepth 453 * the initial minimal depth of the S-expression tree 454 * @param initMaxDepth 455 * the initial maximal depth of the S-expression tree 456 * @param tournamentSize 457 * the tournament size for selection 458 * @param maxGenerations 459 * the number of generations created 460 * @param errorType2Weight 461 * weigth factor for the prediction errors 462 * @param maxDepth 463 * maximal depth of the S-expression tree 464 * @param maxNodes 465 * maximal number of nodes of the S-expression tree 466 * @throws InvalidConfigurationException 467 * thrown in case the problem cannot be created 468 */ 469 public CrossPareGP(double[][] instances, 470 boolean[] output, 471 int populationSize, 472 int minInitDept, 473 int maxInitDepth, 474 int tournamentSize, 475 double errorType2Weight, 476 int maxDepth, 477 int maxNodes) 478 throws InvalidConfigurationException 479 { 219 480 super(new GPConfiguration()); 220 481 221 482 this.instances = instances; 222 483 this.output = output; … … 226 487 Configuration.reset(); 227 488 GPConfiguration config = this.getGPConfiguration(); 228 489 229 490 this.x = new Variable[this.instances[0].length]; 230 231 for(int j=0; j < this.x.length; j++) { 232 this.x[j] = Variable.create(config, "X"+j, CommandGene.DoubleClass); 233 } 234 235 config.setGPFitnessEvaluator(new DeltaGPFitnessEvaluator()); // smaller fitness is better 236 //config.setGPFitnessEvaluator(new DefaultGPFitnessEvaluator()); // bigger fitness is better 491 492 for (int j = 0; j < this.x.length; j++) { 493 this.x[j] = Variable.create(config, "X" + j, CommandGene.DoubleClass); 494 } 495 496 config.setGPFitnessEvaluator(new DeltaGPFitnessEvaluator()); // smaller fitness is 497 // better 498 // config.setGPFitnessEvaluator(new DefaultGPFitnessEvaluator()); // bigger fitness 499 // is better 237 500 238 501 config.setMinInitDepth(minInitDept); 239 502 config.setMaxInitDepth(maxInitDepth); 240 241 config.setCrossoverProb((float) 0.60);242 config.setReproductionProb((float) 0.10);243 config.setMutationProb((float) 0.30);503 504 config.setCrossoverProb((float) 0.60); 505 config.setReproductionProb((float) 0.10); 506 config.setMutationProb((float) 0.30); 244 507 245 508 config.setSelectionMethod(new TournamentSelector(tournamentSize)); … … 248 511 249 512 config.setMaxCrossoverDepth(4); 250 config.setFitnessFunction(new CrossPareFitness(this.x, this.instances, this.output, errorType2Weight)); 513 config.setFitnessFunction(new CrossPareFitness(this.x, this.instances, this.output, 514 errorType2Weight)); 251 515 config.setStrictProgramCreation(true); 252 516 } 253 517 254 // used for running the fitness function again for testing 518 /** 519 * <p> 520 * Returns the variables of the problem. Used for running the fitness function again for 521 * testing. 522 * </p> 523 * 524 * @return the variables 525 */ 255 526 public Variable[] getVariables() { 256 527 return this.x; 257 528 } 258 529 259 530 /** 531 * creates the genetic program 532 */ 533 @SuppressWarnings("rawtypes") 260 534 public GPGenotype create() throws InvalidConfigurationException { 261 535 GPConfiguration config = this.getGPConfiguration(); 262 536 263 537 // return type 264 Class[] types = {CommandGene.DoubleClass}; 538 Class[] types = 539 { CommandGene.DoubleClass }; 265 540 266 541 // Arguments of result-producing chromosome: none 267 Class[][] argTypes = { {} }; 542 Class[][] argTypes = 543 { { } }; 268 544 269 545 // variables + functions, we set the variables with the values of the instances here 270 546 CommandGene[] vars = new CommandGene[this.instances[0].length]; 271 for (int j=0; j < this.instances[0].length; j++) {547 for (int j = 0; j < this.instances[0].length; j++) { 272 548 vars[j] = this.x[j]; 273 549 } 274 CommandGene[] funcs = { 275 new Add(config, CommandGene.DoubleClass), 276 new Subtract(config, CommandGene.DoubleClass), 277 new Multiply(config, CommandGene.DoubleClass), 278 new Divide(config, CommandGene.DoubleClass), 279 new Sine(config, CommandGene.DoubleClass), 280 new Cosine(config, CommandGene.DoubleClass), 281 new Exp(config, CommandGene.DoubleClass), 282 new Log(config, CommandGene.DoubleClass), 283 new GT(config, CommandGene.DoubleClass), 284 new Max(config, CommandGene.DoubleClass), 285 new Terminal(config, CommandGene.DoubleClass, -100.0, 100.0, true), // min, max, whole numbers 286 }; 287 288 CommandGene[] comb = (CommandGene[])ArrayUtils.addAll(vars, funcs); 289 CommandGene[][] nodeSets = { 290 comb, 291 }; 292 550 CommandGene[] funcs = 551 { new Add(config, CommandGene.DoubleClass), 552 new Subtract(config, CommandGene.DoubleClass), 553 new Multiply(config, CommandGene.DoubleClass), 554 new Divide(config, CommandGene.DoubleClass), 555 new Sine(config, CommandGene.DoubleClass), 556 new Cosine(config, CommandGene.DoubleClass), 557 new Exp(config, CommandGene.DoubleClass), 558 new Log(config, CommandGene.DoubleClass), 559 new GT(config, CommandGene.DoubleClass), 560 new Max(config, CommandGene.DoubleClass), 561 new Terminal(config, CommandGene.DoubleClass, -100.0, 100.0, true), // min, 562 // max, 563 // whole 564 // numbers 565 }; 566 567 CommandGene[] comb = (CommandGene[]) ArrayUtils.addAll(vars, funcs); 568 CommandGene[][] nodeSets = 569 { comb, }; 570 293 571 // we only have one chromosome so this suffices 294 int minDepths[] = {config.getMinInitDepth()}; 295 int maxDepths[] = {this.maxDepth}; 296 GPGenotype result = GPGenotype.randomInitialGenotype(config, types, argTypes, nodeSets, minDepths, maxDepths, this.maxNodes, false); // 40 = maxNodes, true = verbose output 572 int minDepths[] = 573 { config.getMinInitDepth() }; 574 int maxDepths[] = 575 { this.maxDepth }; 576 GPGenotype result = 577 GPGenotype.randomInitialGenotype(config, types, argTypes, nodeSets, minDepths, 578 maxDepths, this.maxNodes, false); // 40 = 579 // maxNodes, 580 // true = 581 // verbose 582 // output 297 583 298 584 return result; … … 300 586 } 301 587 302 303 /** 304 * Fitness function 588 /** 589 * Internal helper class for the fitness function. 590 * 591 * @author Alexander Trautsch 305 592 */ 306 593 class CrossPareFitness extends GPFitnessFunction { 307 594 595 /** 596 * generated serialization ID 597 */ 308 598 private static final long serialVersionUID = 75234832484387L; 309 599 600 /** 601 * variables of the genetic program 602 */ 310 603 private Variable[] x; 311 604 605 /** 606 * instance values of the training data 607 */ 312 608 private double[][] instances; 609 610 /** 611 * classifications of the training data 612 */ 313 613 private boolean[] output; 314 614 615 /** 616 * weight of the error costs 617 */ 315 618 private double errorType2Weight = 1.0; 316 619 317 620 // needed in evaluate 318 //private Object[] NO_ARGS = new Object[0]; 319 621 // private Object[] NO_ARGS = new Object[0]; 622 623 /** 624 * fitness value 625 */ 320 626 private double sfitness = 0.0f; 627 628 /** 629 * type I error 630 */ 321 631 private int errorType1 = 0; 632 633 /** 634 * type II error 635 */ 322 636 private int errorType2 = 0; 323 637 324 public CrossPareFitness(Variable[] x, double[][] instances, boolean[] output, double errorType2Weight) { 638 /** 639 * <p> 640 * Constructor. Creates a new fitness function. 641 * </p> 642 * 643 * @param x 644 * variables of the genetic program 645 * @param instances 646 * instance values of the training data 647 * @param output 648 * classification of the training data 649 * @param errorType2Weight 650 * weight of the error costs 651 */ 652 public CrossPareFitness(Variable[] x, 653 double[][] instances, 654 boolean[] output, 655 double errorType2Weight) 656 { 325 657 this.x = x; 326 658 this.instances = instances; … … 329 661 } 330 662 663 /** 664 * <p> 665 * returns the type I error 666 * </p> 667 * 668 * @return type I error 669 */ 331 670 public int getErrorType1() { 332 671 return this.errorType1; 333 672 } 334 673 674 /** 675 * <p> 676 * returns the type II error 677 * </p> 678 * 679 * @return type II error 680 */ 335 681 public int getErrorType2() { 336 682 return this.errorType2; 337 683 } 338 684 685 /** 686 * <p> 687 * returns the value of the secondary fitness function 688 * </p> 689 * 690 * @return secondary fitness 691 */ 339 692 public double getSecondFitness() { 340 693 return this.sfitness; 341 694 } 342 695 696 /** 697 * <p> 698 * returns the number of training instances 699 * </p> 700 * 701 * @return number of instances 702 */ 343 703 public int getNumInstances() { 344 704 return this.instances.length; … … 346 706 347 707 /** 348 * This is the fitness function 708 * <p> 709 * The fitness function. Our fitness is best if we have the less wrong classifications, 710 * this includes a weight for type2 errors. 711 * </p> 349 712 * 350 * Our fitness is best if we have the less wrong classifications, this includes a weight for type2 errors 713 * @param program 714 * the genetic program whose fitness is evaluated. 715 * 716 * @see org.jgap.gp.GPFitnessFunction#evaluate(org.jgap.gp.IGPProgram) 351 717 */ 352 718 @Override … … 360 726 this.errorType2 = 0; 361 727 362 for(int i=0; i < this.instances.length; i++) { 363 364 // requires that we have a variable for each column of our dataset (attribute of instance) 365 for(int j=0; j < this.x.length; j++) { 728 for (int i = 0; i < this.instances.length; i++) { 729 730 // requires that we have a variable for each column of our dataset (attribute of 731 // instance) 732 for (int j = 0; j < this.x.length; j++) { 366 733 this.x[j].set(this.instances[i][j]); 367 734 } … … 370 737 value = program.execute_double(0, this.x); 371 738 372 if (value < 0.5) {373 if (this.output[i] != true) {739 if (value < 0.5) { 740 if (this.output[i] != true) { 374 741 this.errorType1 += 1; 375 742 } 376 }else { 377 if(this.output[i] == true) { 743 } 744 else { 745 if (this.output[i] == true) { 378 746 this.errorType2 += 1; 379 747 } … … 382 750 383 751 // now calc pfitness 384 pfitness = (this.errorType1 + this.errorType2Weight * this.errorType2) / this.instances.length; 752 pfitness = (this.errorType1 + this.errorType2Weight * this.errorType2) / 753 this.instances.length; 385 754 386 755 // number of nodes in the programm, if lower then 10 we assign sFitness of 10 387 756 // we can set metadata with setProgramData to save this 388 if (program.getChromosome(0).getSize(0) < 10) {757 if (program.getChromosome(0).getSize(0) < 10) { 389 758 program.setApplicationData(10.0f); 390 759 } … … 393 762 } 394 763 } 395 764 396 765 /** 397 766 * Custom GT implementation used in the GP Algorithm. 398 */ 399 public class GT extends MathCommand implements ICloneable { 400 401 private static final long serialVersionUID = 113454184817L; 402 403 public GT(final GPConfiguration a_conf, java.lang.Class a_returnType) throws InvalidConfigurationException { 404 super(a_conf, 2, a_returnType); 405 } 406 407 public String toString() { 408 return "GT(&1, &2)"; 409 } 410 411 public String getName() { 412 return "GT"; 413 } 414 415 public float execute_float(ProgramChromosome c, int n, Object[] args) { 416 float f1 = c.execute_float(n, 0, args); 417 float f2 = c.execute_float(n, 1, args); 418 419 float ret = 1.0f; 420 if(f1 > f2) { 421 ret = 0.0f; 422 } 423 424 return ret; 425 } 426 427 public double execute_double(ProgramChromosome c, int n, Object[] args) { 428 double f1 = c.execute_double(n, 0, args); 429 double f2 = c.execute_double(n, 1, args); 430 431 double ret = 1; 432 if(f1 > f2) { 433 ret = 0; 434 } 435 return ret; 436 } 437 438 public Object clone() { 439 try { 440 GT result = new GT(getGPConfiguration(), getReturnType()); 441 return result; 442 }catch(Exception ex) { 443 throw new CloneException(ex); 444 } 445 } 446 } 767 * 768 * @author Alexander Trautsch 769 */ 770 public class GT extends MathCommand implements ICloneable { 771 772 /** 773 * generated serialization ID. 774 */ 775 private static final long serialVersionUID = 113454184817L; 776 777 /** 778 * <p> 779 * Constructor. Creates a new GT. 780 * </p> 781 * 782 * @param a_conf 783 * the configuration of the genetic program 784 * @param a_returnType 785 * the return type 786 * @throws InvalidConfigurationException 787 * thrown is there is a problem during the initialization of the super class 788 * 789 * @see MathCommand 790 */ 791 public GT(final GPConfiguration a_conf, @SuppressWarnings("rawtypes") java.lang.Class a_returnType) 792 throws InvalidConfigurationException 793 { 794 super(a_conf, 2, a_returnType); 795 } 796 797 /* 798 * (non-Javadoc) 799 * 800 * @see org.jgap.gp.CommandGene#toString() 801 */ 802 @Override 803 public String toString() { 804 return "GT(&1, &2)"; 805 } 806 807 /* 808 * (non-Javadoc) 809 * 810 * @see org.jgap.gp.CommandGene#getName() 811 */ 812 @Override 813 public String getName() { 814 return "GT"; 815 } 816 817 /* 818 * (non-Javadoc) 819 * 820 * @see org.jgap.gp.CommandGene#execute_float(org.jgap.gp.impl.ProgramChromosome, int, 821 * java.lang.Object[]) 822 */ 823 @Override 824 public float execute_float(ProgramChromosome c, int n, Object[] args) { 825 float f1 = c.execute_float(n, 0, args); 826 float f2 = c.execute_float(n, 1, args); 827 828 float ret = 1.0f; 829 if (f1 > f2) { 830 ret = 0.0f; 831 } 832 833 return ret; 834 } 835 836 /* 837 * (non-Javadoc) 838 * 839 * @see org.jgap.gp.CommandGene#execute_double(org.jgap.gp.impl.ProgramChromosome, int, 840 * java.lang.Object[]) 841 */ 842 @Override 843 public double execute_double(ProgramChromosome c, int n, Object[] args) { 844 double f1 = c.execute_double(n, 0, args); 845 double f2 = c.execute_double(n, 1, args); 846 847 double ret = 1; 848 if (f1 > f2) { 849 ret = 0; 850 } 851 return ret; 852 } 853 854 /* 855 * (non-Javadoc) 856 * 857 * @see java.lang.Object#clone() 858 */ 859 @Override 860 public Object clone() { 861 try { 862 GT result = new GT(getGPConfiguration(), getReturnType()); 863 return result; 864 } 865 catch (Exception ex) { 866 throw new CloneException(ex); 867 } 868 } 869 } 447 870 } 448 871 449 872 /** 450 873 * GP Multiple Data Sets Validation-Voting Classifier 451 874 * 452 * Basically the same as the GP Multiple Data Sets Validation Classifier. 453 * But here we do keep amodel candidate for each training set which may later vote875 * Basically the same as the GP Multiple Data Sets Validation Classifier. But here we do keep a 876 * model candidate for each training set which may later vote 454 877 * 455 878 */ 456 879 public class GPVVClassifier extends GPVClassifier { 457 880 881 /** 882 * generated serialization ID 883 */ 458 884 private static final long serialVersionUID = -654710583852839901L; 885 886 /** 887 * classifiers for each validation set 888 */ 459 889 private List<Classifier> classifiers = null; 460 890 891 /* 892 * (non-Javadoc) 893 * 894 * @see 895 * de.ugoe.cs.cpdp.training.GPTraining.GPVClassifier#buildClassifier(weka.core.Instances) 896 */ 461 897 @Override 462 898 public void buildClassifier(Instances arg0) throws Exception { 463 899 // TODO Auto-generated method stub 464 465 } 466 467 /** Build the GP Multiple Data Sets Validation-Voting Classifier468 * 469 * This is according to Section 6 of the Paper by Liu et al. 470 * It is basically the MultipleData Sets Validation Classifier but here we keep the best models an let them vote.900 } 901 902 /** 903 * Build the GP Multiple Data Sets Validation-Voting Classifier 904 * 905 * This is according to Section 6 of the Paper by Liu et al. It is basically the Multiple 906 * Data Sets Validation Classifier but here we keep the best models an let them vote. 471 907 * 472 908 * @param traindataSet 909 * the training data 473 910 * @throws Exception 911 * thrown in case of a problem with the training 474 912 */ 475 913 public void buildClassifier(SetUniqueList<Instances> traindataSet) throws Exception { … … 478 916 // then is evaluated on the rest 479 917 classifiers = new LinkedList<>(); 480 for (int i=0; i < traindataSet.size(); i++) {918 for (int i = 0; i < traindataSet.size(); i++) { 481 919 482 920 // candidates we get out of evaluation 483 921 LinkedList<Classifier> candidates = new LinkedList<>(); 484 922 485 923 // number of runs, yields the best of these 486 double smallest_error_count_train = Double.MAX_VALUE; 924 double smallest_error_count_train = Double.MAX_VALUE; 487 925 Classifier bestTrain = null; 488 for(int k=0; k < this.numberRuns; k++) { 489 double[] errors_eval = {0.0, 0.0}; 926 for (int k = 0; k < this.numberRuns; k++) { 927 double[] errors_eval = 928 { 0.0, 0.0 }; 490 929 Classifier classifier = new GPRun(); 491 ((GPRun)classifier).configure(this.populationSize, this.initMinDepth, this.initMaxDepth, this.tournamentSize, this.maxGenerations, this.errorType2Weight, this.maxDepth, this.maxNodes); 492 930 ((GPRun) classifier).configure(this.populationSize, this.initMinDepth, 931 this.initMaxDepth, this.tournamentSize, 932 this.maxGenerations, this.errorType2Weight, 933 this.maxDepth, this.maxNodes); 934 493 935 // one project is training data 494 936 classifier.buildClassifier(traindataSet.get(i)); 495 937 496 938 double[] errors; 497 939 // rest of the set is evaluation data, we evaluate now 498 for(int j=0; j < traindataSet.size(); j++) { 499 if(j != i) { 500 // if type1 and type2 errors are < 0.5 we allow the model in the candidates 501 errors = this.evaluate((GPRun)classifier, traindataSet.get(j)); 940 for (int j = 0; j < traindataSet.size(); j++) { 941 if (j != i) { 942 // if type1 and type2 errors are < 0.5 we allow the model in the 943 // candidates 944 errors = this.evaluate((GPRun) classifier, traindataSet.get(j)); 502 945 errors_eval[0] += errors[0]; 503 946 errors_eval[1] += errors[1]; 504 if ((errors[0] < 0.5) && (errors[1] < 0.5)) {947 if ((errors[0] < 0.5) && (errors[1] < 0.5)) { 505 948 candidates.add(classifier); 506 949 } 507 950 } 508 951 } 509 952 510 953 // if the candidate made fewer errors it is now the best 511 if (errors_eval[0] + errors_eval[1] < smallest_error_count_train) {954 if (errors_eval[0] + errors_eval[1] < smallest_error_count_train) { 512 955 bestTrain = classifier; 513 956 smallest_error_count_train = errors_eval[0] + errors_eval[1]; … … 515 958 } 516 959 517 // now after the evaluation we do a model selection where only one model remains for the given training data 960 // now after the evaluation we do a model selection where only one model remains for 961 // the given training data 518 962 // we select the model which is best on all evaluation data 519 963 double smallest_error_count = Double.MAX_VALUE; 520 964 double[] errors; 521 965 Classifier best = null; 522 for(int ii=0; ii < candidates.size(); ii++) { 523 double[] errors_eval = {0.0, 0.0}; 524 966 for (int ii = 0; ii < candidates.size(); ii++) { 967 double[] errors_eval = 968 { 0.0, 0.0 }; 969 525 970 // we add the errors the candidate makes over the evaldata 526 for (int j=0; j < traindataSet.size(); j++) {527 if (j != i) {528 errors = this.evaluate((GPRun) candidates.get(ii), traindataSet.get(j));971 for (int j = 0; j < traindataSet.size(); j++) { 972 if (j != i) { 973 errors = this.evaluate((GPRun) candidates.get(ii), traindataSet.get(j)); 529 974 errors_eval[0] += errors[0]; 530 975 errors_eval[1] += errors[1]; 531 976 } 532 977 } 533 978 534 979 // if the candidate made fewer errors it is now the best 535 if (errors_eval[0] + errors_eval[1] < smallest_error_count) {980 if (errors_eval[0] + errors_eval[1] < smallest_error_count) { 536 981 best = candidates.get(ii); 537 982 smallest_error_count = errors_eval[0] + errors_eval[1]; 538 983 } 539 984 } 540 541 if ( best==null) {985 986 if (best == null) { 542 987 best = bestTrain; 543 988 } … … 546 991 } 547 992 } 548 993 549 994 /** 550 995 * Use the best classifiers for each training data in a majority voting 996 * 997 * @param instance 998 * instance that is classified 999 * 1000 * @see de.ugoe.cs.cpdp.training.GPTraining.GPVClassifier#classifyInstance(weka.core.Instance) 551 1001 */ 552 1002 @Override 553 1003 public double classifyInstance(Instance instance) { 554 1004 555 1005 int vote_positive = 0; 556 1006 557 1007 for (int i = 0; i < classifiers.size(); i++) { 558 1008 Classifier classifier = classifiers.get(i); 559 560 GPGenotype gp = ((GPRun) classifier).getGp();561 Variable[] vars = ((GPRun) classifier).getVariables();562 563 IGPProgram fitest = gp.getAllTimeBest(); 564 for (int j = 0; j < instance.numAttributes()-1; j++) {565 vars[j].set(instance.value(j));566 } 567 568 if (fitest.execute_double(0, vars) < 0.5) {1009 1010 GPGenotype gp = ((GPRun) classifier).getGp(); 1011 Variable[] vars = ((GPRun) classifier).getVariables(); 1012 1013 IGPProgram fitest = gp.getAllTimeBest(); // all time fitest 1014 for (int j = 0; j < instance.numAttributes() - 1; j++) { 1015 vars[j].set(instance.value(j)); 1016 } 1017 1018 if (fitest.execute_double(0, vars) < 0.5) { 569 1019 vote_positive += 1; 570 1020 } 571 1021 } 572 573 if (vote_positive >= (classifiers.size()/2)) {1022 1023 if (vote_positive >= (classifiers.size() / 2)) { 574 1024 return 1.0; 575 }else { 1025 } 1026 else { 576 1027 return 0.0; 577 1028 } 578 1029 } 579 1030 } 580 1031 581 1032 /** 582 1033 * GP Multiple Data Sets Validation Classifier 583 1034 * 584 * We train a Classifier with one training project $numberRun times. 585 * Then we evaluate the classifier on the rest of the training projects and keep the best classifier. 586 * After that we have for each training project the best classifier as per the evaluation on the rest of the data set. 587 * Then we determine the best classifier from these candidates and keep it to be used later. 1035 * We train a Classifier with one training project $numberRun times. Then we evaluate the 1036 * classifier on the rest of the training projects and keep the best classifier. After that we 1037 * have for each training project the best classifier as per the evaluation on the rest of the 1038 * data set. Then we determine the best classifier from these candidates and keep it to be used 1039 * later. 1040 * 1041 * @author sherbold Alexander Trautsch 588 1042 */ 589 1043 public class GPVClassifier extends AbstractClassifier { 590 1044 591 1045 private List<Classifier> classifiers = null; 592 1046 private Classifier best = null; … … 594 1048 private static final long serialVersionUID = 3708714057579101522L; 595 1049 1050 /** 1051 * size of the population of the genetic program 1052 */ 596 1053 protected int populationSize; 1054 1055 /** 1056 * minimal depth of the S-expression tree at the start of the training 1057 */ 597 1058 protected int initMinDepth; 1059 1060 /** 1061 * maximal depth of the S-expression tree at the start of the training 1062 */ 598 1063 protected int initMaxDepth; 1064 1065 /** 1066 * size of the tournaments used for selection 1067 */ 599 1068 protected int tournamentSize; 1069 1070 /** 1071 * number of genetic generations considered (i.e., number of iterations 1072 */ 600 1073 protected int maxGenerations; 1074 1075 /** 1076 * weight factor for the prediction errors for cost estimation 1077 */ 601 1078 protected double errorType2Weight; 602 protected int numberRuns; 1079 1080 /** 1081 * number of internal replications from which the best result is picked 1082 */ 1083 protected int numberRuns = 20; 1084 1085 /** 1086 * maximal depth of the S-expression tree 1087 */ 603 1088 protected int maxDepth; 1089 1090 /** 1091 * maximal number of nodes of the S-expression tree 1092 */ 604 1093 protected int maxNodes; 605 1094 606 1095 /** 1096 * 1097 * <p> 607 1098 * Configure the GP Params and number of Runs 1099 * </p> 608 1100 * 609 1101 * @param populationSize 1102 * the population size 610 1103 * @param initMinDepth 1104 * the initial minimal depth of the S-expression tree 611 1105 * @param initMaxDepth 1106 * the initial maximal depth of the S-expression tree 612 1107 * @param tournamentSize 1108 * the tournament size for selection 613 1109 * @param maxGenerations 1110 * the number of generations created 614 1111 * @param errorType2Weight 615 */ 616 public void configure(int populationSize, int initMinDepth, int initMaxDepth, int tournamentSize, int maxGenerations, double errorType2Weight, int numberRuns, int maxDepth, int maxNodes) { 1112 * weigth factor for the prediction errors 1113 * @param numberRuns 1114 * number of internal replications from which the best result is picked 1115 * @param maxDepth 1116 * maximal depth of the S-expression tree 1117 * @param maxNodes 1118 * maximal number of nodes of the S-expression tree 1119 */ 1120 public void configure(int populationSize, 1121 int initMinDepth, 1122 int initMaxDepth, 1123 int tournamentSize, 1124 int maxGenerations, 1125 double errorType2Weight, 1126 int numberRuns, 1127 int maxDepth, 1128 int maxNodes) 1129 { 617 1130 this.populationSize = populationSize; 618 1131 this.initMinDepth = initMinDepth; … … 625 1138 this.maxNodes = maxNodes; 626 1139 } 627 628 /** Build the GP Multiple Data Sets Validation Classifier 629 * 630 * This is according to Section 6 of the Paper by Liu et al. except for the selection of the best model. 631 * Section 4 describes a slightly different approach. 1140 1141 /** 1142 * Build the GP Multiple Data Sets Validation Classifier 1143 * 1144 * This is according to Section 6 of the Paper by Liu et al. except for the selection of the 1145 * best model. Section 4 describes a slightly different approach. 632 1146 * 633 1147 * @param traindataSet 1148 * the training data 634 1149 * @throws Exception 1150 * thrown in case of a problem with the training 635 1151 */ 636 1152 public void buildClassifier(SetUniqueList<Instances> traindataSet) throws Exception { … … 638 1154 // each classifier is trained with one project from the set 639 1155 // then is evaluated on the rest 640 for (int i=0; i < traindataSet.size(); i++) {641 1156 for (int i = 0; i < traindataSet.size(); i++) { 1157 642 1158 // candidates we get out of evaluation 643 1159 LinkedList<Classifier> candidates = new LinkedList<>(); 644 1160 645 1161 // numberRuns full GPRuns, we generate numberRuns models for each traindata 646 for (int k=0; k < this.numberRuns; k++) {1162 for (int k = 0; k < this.numberRuns; k++) { 647 1163 Classifier classifier = new GPRun(); 648 ((GPRun)classifier).configure(this.populationSize, this.initMinDepth, this.initMaxDepth, this.tournamentSize, this.maxGenerations, this.errorType2Weight, this.maxDepth, this.maxNodes); 649 1164 ((GPRun) classifier).configure(this.populationSize, this.initMinDepth, 1165 this.initMaxDepth, this.tournamentSize, 1166 this.maxGenerations, this.errorType2Weight, 1167 this.maxDepth, this.maxNodes); 1168 650 1169 classifier.buildClassifier(traindataSet.get(i)); 651 1170 652 1171 double[] errors; 653 1172 654 1173 // rest of the set is evaluation data, we evaluate now 655 for(int j=0; j < traindataSet.size(); j++) { 656 if(j != i) { 657 // if type1 and type2 errors are < 0.5 we allow the model in the candidate list 658 errors = this.evaluate((GPRun)classifier, traindataSet.get(j)); 659 if((errors[0] < 0.5) && (errors[1] < 0.5)) { 1174 for (int j = 0; j < traindataSet.size(); j++) { 1175 if (j != i) { 1176 // if type1 and type2 errors are < 0.5 we allow the model in the 1177 // candidate list 1178 errors = this.evaluate((GPRun) classifier, traindataSet.get(j)); 1179 if ((errors[0] < 0.5) && (errors[1] < 0.5)) { 660 1180 candidates.add(classifier); 661 1181 } … … 663 1183 } 664 1184 } 665 666 // now after the evaluation we do a model selection where only one model remains for the given training data 1185 1186 // now after the evaluation we do a model selection where only one model remains for 1187 // the given training data 667 1188 // we select the model which is best on all evaluation data 668 1189 double smallest_error_count = Double.MAX_VALUE; 669 1190 double[] errors; 670 1191 Classifier best = null; 671 for(int ii=0; ii < candidates.size(); ii++) { 672 double[] errors_eval = {0.0, 0.0}; 673 1192 for (int ii = 0; ii < candidates.size(); ii++) { 1193 double[] errors_eval = 1194 { 0.0, 0.0 }; 1195 674 1196 // we add the errors the candidate makes over the evaldata 675 for (int j=0; j < traindataSet.size(); j++) {676 if (j != i) {677 errors = this.evaluate((GPRun) candidates.get(ii), traindataSet.get(j));1197 for (int j = 0; j < traindataSet.size(); j++) { 1198 if (j != i) { 1199 errors = this.evaluate((GPRun) candidates.get(ii), traindataSet.get(j)); 678 1200 errors_eval[0] += errors[0]; 679 1201 errors_eval[1] += errors[1]; 680 1202 } 681 1203 } 682 1204 683 1205 // if the candidate made fewer errors it is now the best 684 if (errors_eval[0] + errors_eval[1] < smallest_error_count) {1206 if (errors_eval[0] + errors_eval[1] < smallest_error_count) { 685 1207 best = candidates.get(ii); 686 1208 smallest_error_count = errors_eval[0] + errors_eval[1]; 687 1209 } 688 1210 } 689 690 1211 691 1212 // now we have the best classifier for this training data 692 1213 classifiers.add(best); 693 1214 694 1215 } /* endfor trainData */ 695 696 // now we have one best classifier for each trainData 1216 1217 // now we have one best classifier for each trainData 697 1218 // we evaluate again to find the best classifier of all time 698 // this selection is now according to section 4 of the paper and not 6 where an average of the 6 models is build 1219 // this selection is now according to section 4 of the paper and not 6 where an average 1220 // of the 6 models is build 699 1221 double smallest_error_count = Double.MAX_VALUE; 700 1222 double error_count; 701 1223 double errors[]; 702 for (int j=0; j < classifiers.size(); j++) {1224 for (int j = 0; j < classifiers.size(); j++) { 703 1225 error_count = 0; 704 1226 Classifier current = classifiers.get(j); 705 for (int i=0; i < traindataSet.size(); i++) {706 errors = this.evaluate((GPRun) current, traindataSet.get(i));1227 for (int i = 0; i < traindataSet.size(); i++) { 1228 errors = this.evaluate((GPRun) current, traindataSet.get(i)); 707 1229 error_count = errors[0] + errors[1]; 708 1230 } 709 710 if (error_count < smallest_error_count) {1231 1232 if (error_count < smallest_error_count) { 711 1233 best = current; 712 1234 } 713 1235 } 714 1236 } 715 1237 1238 /* 1239 * (non-Javadoc) 1240 * 1241 * @see weka.classifiers.Classifier#buildClassifier(weka.core.Instances) 1242 */ 716 1243 @Override 717 1244 public void buildClassifier(Instances traindata) throws Exception { 718 1245 final Classifier classifier = new GPRun(); 719 ((GPRun)classifier).configure(populationSize, initMinDepth, initMaxDepth, tournamentSize, maxGenerations, errorType2Weight, this.maxDepth, this.maxNodes); 1246 ((GPRun) classifier).configure(populationSize, initMinDepth, initMaxDepth, 1247 tournamentSize, maxGenerations, errorType2Weight, 1248 this.maxDepth, this.maxNodes); 720 1249 classifier.buildClassifier(traindata); 721 1250 classifiers.add(classifier); 722 1251 } 723 724 /** 725 * Evaluation of the Classifier 726 * 727 * We evaluate the classifier with the Instances of the evalData. 728 * It basically assigns the instance attribute values to the variables of the s-expression-tree and 729 * then counts the missclassifications. 1252 1253 /** 1254 * <p> 1255 * Evaluation of the Classifier. 1256 * </p> 1257 * <p> 1258 * We evaluate the classifier with the Instances of the evalData. It basically assigns the 1259 * instance attribute values to the variables of the s-expression-tree and then counts the 1260 * missclassifications. 1261 * </p> 730 1262 * 731 1263 * @param classifier 1264 * the classifier that is evaluated 732 1265 * @param evalData 1266 * the validation data 733 1267 * @return 734 1268 */ … … 736 1270 GPGenotype gp = classifier.getGp(); 737 1271 Variable[] vars = classifier.getVariables(); 738 739 IGPProgram fitest = gp.getAllTimeBest(); // selects the fitest of all not just the last generation 740 1272 1273 IGPProgram fitest = gp.getAllTimeBest(); // selects the fitest of all not just the last 1274 // generation 1275 741 1276 double classification; 742 1277 int error_type1 = 0; … … 744 1279 int positive = 0; 745 1280 int negative = 0; 746 747 for (Instance instance: evalData) {748 1281 1282 for (Instance instance : evalData) { 1283 749 1284 // assign instance attribute values to the variables of the s-expression-tree 750 1285 double[] tmp = WekaUtils.instanceValues(instance); 751 for (int i = 0; i < tmp.length; i++) {1286 for (int i = 0; i < tmp.length; i++) { 752 1287 vars[i].set(tmp[i]); 753 1288 } 754 1289 755 1290 classification = fitest.execute_double(0, vars); 756 1291 757 1292 // we need to count the absolutes of positives for percentage 758 if(instance.classValue() == 1.0) { 759 positive +=1; 760 }else { 761 negative +=1; 762 } 763 1293 if (instance.classValue() == 1.0) { 1294 positive += 1; 1295 } 1296 else { 1297 negative += 1; 1298 } 1299 764 1300 // classification < 0.5 we say defective 765 if (classification < 0.5) {766 if (instance.classValue() != 1.0) {1301 if (classification < 0.5) { 1302 if (instance.classValue() != 1.0) { 767 1303 error_type1 += 1; 768 1304 } 769 }else { 770 if(instance.classValue() == 1.0) { 1305 } 1306 else { 1307 if (instance.classValue() == 1.0) { 771 1308 error_type2 += 1; 772 1309 } 773 1310 } 774 1311 } 775 776 // return error types percentages for the types 1312 1313 // return error types percentages for the types 777 1314 double et1_per = error_type1 / negative; 778 double et2_per = error_type2 / positive; 779 return new double[]{et1_per, et2_per}; 780 } 781 1315 double et2_per = error_type2 / positive; 1316 return new double[] 1317 { et1_per, et2_per }; 1318 } 1319 782 1320 /** 783 1321 * Use only the best classifier from our evaluation phase 1322 * 1323 * @param instance 1324 * instance that is classified 1325 * 1326 * @see weka.classifiers.AbstractClassifier#classifyInstance(weka.core.Instance) 784 1327 */ 785 1328 @Override 786 1329 public double classifyInstance(Instance instance) { 787 GPGenotype gp = ((GPRun) best).getGp();788 Variable[] vars = ((GPRun) best).getVariables();789 790 IGPProgram fitest = gp.getAllTimeBest(); 791 for (int i = 0; i < instance.numAttributes()-1; i++) {792 vars[i].set(instance.value(i));793 } 794 1330 GPGenotype gp = ((GPRun) best).getGp(); 1331 Variable[] vars = ((GPRun) best).getVariables(); 1332 1333 IGPProgram fitest = gp.getAllTimeBest(); // all time fitest 1334 for (int i = 0; i < instance.numAttributes() - 1; i++) { 1335 vars[i].set(instance.value(i)); 1336 } 1337 795 1338 double classification = fitest.execute_double(0, vars); 796 797 if (classification < 0.5) {1339 1340 if (classification < 0.5) { 798 1341 return 1.0; 799 }else { 1342 } 1343 else { 800 1344 return 0.0; 801 1345 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/ISetWiseTestdataAwareTrainingStrategy.java
r45 r135 1 1 2 package de.ugoe.cs.cpdp.training; 2 3 … … 4 5 import weka.core.Instances; 5 6 7 /** 8 * <p> 9 * Training strategy for training with access to the target data and one data set per input product. 10 * </p> 11 * 12 * @author Steffen Herbold 13 */ 6 14 public interface ISetWiseTestdataAwareTrainingStrategy extends ITrainer { 7 15 16 /** 17 * <p> 18 * Applies the training strategy. 19 * </p> 20 * 21 * @param traindataSet 22 * the training data per product 23 * @param testdata 24 * the test data from the target product 25 */ 8 26 void apply(SetUniqueList<Instances> traindataSet, Instances testdata); 9 27 28 /** 29 * <p> 30 * returns the name of the training strategy 31 * </p> 32 * 33 * @return the name 34 */ 10 35 String getName(); 11 36 37 // TODO: these two methods look like they should be removed and instead be handled using the parameters 12 38 void setMethod(String method); 39 13 40 void setThreshold(String threshold); 14 41 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/ISetWiseTrainingStrategy.java
r86 r135 19 19 import weka.core.Instances; 20 20 21 // Bagging Strategy: separate models for each training data set 21 /** 22 * <p> 23 * Training strategy for training with one data set per input product. 24 * </p> 25 * 26 * @author Steffen Herbold 27 */ 22 28 public interface ISetWiseTrainingStrategy extends ITrainer { 23 29 30 /** 31 * <p> 32 * Applies the training strategy. 33 * </p> 34 * 35 * @param traindataSet 36 * the training data per product 37 */ 24 38 void apply(SetUniqueList<Instances> traindataSet); 25 39 40 /** 41 * <p> 42 * returns the name of the training strategy 43 * </p> 44 * 45 * @return the name 46 */ 26 47 String getName(); 27 48 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/ITestAwareTrainingStrategy.java
r65 r135 3 3 import weka.core.Instances; 4 4 5 /** 6 * <p> 7 * Training strategy for training with access to the target data and the training data as a single data set. 8 * </p> 9 * 10 * @author Steffen Herbold 11 */ 5 12 public interface ITestAwareTrainingStrategy extends ITrainer { 6 13 14 /** 15 * <p> 16 * Applies the training strategy. 17 * </p> 18 * 19 * @param traindata 20 * the training data for all products 21 * @param testdata 22 * the test data from the target product 23 */ 7 24 void apply(Instances testdata, Instances traindata); 8 25 26 /** 27 * <p> 28 * returns the name of the training strategy 29 * </p> 30 * 31 * @return the name 32 */ 9 33 String getName(); 10 34 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/ITrainer.java
r86 r135 17 17 import de.ugoe.cs.cpdp.IParameterizable; 18 18 19 /** 20 * <p> 21 * Marker interface for all CrossPare trainers. 22 * </p> 23 * 24 * @author Steffen Herbold 25 */ 19 26 public interface ITrainer extends IParameterizable { 20 27 -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/ITrainingStrategy.java
r86 r135 17 17 import weka.core.Instances; 18 18 19 /** 20 * <p> 21 * Training strategy for training with the training data as a single data set. 22 * </p> 23 * 24 * @author Steffen Herbold 25 */ 19 26 public interface ITrainingStrategy extends ITrainer { 20 27 28 /** 29 * <p> 30 * Applies the training strategy. 31 * </p> 32 * 33 * @param traindata 34 * the training data for all target products 35 */ 21 36 void apply(Instances traindata); 22 37 38 /** 39 * <p> 40 * returns the name of the training strategy 41 * </p> 42 * 43 * @return the name 44 */ 23 45 String getName(); 24 46 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/IWekaCompatibleTrainer.java
r86 r135 17 17 import weka.classifiers.Classifier; 18 18 19 /** 20 * <p> 21 * Common interface for all training strategies that internally use the {@link Classifier} from WEKA. 22 * </p> 23 * 24 * @author Steffen Herbold 25 */ 19 26 public interface IWekaCompatibleTrainer extends ITrainer { 20 27 28 /** 29 * <p> 30 * returns the WEKA classifier 31 * </p> 32 * 33 * @return the classifier 34 */ 21 35 Classifier getClassifier(); 22 36 37 /** 38 * <p> 39 * returns the name of the training strategy 40 * </p> 41 * 42 * @return the name 43 */ 23 44 String getName(); 24 45 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/QuadTree.java
r86 r135 24 24 25 25 /** 26 * QuadTree implementation 26 * <p> 27 * QuadTree implementation. 28 * </p> 29 * <p> 30 * QuadTree gets a list of instances and then recursively split them into 4 children For this it 31 * uses the median of the 2 values x,y. 32 * </p> 27 33 * 28 * QuadTree gets a list of instances and then recursively split them into 4 childs For this it uses 29 * the median of the 2 values x,y 34 * @author Alexander Trautsch 30 35 */ 31 36 public class QuadTree { 32 37 33 /* 1 parent or null */ 38 /** 39 * 1 parent or null 40 */ 34 41 private QuadTree parent = null; 35 42 36 /* 4 childs, 1 per quadrant */ 43 /** 44 * north-west quadrant 45 */ 37 46 private QuadTree child_nw; 47 48 /** 49 * north-east quadrant 50 */ 38 51 private QuadTree child_ne; 52 53 /** 54 * south-east quadrant 55 */ 39 56 private QuadTree child_se; 57 58 /** 59 * south-west quadrant 60 */ 40 61 private QuadTree child_sw; 41 62 42 /* list (only helps with generation of list of childs!) */ 63 /** 64 * helper list for child quadrant generation 65 */ 43 66 private ArrayList<QuadTree> l = new ArrayList<QuadTree>(); 44 67 45 /* level only used for debugging */ 68 /** 69 * debugging attribute 70 */ 46 71 public int level = 0; 47 72 48 /* size of the quadrant */ 73 /** 74 * size of the quadrant in x-dimension 75 */ 49 76 private double[] x; 77 78 /** 79 * size of the quadrant in y-dimension 80 */ 50 81 private double[] y; 51 82 83 /** 84 * debugging parameter 85 */ 52 86 public static boolean verbose = false; 87 88 /** 89 * global size of the QuadTree. 90 */ 53 91 public static int size = 0; 92 93 /** 94 * recursion parameter alpha 95 */ 54 96 public static double alpha = 0; 55 97 56 /* cluster payloads */ 98 /** 99 * data for each cluster 100 */ 57 101 public static ArrayList<ArrayList<QuadTreePayload<Instance>>> ccluster = 58 102 new ArrayList<ArrayList<QuadTreePayload<Instance>>>(); 59 103 60 /* cluster sizes (index is cluster number, arraylist is list of boxes (x0,y0,x1,y1) */ 104 /** 105 * cluster sizes (index is cluster number, {@link ArrayList} is list of boxes (x0,y0,x1,y1 106 */ 61 107 public static HashMap<Integer, ArrayList<Double[][]>> csize = 62 108 new HashMap<Integer, ArrayList<Double[][]>>(); 63 109 64 /* payload of this instance */ 110 /** 111 * data within this quadrant 112 */ 65 113 private ArrayList<QuadTreePayload<Instance>> payload; 66 114 115 /** 116 * <p> 117 * Constructor. Creates a new QuadTree. 118 * </p> 119 * 120 * @param parent 121 * parent of this tree 122 * @param payload 123 * data within the quadrant 124 */ 67 125 public QuadTree(QuadTree parent, ArrayList<QuadTreePayload<Instance>> payload) { 68 126 this.parent = parent; … … 70 128 } 71 129 130 /* 131 * (non-Javadoc) 132 * 133 * @see java.lang.Object#toString() 134 */ 135 @Override 72 136 public String toString() { 73 137 String n = ""; … … 81 145 82 146 /** 147 * <p> 83 148 * Returns the payload, used for clustering in the clustering list we only have children with 84 * paylod 85 * 86 * @return payload 149 * payload 150 * </p> 151 * 152 * @return payload the payload 87 153 */ 88 154 public ArrayList<QuadTreePayload<Instance>> getPayload() { … … 91 157 92 158 /** 93 * Calculate the density of this quadrant 94 * 95 * density = number of instances / global size (all instances) 96 * 97 * @return density 159 * <p> 160 * Calculate the density of this quadrant as 161 * <ul> 162 * <li>density = number of instances / global size (all instances)</li> 163 * </ul> 164 * 165 * @return density the density 98 166 */ 99 167 public double getDensity() { … … 103 171 } 104 172 173 /** 174 * <p> 175 * sets the size coordinates of the quadrant 176 * </p> 177 * 178 * @param x 179 * x-dimension 180 * @param y 181 * y-dimension 182 */ 105 183 public void setSize(double[] x, double[] y) { 106 184 this.x = x; … … 108 186 } 109 187 188 /** 189 * <p> 190 * returns the size of the quadrant 191 * </p> 192 * 193 * @return size of the current quadrant 194 */ 110 195 public double[][] getSize() { 111 196 return new double[][] … … 113 198 } 114 199 200 /** 201 * <p> 202 * returns the size of the quadrant 203 * </p> 204 * 205 * @return size of the current quadrant 206 */ 115 207 public Double[][] getSizeDouble() { 116 208 Double[] tmpX = new Double[2]; … … 128 220 129 221 /** 130 * TODO: DRY, median ist immer dasselbe 222 * <p> 223 * calculates the median for the x axis 224 * </p> 131 225 * 132 226 * @return median for x … … 161 255 } 162 256 257 /** 258 * <p> 259 * calculates the median for the y axis 260 * </p> 261 * 262 * @return median for y 263 */ 163 264 private double getMedianForY() { 164 265 double med_y = 0; … … 191 292 192 293 /** 193 * Reurns the number of instances in the payload 194 * 195 * @return int number of instances 294 * <p> 295 * Returns the number of instances in the payload 296 * </p> 297 * 298 * @return number of instances 196 299 */ 197 300 public int getNumbers() { … … 204 307 205 308 /** 309 * <p> 206 310 * Calculate median values of payload for x, y and split into 4 sectors 311 * </p> 207 312 * 208 313 * @return Array of QuadTree nodes (4 childs) … … 295 400 296 401 /** 297 * TODO: static method 298 * 402 * <p> 403 * creates the children of a QuadTree and recursively splits them as well 404 * </p> 405 * 299 406 * @param q 300 */ 301 public void recursiveSplit(QuadTree q) { 407 * tree that is split 408 */ 409 public static void recursiveSplit(QuadTree q) { 302 410 if (QuadTree.verbose) { 303 411 System.out.println("splitting: " + q); … … 310 418 try { 311 419 QuadTree[] childs = q.split(); 312 this.recursiveSplit(childs[0]);313 this.recursiveSplit(childs[1]);314 this.recursiveSplit(childs[2]);315 this.recursiveSplit(childs[3]);420 recursiveSplit(childs[0]); 421 recursiveSplit(childs[1]); 422 recursiveSplit(childs[2]); 423 recursiveSplit(childs[3]); 316 424 } 317 425 catch (Exception e) { … … 322 430 323 431 /** 324 * returns an list of childs sorted by density 432 * <p> 433 * returns an list of children sorted by density 434 * </p> 325 435 * 326 436 * @param q 327 437 * QuadTree 328 * @return list of QuadTrees329 438 */ 330 439 private void generateList(QuadTree q) { … … 350 459 351 460 /** 461 * <p> 352 462 * Checks if passed QuadTree is neighboring to us 463 * </p> 353 464 * 354 465 * @param q … … 396 507 397 508 /** 509 * <p> 398 510 * Perform pruning and clustering of the quadtree 399 * 511 * </p> 512 * <p> 400 513 * Pruning according to: Tim Menzies, Andrew Butcher, David Cok, Andrian Marcus, Lucas Layman, 401 514 * Forrest Shull, Burak Turhan, Thomas Zimmermann, 402 515 * "Local versus Global Lessons for Defect Prediction and Effort Estimation," IEEE Transactions 403 516 * on Software Engineering, vol. 39, no. 6, pp. 822-834, June, 2013 404 * 405 * 1) get list of leaf quadrants 2) sort by their density 3) set stop_rule to 0.5 * highest 406 * Density in the list 4) merge all nodes with a density > stop_rule to the new cluster and 407 * remove all from list 5) repeat 517 * </p> 518 * <ol> 519 * <li>get list of leaf quadrants</li> 520 * <li>sort by their density</li> 521 * <li>set stop_rule to 0.5*highest Density in the list</li> 522 * <li>merge all nodes with a density > stop_rule to the new cluster and remove all from list 523 * </li> 524 * <li>repeat</li> 525 * </ol> 408 526 * 409 527 * @param q … … 479 597 } 480 598 599 /** 600 * <p> 601 * debugging function that prints information about the QuadTree 602 * </p> 603 * 604 */ 481 605 public void printInfo() { 482 606 System.out.println("we have " + ccluster.size() + " clusters"); … … 488 612 489 613 /** 614 * <p> 490 615 * Helper Method to get a sorted list (by density) for all children 616 * </p> 491 617 * 492 618 * @param q -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/WekaBaggingTraining.java
r99 r135 29 29 30 30 /** 31 * Programmatic WekaBaggingTraining 32 * 33 * first parameter is Trainer Name. second parameter is class name 34 * 35 * all subsequent parameters are configuration params (for example for trees) Cross Validation 36 * params always come last and are prepended with -CVPARAM 37 * 31 * <p> 32 * The first parameter is the trainer name, second parameter is class name. All subsequent 33 * parameters are configuration parameters of the algorithms. Cross validation parameters always 34 * come last and are prepended with -CVPARAM 35 * </p> 36 * <p> 38 37 * XML Configurations for Weka Classifiers: 39 38 * … … 45 44 * } 46 45 * </pre> 46 * </p> 47 47 * 48 * @author Alexander Trautsch 48 49 */ 49 50 public class WekaBaggingTraining extends WekaBaseTraining implements ISetWiseTrainingStrategy { 50 51 52 /** 53 * the classifier 54 */ 51 55 private final TraindatasetBagging classifier = new TraindatasetBagging(); 52 56 57 /* 58 * (non-Javadoc) 59 * 60 * @see de.ugoe.cs.cpdp.training.WekaBaseTraining#getClassifier() 61 */ 53 62 @Override 54 63 public Classifier getClassifier() { … … 56 65 } 57 66 67 /* 68 * (non-Javadoc) 69 * 70 * @see 71 * de.ugoe.cs.cpdp.training.ISetWiseTrainingStrategy#apply(org.apache.commons.collections4.list. 72 * SetUniqueList) 73 */ 58 74 @Override 59 75 public void apply(SetUniqueList<Instances> traindataSet) { … … 66 82 } 67 83 84 /** 85 * <p> 86 * Helper class for bagging classifiers. 87 * </p> 88 * 89 * @author Steffen Herbold 90 */ 68 91 public class TraindatasetBagging extends AbstractClassifier { 69 92 93 /** 94 * default serialization ID. 95 */ 70 96 private static final long serialVersionUID = 1L; 71 97 98 /** 99 * internal storage of the training data 100 */ 72 101 private List<Instances> trainingData = null; 73 102 103 /** 104 * bagging classifier for each training data set 105 */ 74 106 private List<Classifier> classifiers = null; 75 107 108 /* 109 * (non-Javadoc) 110 * 111 * @see weka.classifiers.AbstractClassifier#classifyInstance(weka.core.Instance) 112 */ 76 113 @Override 77 114 public double classifyInstance(Instance instance) { … … 115 152 } 116 153 154 /** 155 * <p> 156 * trains a new dataset wise bagging classifier 157 * </p> 158 * 159 * @param traindataSet 160 * the training data per prodcut 161 * @throws Exception 162 * thrown if an error occurs during the training of the classifiers for any 163 * product 164 */ 117 165 public void buildClassifier(SetUniqueList<Instances> traindataSet) throws Exception { 118 166 classifiers = new LinkedList<>(); … … 126 174 } 127 175 176 /* 177 * (non-Javadoc) 178 * 179 * @see weka.classifiers.Classifier#buildClassifier(weka.core.Instances) 180 */ 128 181 @Override 129 182 public void buildClassifier(Instances traindata) throws Exception { -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/WekaBaseTraining.java
r131 r135 27 27 28 28 /** 29 * WekaBaseTraining2 29 * <p> 30 * Allows specification of the Weka classifier and its params in the XML experiment configuration. 31 * </p> 32 * <p> 33 * Important conventions of the XML format: Cross Validation params always come last and are 34 * prepended with -CVPARAM.<br> 35 * Example: 30 36 * 31 * Allows specification of the Weka classifier and its params in the XML experiment configuration. 37 * <pre> 38 * {@code 39 * <trainer name="WekaTraining" param="RandomForestLocal weka.classifiers.trees.RandomForest -CVPARAM I 5 25 5"/> 40 * } 41 * </pre> 32 42 * 33 * Important conventions of the XML format: Cross Validation params always come last and are 34 * prepended with -CVPARAM Example: <trainer name="WekaTraining" 35 * param="RandomForestLocal weka.classifiers.trees.RandomForest -CVPARAM I 5 25 5"/> 43 * @author Alexander Trautsch 36 44 */ 37 45 public abstract class WekaBaseTraining implements IWekaCompatibleTrainer { 38 46 47 /** 48 * reference to the Weka classifier 49 */ 39 50 protected Classifier classifier = null; 51 52 /** 53 * qualified class name of the weka classifier 54 */ 40 55 protected String classifierClassName; 56 57 /** 58 * name of the classifier 59 */ 41 60 protected String classifierName; 61 62 /** 63 * parameters of the training 64 */ 42 65 protected String[] classifierParams; 43 66 67 /* 68 * (non-Javadoc) 69 * 70 * @see de.ugoe.cs.cpdp.IParameterizable#setParameter(java.lang.String) 71 */ 44 72 @Override 45 73 public void setParameter(String parameters) { … … 58 86 classifierParams = Arrays.copyOfRange(params, 2, params.length); 59 87 60 // classifier = setupClassifier();88 // classifier = setupClassifier(); 61 89 } 62 90 91 /* 92 * (non-Javadoc) 93 * 94 * @see de.ugoe.cs.cpdp.training.IWekaCompatibleTrainer#getClassifier() 95 */ 63 96 @Override 64 97 public Classifier getClassifier() { … … 66 99 } 67 100 101 /** 102 * <p> 103 * helper function that sets up the Weka classifier including its parameters 104 * </p> 105 * 106 * @return 107 */ 68 108 protected Classifier setupClassifier() { 69 109 Classifier cl = null; … … 95 135 cl = obj; 96 136 97 if ( cl instanceof Vote) {137 if (cl instanceof Vote) { 98 138 Vote votingClassifier = (Vote) cl; 99 for ( Classifier classifier : votingClassifier.getClassifiers()) {100 if ( classifier instanceof BayesNet) {139 for (Classifier classifier : votingClassifier.getClassifiers()) { 140 if (classifier instanceof BayesNet) { 101 141 ((BayesNet) classifier).setUseADTree(false); 102 142 } … … 141 181 } 142 182 183 /* 184 * (non-Javadoc) 185 * 186 * @see de.ugoe.cs.cpdp.training.IWekaCompatibleTrainer#getName() 187 */ 143 188 @Override 144 189 public String getName() { -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/WekaLASERTraining.java
r91 r135 24 24 import weka.core.Instances; 25 25 26 27 26 /** 28 27 * <p> 29 * TODO comment28 * Implements training following the LASER classification scheme. 30 29 * </p> 31 30 * … … 34 33 public class WekaLASERTraining extends WekaBaseTraining implements ITrainingStrategy { 35 34 35 /** 36 * Internal classifier used for LASER. 37 */ 36 38 private final LASERClassifier internalClassifier = new LASERClassifier(); 37 39 40 /* 41 * (non-Javadoc) 42 * 43 * @see de.ugoe.cs.cpdp.training.WekaBaseTraining#getClassifier() 44 */ 38 45 @Override 39 46 public Classifier getClassifier() { … … 41 48 } 42 49 50 /* 51 * (non-Javadoc) 52 * 53 * @see de.ugoe.cs.cpdp.training.ITrainingStrategy#apply(weka.core.Instances) 54 */ 43 55 @Override 44 56 public void apply(Instances traindata) { … … 51 63 } 52 64 65 /** 66 * <p> 67 * Internal helper class that defines the laser classifier. 68 * </p> 69 * 70 * @author Steffen Herbold 71 */ 53 72 public class LASERClassifier extends AbstractClassifier { 54 73 74 /** 75 * Default serial ID. 76 */ 55 77 private static final long serialVersionUID = 1L; 56 78 79 /** 80 * Internal reference to the classifier. 81 */ 57 82 private Classifier laserClassifier = null; 83 84 /** 85 * Internal storage of the training data required for NN analysis. 86 */ 58 87 private Instances traindata = null; 59 88 89 /* 90 * (non-Javadoc) 91 * 92 * @see weka.classifiers.AbstractClassifier#classifyInstance(weka.core.Instance) 93 */ 60 94 @Override 61 95 public double classifyInstance(Instance instance) throws Exception { 62 96 List<Integer> closestInstances = new LinkedList<>(); 63 97 double minDistance = Double.MAX_VALUE; 64 for ( int i=0; i<traindata.size(); i++) {98 for (int i = 0; i < traindata.size(); i++) { 65 99 double distance = WekaUtils.hammingDistance(instance, traindata.get(i)); 66 if ( distance<minDistance) {100 if (distance < minDistance) { 67 101 minDistance = distance; 68 102 } 69 103 } 70 for ( int i=0; i<traindata.size(); i++) {104 for (int i = 0; i < traindata.size(); i++) { 71 105 double distance = WekaUtils.hammingDistance(instance, traindata.get(i)); 72 if ( distance<=minDistance) {106 if (distance <= minDistance) { 73 107 closestInstances.add(i); 74 108 } 75 109 } 76 if ( closestInstances.size()==1) {110 if (closestInstances.size() == 1) { 77 111 int closestIndex = closestInstances.get(0); 78 112 Instance closestTrainingInstance = traindata.get(closestIndex); 79 113 List<Integer> closestToTrainingInstance = new LinkedList<>(); 80 114 double minTrainingDistance = Double.MAX_VALUE; 81 for( int i=0; i<traindata.size(); i++ ) { 82 if( closestIndex!=i ) { 83 double distance = WekaUtils.hammingDistance(closestTrainingInstance, traindata.get(i)); 84 if( distance<minTrainingDistance ) { 115 for (int i = 0; i < traindata.size(); i++) { 116 if (closestIndex != i) { 117 double distance = 118 WekaUtils.hammingDistance(closestTrainingInstance, traindata.get(i)); 119 if (distance < minTrainingDistance) { 85 120 minTrainingDistance = distance; 86 121 } 87 122 } 88 123 } 89 for( int i=0; i<traindata.size(); i++ ) { 90 if( closestIndex!=i ) { 91 double distance = WekaUtils.hammingDistance(closestTrainingInstance, traindata.get(i)); 92 if( distance<=minTrainingDistance ) { 124 for (int i = 0; i < traindata.size(); i++) { 125 if (closestIndex != i) { 126 double distance = 127 WekaUtils.hammingDistance(closestTrainingInstance, traindata.get(i)); 128 if (distance <= minTrainingDistance) { 93 129 closestToTrainingInstance.add(i); 94 130 } 95 131 } 96 132 } 97 if ( closestToTrainingInstance.size()==1) {133 if (closestToTrainingInstance.size() == 1) { 98 134 return laserClassifier.classifyInstance(instance); 99 135 } … … 101 137 double label = Double.NaN; 102 138 boolean allEqual = true; 103 for ( Integer index : closestToTrainingInstance) {104 if ( Double.isNaN(label)) {139 for (Integer index : closestToTrainingInstance) { 140 if (Double.isNaN(label)) { 105 141 label = traindata.get(index).classValue(); 106 142 } 107 else if ( label!=traindata.get(index).classValue()) {143 else if (label != traindata.get(index).classValue()) { 108 144 allEqual = false; 109 145 break; 110 146 } 111 147 } 112 if ( allEqual) {148 if (allEqual) { 113 149 return label; 114 150 } … … 117 153 } 118 154 } 119 } else { 155 } 156 else { 120 157 double label = Double.NaN; 121 158 boolean allEqual = true; 122 for ( Integer index : closestInstances) {123 if ( Double.isNaN(label)) {159 for (Integer index : closestInstances) { 160 if (Double.isNaN(label)) { 124 161 label = traindata.get(index).classValue(); 125 162 } 126 else if ( label!=traindata.get(index).classValue()) {163 else if (label != traindata.get(index).classValue()) { 127 164 allEqual = false; 128 165 break; 129 166 } 130 167 } 131 if ( allEqual) {168 if (allEqual) { 132 169 return label; 133 170 } … … 138 175 } 139 176 177 /* 178 * (non-Javadoc) 179 * 180 * @see weka.classifiers.Classifier#buildClassifier(weka.core.Instances) 181 */ 140 182 @Override 141 183 public void buildClassifier(Instances traindata) throws Exception { -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/WekaLocalEMTraining.java
r99 r135 33 33 34 34 /** 35 * WekaLocalEMTraining 35 * <p> 36 * Local Trainer with EM Clustering for data partitioning. Currently supports only EM Clustering. 37 * </p> 38 * <ol> 39 * <li>Cluster training data</li> 40 * <li>for each cluster train a classifier with training data from cluster</li> 41 * <li>match test data instance to a cluster, then classify with classifier from the cluster</li> 42 * </ol> 36 43 * 37 * Local Trainer with EM Clustering for data partitioning. Currently supports only EM Clustering.44 * XML configuration: 38 45 * 39 * 1. Cluster training data 2. for each cluster train a classifier with training data from cluster 40 * 3. match test data instance to a cluster, then classify with classifier from the cluster 41 * 42 * XML configuration: <!-- because of clustering --> <preprocessor name="Normalization" param=""/> 43 * 44 * <!-- cluster trainer --> <trainer name="WekaLocalEMTraining" 45 * param="NaiveBayes weka.classifiers.bayes.NaiveBayes" /> 46 * <pre> 47 * {@code 48 * <trainer name="WekaLocalEMTraining" param="NaiveBayes weka.classifiers.bayes.NaiveBayes" /> 49 * } 50 * </pre> 46 51 */ 47 52 public class WekaLocalEMTraining extends WekaBaseTraining implements ITrainingStrategy { 48 53 54 /** 55 * the classifier 56 */ 49 57 private final TraindatasetCluster classifier = new TraindatasetCluster(); 50 58 59 /* 60 * (non-Javadoc) 61 * 62 * @see de.ugoe.cs.cpdp.training.WekaBaseTraining#getClassifier() 63 */ 51 64 @Override 52 65 public Classifier getClassifier() { … … 54 67 } 55 68 69 /* 70 * (non-Javadoc) 71 * 72 * @see de.ugoe.cs.cpdp.training.ITrainingStrategy#apply(weka.core.Instances) 73 */ 56 74 @Override 57 75 public void apply(Instances traindata) { … … 64 82 } 65 83 84 /** 85 * <p> 86 * Weka classifier for the local model with EM clustering. 87 * </p> 88 * 89 * @author Alexander Trautsch 90 */ 66 91 public class TraindatasetCluster extends AbstractClassifier { 67 92 93 /** 94 * default serializtion ID 95 */ 68 96 private static final long serialVersionUID = 1L; 69 97 98 /** 99 * EM clusterer used 100 */ 70 101 private EM clusterer = null; 71 102 103 /** 104 * classifiers for each cluster 105 */ 72 106 private HashMap<Integer, Classifier> cclassifier; 107 108 /** 109 * training data for each cluster 110 */ 73 111 private HashMap<Integer, Instances> ctraindata; 74 112 … … 107 145 } 108 146 147 /* 148 * (non-Javadoc) 149 * 150 * @see weka.classifiers.AbstractClassifier#classifyInstance(weka.core.Instance) 151 */ 109 152 @Override 110 153 public double classifyInstance(Instance instance) { … … 139 182 } 140 183 184 /* 185 * (non-Javadoc) 186 * 187 * @see weka.classifiers.Classifier#buildClassifier(weka.core.Instances) 188 */ 141 189 @Override 142 190 public void buildClassifier(Instances traindata) throws Exception { -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/WekaLocalFQTraining.java
r99 r135 35 35 36 36 /** 37 * <p> 37 38 * Trainer with reimplementation of WHERE clustering algorithm from: Tim Menzies, Andrew Butcher, 38 39 * David Cok, Andrian Marcus, Lucas Layman, Forrest Shull, Burak Turhan, Thomas Zimmermann, 39 40 * "Local versus Global Lessons for Defect Prediction and Effort Estimation," IEEE Transactions on 40 41 * Software Engineering, vol. 39, no. 6, pp. 822-834, June, 2013 41 * 42 * With WekaLocalFQTraining we do the following: 1) Run the Fastmap algorithm on all training data, 43 * let it calculate the 2 most significant dimensions and projections of each instance to these 44 * dimensions 2) With these 2 dimensions we span a QuadTree which gets recursively split on 45 * median(x) and median(y) values. 3) We cluster the QuadTree nodes together if they have similar 46 * density (50%) 4) We save the clusters and their training data 5) We only use clusters with > 47 * ALPHA instances (currently Math.sqrt(SIZE)), rest is discarded with the training data of this 48 * cluster 6) We train a Weka classifier for each cluster with the clusters training data 7) We 49 * recalculate Fastmap distances for a single instance with the old pivots and then try to find a 50 * cluster containing the coords of the instance. 7.1.) If we can not find a cluster (due to coords 51 * outside of all clusters) we find the nearest cluster. 8) We classify the Instance with the 52 * classifier and traindata from the Cluster we found in 7. 42 * </p> 43 * <p> 44 * With WekaLocalFQTraining we do the following: 45 * <ol> 46 * <li>Run the Fastmap algorithm on all training data, let it calculate the 2 most significant 47 * dimensions and projections of each instance to these dimensions</li> 48 * <li>With these 2 dimensions we span a QuadTree which gets recursively split on median(x) and 49 * median(y) values.</li> 50 * <li>We cluster the QuadTree nodes together if they have similar density (50%)</li> 51 * <li>We save the clusters and their training data</li> 52 * <li>We only use clusters with > ALPHA instances (currently Math.sqrt(SIZE)), the rest is 53 * discarded with the training data of this cluster</li> 54 * <li>We train a Weka classifier for each cluster with the clusters training data</li> 55 * <li>We recalculate Fastmap distances for a single instance with the old pivots and then try to 56 * find a cluster containing the coords of the instance. If we can not find a cluster (due to coords 57 * outside of all clusters) we find the nearest cluster.</li> 58 * <li>We classify the Instance with the classifier and traindata from the Cluster we found in 7. 59 * </li> 60 * </p> 53 61 */ 54 62 public class WekaLocalFQTraining extends WekaBaseTraining implements ITrainingStrategy { 55 63 64 /** 65 * the classifier 66 */ 56 67 private final TraindatasetCluster classifier = new TraindatasetCluster(); 57 68 69 /* 70 * (non-Javadoc) 71 * 72 * @see de.ugoe.cs.cpdp.training.WekaBaseTraining#getClassifier() 73 */ 58 74 @Override 59 75 public Classifier getClassifier() { … … 61 77 } 62 78 79 /* 80 * (non-Javadoc) 81 * 82 * @see de.ugoe.cs.cpdp.training.ITrainingStrategy#apply(weka.core.Instances) 83 */ 63 84 @Override 64 85 public void apply(Instances traindata) { … … 71 92 } 72 93 94 /** 95 * <p> 96 * Weka classifier for the local model with WHERE clustering 97 * </p> 98 * 99 * @author Alexander Trautsch 100 */ 73 101 public class TraindatasetCluster extends AbstractClassifier { 74 102 103 /** 104 * default serialization ID 105 */ 75 106 private static final long serialVersionUID = 1L; 76 107 77 /* classifier per cluster */ 108 /** 109 * classifiers for each cluster 110 */ 78 111 private HashMap<Integer, Classifier> cclassifier; 79 112 80 /* instances per cluster */ 113 /** 114 * training data for each cluster 115 */ 81 116 private HashMap<Integer, Instances> ctraindata; 82 117 83 /* 118 /** 84 119 * holds the instances and indices of the pivot objects of the Fastmap calculation in 85 120 * buildClassifier … … 87 122 private HashMap<Integer, Instance> cpivots; 88 123 89 /* holds the indices of the pivot objects for x,y and the dimension [x,y][dimension] */ 124 /** 125 * holds the indices of the pivot objects for x,y and the dimension [x,y][dimension] 126 */ 90 127 private int[][] cpivotindices; 91 128 92 /* holds the sizes of the cluster multiple "boxes" per cluster */ 129 /** 130 * holds the sizes of the cluster multiple "boxes" per cluster 131 */ 93 132 private HashMap<Integer, ArrayList<Double[][]>> csize; 94 133 95 /* debug vars */ 134 /** 135 * debug variable 136 */ 96 137 @SuppressWarnings("unused") 97 138 private boolean show_biggest = true; 98 139 140 /** 141 * debug variable 142 */ 99 143 @SuppressWarnings("unused") 100 144 private int CFOUND = 0; 145 146 /** 147 * debug variable 148 */ 101 149 @SuppressWarnings("unused") 102 150 private int CNOTFOUND = 0; 103 151 152 /** 153 * <p> 154 * copies an instance such that is is compatible with the local model 155 * </p> 156 * 157 * @param instances 158 * instance format 159 * @param instance 160 * instance that is copied 161 * @return 162 */ 104 163 private Instance createInstance(Instances instances, Instance instance) { 105 164 // attributes for feeding instance to classifier … … 127 186 128 187 /** 188 * <p> 129 189 * Because Fastmap saves only the image not the values of the attributes it used we can not 130 190 * use the old data directly to classify single instances to clusters. 191 * </p> 192 * <p> 193 * To classify a single instance we do a new Fastmap computation with only the instance and 194 * the old pivot elements. 195 * </p> 196 * </p> 197 * After that we find the cluster with our Fastmap result for x and y. 198 * </p> 131 199 * 132 * To classify a single instance we do a new fastmap computation with only the instance and 133 * the old pivot elements. 134 * 135 * After that we find the cluster with our fastmap result for x and y. 200 * @param instance 201 * instance that is classified 202 * @see weka.classifiers.AbstractClassifier#classifyInstance(weka.core.Instance) 136 203 */ 137 204 @Override … … 169 236 double[][] distmat = new double[2 * FMAP.target_dims + 1][2 * FMAP.target_dims + 1]; 170 237 distmat[0][0] = 0; 171 distmat[0][1] = 172 dist.distance(clusterInstance, 173 this.cpivots.get((Integer) this.cpivotindices[0][0])); 174 distmat[0][2] = 175 dist.distance(clusterInstance, 176 this.cpivots.get((Integer) this.cpivotindices[1][0])); 177 distmat[0][3] = 178 dist.distance(clusterInstance, 179 this.cpivots.get((Integer) this.cpivotindices[0][1])); 180 distmat[0][4] = 181 dist.distance(clusterInstance, 182 this.cpivots.get((Integer) this.cpivotindices[1][1])); 183 184 distmat[1][0] = 185 dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][0]), 186 clusterInstance); 238 distmat[0][1] = dist.distance(clusterInstance, 239 this.cpivots.get((Integer) this.cpivotindices[0][0])); 240 distmat[0][2] = dist.distance(clusterInstance, 241 this.cpivots.get((Integer) this.cpivotindices[1][0])); 242 distmat[0][3] = dist.distance(clusterInstance, 243 this.cpivots.get((Integer) this.cpivotindices[0][1])); 244 distmat[0][4] = dist.distance(clusterInstance, 245 this.cpivots.get((Integer) this.cpivotindices[1][1])); 246 247 distmat[1][0] = dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][0]), 248 clusterInstance); 187 249 distmat[1][1] = 0; 188 distmat[1][2] = 189 dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][0]), 190 this.cpivots.get((Integer) this.cpivotindices[1][0])); 191 distmat[1][3] = 192 dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][0]), 193 this.cpivots.get((Integer) this.cpivotindices[0][1])); 194 distmat[1][4] = 195 dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][0]), 196 this.cpivots.get((Integer) this.cpivotindices[1][1])); 197 198 distmat[2][0] = 199 dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][0]), 200 clusterInstance); 201 distmat[2][1] = 202 dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][0]), 203 this.cpivots.get((Integer) this.cpivotindices[0][0])); 250 distmat[1][2] = dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][0]), 251 this.cpivots.get((Integer) this.cpivotindices[1][0])); 252 distmat[1][3] = dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][0]), 253 this.cpivots.get((Integer) this.cpivotindices[0][1])); 254 distmat[1][4] = dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][0]), 255 this.cpivots.get((Integer) this.cpivotindices[1][1])); 256 257 distmat[2][0] = dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][0]), 258 clusterInstance); 259 distmat[2][1] = dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][0]), 260 this.cpivots.get((Integer) this.cpivotindices[0][0])); 204 261 distmat[2][2] = 0; 205 distmat[2][3] = 206 dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][0]), 207 this.cpivots.get((Integer) this.cpivotindices[0][1])); 208 distmat[2][4] = 209 dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][0]), 210 this.cpivots.get((Integer) this.cpivotindices[1][1])); 211 212 distmat[3][0] = 213 dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][1]), 214 clusterInstance); 215 distmat[3][1] = 216 dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][1]), 217 this.cpivots.get((Integer) this.cpivotindices[0][0])); 218 distmat[3][2] = 219 dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][1]), 220 this.cpivots.get((Integer) this.cpivotindices[1][0])); 262 distmat[2][3] = dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][0]), 263 this.cpivots.get((Integer) this.cpivotindices[0][1])); 264 distmat[2][4] = dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][0]), 265 this.cpivots.get((Integer) this.cpivotindices[1][1])); 266 267 distmat[3][0] = dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][1]), 268 clusterInstance); 269 distmat[3][1] = dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][1]), 270 this.cpivots.get((Integer) this.cpivotindices[0][0])); 271 distmat[3][2] = dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][1]), 272 this.cpivots.get((Integer) this.cpivotindices[1][0])); 221 273 distmat[3][3] = 0; 222 distmat[3][4] = 223 dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][1]), 224 this.cpivots.get((Integer) this.cpivotindices[1][1])); 225 226 distmat[4][0] = 227 dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][1]), 228 clusterInstance); 229 distmat[4][1] = 230 dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][1]), 231 this.cpivots.get((Integer) this.cpivotindices[0][0])); 232 distmat[4][2] = 233 dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][1]), 234 this.cpivots.get((Integer) this.cpivotindices[1][0])); 235 distmat[4][3] = 236 dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][1]), 237 this.cpivots.get((Integer) this.cpivotindices[0][1])); 274 distmat[3][4] = dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][1]), 275 this.cpivots.get((Integer) this.cpivotindices[1][1])); 276 277 distmat[4][0] = dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][1]), 278 clusterInstance); 279 distmat[4][1] = dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][1]), 280 this.cpivots.get((Integer) this.cpivotindices[0][0])); 281 distmat[4][2] = dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][1]), 282 this.cpivots.get((Integer) this.cpivotindices[1][0])); 283 distmat[4][3] = dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][1]), 284 this.cpivots.get((Integer) this.cpivotindices[0][1])); 238 285 distmat[4][4] = 0; 239 286 … … 243 290 * distmat[0].length; j++) { if(biggest < distmat[i][j]) { biggest = distmat[i][j]; 244 291 * } } } if(this.show_biggest) { Console.traceln(Level.INFO, 245 * String.format(""+clusterInstance)); Console.traceln(Level.INFO, 246 * String.format("biggest distances: "+ biggest)); this.show_biggest = false; }292 * String.format(""+clusterInstance)); Console.traceln(Level.INFO, String.format( 293 * "biggest distances: "+ biggest)); this.show_biggest = false; } 247 294 */ 248 295 … … 316 363 cnumber = clusternumber.next(); 317 364 for (int i = 0; i < ctraindata.get(cnumber).size(); i++) { 318 if (dist.distance(instance, ctraindata.get(cnumber).get(i)) <= min_distance) 365 if (dist.distance(instance, 366 ctraindata.get(cnumber).get(i)) <= min_distance) 319 367 { 320 368 found_cnumber = cnumber; … … 347 395 } 348 396 397 /* 398 * (non-Javadoc) 399 * 400 * @see weka.classifiers.Classifier#buildClassifier(weka.core.Instances) 401 */ 349 402 @Override 350 403 public void buildClassifier(Instances traindata) throws Exception { … … 421 474 422 475 // Console.traceln(Level.INFO, 423 // String.format("size for cluster ("+small[0]+","+small[1]+") - ("+big[0]+","+big[1]+")")); 476 // String.format("size for cluster ("+small[0]+","+small[1]+") - 477 // ("+big[0]+","+big[1]+")")); 424 478 425 479 // 5. generate quadtree … … 439 493 440 494 // recursive split und grid clustering eher static 441 TREE.recursiveSplit(TREE);495 QuadTree.recursiveSplit(TREE); 442 496 443 497 // generate list of nodes sorted by density (childs only) … … 465 519 } 466 520 else { 467 Console.traceln(Level.INFO, 468 String.format("drop cluster, only: " + current.size() + 469 " instances")); 521 Console.traceln(Level.INFO, String 522 .format("drop cluster, only: " + current.size() + " instances")); 470 523 } 471 524 } … … 505 558 // traindata_count += ctraindata.get(cnumber).size(); 506 559 // Console.traceln(Level.INFO, 507 // String.format("building classifier in cluster "+cnumber +" 560 // String.format("building classifier in cluster "+cnumber +" with "+ 508 561 // ctraindata.get(cnumber).size() +" traindata instances")); 509 562 } … … 516 569 517 570 /** 518 * Payload for the QuadTree. x and y are the calculated Fastmap values. T is a weka instance. 571 * <p> 572 * Payload for the QuadTree. x and y are the calculated Fastmap values. T is a Weka instance. 573 * </p> 574 * 575 * @author Alexander Trautsch 519 576 */ 520 577 public class QuadTreePayload<T> { 521 578 522 public double x; 523 public double y; 579 /** 580 * x-value 581 */ 582 public final double x; 583 584 /** 585 * y-value 586 */ 587 public final double y; 588 589 /** 590 * associated instance 591 */ 524 592 private T inst; 525 593 594 /** 595 * <p> 596 * Constructor. Creates the payload. 597 * </p> 598 * 599 * @param x 600 * x-value 601 * @param y 602 * y-value 603 * @param value 604 * associated instace 605 */ 526 606 public QuadTreePayload(double x, double y, T value) { 527 607 this.x = x; … … 530 610 } 531 611 612 /** 613 * <p> 614 * returns the instance 615 * </p> 616 * 617 * @return 618 */ 532 619 public T getInst() { 533 620 return this.inst; … … 536 623 537 624 /** 538 * Fastmap implementation539 * 540 * Faloutsos, C., & Lin, K. I. (1995). FastMap: A fast algorithm for indexing, data-mining and625 * <p> 626 * Fastmap implementation after:<br> 627 * * Faloutsos, C., & Lin, K. I. (1995). FastMap: A fast algorithm for indexing, data-mining and 541 628 * visualization of traditional and multimedia datasets (Vol. 24, No. 2, pp. 163-174). ACM. 629 * </p> 542 630 */ 543 631 public class Fastmap { 544 632 545 /* N x k Array, at the end, the i-th row will be the image of the i-th object */ 633 /** 634 * N x k Array, at the end, the i-th row will be the image of the i-th object 635 */ 546 636 private double[][] X; 547 637 548 /* 2 x k pivot Array one pair per recursive call */ 638 /** 639 * 2 x k pivot Array one pair per recursive call 640 */ 549 641 private int[][] PA; 550 642 551 /* Objects we got (distance matrix) */ 643 /** 644 * Objects we got (distance matrix) 645 */ 552 646 private double[][] O; 553 647 554 /* column of X currently updated (also the dimension) */ 648 /** 649 * column of X currently updated (also the dimension) 650 */ 555 651 private int col = 0; 556 652 557 /* number of dimensions we want */ 653 /** 654 * number of dimensions we want 655 */ 558 656 private int target_dims = 0; 559 657 560 // if we already have the pivot elements 658 /** 659 * if we already have the pivot elements 660 */ 561 661 private boolean pivot_set = false; 562 662 663 /** 664 * <p> 665 * Constructor. Creates a new Fastmap object. 666 * </p> 667 * 668 * @param k 669 */ 563 670 public Fastmap(int k) { 564 671 this.target_dims = k; … … 566 673 567 674 /** 568 * Sets the distance matrix and params that depend on this 675 * <p> 676 * Sets the distance matrix and params that depend on this. 677 * </p> 569 678 * 570 679 * @param O 680 * distance matrix 571 681 */ 572 682 public void setDistmat(double[][] O) { … … 578 688 579 689 /** 690 * <p> 580 691 * Set pivot elements, we need that to classify instances after the calculation is complete 581 692 * (because we then want to reuse only the pivot elements). 693 * </p> 582 694 * 583 695 * @param pi 696 * the pivots 584 697 */ 585 698 public void setPivots(int[][] pi) { … … 589 702 590 703 /** 704 * <p> 591 705 * Return the pivot elements that were chosen during the calculation 706 * </p> 592 707 * 593 * @return 708 * @return the pivots 594 709 */ 595 710 public int[][] getPivots() { … … 598 713 599 714 /** 600 * The distance function for euclidean distance 601 * 602 * Acts according to equation 4 of the fastmap paper 715 * <p> 716 * The distance function for euclidean distance. Acts according to equation 4 of the Fastmap 717 * paper. 718 * </p> 603 719 * 604 720 * @param x … … 606 722 * @param y 607 723 * y index of y image (if k==0 y object) 608 * @param kdimensionality 609 * @return distance 724 * @param k 725 * dimensionality 726 * @return the distance 610 727 */ 611 728 private double dist(int x, int y, int k) { … … 624 741 625 742 /** 626 * Find the object farthest from the given index This method is a helper Method for 627 * findDistandObjects 743 * <p> 744 * Find the object farthest from the given index. This method is a helper Method for 745 * findDistandObjects. 746 * </p> 628 747 * 629 748 * @param index … … 646 765 647 766 /** 648 * Finds the pivot objects 767 * <p> 768 * Finds the pivot objects. This method is basically algorithm 1 of the Fastmap paper. 769 * </p> 649 770 * 650 * This method is basically algorithm 1 of the fastmap paper. 651 * 652 * @return 2 indexes of the choosen pivot objects 771 * @return 2 indexes of the chosen pivot objects 653 772 */ 654 773 private int[] findDistantObjects() { … … 668 787 669 788 /** 670 * Calculates the new k-vector values (projections) 671 * 672 * This is basically algorithm 2 of the fastmap paper. We just added the possibility to 673 * pre-set the pivot elements because we need to classify single instances after the 674 * computation is already done. 675 * 676 * @param dims 677 * dimensionality 789 * <p> 790 * Calculates the new k-vector values (projections) This is basically algorithm 2 of the 791 * fastmap paper. We just added the possibility to pre-set the pivot elements because we 792 * need to classify single instances after the computation is already done. 793 * </p> 678 794 */ 679 795 public void calculate() { … … 713 829 714 830 /** 831 * <p> 715 832 * returns the result matrix of the projections 833 * </p> 716 834 * 717 835 * @return calculated result -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/WekaTestAwareTraining.java
r99 r135 22 22 import weka.core.Instances; 23 23 24 // TODO comment 24 /** 25 * <p> 26 * Trainer that allows classifiers access to the training data. Classifiers need to make sure that 27 * they do not use the classification. 28 * </p> 29 * 30 * @author Steffen Herbold 31 */ 25 32 public class WekaTestAwareTraining extends WekaBaseTraining implements ITestAwareTrainingStrategy { 26 33 34 /* 35 * (non-Javadoc) 36 * 37 * @see de.ugoe.cs.cpdp.training.ITestAwareTrainingStrategy#apply(weka.core.Instances, 38 * weka.core.Instances) 39 */ 27 40 @Override 28 41 public void apply(Instances testdata, Instances traindata) { 29 42 classifier = setupClassifier(); 30 if ( !(classifier instanceof ITestAwareClassifier)) {43 if (!(classifier instanceof ITestAwareClassifier)) { 31 44 throw new RuntimeException("classifier must implement the ITestAwareClassifier interface in order to be used as TestAwareTrainingStrategy"); 32 45 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/WekaTraining.java
r99 r135 22 22 23 23 /** 24 * Programmatic WekaTraining 25 * 26 * first parameter is Trainer Name. second parameter is class name 27 * 28 * all subsequent parameters are configuration params (for example for trees) Cross Validation 29 * params always come last and are prepended with -CVPARAM 30 * 31 * XML Configurations for Weka Classifiers: 32 * 24 * <p> 25 * The first parameter is the trainer name, second parameter is class name. All subsequent 26 * parameters are configuration parameters of the algorithms. Cross validation parameters always 27 * come last and are prepended with -CVPARAM 28 * </p> 29 * XML Configurations for Weka Classifiers: 33 30 * <pre> 34 31 * {@code -
trunk/CrossPare/src/de/ugoe/cs/cpdp/util/SortUtils.java
r61 r135 1 1 2 package de.ugoe.cs.cpdp.util; 2 3 4 /** 5 * <p> 6 * Utility functions for sorting. 7 * </p> 8 * 9 * @author Steffen Herbold 10 */ 3 11 public class SortUtils { 4 12 13 /** 14 * <p> 15 * Implements a quick sort that sorts an index set together with the array. 16 * </p> 17 * 18 * @param main 19 * the array that is sorted 20 * @param index 21 * the index set for the array 22 */ 5 23 public static <T extends Comparable<T>> void quicksort(T[] main, int[] index) { 6 24 quicksort(main, index, 0, index.length - 1, false); 7 25 } 8 9 public static <T extends Comparable<T>> void quicksort(T[] main, int[] index, boolean descending) { 26 27 /** 28 * <p> 29 * Implements a quick sort that sorts an index set together with the array. 30 * </p> 31 * 32 * @param main 33 * the array that is sorted 34 * @param index 35 * the index set for the array 36 * @param descending 37 * defines the sorting order 38 */ 39 public static <T extends Comparable<T>> void quicksort(T[] main, 40 int[] index, 41 boolean descending) 42 { 10 43 quicksort(main, index, 0, index.length - 1, descending); 11 44 } 12 45 13 // quicksort a[left] to a[right] 14 private static <T extends Comparable<T>> void quicksort(T[] a, int[] index, int left, int right, boolean descending) { 46 /** 47 * <p> 48 * internal quicksort implementation 49 * </p> 50 * 51 * @param main 52 * the array that is sorted 53 * @param index 54 * the index set for the array 55 * @param left 56 * defines the current partition 57 * @param right 58 * defines the current partition 59 * @param descending 60 * defines the sorting order 61 */ 62 private static <T extends Comparable<T>> void quicksort(T[] main, 63 int[] index, 64 int left, 65 int right, 66 boolean descending) 67 { 15 68 if (right <= left) 16 69 return; 17 int i = partition( a, index, left, right, descending);18 quicksort( a, index, left, i - 1, descending);19 quicksort( a, index, i + 1, right, descending);70 int i = partition(main, index, left, right, descending); 71 quicksort(main, index, left, i - 1, descending); 72 quicksort(main, index, i + 1, right, descending); 20 73 } 21 74 22 // partition a[left] to a[right], assumes left < right 23 private static <T extends Comparable<T>> int partition(T[] a, int[] index, int left, int right, boolean descending) { 75 /** 76 * <p> 77 * internal partitioning of the quicksort implementation 78 * </p> 79 * 80 * @param main 81 * the array that is sorted 82 * @param index 83 * the index set for the array 84 * @param left 85 * defines the current partition 86 * @param right 87 * defines the current partition 88 * @param descending 89 * defines the sorting order 90 */ 91 private static <T extends Comparable<T>> int partition(T[] main, 92 int[] index, 93 int left, 94 int right, 95 boolean descending) 96 { 24 97 int i = left - 1; 25 98 int j = right; 26 99 while (true) { 27 while (compare( a[++i], a[right], descending)) // find item on left to swap100 while (compare(main[++i], main[right], descending)) // find item on left to swap 28 101 ; // a[right] acts as sentinel 29 while (compare( a[right], a[--j], descending)) // find item on right to swap102 while (compare(main[right], main[--j], descending)) // find item on right to swap 30 103 if (j == left) 31 104 break; // don't go out-of-bounds 32 105 if (i >= j) 33 106 break; // check if pointers cross 34 exch(a, index, i, j); // swap two elements into place107 swap(main, index, i, j); // swap two elements into place 35 108 } 36 exch(a, index, i, right); // swap with partition element109 swap(main, index, i, right); // swap with partition element 37 110 return i; 38 111 } 39 112 40 // is x < y ? 113 /** 114 * <p> 115 * helper function for comparator evaluation 116 * </p> 117 * 118 * @param x 119 * first element that is compared 120 * @param y 121 * second element that is compared 122 * @param descending 123 * defines the sorting order 124 * @return true if x is larger than y and descending is true or y is larger than x and 125 * descending is false 126 */ 41 127 private static <T extends Comparable<T>> boolean compare(T x, T y, boolean descending) { 42 if( descending ) { 43 return x.compareTo(y)>0; 44 } else { 45 return x.compareTo(y)<0; 128 if (descending) { 129 return x.compareTo(y) > 0; 130 } 131 else { 132 return x.compareTo(y) < 0; 46 133 } 47 134 } 48 135 49 // exchange a[i] and a[j] 50 private static <T extends Comparable<T>> void exch(T[] a, int[] index, int i, int j) { 51 T swap = a[i]; 52 a[i] = a[j]; 53 a[j] = swap; 136 /** 137 * <p> 138 * swaps to elements 139 * </p> 140 * 141 * @param main 142 * the array that is sorted 143 * @param index 144 * the index set for the array 145 * @param i 146 * index of the first element 147 * @param j 148 * index of the second element 149 */ 150 private static <T extends Comparable<T>> void swap(T[] main, int[] index, int i, int j) { 151 T tmp = main[i]; 152 main[i] = main[j]; 153 main[j] = tmp; 54 154 int b = index[i]; 55 155 index[i] = index[j]; -
trunk/CrossPare/src/de/ugoe/cs/cpdp/util/WekaUtils.java
r129 r135 15 15 package de.ugoe.cs.cpdp.util; 16 16 17 // TODO comment18 17 import org.apache.commons.math3.ml.distance.EuclideanDistance; 19 18 … … 21 20 import weka.core.Instances; 22 21 22 /** 23 * <p> 24 * Collections of helper functions to work with Weka. 25 * </p> 26 * 27 * @author Steffen Herbold 28 */ 23 29 public class WekaUtils { 24 30 31 /** 32 * <p> 33 * Data class for distance between instances within a data set based on their distributional 34 * characteristics. 35 * </p> 36 * 37 * @author Steffen Herbold 38 */ 25 39 public static class DistChar { 26 40 public final double mean; … … 29 43 public final double max; 30 44 public final int num; 45 31 46 private DistChar(double mean, double std, double min, double max, int num) { 32 47 this.mean = mean; … … 37 52 } 38 53 } 39 54 40 55 /** 41 56 * Scaling value that moves the decimal point by 5 digets. 42 57 */ 43 58 public final static double SCALER = 10000.0d; 44 59 45 60 /** 46 61 * <p> … … 66 81 return distance; 67 82 } 68 83 84 /** 85 * <p> 86 * Returns a double array of the values without the classification. 87 * </p> 88 * 89 * @param instance 90 * the instance 91 * @return double array 92 */ 69 93 public static double[] instanceValues(Instance instance) { 70 double[] values = new double[instance.numAttributes() -1];71 int k =0;72 for ( int j=0; j<instance.numAttributes() ; j++) {73 if ( j!= instance.classIndex()) {94 double[] values = new double[instance.numAttributes() - 1]; 95 int k = 0; 96 for (int j = 0; j < instance.numAttributes(); j++) { 97 if (j != instance.classIndex()) { 74 98 values[k] = instance.value(j); 75 99 k++; … … 78 102 return values; 79 103 } 80 104 105 /** 106 * <p> 107 * Calculates the distributional characteristics of the distances the instances within a data 108 * set have to each other. 109 * </p> 110 * 111 * @param data 112 * data for which the instances are characterized 113 * @return characteristics 114 */ 81 115 public static DistChar datasetDistance(Instances data) { 82 116 double distance; … … 87 121 int numCmp = 0; 88 122 int l = 0; 89 double[] inst1 = new double[data.numAttributes() -1];90 double[] inst2 = new double[data.numAttributes() -1];123 double[] inst1 = new double[data.numAttributes() - 1]; 124 double[] inst2 = new double[data.numAttributes() - 1]; 91 125 EuclideanDistance euclideanDistance = new EuclideanDistance(); 92 for ( int i=0; i<data.numInstances(); i++) {93 l =0;94 for ( int k=0; k<data.numAttributes(); k++) {95 if ( k!=data.classIndex()) {126 for (int i = 0; i < data.numInstances(); i++) { 127 l = 0; 128 for (int k = 0; k < data.numAttributes(); k++) { 129 if (k != data.classIndex()) { 96 130 inst1[l] = data.instance(i).value(k); 97 131 } 98 132 } 99 for ( int j=0; j<data.numInstances(); j++) {100 if ( j!=i) {101 l =0;102 for ( int k=0; k<data.numAttributes(); k++) {103 if ( k!=data.classIndex()) {133 for (int j = 0; j < data.numInstances(); j++) { 134 if (j != i) { 135 l = 0; 136 for (int k = 0; k < data.numAttributes(); k++) { 137 if (k != data.classIndex()) { 104 138 inst2[l] = data.instance(j).value(k); 105 139 } … … 107 141 distance = euclideanDistance.compute(inst1, inst2); 108 142 sumAll += distance; 109 sumAllQ += distance *distance;143 sumAllQ += distance * distance; 110 144 numCmp++; 111 if ( distance < min) {145 if (distance < min) { 112 146 min = distance; 113 147 } 114 if ( distance > max) {148 if (distance > max) { 115 149 max = distance; 116 150 } … … 119 153 } 120 154 double mean = sumAll / numCmp; 121 double std = Math.sqrt((sumAllQ-(sumAll*sumAll)/numCmp) * 122 (1.0d / (numCmp - 1))); 155 double std = Math.sqrt((sumAllQ - (sumAll * sumAll) / numCmp) * (1.0d / (numCmp - 1))); 123 156 return new DistChar(mean, std, min, max, data.numInstances()); 124 157 } 125 126 // like above, but for single attribute 158 159 /** 160 * <p> 161 * Calculates the distributional characteristics of the distances of a single attribute the 162 * instances within a data set have to each other. 163 * </p> 164 * 165 * @param data 166 * data for which the instances are characterized 167 * @param index 168 * attribute for which the distances are characterized 169 * @return characteristics 170 */ 127 171 public static DistChar attributeDistance(Instances data, int index) { 128 172 double distance; … … 133 177 int numCmp = 0; 134 178 double value1, value2; 135 for ( int i=0; i<data.numInstances(); i++) {179 for (int i = 0; i < data.numInstances(); i++) { 136 180 value1 = data.instance(i).value(index); 137 for ( int j=0; j<data.numInstances(); j++) {138 if ( j!=i) {181 for (int j = 0; j < data.numInstances(); j++) { 182 if (j != i) { 139 183 value2 = data.instance(j).value(index); 140 distance = Math.abs(value1 -value2);184 distance = Math.abs(value1 - value2); 141 185 sumAll += distance; 142 sumAllQ += distance *distance;186 sumAllQ += distance * distance; 143 187 numCmp++; 144 if ( distance < min) {188 if (distance < min) { 145 189 min = distance; 146 190 } 147 if ( distance > max) {191 if (distance > max) { 148 192 max = distance; 149 193 } … … 152 196 } 153 197 double mean = sumAll / numCmp; 154 double std = Math.sqrt((sumAllQ-(sumAll*sumAll)/numCmp) * 155 (1.0d / (numCmp - 1))); 198 double std = Math.sqrt((sumAllQ - (sumAll * sumAll) / numCmp) * (1.0d / (numCmp - 1))); 156 199 return new DistChar(mean, std, min, max, data.numInstances()); 157 200 } 158 201 159 202 /** 160 203 * <p> -
trunk/CrossPare/src/de/ugoe/cs/cpdp/versions/AbstractVersionFilter.java
r86 r135 25 25 public abstract class AbstractVersionFilter implements IVersionFilter { 26 26 27 /* *27 /* 28 28 * @see de.ugoe.cs.cpdp.versions.IVersionFilter#apply(java.util.List) 29 29 */ -
trunk/CrossPare/src/de/ugoe/cs/cpdp/versions/MinClassNumberFilter.java
r86 r135 30 30 private int minInstances = 0; 31 31 32 /* *32 /* 33 33 * @see de.ugoe.cs.cpdp.versions.IVersionFilter#apply(de.ugoe.cs.cpdp.versions.SoftwareVersion) 34 34 */ -
trunk/CrossPare/src/de/ugoe/cs/cpdp/versions/MinInstanceNumberFilter.java
r86 r135 28 28 private int minInstances = 0; 29 29 30 /* *30 /* 31 31 * @see de.ugoe.cs.cpdp.versions.IVersionFilter#apply(de.ugoe.cs.cpdp.versions.SoftwareVersion) 32 32 */ -
trunk/CrossPare/src/de/ugoe/cs/cpdp/versions/SoftwareVersion.java
r132 r135 40 40 */ 41 41 private final Instances instances; 42 42 43 43 /** 44 * Review effort per instance. 44 * Review effort per instance. 45 45 */ 46 46 private final List<Double> efforts; … … 56 56 * data of the version 57 57 */ 58 public SoftwareVersion(String project, String version, Instances instances, List<Double> efforts) { 58 public SoftwareVersion(String project, 59 String version, 60 Instances instances, 61 List<Double> efforts) 62 { 59 63 this.project = project; 60 64 this.version = version; … … 62 66 this.efforts = efforts; 63 67 } 64 68 65 69 /** 66 70 * returns the project name … … 89 93 return new Instances(instances); 90 94 } 91 95 92 96 /** 93 97 * <p> -
trunk/CrossPare/src/de/ugoe/cs/cpdp/wekaclassifier/BayesNetWrapper.java
r130 r135 40 40 * generated ID 41 41 */ 42 /** */43 42 private static final long serialVersionUID = -4835134612921456157L; 44 43 -
trunk/CrossPare/src/de/ugoe/cs/cpdp/wekaclassifier/FixClass.java
r86 r135 23 23 24 24 /** 25 * Simple classifier that always predicts the same class 25 * Simple classifier that always predicts the same class. 26 26 * 27 27 * @author Steffen Herbold … … 29 29 public class FixClass extends AbstractClassifier { 30 30 31 /** 32 * default serialization ID 33 */ 31 34 private static final long serialVersionUID = 1L; 32 35 36 /** 37 * default prediction: non-defective 38 */ 33 39 private double fixedClassValue = 0.0d; 34 35 public FixClass() {36 // TODO Auto-generated constructor stub37 }38 40 39 41 /** … … 66 68 } 67 69 70 /* 71 * (non-Javadoc) 72 * 73 * @see weka.classifiers.AbstractClassifier#setOptions(java.lang.String[]) 74 */ 68 75 @Override 69 76 public void setOptions(String[] options) throws Exception { … … 71 78 } 72 79 80 /* 81 * (non-Javadoc) 82 * 83 * @see weka.classifiers.AbstractClassifier#classifyInstance(weka.core.Instance) 84 */ 73 85 @Override 74 86 public double classifyInstance(Instance instance) { … … 76 88 } 77 89 90 /* 91 * (non-Javadoc) 92 * 93 * @see weka.classifiers.Classifier#buildClassifier(weka.core.Instances) 94 */ 78 95 @Override 79 96 public void buildClassifier(Instances traindata) throws Exception { -
trunk/CrossPare/src/de/ugoe/cs/cpdp/wekaclassifier/ITestAwareClassifier.java
r66 r135 1 1 2 package de.ugoe.cs.cpdp.wekaclassifier; 2 3 3 4 import weka.core.Instances; 4 5 6 /** 7 * <p> 8 * Interface for test data aware classifier implementations 9 * </p> 10 * 11 * @author Steffen Herbold 12 */ 5 13 public interface ITestAwareClassifier { 6 14 15 /** 16 * <p> 17 * passes the test data to the classifier 18 * </p> 19 * 20 * @param testdata 21 * the test data 22 */ 7 23 public void setTestdata(Instances testdata); 8 24 -
trunk/CrossPare/src/de/ugoe/cs/cpdp/wekaclassifier/RandomClass.java
r86 r135 22 22 23 23 /** 24 * <p> 24 25 * Assigns a random class label to the instance it is evaluated on. 25 * 26 * </p> 26 27 * The range of class labels are hardcoded in fixedClassValues. This can later be extended to take 27 28 * values from the XML configuration. 29 * </p> 30 * 31 * @author Alexander Trautsch 28 32 */ 29 33 public class RandomClass extends AbstractClassifier { 30 34 35 /** 36 * default serialization id 37 */ 31 38 private static final long serialVersionUID = 1L; 32 39 40 /** 41 * class values 42 */ 33 43 private double[] fixedClassValues = 34 44 { 0.0d, 1.0d }; 35 45 46 /* 47 * (non-Javadoc) 48 * 49 * @see weka.classifiers.Classifier#buildClassifier(weka.core.Instances) 50 */ 36 51 @Override 37 52 public void buildClassifier(Instances arg0) throws Exception { … … 39 54 } 40 55 56 /* 57 * (non-Javadoc) 58 * 59 * @see weka.classifiers.AbstractClassifier#classifyInstance(weka.core.Instance) 60 */ 41 61 @Override 42 62 public double classifyInstance(Instance instance) { -
trunk/CrossPare/src/de/ugoe/cs/cpdp/wekaclassifier/VCBSVM.java
r105 r135 334 334 */ 335 335 private Instances weightedResample(final Instances data, final int size) { 336 if ( data.isEmpty()) {336 if (data.isEmpty()) { 337 337 return data; 338 338 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/wekaclassifier/WHICH.java
r127 r135 268 268 score = 0; 269 269 } 270 if ( score==0) {270 if (score == 0) { 271 271 score = 0.000000001; // to disallow 0 total score 272 272 } … … 296 296 else { 297 297 String range = ranges.get(k); 298 if ( "'All'".equals(range)) {298 if ("'All'".equals(range)) { 299 299 result = true; 300 } else { 300 } 301 else { 301 302 double instanceValue = instance.value(attrIndex); 302 303 double lowerBound; … … 327 328 else { 328 329 // first value is positive 329 if( splitResult[0].substring(2, splitResult[0].length()).equals("ll'")) { 330 if (splitResult[0].substring(2, splitResult[0].length()) 331 .equals("ll'")) 332 { 330 333 System.out.println("foo"); 331 334 } 332 lowerBound = Double 333 . parseDouble(splitResult[0].substring(2, splitResult[0].length()));335 lowerBound = Double.parseDouble(splitResult[0] 336 .substring(2, splitResult[0].length())); 334 337 if (splitResult[1].startsWith("inf")) { 335 338 upperBound = Double.POSITIVE_INFINITY; … … 346 349 boolean upperBoundMatch = (range.charAt(range.length() - 2) == ')' && 347 350 instanceValue < upperBound) || 348 (range.charAt(range.length() - 2) == ']' && instanceValue <= upperBound); 351 (range.charAt(range.length() - 2) == ']' && 352 instanceValue <= upperBound); 349 353 result = lowerBoundMatch && upperBoundMatch; 350 354 } -
trunk/CrossPare/test/de/ugoe/cs/cpdp/eval/MySQLResultStorageTest.java
r71 r135 12 12 result.setSizeTestData(100); 13 13 result.setSizeTrainingData(200); 14 result.setSuccHe(0.1);15 result.setSuccZi(0.05);16 result.setSuccG75(0.2);17 result.setSuccG60(0.4);18 14 result.setError(0.2); 19 15 result.setRecall(0.8);
Note: See TracChangeset
for help on using the changeset viewer.