Changeset 138 for trunk/CrossPare
- Timestamp:
- 08/17/16 19:21:07 (8 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/MetricMatchingTraining.java
r137 r138 40 40 import weka.core.Attribute; 41 41 import weka.core.DenseInstance; 42 import weka.core.FastVector;43 42 import weka.core.Instance; 44 43 import weka.core.Instances; 45 44 46 45 /** 47 * Implements Heterogenous Defect Prediction after Nam et al. 46 * Implements Heterogenous Defect Prediction after Nam et al. 2015. 47 * 48 * We extend WekaBaseTraining because we have to Wrap the Classifier to use MetricMatching. 49 * This also means we can use any Weka Classifier not just LogisticRegression. 48 50 * 49 51 * TODO: 50 52 * - spacing, coding conventions 51 * - we depend on having exactly one class attribute on multiple locations52 * - 53 * - clean up attribute selection on train data 54 * - percentile test run 53 55 */ 54 56 public class MetricMatchingTraining extends WekaBaseTraining implements ISetWiseTestdataAwareTrainingStrategy { 55 57 56 58 private SetUniqueList<Instances> traindataSet; 57 private MetricMatch mm ;58 private finalClassifier classifier = new MetricMatchingClassifier();59 private MetricMatch mm = null; 60 private Classifier classifier = new MetricMatchingClassifier(); 59 61 60 62 private String method; … … 70 72 } 71 73 72 74 /** 75 * Set similarity measure method. 76 */ 73 77 @Override 74 78 public void setMethod(String method) { … … 76 80 } 77 81 78 82 /** 83 * Set threshold for similarity measure. 84 */ 79 85 @Override 80 86 public void setThreshold(String threshold) { … … 88 94 @Override 89 95 public void apply(SetUniqueList<Instances> traindataSet, Instances testdata) { 90 this.traindataSet = traindataSet; 91 92 double score = 0; // custom ranking score to select the best training data from the set 96 97 double score = 0; // matching score to select the best matching training data from the set 93 98 int num = 0; 94 99 int biggest_num = 0; 95 100 MetricMatch tmp; 96 MetricMatch biggest = null; 97 for (Instances traindata : this.traindataSet) { 101 for (Instances traindata : traindataSet) { 98 102 num++; 99 103 … … 110 114 111 115 // we only select the training data from our set with the most matching attributes 112 if (tmp.getScore() > score ) {116 if (tmp.getScore() > score && tmp.attributes.size() > 0) { 113 117 score = tmp.getScore(); 114 biggest= tmp;118 this.mm = tmp; 115 119 biggest_num = num; 116 120 } 117 121 } 118 122 119 if (biggest == null) { 120 throw new RuntimeException("not enough matching attributes found"); 121 } 122 123 // we use the best match according to our matching score 124 this.mm = biggest; 125 Instances ilist = this.mm.getMatchedTrain(); 126 Console.traceln(Level.INFO, "Chosing the trainingdata set num "+biggest_num +" with " + score + " matching score, " + ilist.size() + " instances, and " + biggest.attributes.size() + " matched attributes out of a possible set of " + traindataSet.size() + " sets"); 127 128 for(int i = 0; i < this.mm.attributes.size(); i++) { 129 Console.traceln(Level.INFO, "Matched Attribute: " + this.mm.train.attribute(i).name() + " with " + this.mm.test.attribute((int)this.mm.attributes.get(i)).name()); 130 } 131 // replace traindataSEt 132 //traindataSet = new SetUniqueList<Instances>(); 133 traindataSet.clear(); 134 traindataSet.add(ilist); 135 136 // we have to build the classifier here: 123 // if we have found a matching instance we use it 124 Instances ilist = null; 125 if (this.mm != null) { 126 ilist = this.mm.getMatchedTrain(); 127 Console.traceln(Level.INFO, "[MATCH FOUND] match: ["+biggest_num +"], score: [" + score + "], instances: [" + ilist.size() + "], attributes: [" + this.mm.attributes.size() + "], ilist attrs: ["+ilist.numAttributes()+"]"); 128 for(Map.Entry<Integer, Integer> attmatch : this.mm.attributes.entrySet()) { 129 Console.traceln(Level.INFO, "[MATCHED ATTRIBUTE] source attribute: [" + this.mm.train.attribute(attmatch.getKey()).name() + "], target attribute: [" + this.mm.test.attribute(attmatch.getValue()).name() + "]"); 130 } 131 132 // replace traindataSet 133 //traindataSet.clear(); 134 //traindataSet.add(ilist); 135 } 136 137 // if we have a match we build the special classifier, if not we fall back to FixClass 137 138 try { 138 139 // 140 if (this.classifier == null) { 141 Console.traceln(Level.SEVERE, "Classifier is null"); 142 } 143 //Console.traceln(Level.INFO, "Building classifier with the matched training data with " + ilist.size() + " instances and "+ ilist.numAttributes() + " attributes"); 144 this.classifier.buildClassifier(ilist); 145 ((MetricMatchingClassifier) this.classifier).setMetricMatching(this.mm); 139 if(this.mm != null) { 140 this.classifier.buildClassifier(ilist); 141 ((MetricMatchingClassifier) this.classifier).setMetricMatching(this.mm); 142 }else { 143 this.classifier = new FixClass(); 144 this.classifier.buildClassifier(ilist); // this is null, but the FixClass Classifier does not use it anyway 145 } 146 146 }catch(Exception e) { 147 147 e.printStackTrace(); … … 166 166 this.classifier.buildClassifier(traindata); 167 167 } 168 168 169 /** 170 * Sets the MetricMatch instance so that we can use matched test data later. 171 * @param mm 172 */ 169 173 public void setMetricMatching(MetricMatch mm) { 170 174 this.mm = mm; … … 172 176 173 177 /** 174 * Here we can not do the metric matching because we only get one instance 178 * Here we can not do the metric matching because we only get one instance. 179 * Therefore we need a MetricMatch instance beforehand to use here. 175 180 */ 176 181 public double classifyInstance(Instance testdata) { 177 // todo: maybe we can pull the instance out of our matched testdata?182 // get a copy of testdata Instance with only the matched attributes 178 183 Instance ntest = this.mm.getMatchedTestInstance(testdata); 179 184 … … 191 196 192 197 /** 193 * Encapsulates MetricMatching on Instances Arrays 198 * Encapsulates one MetricMatching process. 199 * One source (train) matches against one target (test). 194 200 */ 195 201 public class MetricMatch { … … 198 204 199 205 // used to sum up the matching values of all attributes 200 double p_sum = 0;206 protected double p_sum = 0; 201 207 202 208 // attribute matching, train -> test 203 209 HashMap<Integer, Integer> attributes = new HashMap<Integer,Integer>(); 204 //double[][] weights; /* weight matrix, needed to find maximum weighted bipartite matching */ 205 206 ArrayList<double[]> train_values; 207 ArrayList<double[]> test_values; 208 209 // todo: this constructor does not work 210 public MetricMatch() { 211 } 210 211 // used for similarity tests 212 protected ArrayList<double[]> train_values; 213 protected ArrayList<double[]> test_values; 214 212 215 213 216 public MetricMatch(Instances train, Instances test) { 214 // expensive! but we are dropping the attributes so we have to copy all of the data 215 this.train = new Instances(train); 216 this.test = new Instances(test); 217 218 // 1. convert metrics of testdata and traindata to later use in test 217 this.train = train; 218 this.test = test; 219 220 // convert metrics of testdata and traindata to later use in similarity tests 219 221 this.train_values = new ArrayList<double[]>(); 220 for (int i = 0; i < this.train.numAttributes()-1; i++) { 221 this.train_values.add(train.attributeToDoubleArray(i)); 222 for (int i = 0; i < this.train.numAttributes(); i++) { 223 if(this.train.classIndex() != i) { 224 this.train_values.add(this.train.attributeToDoubleArray(i)); 225 } 222 226 } 223 227 224 228 this.test_values = new ArrayList<double[]>(); 225 for (int i=0; i < this.test.numAttributes()-1; i++) { 226 this.test_values.add(this.test.attributeToDoubleArray(i)); 229 for (int i=0; i < this.test.numAttributes(); i++) { 230 if(this.test.classIndex() != i) { 231 this.test_values.add(this.test.attributeToDoubleArray(i)); 232 } 227 233 } 228 234 } … … 235 241 */ 236 242 public double getScore() { 237 int as = this.attributes.size(); 243 int as = this.attributes.size(); // # of attributes that were matched 238 244 239 245 // we use thresholding ranking approach for numInstances to influence the matching score … … 250 256 } 251 257 252 253 254 258 public HashMap<Integer, Integer> getAttributes() { 259 return this.attributes; 260 } 255 261 256 public int getNumInstances() { 257 return this.train_values.get(0).length; 258 } 262 public int getNumInstances() { 263 return this.train_values.get(0).length; 264 } 265 266 /** 267 * This creates a new Instance out of the passed Instance and the previously matched attributes. 268 * We do this because the evaluation phase requires an original Instance with every attribute. 269 * 270 * @param test instance 271 * @return new instance 272 */ 273 public Instance getMatchedTestInstance(Instance test) { 274 //create new instance with our matched number of attributes + 1 (the class attribute) 275 Instances testdata = this.getMatchedTest(); 276 277 Instance ni = new DenseInstance(this.attributes.size()+1); 278 ni.setDataset(testdata); 279 280 for(Map.Entry<Integer, Integer> attmatch : this.attributes.entrySet()) { 281 ni.setValue(testdata.attribute(attmatch.getKey()), test.value(attmatch.getValue())); 282 } 283 284 ni.setClassValue(test.value(test.classAttribute())); 285 286 return ni; 287 } 288 289 /** 290 * returns a new instances array with the metric matched training data 291 * 292 * @return instances 293 */ 294 public Instances getMatchedTrain() { 295 return this.getMatchedInstances("train", this.train); 296 } 259 297 260 public Instance getMatchedTestInstance(Instance test) { 261 // create new instance with our matched number of attributes + 1 (the class attribute) 262 //Console.traceln(Level.INFO, "getting matched instance"); 263 Instances testdata = this.getMatchedTest(); 264 265 //Instance ni = new DenseInstance(this.attmatch.size()+1); 266 Instance ni = new DenseInstance(this.attributes.size()+1); 267 ni.setDataset(testdata); 268 269 //Console.traceln(Level.INFO, "Attributes to match: " + this.attmatch.size() + ""); 270 271 Iterator it = this.attributes.entrySet().iterator(); 272 int j = 0; 273 while (it.hasNext()) { 274 Map.Entry values = (Map.Entry)it.next(); 275 ni.setValue(testdata.attribute(j), test.value((int)values.getValue())); 276 j++; 277 278 } 279 280 ni.setClassValue(test.value(test.classAttribute())); 281 282 return ni; 283 } 284 285 /** 286 * returns a new instances array with the metric matched training data 287 * 288 * @return instances 289 */ 290 public Instances getMatchedTrain() { 291 return this.getMatchedInstances("train", this.train); 292 } 293 294 /** 295 * returns a new instances array with the metric matched test data 296 * 297 * @return instances 298 */ 299 public Instances getMatchedTest() { 300 return this.getMatchedInstances("test", this.test); 301 } 302 303 // todo: there must be a better way 304 // https://weka.wikispaces.com/Programmatic+Use 305 private Instances getMatchedInstances(String name, Instances data) { 306 //Console.traceln(Level.INFO, "Constructing instances from: " + name); 307 // construct our new attributes 308 Attribute[] attrs = new Attribute[this.attributes.size()+1]; 309 FastVector fwTrain = new FastVector(this.attributes.size()); 310 for (int i=0; i < this.attributes.size(); i++) { 311 attrs[i] = new Attribute(String.valueOf(i)); 312 fwTrain.addElement(attrs[i]); 313 } 314 // add our ClassAttribute (which is not numeric!) 315 ArrayList<String> acl= new ArrayList<String>(); 316 acl.add("0"); 317 acl.add("1"); 318 319 fwTrain.addElement(new Attribute("bug", acl)); 320 Instances newTrain = new Instances(name, fwTrain, data.size()); 321 newTrain.setClassIndex(newTrain.numAttributes()-1); 322 323 //Console.traceln(Level.INFO, "data attributes: " + data.numAttributes() + ", this.attributes: "+this.attributes.size()); 324 325 for (int i=0; i < data.size(); i++) { 326 Instance ni = new DenseInstance(this.attributes.size()+1); 327 328 Iterator it = this.attributes.entrySet().iterator(); 329 int j = 0; 330 while (it.hasNext()) { 331 Map.Entry values = (Map.Entry)it.next(); 332 int value = (int)values.getValue(); 333 334 // key ist traindata 335 if (name.equals("train")) { 336 value = (int)values.getKey(); 337 } 338 //Console.traceln(Level.INFO, "setting attribute " + j + " with data from instance: " + i); 339 ni.setValue(newTrain.attribute(j), data.instance(i).value(value)); 340 j++; 341 } 342 ni.setValue(ni.numAttributes()-1, data.instance(i).value(data.classAttribute())); 343 344 newTrain.add(ni); 345 } 346 347 return newTrain; 348 } 349 298 /** 299 * returns a new instances array with the metric matched test data 300 * 301 * @return instances 302 */ 303 public Instances getMatchedTest() { 304 return this.getMatchedInstances("test", this.test); 305 } 306 307 /** 308 * We could drop unmatched attributes from our instances datasets. 309 * Alas, that would not be nice for the following postprocessing jobs and would not work at all for evaluation. 310 * We keep this as a warning for future generations. 311 * 312 * @param name 313 * @param data 314 */ 315 @SuppressWarnings("unused") 316 private void dropUnmatched(String name, Instances data) { 317 for(int i = 0; i < data.numAttributes(); i++) { 318 if(data.classIndex() == i) { 319 continue; 320 } 321 322 if(name.equals("train") && !this.attributes.containsKey(i)) { 323 data.deleteAttributeAt(i); 324 } 325 326 if(name.equals("test") && !this.attributes.containsValue(i)) { 327 data.deleteAttributeAt(i); 328 } 329 } 330 } 331 332 333 /** 334 * Returns a deep copy of passed Instances data for Train or Test data. 335 * It only keeps attributes that have been matched. 336 * 337 * @param name 338 * @param data 339 * @return matched Instances 340 */ 341 private Instances getMatchedInstances(String name, Instances data) { 342 ArrayList<Attribute> attrs = new ArrayList<Attribute>(); 343 344 // bug attr is a string, really! 345 ArrayList<String> bug = new ArrayList<String>(); 346 bug.add("0"); 347 bug.add("1"); 348 349 // add our matched attributes and last the bug 350 for(Map.Entry<Integer, Integer> attmatch : this.attributes.entrySet()) { 351 attrs.add(new Attribute(String.valueOf(attmatch.getValue()))); 352 } 353 attrs.add(new Attribute("bug", bug)); 354 355 // create new instances object of the same size (at least for instances) 356 Instances newInst = new Instances(name, attrs, data.size()); 357 358 // set last as class 359 newInst.setClassIndex(newInst.numAttributes()-1); 360 361 // copy data for matched attributes, this depends if we return train or test data 362 for (int i=0; i < data.size(); i++) { 363 Instance ni = new DenseInstance(this.attributes.size()+1); 364 365 int j = 0; // new indices! 366 for(Map.Entry<Integer, Integer> attmatch : this.attributes.entrySet()) { 367 368 // test attribute match 369 int value = attmatch.getValue(); 370 371 // train attribute match 372 if(name.equals("train")) { 373 value = attmatch.getKey(); 374 } 375 376 ni.setValue(newInst.attribute(j), data.instance(i).value(value)); 377 j++; 378 } 379 ni.setValue(ni.numAttributes()-1, data.instance(i).value(data.classAttribute())); 380 newInst.add(ni); 381 } 382 383 return newInst; 384 } 350 385 351 386 /** … … 364 399 //Console.traceln(Level.INFO, "-----"); 365 400 } 401 366 402 367 403 private void attributeSelection(Instances which) throws Exception { … … 431 467 432 468 433 469 /** 470 * Executes the similarity matching between train and test data. 471 * 472 * After this function is finished we have this.attributes with the correct matching between train and test data attributes. 473 * 474 * @param type 475 * @param cutoff 476 */ 434 477 public void matchAttributes(String type, double cutoff) { 435 478 436 437 479 MWBMatchingAlgorithm mwbm = new MWBMatchingAlgorithm(this.train.numAttributes(), this.test.numAttributes()); 438 480 … … 447 489 } 448 490 449 // resulting maximal match 491 // resulting maximal match gets assigned to this.attributes 450 492 int[] result = mwbm.getMatching(); 451 493 for( int i = 0; i < result.length; i++) { … … 468 510 for( int i = 0; i < this.train.numAttributes(); i++ ) { 469 511 for( int j = 0; j < this.test.numAttributes(); j++ ) { 470 // negative infinity counts as not present, we do this so we don't have to map between attribute index s in weka512 // negative infinity counts as not present, we do this so we don't have to map between attribute indexes in weka 471 513 // and the result of the mwbm computation 472 514 mwbm.setWeight(i, j, Double.NEGATIVE_INFINITY); … … 510 552 } 511 553 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 554 /** 555 * Calculate Spearmans rank correlation coefficient as matching score 556 * The number of instances for the source and target needs to be the same so we randomly sample from the bigger one. 557 * 558 * @param cutoff 559 * @param mwbmatching 560 */ 561 public void spearmansRankCorrelation(double cutoff, MWBMatchingAlgorithm mwbm) { 562 double p = 0; 563 564 SpearmansCorrelation t = new SpearmansCorrelation(); 565 566 // size has to be the same so we randomly sample the number of the smaller sample from the big sample 567 if (this.train.size() > this.test.size()) { 568 this.sample(this.train, this.test, this.train_values); 569 }else if (this.test.size() > this.train.size()) { 570 this.sample(this.test, this.train, this.test_values); 571 } 530 572 531 573 // try out possible attribute combinations … … 622 664 // this may invoke exactP on small sample sizes which will not terminate in all cases 623 665 //p = t.kolmogorovSmirnovTest(this.train_values.get(i), this.test_values.get(j), false); 666 667 // this uses approximateP everytime 624 668 p = t.approximateP(t.kolmogorovSmirnovStatistic(this.train_values.get(i), this.test_values.get(j)), this.train_values.get(i).length, this.test_values.get(j).length); 625 669 if (p > cutoff) {
Note: See TracChangeset
for help on using the changeset viewer.