Changeset 139 for trunk/CrossPare
- Timestamp:
- 08/18/16 16:56:35 (8 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/MetricMatchingTraining.java
r138 r139 25 25 import java.util.List; 26 26 import java.util.Map; 27 import java.util.Map.Entry; 27 28 import java.util.logging.Level; 28 29 … … 30 31 31 32 import org.apache.commons.collections4.list.SetUniqueList; 32 import org.apache.commons.math3.stat.inference.ChiSquareTest;33 33 import org.apache.commons.math3.stat.correlation.SpearmansCorrelation; 34 34 import org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest; … … 44 44 45 45 /** 46 * Implements Heterogenous Defect Prediction after Nam et al. 46 * Implements Heterogenous Defect Prediction after Nam et al. 2015. 47 47 * 48 48 * We extend WekaBaseTraining because we have to Wrap the Classifier to use MetricMatching. 49 49 * This also means we can use any Weka Classifier not just LogisticRegression. 50 50 * 51 * TODO:52 * - spacing, coding conventions53 * - clean up attribute selection on train data54 * - percentile test run55 51 */ 56 52 public class MetricMatchingTraining extends WekaBaseTraining implements ISetWiseTestdataAwareTrainingStrategy { 57 53 58 private SetUniqueList<Instances> traindataSet;59 54 private MetricMatch mm = null; 60 55 private Classifier classifier = new MetricMatchingClassifier(); … … 64 59 65 60 /** 66 * We wrap the classifier here because of classifyInstance 61 * We wrap the classifier here because of classifyInstance with our MetricMatchingClassfier 67 62 * @return 68 63 */ … … 129 124 Console.traceln(Level.INFO, "[MATCHED ATTRIBUTE] source attribute: [" + this.mm.train.attribute(attmatch.getKey()).name() + "], target attribute: [" + this.mm.test.attribute(attmatch.getValue()).name() + "]"); 130 125 } 131 132 // replace traindataSet 133 //traindataSet.clear(); 134 //traindataSet.add(ilist); 126 }else { 127 Console.traceln(Level.INFO, "[NO MATCH FOUND]"); 135 128 } 136 129 137 // if we have a match we build the special classifier, if not we fall back to FixClass130 // if we have a match we build the MetricMatchingClassifier, if not we fall back to FixClass Classifier 138 131 try { 139 132 if(this.mm != null) { … … 163 156 @Override 164 157 public void buildClassifier(Instances traindata) throws Exception { 165 this.classifier = setupClassifier(); // parent method from WekaBase158 this.classifier = setupClassifier(); 166 159 this.classifier.buildClassifier(traindata); 167 160 } … … 213 206 protected ArrayList<double[]> test_values; 214 207 215 208 216 209 public MetricMatch(Instances train, Instances test) { 217 this.train = train; 218 this.test = test; 210 // this is expensive but we need to keep the original data intact 211 this.train = this.deepCopy(train); 212 this.test = test; // we do not need a copy here because we do not drop attributes before the matching and after the matching we create a new Instances with only the matched attributes 219 213 220 214 // convert metrics of testdata and traindata to later use in similarity tests … … 330 324 } 331 325 332 326 /** 327 * Deep Copy (well, reasonably deep, not sure about header information of attributes) Weka Instances. 328 * 329 * @param data Instances 330 * @return copy of Instances passed 331 */ 332 private Instances deepCopy(Instances data) { 333 Instances newInst = new Instances(data); 334 335 newInst.clear(); 336 337 for (int i=0; i < data.size(); i++) { 338 Instance ni = new DenseInstance(data.numAttributes()); 339 for(int j = 0; j < data.numAttributes(); j++) { 340 ni.setValue(newInst.attribute(j), data.instance(i).value(data.attribute(j))); 341 } 342 newInst.add(ni); 343 } 344 345 return newInst; 346 } 347 333 348 /** 334 349 * Returns a deep copy of passed Instances data for Train or Test data. … … 392 407 */ 393 408 public void attributeSelection() throws Exception { 394 //Console.traceln(Level.INFO, "Attribute Selection on Training Attributes ("+this.train.numAttributes()+")"); 395 this.attributeSelection(this.train); 396 //Console.traceln(Level.INFO, "-----"); 397 //Console.traceln(Level.INFO, "Attribute Selection on Test Attributes ("+this.test.numAttributes()+")"); 398 //this.attributeSelection(this.test); 399 //Console.traceln(Level.INFO, "-----"); 400 } 401 402 403 private void attributeSelection(Instances which) throws Exception { 404 // 1. step we have to categorize the attributes 405 //http://weka.sourceforge.net/doc.packages/probabilisticSignificanceAE/weka/attributeSelection/SignificanceAttributeEval.html 406 409 410 // it is a wrapper, we may decide to implement ChiSquare or other means of selecting attributes 411 this.attributeSelectionBySignificance(this.train); 412 } 413 414 private void attributeSelectionBySignificance(Instances which) throws Exception { 415 // Uses: http://weka.sourceforge.net/doc.packages/probabilisticSignificanceAE/weka/attributeSelection/SignificanceAttributeEval.html 407 416 SignificanceAttributeEval et = new SignificanceAttributeEval(); 408 417 et.buildEvaluator(which); 409 //double tmp[] = new double[this.train.numAttributes()]; 418 419 // evaluate all training attributes 410 420 HashMap<String,Double> saeval = new HashMap<String,Double>(); 411 // evaluate all training attributes412 // select top 15% of metrics413 421 for(int i=0; i < which.numAttributes(); i++) { 414 422 if(which.classIndex() != i) { 415 423 saeval.put(which.attribute(i).name(), et.evaluateAttribute(i)); 416 424 } 417 //Console.traceln(Level.SEVERE, "Significance Attribute Eval: " + tmp);418 425 } 419 426 420 HashMap<String, Double> sorted = sortByValues(saeval); 427 // sort by significance 428 HashMap<String, Double> sorted = (HashMap<String, Double>) sortByValues(saeval); 421 429 422 // die besten 15% wollen wir haben430 // Keep the best 15% 423 431 double last = ((double)saeval.size() / 100.0) * 15.0; 424 432 int drop_first = saeval.size() - (int)last; 425 433 426 //Console.traceln(Level.INFO, "Dropping "+ drop_first + " of " + sorted.size() + " attributes (we only keep the best 15% "+last+")");427 /*428 Iterator it = sorted.entrySet().iterator();429 while (it.hasNext()) {430 Map.Entry pair = (Map.Entry)it.next();431 Console.traceln(Level.INFO, "key: "+(int)pair.getKey()+", value: " + (double)pair.getValue() + "");432 }*/433 434 434 // drop attributes above last 435 Iterator it = sorted.entrySet().iterator();435 Iterator<Entry<String, Double>> it = sorted.entrySet().iterator(); 436 436 while (drop_first > 0) { 437 Map.Entry pair = (Map.Entry)it.next();437 Map.Entry<String, Double> pair = (Map.Entry<String, Double>)it.next(); 438 438 if(which.attribute((String)pair.getKey()).index() != which.classIndex()) { 439 440 439 which.deleteAttributeAt(which.attribute((String)pair.getKey()).index()); 441 //Console.traceln(Level.INFO, "dropping attribute: "+ (String)pair.getKey());442 440 } 443 441 drop_first-=1; 444 442 } 445 //Console.traceln(Level.INFO, "Now we have " + which.numAttributes() + " attributes left (incl. class attribute!)"); 446 } 447 448 449 private HashMap sortByValues(HashMap map) { 450 List list = new LinkedList(map.entrySet()); 451 452 Collections.sort(list, new Comparator() { 453 public int compare(Object o1, Object o2) { 454 return ((Comparable) ((Map.Entry) (o1)).getValue()) 455 .compareTo(((Map.Entry) (o2)).getValue()); 443 } 444 445 /** 446 * Helper method to sort a hashmap by its values. 447 * 448 * @param map 449 * @return sorted map 450 */ 451 private HashMap<String, Double> sortByValues(HashMap<String, Double> map) { 452 List<Map.Entry<String, Double>> list = new LinkedList<Map.Entry<String, Double>>(map.entrySet()); 453 454 Collections.sort(list, new Comparator<Map.Entry<String, Double>>() { 455 public int compare(Map.Entry<String, Double> o1, Map.Entry<String, Double> o2) { 456 return (o1.getValue()).compareTo( o2.getValue() ); 456 457 } 457 458 }); 458 459 459 460 HashMap sortedHashMap = new LinkedHashMap(); 461 for (Iterator it = list.iterator(); it.hasNext();) { 462 Map.Entry entry = (Map.Entry) it.next(); 463 sortedHashMap.put(entry.getKey(), entry.getValue()); 464 } 460 HashMap<String, Double> sortedHashMap = new LinkedHashMap<String, Double>(); 461 for(Map.Entry<String, Double> item : list) { 462 sortedHashMap.put(item.getKey(), item.getValue()); 463 } 465 464 return sortedHashMap; 466 465 } 467 466 468 467 469 468 /** … … 553 552 554 553 /** 555 * Calculate Spearmans rank correlation coefficient as matching score 554 * Calculate Spearmans rank correlation coefficient as matching score. 556 555 * The number of instances for the source and target needs to be the same so we randomly sample from the bigger one. 557 556 * … … 573 572 // try out possible attribute combinations 574 573 for (int i=0; i < this.train.numAttributes(); i++) { 575 576 574 for (int j=0; j < this.test.numAttributes(); j++) { 577 575 // negative infinity counts as not present, we do this so we don't have to map between attribute indexs in weka … … 591 589 this.p_sum += p; 592 590 mwbm.setWeight(i, j, p); 593 //Console.traceln(Level.INFO, "Found match: p-val: " + p);594 591 } 595 592 } 596 593 } 597 598 //Console.traceln(Level.INFO, "Found " + this.attributes.size() + " matching attributes"); 599 } 600 601 602 public void sample(Instances bigger, Instances smaller, ArrayList<double[]> values) { 594 } 595 596 /** 597 * Helper method to sample instances for the Spearman rank correlation coefficient method. 598 * 599 * @param bigger 600 * @param smaller 601 * @param values 602 */ 603 private void sample(Instances bigger, Instances smaller, ArrayList<double[]> values) { 603 604 // we want to at keep the indices we select the same 604 605 int indices_to_draw = smaller.size(); … … 646 647 647 648 KolmogorovSmirnovTest t = new KolmogorovSmirnovTest(); 648 649 //Console.traceln(Level.INFO, "Starting Kolmogorov-Smirnov test for traindata size: " + this.train.size() + " attributes("+this.train.numAttributes()+") and testdata size: " + this.test.size() + " attributes("+this.test.numAttributes()+")");650 649 for (int i=0; i < this.train.numAttributes(); i++) { 651 650 for ( int j=0; j < this.test.numAttributes(); j++) { … … 673 672 } 674 673 } 675 //Console.traceln(Level.INFO, "Found " + this.attributes.size() + " matching attributes");676 674 } 677 675 }
Note: See TracChangeset
for help on using the changeset viewer.