Changeset 141 for trunk/CrossPare/src
- Timestamp:
- 08/31/16 10:42:32 (8 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/MetricMatchingTraining.java
r140 r141 46 46 * Implements Heterogenous Defect Prediction after Nam et al. 2015. 47 47 * 48 * We extend WekaBaseTraining because we have to Wrap the Classifier to use MetricMatching. 49 * Thisalso means we can use any Weka Classifier not just LogisticRegression.48 * We extend WekaBaseTraining because we have to Wrap the Classifier to use MetricMatching. This 49 * also means we can use any Weka Classifier not just LogisticRegression. 50 50 * 51 * Config: <setwisetestdataawaretrainer name="MetricMatchingTraining" param="Logistic weka.classifiers.functions.Logistic" threshold="0.05" method="spearman"/> 52 * Instead of spearman metchod it also takes ks, percentile. 53 * Instead of Logistic every other weka classifier can be chosen. 51 * Config: <setwisetestdataawaretrainer name="MetricMatchingTraining" param= 52 * "Logistic weka.classifiers.functions.Logistic" threshold="0.05" method="spearman"/> Instead of 53 * spearman metchod it also takes ks, percentile. Instead of Logistic every other weka classifier 54 * can be chosen. 54 55 * 55 * Future work: 56 * implement chisquare test in addition to significance for attribute selection 57 * http://commons.apache.org/proper/commons-math/apidocs/org/apache/commons/math3/stat/inference/ChiSquareTest.html 58 * use chiSquareTestDataSetsComparison 56 * Future work: implement chisquare test in addition to significance for attribute selection 57 * http://commons.apache.org/proper/commons-math/apidocs/org/apache/commons/math3/stat/inference/ 58 * ChiSquareTest.html use chiSquareTestDataSetsComparison 59 59 */ 60 public class MetricMatchingTraining extends WekaBaseTraining implements ISetWiseTestdataAwareTrainingStrategy { 60 public class MetricMatchingTraining extends WekaBaseTraining 61 implements ISetWiseTestdataAwareTrainingStrategy 62 { 61 63 62 64 private MetricMatch mm = null; 63 65 private Classifier classifier = null; 64 66 65 67 private String method; 66 68 private float threshold; 67 69 68 70 /** 69 71 * We wrap the classifier here because of classifyInstance with our MetricMatchingClassfier 72 * 70 73 * @return 71 74 */ … … 91 94 } 92 95 93 /** 94 * We need the test data instances to do a metric matching, so in this special case we get this data 95 * before evaluation. 96 */ 97 @Override 98 public void apply(SetUniqueList<Instances> traindataSet, Instances testdata) { 99 // reset these for each run 100 this.mm = null; 101 this.classifier = null; 102 103 double score = 0; // matching score to select the best matching training data from the set 104 int num = 0; 105 int biggest_num = 0; 106 MetricMatch tmp; 107 for (Instances traindata : traindataSet) { 108 num++; 109 110 tmp = new MetricMatch(traindata, testdata); 111 112 // metric selection may create error, continue to next training set 113 try { 114 tmp.attributeSelection(); 115 tmp.matchAttributes(this.method, this.threshold); 116 }catch(Exception e) { 117 e.printStackTrace(); 118 throw new RuntimeException(e); 119 } 120 121 // we only select the training data from our set with the most matching attributes 122 if (tmp.getScore() > score && tmp.attributes.size() > 0) { 123 score = tmp.getScore(); 124 this.mm = tmp; 125 biggest_num = num; 126 } 127 } 128 129 // if we have found a matching instance we use it, log information about the match for additional eval later 130 Instances ilist = null; 131 if (this.mm != null) { 132 ilist = this.mm.getMatchedTrain(); 133 Console.traceln(Level.INFO, "[MATCH FOUND] match: ["+biggest_num +"], score: [" + score + "], instances: [" + ilist.size() + "], attributes: [" + this.mm.attributes.size() + "], ilist attrs: ["+ilist.numAttributes()+"]"); 134 for(Map.Entry<Integer, Integer> attmatch : this.mm.attributes.entrySet()) { 135 Console.traceln(Level.INFO, "[MATCHED ATTRIBUTE] source attribute: [" + this.mm.train.attribute(attmatch.getKey()).name() + "], target attribute: [" + this.mm.test.attribute(attmatch.getValue()).name() + "]"); 136 } 137 }else { 138 Console.traceln(Level.INFO, "[NO MATCH FOUND]"); 139 } 140 141 // if we have a match we build the MetricMatchingClassifier, if not we fall back to FixClass Classifier 142 try { 143 if(this.mm != null) { 144 this.classifier = new MetricMatchingClassifier(); 145 this.classifier.buildClassifier(ilist); 146 ((MetricMatchingClassifier) this.classifier).setMetricMatching(this.mm); 147 }else { 148 this.classifier = new FixClass(); 149 this.classifier.buildClassifier(ilist); // this is null, but the FixClass Classifier does not use it anyway 150 } 151 }catch(Exception e) { 152 e.printStackTrace(); 153 throw new RuntimeException(e); 154 } 155 } 156 157 158 /** 159 * Encapsulates the classifier configured with WekaBase within but use metric matching. 160 * This allows us to use any Weka classifier with Heterogenous Defect Prediction. 161 */ 162 public class MetricMatchingClassifier extends AbstractClassifier { 163 164 private static final long serialVersionUID = -1342172153473770935L; 165 private MetricMatch mm; 166 private Classifier classifier; 167 168 @Override 169 public void buildClassifier(Instances traindata) throws Exception { 170 this.classifier = setupClassifier(); 171 this.classifier.buildClassifier(traindata); 172 } 173 174 /** 175 * Sets the MetricMatch instance so that we can use matched test data later. 176 * @param mm 177 */ 178 public void setMetricMatching(MetricMatch mm) { 179 this.mm = mm; 180 } 181 182 /** 183 * Here we can not do the metric matching because we only get one instance. 184 * Therefore we need a MetricMatch instance beforehand to use here. 185 */ 186 public double classifyInstance(Instance testdata) { 187 // get a copy of testdata Instance with only the matched attributes 188 Instance ntest = this.mm.getMatchedTestInstance(testdata); 189 190 double ret = 0.0; 191 try { 192 ret = this.classifier.classifyInstance(ntest); 193 }catch(Exception e) { 194 e.printStackTrace(); 195 throw new RuntimeException(e); 196 } 197 198 return ret; 199 } 200 } 201 202 /** 203 * Encapsulates one MetricMatching process. 204 * One source (train) matches against one target (test). 205 */ 96 /** 97 * We need the test data instances to do a metric matching, so in this special case we get this 98 * data before evaluation. 99 */ 100 @Override 101 public void apply(SetUniqueList<Instances> traindataSet, Instances testdata) { 102 // reset these for each run 103 this.mm = null; 104 this.classifier = null; 105 106 double score = 0; // matching score to select the best matching training data from the set 107 int num = 0; 108 int biggest_num = 0; 109 MetricMatch tmp; 110 for (Instances traindata : traindataSet) { 111 num++; 112 113 tmp = new MetricMatch(traindata, testdata); 114 115 // metric selection may create error, continue to next training set 116 try { 117 tmp.attributeSelection(); 118 tmp.matchAttributes(this.method, this.threshold); 119 } 120 catch (Exception e) { 121 e.printStackTrace(); 122 throw new RuntimeException(e); 123 } 124 125 // we only select the training data from our set with the most matching attributes 126 if (tmp.getScore() > score && tmp.attributes.size() > 0) { 127 score = tmp.getScore(); 128 this.mm = tmp; 129 biggest_num = num; 130 } 131 } 132 133 // if we have found a matching instance we use it, log information about the match for 134 // additional eval later 135 Instances ilist = null; 136 if (this.mm != null) { 137 ilist = this.mm.getMatchedTrain(); 138 Console.traceln(Level.INFO, "[MATCH FOUND] match: [" + biggest_num + "], score: [" + 139 score + "], instances: [" + ilist.size() + "], attributes: [" + 140 this.mm.attributes.size() + "], ilist attrs: [" + ilist.numAttributes() + "]"); 141 for (Map.Entry<Integer, Integer> attmatch : this.mm.attributes.entrySet()) { 142 Console.traceln(Level.INFO, "[MATCHED ATTRIBUTE] source attribute: [" + 143 this.mm.train.attribute(attmatch.getKey()).name() + "], target attribute: [" + 144 this.mm.test.attribute(attmatch.getValue()).name() + "]"); 145 } 146 } 147 else { 148 Console.traceln(Level.INFO, "[NO MATCH FOUND]"); 149 } 150 151 // if we have a match we build the MetricMatchingClassifier, if not we fall back to FixClass 152 // Classifier 153 try { 154 if (this.mm != null) { 155 this.classifier = new MetricMatchingClassifier(); 156 this.classifier.buildClassifier(ilist); 157 ((MetricMatchingClassifier) this.classifier).setMetricMatching(this.mm); 158 } 159 else { 160 this.classifier = new FixClass(); 161 this.classifier.buildClassifier(ilist); // this is null, but the FixClass Classifier 162 // does not use it anyway 163 } 164 } 165 catch (Exception e) { 166 e.printStackTrace(); 167 throw new RuntimeException(e); 168 } 169 } 170 171 /** 172 * Encapsulates the classifier configured with WekaBase within but use metric matching. This 173 * allows us to use any Weka classifier with Heterogenous Defect Prediction. 174 */ 175 public class MetricMatchingClassifier extends AbstractClassifier { 176 177 private static final long serialVersionUID = -1342172153473770935L; 178 private MetricMatch mm; 179 private Classifier classifier; 180 181 @Override 182 public void buildClassifier(Instances traindata) throws Exception { 183 this.classifier = setupClassifier(); 184 this.classifier.buildClassifier(traindata); 185 } 186 187 /** 188 * Sets the MetricMatch instance so that we can use matched test data later. 189 * 190 * @param mm 191 */ 192 public void setMetricMatching(MetricMatch mm) { 193 this.mm = mm; 194 } 195 196 /** 197 * Here we can not do the metric matching because we only get one instance. Therefore we 198 * need a MetricMatch instance beforehand to use here. 199 */ 200 public double classifyInstance(Instance testdata) { 201 // get a copy of testdata Instance with only the matched attributes 202 Instance ntest = this.mm.getMatchedTestInstance(testdata); 203 204 double ret = 0.0; 205 try { 206 ret = this.classifier.classifyInstance(ntest); 207 } 208 catch (Exception e) { 209 e.printStackTrace(); 210 throw new RuntimeException(e); 211 } 212 213 return ret; 214 } 215 } 216 217 /** 218 * Encapsulates one MetricMatching process. One source (train) matches against one target 219 * (test). 220 */ 206 221 public class MetricMatch { 207 208 209 210 211 212 213 214 HashMap<Integer, Integer> attributes = new HashMap<Integer,Integer>();215 216 217 218 219 220 221 public MetricMatch(Instances train, Instances test) { 222 // this is expensive but we need to keep the original data intact 223 this.train = this.deepCopy(train); 224 this.test = test; // we do not need a copy here because we do not drop attributes before the matching and after the matching we create a new Instances with only the matched attributes 225 226 // convert metrics of testdata and traindata to later use in similarity tests 227 this.train_values = new ArrayList<double[]>(); 228 for (int i = 0; i < this.train.numAttributes(); i++) { 229 if(this.train.classIndex() != i) {230 this.train_values.add(this.train.attributeToDoubleArray(i)); 231 } 232 233 234 this.test_values = new ArrayList<double[]>(); 235 for (int i=0; i < this.test.numAttributes(); i++) { 236 if(this.test.classIndex() != i) {237 this.test_values.add(this.test.attributeToDoubleArray(i)); 238 } 239 240 241 242 /** 243 * We have a lot of matching possibilities. 244 *Here we try to determine the best one.245 246 247 248 249 int as = this.attributes.size();// # of attributes that were matched250 251 252 253 254 if(instances > 100) {255 256 257 if(instances > 500) {222 Instances train; 223 Instances test; 224 225 // used to sum up the matching values of all attributes 226 protected double p_sum = 0; 227 228 // attribute matching, train -> test 229 HashMap<Integer, Integer> attributes = new HashMap<Integer, Integer>(); 230 231 // used for similarity tests 232 protected ArrayList<double[]> train_values; 233 protected ArrayList<double[]> test_values; 234 235 public MetricMatch(Instances train, Instances test) { 236 // this is expensive but we need to keep the original data intact 237 this.train = this.deepCopy(train); 238 this.test = test; // we do not need a copy here because we do not drop attributes before 239 // the matching and after the matching we create a new Instances with 240 // only the matched attributes 241 242 // convert metrics of testdata and traindata to later use in similarity tests 243 this.train_values = new ArrayList<double[]>(); 244 for (int i = 0; i < this.train.numAttributes(); i++) { 245 if (this.train.classIndex() != i) { 246 this.train_values.add(this.train.attributeToDoubleArray(i)); 247 } 248 } 249 250 this.test_values = new ArrayList<double[]>(); 251 for (int i = 0; i < this.test.numAttributes(); i++) { 252 if (this.test.classIndex() != i) { 253 this.test_values.add(this.test.attributeToDoubleArray(i)); 254 } 255 } 256 } 257 258 /** 259 * We have a lot of matching possibilities. Here we try to determine the best one. 260 * 261 * @return double matching score 262 */ 263 public double getScore() { 264 int as = this.attributes.size(); // # of attributes that were matched 265 266 // we use thresholding ranking approach for numInstances to influence the matching score 267 int instances = this.train.numInstances(); 268 int inst_rank = 0; 269 if (instances > 100) { 270 inst_rank = 1; 271 } 272 if (instances > 500) { 258 273 inst_rank = 2; 259 274 } 260 261 262 263 264 265 266 267 268 269 270 271 272 273 /** 274 * The test instance must be of the same dataset as the train data, otherwise WekaEvaluation will die. 275 * This means we have to force the dataset of this.train (after matching) and only 276 * set the values for the attributes we matched but with the index of the traindataattributes we matched.277 278 279 280 281 282 Instance ni = new DenseInstance(this.attributes.size() +1);283 275 276 return this.p_sum + as + inst_rank; 277 } 278 279 public HashMap<Integer, Integer> getAttributes() { 280 return this.attributes; 281 } 282 283 public int getNumInstances() { 284 return this.train_values.get(0).length; 285 } 286 287 /** 288 * The test instance must be of the same dataset as the train data, otherwise WekaEvaluation 289 * will die. This means we have to force the dataset of this.train (after matching) and only 290 * set the values for the attributes we matched but with the index of the traindata 291 * attributes we matched. 292 * 293 * @param test 294 * @return 295 */ 296 public Instance getMatchedTestInstance(Instance test) { 297 Instance ni = new DenseInstance(this.attributes.size() + 1); 298 284 299 Instances inst = this.getMatchedTrain(); 285 300 286 301 ni.setDataset(inst); 287 302 288 303 // assign only the matched attributes to new indexes 289 304 double val; 290 305 int k = 0; 291 for (Map.Entry<Integer, Integer> attmatch : this.attributes.entrySet()) {306 for (Map.Entry<Integer, Integer> attmatch : this.attributes.entrySet()) { 292 307 // get value from matched attribute 293 308 val = test.value(attmatch.getValue()); 294 309 295 310 // set it to new index, the order of the attributes is the same 296 311 ni.setValue(k, val); … … 300 315 301 316 return ni; 302 } 303 304 317 } 318 305 319 /** 306 320 * returns a new instances array with the metric matched training data … … 308 322 * @return instances 309 323 */ 310 311 312 313 314 /** 315 316 317 318 319 320 321 322 323 324 * We could drop unmatched attributes from our instances datasets. 325 * Alas, that would not be nice for the following postprocessing jobs and would not work at all for evaluation. 326 * Wekeep this as a warning for future generations.327 328 329 330 331 324 public Instances getMatchedTrain() { 325 return this.getMatchedInstances("train", this.train); 326 } 327 328 /** 329 * returns a new instances array with the metric matched test data 330 * 331 * @return instances 332 */ 333 public Instances getMatchedTest() { 334 return this.getMatchedInstances("test", this.test); 335 } 336 337 /** 338 * We could drop unmatched attributes from our instances datasets. Alas, that would not be 339 * nice for the following postprocessing jobs and would not work at all for evaluation. We 340 * keep this as a warning for future generations. 341 * 342 * @param name 343 * @param data 344 */ 345 @SuppressWarnings("unused") 332 346 private void dropUnmatched(String name, Instances data) { 333 for(int i = 0; i < data.numAttributes(); i++) { 334 if(data.classIndex() == i) { 335 continue; 336 } 337 338 if(name.equals("train") && !this.attributes.containsKey(i)) { 339 data.deleteAttributeAt(i); 340 } 341 342 if(name.equals("test") && !this.attributes.containsValue(i)) { 343 data.deleteAttributeAt(i); 344 } 345 } 346 } 347 348 /** 349 * Deep Copy (well, reasonably deep, not sure about header information of attributes) Weka Instances. 350 * 351 * @param data Instances 347 for (int i = 0; i < data.numAttributes(); i++) { 348 if (data.classIndex() == i) { 349 continue; 350 } 351 352 if (name.equals("train") && !this.attributes.containsKey(i)) { 353 data.deleteAttributeAt(i); 354 } 355 356 if (name.equals("test") && !this.attributes.containsValue(i)) { 357 data.deleteAttributeAt(i); 358 } 359 } 360 } 361 362 /** 363 * Deep Copy (well, reasonably deep, not sure about header information of attributes) Weka 364 * Instances. 365 * 366 * @param data 367 * Instances 352 368 * @return copy of Instances passed 353 369 */ 354 355 356 357 358 359 for (int i =0; i < data.size(); i++) {370 private Instances deepCopy(Instances data) { 371 Instances newInst = new Instances(data); 372 373 newInst.clear(); 374 375 for (int i = 0; i < data.size(); i++) { 360 376 Instance ni = new DenseInstance(data.numAttributes()); 361 for (int j = 0; j < data.numAttributes(); j++) {377 for (int j = 0; j < data.numAttributes(); j++) { 362 378 ni.setValue(newInst.attribute(j), data.instance(i).value(data.attribute(j))); 363 379 } 364 380 newInst.add(ni); 365 381 } 366 382 367 383 return newInst; 368 369 370 /** 371 * Returns a deep copy of passed Instances data for Train or Test data. 372 * It only keepsattributes that have been matched.384 } 385 386 /** 387 * Returns a deep copy of passed Instances data for Train or Test data. It only keeps 388 * attributes that have been matched. 373 389 * 374 390 * @param name … … 376 392 * @return matched Instances 377 393 */ 378 379 380 381 382 394 private Instances getMatchedInstances(String name, Instances data) { 395 ArrayList<Attribute> attrs = new ArrayList<Attribute>(); 396 397 // bug attr is a string, really! 398 ArrayList<String> bug = new ArrayList<String>(); 383 399 bug.add("0"); 384 400 bug.add("1"); 385 401 386 402 // add our matched attributes and last the bug 387 for(Map.Entry<Integer, Integer> attmatch : this.attributes.entrySet()) {388 389 390 391 392 393 394 395 396 newInst.setClassIndex(newInst.numAttributes()-1);397 398 399 for (int i =0; i < data.size(); i++) {400 Instance ni = new DenseInstance(this.attributes.size() +1);401 403 for (Map.Entry<Integer, Integer> attmatch : this.attributes.entrySet()) { 404 attrs.add(new Attribute(String.valueOf(attmatch.getValue()))); 405 } 406 attrs.add(new Attribute("bug", bug)); 407 408 // create new instances object of the same size (at least for instances) 409 Instances newInst = new Instances(name, attrs, data.size()); 410 411 // set last as class 412 newInst.setClassIndex(newInst.numAttributes() - 1); 413 414 // copy data for matched attributes, this depends if we return train or test data 415 for (int i = 0; i < data.size(); i++) { 416 Instance ni = new DenseInstance(this.attributes.size() + 1); 417 402 418 int j = 0; // new indices! 403 for (Map.Entry<Integer, Integer> attmatch : this.attributes.entrySet()) {404 419 for (Map.Entry<Integer, Integer> attmatch : this.attributes.entrySet()) { 420 405 421 // test attribute match 406 422 int value = attmatch.getValue(); 407 423 408 424 // train attribute match 409 if (name.equals("train")) {425 if (name.equals("train")) { 410 426 value = attmatch.getKey(); 411 427 } 412 428 413 429 ni.setValue(newInst.attribute(j), data.instance(i).value(value)); 414 430 j++; 415 431 } 416 ni.setValue(ni.numAttributes() -1, data.instance(i).value(data.classAttribute()));432 ni.setValue(ni.numAttributes() - 1, data.instance(i).value(data.classAttribute())); 417 433 newInst.add(ni); 418 434 } 419 435 420 436 return newInst; 421 } 422 423 /** 424 * performs the attribute selection 425 * we perform attribute significance tests and drop attributes 426 * 427 * attribute selection is only performed on the source dataset 428 * we retain the top 15% attributes (if 15% is a float we just use the integer part) 429 */ 430 public void attributeSelection() throws Exception { 431 432 // it is a wrapper, we may decide to implement ChiSquare or other means of selecting attributes 433 this.attributeSelectionBySignificance(this.train); 434 } 435 436 private void attributeSelectionBySignificance(Instances which) throws Exception { 437 // Uses: http://weka.sourceforge.net/doc.packages/probabilisticSignificanceAE/weka/attributeSelection/SignificanceAttributeEval.html 438 SignificanceAttributeEval et = new SignificanceAttributeEval(); 439 et.buildEvaluator(which); 440 441 // evaluate all training attributes 442 HashMap<String,Double> saeval = new HashMap<String,Double>(); 443 for(int i=0; i < which.numAttributes(); i++) { 444 if(which.classIndex() != i) { 445 saeval.put(which.attribute(i).name(), et.evaluateAttribute(i)); 446 } 447 } 448 449 // sort by significance 450 HashMap<String, Double> sorted = (HashMap<String, Double>) sortByValues(saeval); 451 452 // Keep the best 15% 453 double last = ((double)saeval.size() / 100.0) * 15.0; 454 int drop_first = saeval.size() - (int)last; 455 456 // drop attributes above last 457 Iterator<Entry<String, Double>> it = sorted.entrySet().iterator(); 458 while (drop_first > 0) { 459 Map.Entry<String, Double> pair = (Map.Entry<String, Double>)it.next(); 460 if(which.attribute((String)pair.getKey()).index() != which.classIndex()) { 461 which.deleteAttributeAt(which.attribute((String)pair.getKey()).index()); 462 } 463 drop_first-=1; 464 } 465 } 466 467 /** 468 * Helper method to sort a hashmap by its values. 469 * 470 * @param map 471 * @return sorted map 472 */ 473 private HashMap<String, Double> sortByValues(HashMap<String, Double> map) { 474 List<Map.Entry<String, Double>> list = new LinkedList<Map.Entry<String, Double>>(map.entrySet()); 475 476 Collections.sort(list, new Comparator<Map.Entry<String, Double>>() { 477 public int compare(Map.Entry<String, Double> o1, Map.Entry<String, Double> o2) { 478 return (o1.getValue()).compareTo( o2.getValue() ); 479 } 480 }); 481 482 HashMap<String, Double> sortedHashMap = new LinkedHashMap<String, Double>(); 483 for(Map.Entry<String, Double> item : list) { 484 sortedHashMap.put(item.getKey(), item.getValue()); 485 } 486 return sortedHashMap; 487 } 488 489 437 } 438 439 /** 440 * performs the attribute selection we perform attribute significance tests and drop 441 * attributes 442 * 443 * attribute selection is only performed on the source dataset we retain the top 15% 444 * attributes (if 15% is a float we just use the integer part) 445 */ 446 public void attributeSelection() throws Exception { 447 448 // it is a wrapper, we may decide to implement ChiSquare or other means of selecting 449 // attributes 450 this.attributeSelectionBySignificance(this.train); 451 } 452 453 private void attributeSelectionBySignificance(Instances which) throws Exception { 454 // Uses: 455 // http://weka.sourceforge.net/doc.packages/probabilisticSignificanceAE/weka/attributeSelection/SignificanceAttributeEval.html 456 SignificanceAttributeEval et = new SignificanceAttributeEval(); 457 et.buildEvaluator(which); 458 459 // evaluate all training attributes 460 HashMap<String, Double> saeval = new HashMap<String, Double>(); 461 for (int i = 0; i < which.numAttributes(); i++) { 462 if (which.classIndex() != i) { 463 saeval.put(which.attribute(i).name(), et.evaluateAttribute(i)); 464 } 465 } 466 467 // sort by significance 468 HashMap<String, Double> sorted = (HashMap<String, Double>) sortByValues(saeval); 469 470 // Keep the best 15% 471 double last = ((double) saeval.size() / 100.0) * 15.0; 472 int drop_first = saeval.size() - (int) last; 473 474 // drop attributes above last 475 Iterator<Entry<String, Double>> it = sorted.entrySet().iterator(); 476 while (drop_first > 0) { 477 Map.Entry<String, Double> pair = (Map.Entry<String, Double>) it.next(); 478 if (which.attribute((String) pair.getKey()).index() != which.classIndex()) { 479 which.deleteAttributeAt(which.attribute((String) pair.getKey()).index()); 480 } 481 drop_first -= 1; 482 } 483 } 484 485 /** 486 * Helper method to sort a hashmap by its values. 487 * 488 * @param map 489 * @return sorted map 490 */ 491 private HashMap<String, Double> sortByValues(HashMap<String, Double> map) { 492 List<Map.Entry<String, Double>> list = 493 new LinkedList<Map.Entry<String, Double>>(map.entrySet()); 494 495 Collections.sort(list, new Comparator<Map.Entry<String, Double>>() { 496 public int compare(Map.Entry<String, Double> o1, Map.Entry<String, Double> o2) { 497 return (o1.getValue()).compareTo(o2.getValue()); 498 } 499 }); 500 501 HashMap<String, Double> sortedHashMap = new LinkedHashMap<String, Double>(); 502 for (Map.Entry<String, Double> item : list) { 503 sortedHashMap.put(item.getKey(), item.getValue()); 504 } 505 return sortedHashMap; 506 } 507 490 508 /** 491 509 * Executes the similarity matching between train and test data. 492 510 * 493 * After this function is finished we have this.attributes with the correct matching between train and test data attributes. 511 * After this function is finished we have this.attributes with the correct matching between 512 * train and test data attributes. 494 513 * 495 514 * @param type 496 515 * @param cutoff 497 516 */ 498 public void matchAttributes(String type, double cutoff) { 499 500 MWBMatchingAlgorithm mwbm = new MWBMatchingAlgorithm(this.train.numAttributes(), this.test.numAttributes()); 501 502 if (type.equals("spearman")) { 503 this.spearmansRankCorrelation(cutoff, mwbm); 504 }else if(type.equals("ks")) { 505 this.kolmogorovSmirnovTest(cutoff, mwbm); 506 }else if(type.equals("percentile")) { 507 this.percentiles(cutoff, mwbm); 508 }else { 509 throw new RuntimeException("unknown matching method"); 510 } 511 512 // resulting maximal match gets assigned to this.attributes 517 public void matchAttributes(String type, double cutoff) { 518 519 MWBMatchingAlgorithm mwbm = 520 new MWBMatchingAlgorithm(this.train.numAttributes(), this.test.numAttributes()); 521 522 if (type.equals("spearman")) { 523 this.spearmansRankCorrelation(cutoff, mwbm); 524 } 525 else if (type.equals("ks")) { 526 this.kolmogorovSmirnovTest(cutoff, mwbm); 527 } 528 else if (type.equals("percentile")) { 529 this.percentiles(cutoff, mwbm); 530 } 531 else { 532 throw new RuntimeException("unknown matching method"); 533 } 534 535 // resulting maximal match gets assigned to this.attributes 513 536 int[] result = mwbm.getMatching(); 514 for (int i = 0; i < result.length; i++) {515 537 for (int i = 0; i < result.length; i++) { 538 516 539 // -1 means that it is not in the set of maximal matching 517 if( i != -1 && result[i] != -1) { 518 this.p_sum += mwbm.weights[i][result[i]]; // we add the weight of the returned matching for scoring the complete match later 540 if (i != -1 && result[i] != -1) { 541 this.p_sum += mwbm.weights[i][result[i]]; // we add the weight of the returned 542 // matching for scoring the complete 543 // match later 519 544 this.attributes.put(i, result[i]); 520 545 } 521 546 } 522 547 } 523 524 525 /** 526 * Calculates the Percentiles of the source and target metrics. 527 * 528 * @param cutoff 529 */ 530 public void percentiles(double cutoff, MWBMatchingAlgorithm mwbm) {531 for( int i = 0; i < this.train.numAttributes(); i++) {532 for( int j = 0; j < this.test.numAttributes(); j++ ) {533 // negative infinity counts as not present, we do this so we don't have to mapbetween attribute indexes in weka548 549 /** 550 * Calculates the Percentiles of the source and target metrics. 551 * 552 * @param cutoff 553 */ 554 public void percentiles(double cutoff, MWBMatchingAlgorithm mwbm) { 555 for (int i = 0; i < this.train.numAttributes(); i++) { 556 for (int j = 0; j < this.test.numAttributes(); j++) { 557 // negative infinity counts as not present, we do this so we don't have to map 558 // between attribute indexes in weka 534 559 // and the result of the mwbm computation 535 560 mwbm.setWeight(i, j, Double.NEGATIVE_INFINITY); 536 537 // class attributes are not relevant 561 562 // class attributes are not relevant 538 563 if (this.test.classIndex() == j) { 539 564 continue; … … 546 571 double train[] = this.train_values.get(i); 547 572 double test[] = this.test_values.get(j); 548 573 549 574 Arrays.sort(train); 550 575 Arrays.sort(test); 551 576 552 577 // percentiles 553 578 double train_p; 554 579 double test_p; 555 580 double score = 0.0; 556 for ( int p=1; p <= 9; p++) {557 train_p = train[(int) Math.ceil(train.length * (p/100))];558 test_p = test[(int) Math.ceil(test.length * (p/100))];559 560 if ( train_p > test_p) {581 for (int p = 1; p <= 9; p++) { 582 train_p = train[(int) Math.ceil(train.length * (p / 100))]; 583 test_p = test[(int) Math.ceil(test.length * (p / 100))]; 584 585 if (train_p > test_p) { 561 586 score += test_p / train_p; 562 }else { 587 } 588 else { 563 589 score += train_p / test_p; 564 590 } 565 591 } 566 567 if ( score > cutoff) {592 593 if (score > cutoff) { 568 594 mwbm.setWeight(i, j, score); 569 595 } 570 596 } 571 597 } 572 } 573 574 /** 575 * Calculate Spearmans rank correlation coefficient as matching score. 576 * The number of instances for the source and target needs to be the same so we randomly sample from the bigger one. 577 * 578 * @param cutoff 579 * @param mwbmatching 580 */ 581 public void spearmansRankCorrelation(double cutoff, MWBMatchingAlgorithm mwbm) { 582 double p = 0; 583 584 SpearmansCorrelation t = new SpearmansCorrelation(); 585 586 // size has to be the same so we randomly sample the number of the smaller sample from the big sample 587 if (this.train.size() > this.test.size()) { 588 this.sample(this.train, this.test, this.train_values); 589 }else if (this.test.size() > this.train.size()) { 590 this.sample(this.test, this.train, this.test_values); 591 } 592 598 } 599 600 /** 601 * Calculate Spearmans rank correlation coefficient as matching score. The number of 602 * instances for the source and target needs to be the same so we randomly sample from the 603 * bigger one. 604 * 605 * @param cutoff 606 * @param mwbmatching 607 */ 608 public void spearmansRankCorrelation(double cutoff, MWBMatchingAlgorithm mwbm) { 609 double p = 0; 610 611 SpearmansCorrelation t = new SpearmansCorrelation(); 612 613 // size has to be the same so we randomly sample the number of the smaller sample from 614 // the big sample 615 if (this.train.size() > this.test.size()) { 616 this.sample(this.train, this.test, this.train_values); 617 } 618 else if (this.test.size() > this.train.size()) { 619 this.sample(this.test, this.train, this.test_values); 620 } 621 593 622 // try out possible attribute combinations 594 for (int i=0; i < this.train.numAttributes(); i++) { 595 for (int j=0; j < this.test.numAttributes(); j++) { 596 // negative infinity counts as not present, we do this so we don't have to map between attribute indexs in weka 623 for (int i = 0; i < this.train.numAttributes(); i++) { 624 for (int j = 0; j < this.test.numAttributes(); j++) { 625 // negative infinity counts as not present, we do this so we don't have to map 626 // between attribute indexs in weka 597 627 // and the result of the mwbm computation 598 628 mwbm.setWeight(i, j, Double.NEGATIVE_INFINITY); 599 600 // class attributes are not relevant 629 630 // class attributes are not relevant 601 631 if (this.test.classIndex() == j) { 602 632 continue; … … 605 635 continue; 606 636 } 607 608 609 637 638 p = t.correlation(this.train_values.get(i), this.test_values.get(j)); 639 if (p > cutoff) { 610 640 mwbm.setWeight(i, j, p); 611 612 613 614 } 615 616 617 618 619 620 621 622 641 } 642 } 643 } 644 } 645 646 /** 647 * Helper method to sample instances for the Spearman rank correlation coefficient method. 648 * 649 * @param bigger 650 * @param smaller 651 * @param values 652 */ 623 653 private void sample(Instances bigger, Instances smaller, ArrayList<double[]> values) { 624 654 // we want to at keep the indices we select the same … … 627 657 Random rand = new Random(); 628 658 while (indices_to_draw > 0) { 629 630 int index = rand.nextInt(bigger.size() -1);631 659 660 int index = rand.nextInt(bigger.size() - 1); 661 632 662 if (!indices.contains(index)) { 633 663 indices.add(index); … … 635 665 } 636 666 } 637 667 638 668 // now reduce our values to the indices we choose above for every attribute 639 for (int att =0; att < bigger.numAttributes()-1; att++) {640 669 for (int att = 0; att < bigger.numAttributes() - 1; att++) { 670 641 671 // get double for the att 642 672 double[] vals = values.get(att); 643 673 double[] new_vals = new double[indices.size()]; 644 674 645 675 int i = 0; 646 676 for (Iterator<Integer> it = indices.iterator(); it.hasNext();) { … … 648 678 i++; 649 679 } 650 680 651 681 values.set(att, new_vals); 652 682 } 653 } 654 655 656 /** 657 * We run the kolmogorov-smirnov test on the data from our test an traindata 658 * if the p value is above the cutoff we include it in the results 659 * p value tends to be 0 when the distributions of the data are significantly different 660 * but we want them to be the same 661 * 662 * @param cutoff 663 * @return p-val 664 */ 665 public void kolmogorovSmirnovTest(double cutoff, MWBMatchingAlgorithm mwbm) { 666 double p = 0; 667 668 KolmogorovSmirnovTest t = new KolmogorovSmirnovTest(); 669 for (int i=0; i < this.train.numAttributes(); i++) { 670 for ( int j=0; j < this.test.numAttributes(); j++) { 671 // negative infinity counts as not present, we do this so we don't have to map between attribute indexs in weka 683 } 684 685 /** 686 * We run the kolmogorov-smirnov test on the data from our test an traindata if the p value 687 * is above the cutoff we include it in the results p value tends to be 0 when the 688 * distributions of the data are significantly different but we want them to be the same 689 * 690 * @param cutoff 691 * @return p-val 692 */ 693 public void kolmogorovSmirnovTest(double cutoff, MWBMatchingAlgorithm mwbm) { 694 double p = 0; 695 696 KolmogorovSmirnovTest t = new KolmogorovSmirnovTest(); 697 for (int i = 0; i < this.train.numAttributes(); i++) { 698 for (int j = 0; j < this.test.numAttributes(); j++) { 699 // negative infinity counts as not present, we do this so we don't have to map 700 // between attribute indexs in weka 672 701 // and the result of the mwbm computation 673 702 mwbm.setWeight(i, j, Double.NEGATIVE_INFINITY); 674 675 // class attributes are not relevant 703 704 // class attributes are not relevant 676 705 if (this.test.classIndex() == j) { 677 706 continue; … … 680 709 continue; 681 710 } 682 683 // this may invoke exactP on small sample sizes which will not terminate in all cases 684 //p = t.kolmogorovSmirnovTest(this.train_values.get(i), this.test_values.get(j), false); 711 712 // this may invoke exactP on small sample sizes which will not terminate in all 713 // cases 714 // p = t.kolmogorovSmirnovTest(this.train_values.get(i), 715 // this.test_values.get(j), false); 685 716 686 717 // this uses approximateP everytime 687 p = t.approximateP(t.kolmogorovSmirnovStatistic(this.train_values.get(i), this.test_values.get(j)), this.train_values.get(i).length, this.test_values.get(j).length); 688 if (p > cutoff) { 718 p = t.approximateP( 719 t.kolmogorovSmirnovStatistic(this.train_values.get(i), 720 this.test_values.get(j)), 721 this.train_values.get(i).length, 722 this.test_values.get(j).length); 723 if (p > cutoff) { 689 724 mwbm.setWeight(i, j, p); 690 691 692 693 725 } 726 } 727 } 728 } 694 729 } 695 730 696 731 /* 697 * Copyright (c) 2007, Massachusetts Institute of Technology 698 * Copyright (c) 2005-2006, Regents of the University of California 699 * All rights reserved. 732 * Copyright (c) 2007, Massachusetts Institute of Technology Copyright (c) 2005-2006, Regents of 733 * the University of California All rights reserved. 700 734 * 701 * Redistribution and use in source and binary forms, with or without 702 * modification, are permitted provided that the following conditions 703 * are met: 735 * Redistribution and use in source and binary forms, with or without modification, are 736 * permitted provided that the following conditions are met: 704 737 * 705 * * Redistributions of source code must retain the above copyright 706 * notice, this list ofconditions and the following disclaimer.738 * * Redistributions of source code must retain the above copyright notice, this list of 739 * conditions and the following disclaimer. 707 740 * 708 * * Redistributions in binary form must reproduce the above copyright 709 * notice, this list of conditions and the following disclaimer in 710 * the documentation and/or other materials provided with the 711 * distribution. 741 * * Redistributions in binary form must reproduce the above copyright notice, this list of 742 * conditions and the following disclaimer in the documentation and/or other materials provided 743 * with the distribution. 712 744 * 713 * * Neither the name of the University of California, Berkeley nor 714 * the names of its contributors may be used to endorse or promote 715 * products derived from this software without specific prior 716 * written permission. 745 * * Neither the name of the University of California, Berkeley nor the names of its 746 * contributors may be used to endorse or promote products derived from this software without 747 * specific prior written permission. 717 748 * 718 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 719 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 720 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 721 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 722 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 723 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 724 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 725 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 726 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 727 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 728 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 749 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS 750 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 751 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 752 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 753 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 754 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 755 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 756 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 729 757 * OF THE POSSIBILITY OF SUCH DAMAGE. 730 758 */ 731 759 732 733 734 760 /** 735 * An engine for finding the maximum-weight matching in a complete 736 * bipartite graph. Suppose we have two sets <i>S</i> and <i>T</i>, 737 * both of size <i>n</i>. For each <i>i</i> in <i>S</i> and <i>j</i> 738 * in <i>T</i>, we have a weight <i>w<sub>ij</sub></i>. A perfect 739 * matching <i>X</i> is a subset of <i>S</i> x <i>T</i> such that each 740 * <i>i</i> in <i>S</i> occurs in exactly one element of <i>X</i>, and 741 * each <i>j</i> in <i>T</i> occurs in exactly one element of 742 * <i>X</i>. Thus, <i>X</i> can be thought of as a one-to-one 743 * function from <i>S</i> to <i>T</i>. The weight of <i>X</i> is the 744 * sum, over (<i>i</i>, <i>j</i>) in <i>X</i>, of 745 * <i>w<sub>ij</sub></i>. A BipartiteMatcher takes the number 746 * <i>n</i> and the weights <i>w<sub>ij</sub></i>, and finds a perfect 747 * matching of maximum weight. 761 * An engine for finding the maximum-weight matching in a complete bipartite graph. Suppose we 762 * have two sets <i>S</i> and <i>T</i>, both of size <i>n</i>. For each <i>i</i> in <i>S</i> and 763 * <i>j</i> in <i>T</i>, we have a weight <i>w<sub>ij</sub></i>. A perfect matching <i>X</i> is 764 * a subset of <i>S</i> x <i>T</i> such that each <i>i</i> in <i>S</i> occurs in exactly one 765 * element of <i>X</i>, and each <i>j</i> in <i>T</i> occurs in exactly one element of <i>X</i>. 766 * Thus, <i>X</i> can be thought of as a one-to-one function from <i>S</i> to <i>T</i>. The 767 * weight of <i>X</i> is the sum, over (<i>i</i>, <i>j</i>) in <i>X</i>, of <i>w 768 * <sub>ij</sub></i>. A BipartiteMatcher takes the number <i>n</i> and the weights <i>w 769 * <sub>ij</sub></i>, and finds a perfect matching of maximum weight. 748 770 * 749 * It uses the Hungarian algorithm of Kuhn (1955), as improved and 750 * presented by E. L. Lawler in his book <cite>Combinatorial 751 * Optimization: Networks and Matroids</cite> (Holt, Rinehart and 752 * Winston, 1976, p. 205-206). The running time is 753 * O(<i>n</i><sup>3</sup>). The weights can be any finite real 754 * numbers; Lawler's algorithm assumes positive weights, so if 755 * necessary we add a constant <i>c</i> to all the weights before 756 * running the algorithm. This increases the weight of every perfect 757 * matching by <i>nc</i>, which doesn't change which perfect matchings 771 * It uses the Hungarian algorithm of Kuhn (1955), as improved and presented by E. L. Lawler in 772 * his book <cite>Combinatorial Optimization: Networks and Matroids</cite> (Holt, Rinehart and 773 * Winston, 1976, p. 205-206). The running time is O(<i>n</i><sup>3</sup>). The weights can be 774 * any finite real numbers; Lawler's algorithm assumes positive weights, so if necessary we add 775 * a constant <i>c</i> to all the weights before running the algorithm. This increases the 776 * weight of every perfect matching by <i>nc</i>, which doesn't change which perfect matchings 758 777 * have maximum weight. 759 778 * 760 * If a weight is set to Double.NEGATIVE_INFINITY, then the algorithm will 761 * behave as if that edge were not in the graph. If all the edges incident on762 * a given node have weight Double.NEGATIVE_INFINITY, then the final result763 * will not be a perfect matching, and an exception will be thrown.779 * If a weight is set to Double.NEGATIVE_INFINITY, then the algorithm will behave as if that 780 * edge were not in the graph. If all the edges incident on a given node have weight 781 * Double.NEGATIVE_INFINITY, then the final result will not be a perfect matching, and an 782 * exception will be thrown. 764 783 */ 765 class MWBMatchingAlgorithm { 766 /** 767 * Creates a BipartiteMatcher without specifying the graph size. Calling 768 * any other method before calling reset will yield an 769 * IllegalStateException. 770 */ 771 772 /** 773 * Tolerance for comparisons to zero, to account for 774 * floating-point imprecision. We consider a positive number to 775 * be essentially zero if it is strictly less than TOL. 784 class MWBMatchingAlgorithm { 785 /** 786 * Creates a BipartiteMatcher without specifying the graph size. Calling any other method 787 * before calling reset will yield an IllegalStateException. 788 */ 789 790 /** 791 * Tolerance for comparisons to zero, to account for floating-point imprecision. We consider 792 * a positive number to be essentially zero if it is strictly less than TOL. 776 793 */ 777 794 private static final double TOL = 1e-10; 778 // Number of left side nodes795 // Number of left side nodes 779 796 int n; 780 797 781 // Number of right side nodes798 // Number of right side nodes 782 799 int m; 783 800 … … 786 803 double maxWeight; 787 804 788 // If (i, j) is in the mapping, then sMatches[i] = j and tMatches[j] = i. 789 // If i is unmatched, then sMatches[i] = -1 (and likewise for tMatches). 805 // If (i, j) is in the mapping, then sMatches[i] = j and tMatches[j] = i. 806 // If i is unmatched, then sMatches[i] = -1 (and likewise for tMatches). 790 807 int[] sMatches; 791 808 int[] tMatches; … … 799 816 double[] u; 800 817 double[] v; 801 818 802 819 double[] pi; 803 820 804 821 List<Integer> eligibleS = new ArrayList<Integer>(); 805 List<Integer> eligibleT = new ArrayList<Integer>(); 806 807 822 List<Integer> eligibleT = new ArrayList<Integer>(); 823 808 824 public MWBMatchingAlgorithm() { 809 n = -1;810 m = -1;811 } 812 813 /** 814 * Creates a BipartiteMatcher and prepares it to run on an n x m graph. 815 * All the weights are initially set to 1.825 n = -1; 826 m = -1; 827 } 828 829 /** 830 * Creates a BipartiteMatcher and prepares it to run on an n x m graph. All the weights are 831 * initially set to 1. 816 832 */ 817 833 public MWBMatchingAlgorithm(int n, int m) { 818 reset(n, m); 819 } 820 821 /** 822 * Resets the BipartiteMatcher to run on an n x m graph. The weights are 823 * all reset to 1. 834 reset(n, m); 835 } 836 837 /** 838 * Resets the BipartiteMatcher to run on an n x m graph. The weights are all reset to 1. 824 839 */ 825 840 private void reset(int n, int m) { … … 833 848 for (int i = 0; i < n; i++) { 834 849 for (int j = 0; j < m; j++) { 835 weights[i][j] = 1;850 weights[i][j] = 1; 836 851 } 837 852 } … … 846 861 v = new double[m]; 847 862 pi = new double[m]; 848 849 } 850 /** 851 * Sets the weight w<sub>ij</sub> to the given value w. 863 864 } 865 866 /** 867 * Sets the weight w<sub>ij</sub> to the given value w. 852 868 * 853 * @throws IllegalArgumentException if i or j is outside the range [0, n). 869 * @throws IllegalArgumentException 870 * if i or j is outside the range [0, n). 854 871 */ 855 872 public void setWeight(int i, int j, double w) { 856 if (n == -1 || m == -1) {857 throw new IllegalStateException("Graph size not specified.");858 }859 if ((i < 0) || (i >= n)) {860 throw new IllegalArgumentException("i-value out of range: " + i);861 }862 if ((j < 0) || (j >= m)) {863 throw new IllegalArgumentException("j-value out of range: " + j);864 }865 if (Double.isNaN(w)) {866 throw new IllegalArgumentException("Illegal weight: " + w);867 }868 869 weights[i][j] = w;870 if ((w > Double.NEGATIVE_INFINITY) && (w < minWeight)) {871 minWeight = w;872 }873 if (w > maxWeight) {874 maxWeight = w;875 }876 } 877 878 /** 879 * Returns a maximum-weight perfect matching relative to the weights 880 * s pecified with setWeight. The matching is represented as an array arr881 * of length n, where arr[i] = j if(i,j) is in the matching.873 if (n == -1 || m == -1) { 874 throw new IllegalStateException("Graph size not specified."); 875 } 876 if ((i < 0) || (i >= n)) { 877 throw new IllegalArgumentException("i-value out of range: " + i); 878 } 879 if ((j < 0) || (j >= m)) { 880 throw new IllegalArgumentException("j-value out of range: " + j); 881 } 882 if (Double.isNaN(w)) { 883 throw new IllegalArgumentException("Illegal weight: " + w); 884 } 885 886 weights[i][j] = w; 887 if ((w > Double.NEGATIVE_INFINITY) && (w < minWeight)) { 888 minWeight = w; 889 } 890 if (w > maxWeight) { 891 maxWeight = w; 892 } 893 } 894 895 /** 896 * Returns a maximum-weight perfect matching relative to the weights specified with 897 * setWeight. The matching is represented as an array arr of length n, where arr[i] = j if 898 * (i,j) is in the matching. 882 899 */ 883 900 public int[] getMatching() { 884 if (n == -1 || m == -1 ) { 885 throw new IllegalStateException("Graph size not specified."); 886 } 887 if (n == 0) { 888 return new int[0]; 889 } 890 ensurePositiveWeights(); 891 892 // Step 0: Initialization 893 eligibleS.clear(); 894 eligibleT.clear(); 895 for (Integer i = 0; i < n; i++) { 896 sMatches[i] = -1; 897 898 u[i] = maxWeight; // ambiguous on p. 205 of Lawler, but see p. 202 899 900 // this is really first run of Step 1.0 901 sLabels[i] = EMPTY_LABEL; 902 eligibleS.add(i); 903 } 904 905 for (int j = 0; j < m; j++) { 906 tMatches[j] = -1; 907 908 v[j] = 0; 909 pi[j] = Double.POSITIVE_INFINITY; 910 911 // this is really first run of Step 1.0 912 tLabels[j] = NO_LABEL; 913 } 914 915 while (true) { 916 // Augment the matching until we can't augment any more given the 917 // current settings of the dual variables. 901 if (n == -1 || m == -1) { 902 throw new IllegalStateException("Graph size not specified."); 903 } 904 if (n == 0) { 905 return new int[0]; 906 } 907 ensurePositiveWeights(); 908 909 // Step 0: Initialization 910 eligibleS.clear(); 911 eligibleT.clear(); 912 for (Integer i = 0; i < n; i++) { 913 sMatches[i] = -1; 914 915 u[i] = maxWeight; // ambiguous on p. 205 of Lawler, but see p. 202 916 917 // this is really first run of Step 1.0 918 sLabels[i] = EMPTY_LABEL; 919 eligibleS.add(i); 920 } 921 922 for (int j = 0; j < m; j++) { 923 tMatches[j] = -1; 924 925 v[j] = 0; 926 pi[j] = Double.POSITIVE_INFINITY; 927 928 // this is really first run of Step 1.0 929 tLabels[j] = NO_LABEL; 930 } 931 918 932 while (true) { 919 // Steps 1.1-1.4: Find an augmenting path 920 int lastNode = findAugmentingPath(); 921 if (lastNode == -1) { 922 break; // no augmenting path 923 } 924 925 // Step 2: Augmentation 926 flipPath(lastNode); 927 for (int i = 0; i < n; i++) 928 sLabels[i] = NO_LABEL; 929 930 for (int j = 0; j < m; j++) { 931 pi[j] = Double.POSITIVE_INFINITY; 932 tLabels[j] = NO_LABEL; 933 } 934 935 936 937 // This is Step 1.0 938 eligibleS.clear(); 933 // Augment the matching until we can't augment any more given the 934 // current settings of the dual variables. 935 while (true) { 936 // Steps 1.1-1.4: Find an augmenting path 937 int lastNode = findAugmentingPath(); 938 if (lastNode == -1) { 939 break; // no augmenting path 940 } 941 942 // Step 2: Augmentation 943 flipPath(lastNode); 944 for (int i = 0; i < n; i++) 945 sLabels[i] = NO_LABEL; 946 947 for (int j = 0; j < m; j++) { 948 pi[j] = Double.POSITIVE_INFINITY; 949 tLabels[j] = NO_LABEL; 950 } 951 952 // This is Step 1.0 953 eligibleS.clear(); 954 for (int i = 0; i < n; i++) { 955 if (sMatches[i] == -1) { 956 sLabels[i] = EMPTY_LABEL; 957 eligibleS.add(new Integer(i)); 958 } 959 } 960 961 eligibleT.clear(); 962 } 963 964 // Step 3: Change the dual variables 965 966 // delta1 = min_i u[i] 967 double delta1 = Double.POSITIVE_INFINITY; 968 for (int i = 0; i < n; i++) { 969 if (u[i] < delta1) { 970 delta1 = u[i]; 971 } 972 } 973 974 // delta2 = min_{j : pi[j] > 0} pi[j] 975 double delta2 = Double.POSITIVE_INFINITY; 976 for (int j = 0; j < m; j++) { 977 if ((pi[j] >= TOL) && (pi[j] < delta2)) { 978 delta2 = pi[j]; 979 } 980 } 981 982 if (delta1 < delta2) { 983 // In order to make another pi[j] equal 0, we'd need to 984 // make some u[i] negative. 985 break; // we have a maximum-weight matching 986 } 987 988 changeDualVars(delta2); 989 } 990 991 int[] matching = new int[n]; 939 992 for (int i = 0; i < n; i++) { 940 if (sMatches[i] == -1) { 941 sLabels[i] = EMPTY_LABEL; 942 eligibleS.add(new Integer(i)); 943 } 944 } 945 946 947 eligibleT.clear(); 948 } 949 950 // Step 3: Change the dual variables 951 952 // delta1 = min_i u[i] 953 double delta1 = Double.POSITIVE_INFINITY; 954 for (int i = 0; i < n; i++) { 955 if (u[i] < delta1) { 956 delta1 = u[i]; 957 } 958 } 959 960 // delta2 = min_{j : pi[j] > 0} pi[j] 961 double delta2 = Double.POSITIVE_INFINITY; 962 for (int j = 0; j < m; j++) { 963 if ((pi[j] >= TOL) && (pi[j] < delta2)) { 964 delta2 = pi[j]; 965 } 966 } 967 968 if (delta1 < delta2) { 969 // In order to make another pi[j] equal 0, we'd need to 970 // make some u[i] negative. 971 break; // we have a maximum-weight matching 972 } 973 974 changeDualVars(delta2); 975 } 976 977 int[] matching = new int[n]; 978 for (int i = 0; i < n; i++) { 979 matching[i] = sMatches[i]; 980 } 981 return matching; 982 } 983 984 /** 985 * Tries to find an augmenting path containing only edges (i,j) for which 986 * u[i] + v[j] = weights[i][j]. If it succeeds, returns the index of the 987 * last node in the path. Otherwise, returns -1. In any case, updates 988 * the labels and pi values. 993 matching[i] = sMatches[i]; 994 } 995 return matching; 996 } 997 998 /** 999 * Tries to find an augmenting path containing only edges (i,j) for which u[i] + v[j] = 1000 * weights[i][j]. If it succeeds, returns the index of the last node in the path. Otherwise, 1001 * returns -1. In any case, updates the labels and pi values. 989 1002 */ 990 1003 int findAugmentingPath() { 991 while ((!eligibleS.isEmpty()) || (!eligibleT.isEmpty())) { 992 if (!eligibleS.isEmpty()) { 993 int i = ((Integer) eligibleS.get(eligibleS.size() - 1)). 994 intValue(); 995 eligibleS.remove(eligibleS.size() - 1); 996 for (int j = 0; j < m; j++) { 997 // If pi[j] has already been decreased essentially 998 // to zero, then j is already labeled, and we 999 // can't decrease pi[j] any more. Omitting the 1000 // pi[j] >= TOL check could lead us to relabel j 1001 // unnecessarily, since the diff we compute on the 1002 // next line may end up being less than pi[j] due 1003 // to floating point imprecision. 1004 if ((tMatches[j] != i) && (pi[j] >= TOL)) { 1005 double diff = u[i] + v[j] - weights[i][j]; 1006 if (diff < pi[j]) { 1007 tLabels[j] = i; 1008 pi[j] = diff; 1009 if (pi[j] < TOL) { 1010 eligibleT.add(new Integer(j)); 1011 } 1012 } 1013 } 1014 } 1015 } else { 1016 int j = ((Integer) eligibleT.get(eligibleT.size() - 1)). 1017 intValue(); 1018 eligibleT.remove(eligibleT.size() - 1); 1019 if (tMatches[j] == -1) { 1020 return j; // we've found an augmenting path 1021 } 1022 1023 int i = tMatches[j]; 1024 sLabels[i] = j; 1025 eligibleS.add(new Integer(i)); // ok to add twice 1026 } 1027 } 1028 1029 return -1; 1030 } 1031 1032 /** 1033 * Given an augmenting path ending at lastNode, "flips" the path. This 1034 * means that an edge on the path is in the matching after the flip if 1035 * and only if it was not in the matching before the flip. An augmenting 1036 * path connects two unmatched nodes, so the result is still a matching. 1037 */ 1004 while ((!eligibleS.isEmpty()) || (!eligibleT.isEmpty())) { 1005 if (!eligibleS.isEmpty()) { 1006 int i = ((Integer) eligibleS.get(eligibleS.size() - 1)).intValue(); 1007 eligibleS.remove(eligibleS.size() - 1); 1008 for (int j = 0; j < m; j++) { 1009 // If pi[j] has already been decreased essentially 1010 // to zero, then j is already labeled, and we 1011 // can't decrease pi[j] any more. Omitting the 1012 // pi[j] >= TOL check could lead us to relabel j 1013 // unnecessarily, since the diff we compute on the 1014 // next line may end up being less than pi[j] due 1015 // to floating point imprecision. 1016 if ((tMatches[j] != i) && (pi[j] >= TOL)) { 1017 double diff = u[i] + v[j] - weights[i][j]; 1018 if (diff < pi[j]) { 1019 tLabels[j] = i; 1020 pi[j] = diff; 1021 if (pi[j] < TOL) { 1022 eligibleT.add(new Integer(j)); 1023 } 1024 } 1025 } 1026 } 1027 } 1028 else { 1029 int j = ((Integer) eligibleT.get(eligibleT.size() - 1)).intValue(); 1030 eligibleT.remove(eligibleT.size() - 1); 1031 if (tMatches[j] == -1) { 1032 return j; // we've found an augmenting path 1033 } 1034 1035 int i = tMatches[j]; 1036 sLabels[i] = j; 1037 eligibleS.add(new Integer(i)); // ok to add twice 1038 } 1039 } 1040 1041 return -1; 1042 } 1043 1044 /** 1045 * Given an augmenting path ending at lastNode, "flips" the path. This means that an edge on 1046 * the path is in the matching after the flip if and only if it was not in the matching 1047 * before the flip. An augmenting path connects two unmatched nodes, so the result is still 1048 * a matching. 1049 */ 1038 1050 void flipPath(int lastNode) { 1039 1051 while (lastNode != EMPTY_LABEL) { 1040 1052 int parent = tLabels[lastNode]; 1041 1042 // Add (parent, lastNode) to matching. We don't need to1043 // explicitly remove any edges from the matching because: 1044 // * We know at this point that there is no i such that1045 // sMatches[i] = lastNode.1046 // 1047 // parent, that j must be sLabels[parent], and will change1048 // tMatches[j] in the next time through this loop.1053 1054 // Add (parent, lastNode) to matching. We don't need to 1055 // explicitly remove any edges from the matching because: 1056 // * We know at this point that there is no i such that 1057 // sMatches[i] = lastNode. 1058 // * Although there might be some j such that tMatches[j] = 1059 // parent, that j must be sLabels[parent], and will change 1060 // tMatches[j] in the next time through this loop. 1049 1061 sMatches[parent] = lastNode; 1050 1062 tMatches[lastNode] = parent; 1051 1063 1052 1064 lastNode = sLabels[parent]; 1053 1065 } … … 1057 1069 for (int i = 0; i < n; i++) { 1058 1070 if (sLabels[i] != NO_LABEL) { 1059 u[i] -= delta;1060 } 1061 } 1062 1071 u[i] -= delta; 1072 } 1073 } 1074 1063 1075 for (int j = 0; j < m; j++) { 1064 1076 if (pi[j] < TOL) { 1065 v[j] += delta; 1066 } else if (tLabels[j] != NO_LABEL) { 1067 pi[j] -= delta; 1068 if (pi[j] < TOL) { 1069 eligibleT.add(new Integer(j)); 1070 } 1071 } 1072 } 1073 } 1074 1075 /** 1076 * Ensures that all weights are either Double.NEGATIVE_INFINITY, 1077 * or strictly greater than zero. 1077 v[j] += delta; 1078 } 1079 else if (tLabels[j] != NO_LABEL) { 1080 pi[j] -= delta; 1081 if (pi[j] < TOL) { 1082 eligibleT.add(new Integer(j)); 1083 } 1084 } 1085 } 1086 } 1087 1088 /** 1089 * Ensures that all weights are either Double.NEGATIVE_INFINITY, or strictly greater than 1090 * zero. 1078 1091 */ 1079 1092 private void ensurePositiveWeights() { … … 1081 1094 if (minWeight < TOL) { 1082 1095 for (int i = 0; i < n; i++) { 1083 for (int j = 0; j < m; j++) {1084 weights[i][j] = weights[i][j] - minWeight + 1;1085 }1086 } 1087 1096 for (int j = 0; j < m; j++) { 1097 weights[i][j] = weights[i][j] - minWeight + 1; 1098 } 1099 } 1100 1088 1101 maxWeight = maxWeight - minWeight + 1; 1089 1102 minWeight = 1; … … 1095 1108 for (int i = 0; i < n; i++) { 1096 1109 for (int j = 0; j < m; j++) { 1097 System.out.print(weights[i][j] + " ");1110 System.out.print(weights[i][j] + " "); 1098 1111 } 1099 1112 System.out.println("");
Note: See TracChangeset
for help on using the changeset viewer.