Changeset 41 for trunk/CrossPare/src/de/ugoe/cs/cpdp
- Timestamp:
- 09/24/15 10:59:05 (9 years ago)
- Location:
- trunk/CrossPare/src/de/ugoe/cs/cpdp
- Files:
-
- 84 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/CrossPare/src/de/ugoe/cs/cpdp/ExperimentConfiguration.java
r32 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp; 2 16 … … 33 47 34 48 /** 35 * Class that contains all meta information about an experiment, i.e., its configuration. The configuration is loaded from an XML file. 36 * <br><br> 37 * In the current implementation, the experiment configuration can only be created using an XML file. Programmatic creation of experiment configurations is currently not possibly. 49 * Class that contains all meta information about an experiment, i.e., its configuration. The 50 * configuration is loaded from an XML file. <br> 51 * <br> 52 * In the current implementation, the experiment configuration can only be created using an XML 53 * file. Programmatic creation of experiment configurations is currently not possibly. 54 * 38 55 * @author Steffen Herbold 39 56 */ 40 public class ExperimentConfiguration extends DefaultHandler { 41 42 /** 43 * handle of the file that contains the configuration 44 */ 45 private final File configFile; 46 47 /** 48 * name of the experiment (automatically set to the file name without the .xml ending) 49 */ 50 private String experimentName = "exp"; 51 52 /** 53 * loads instances 54 */ 55 private List<IVersionLoader> loaders; 56 57 /** 58 * path were the results of the experiments are stored 59 */ 60 private String resultsPath = "results"; 61 62 /** 63 * data set filters applied to all data 64 */ 65 private List<IVersionFilter> versionFilters; 66 67 /** 68 * data set filters that decide if a data set is used as test data 69 */ 70 private List<IVersionFilter> testVersionFilters; 71 72 /** 73 * data set filters that decide if a data is used as candidate training data 74 */ 75 private List<IVersionFilter> trainingVersionFilters; 76 77 /** 78 * setwise data processors that are applied before the setwise data selection 79 */ 80 private List<ISetWiseProcessingStrategy> setwisepreprocessors; 81 82 /** 83 * setwise data selection strategies 84 */ 85 private List<ISetWiseDataselectionStrategy> setwiseselectors; 86 87 /** 88 * setwise data processors that are applied after the setwise data selection 89 */ 90 private List<ISetWiseProcessingStrategy> setwisepostprocessors; 91 92 /** 93 * setwise trainers, i.e., trainers that require the selected training data to be separate from each other 94 */ 95 private List<ISetWiseTrainingStrategy> setwiseTrainers; 96 97 /** 98 * data processors that are applied before the pointwise data selection 99 */ 100 private List<IProcessesingStrategy> preprocessors; 101 102 /** 103 * pointwise data selection strategies 104 */ 105 private List<IPointWiseDataselectionStrategy> pointwiseselectors; 106 107 /** 108 * data processors that are applied before the pointwise data selection 109 */ 110 private List<IProcessesingStrategy> postprocessors; 111 112 /** 113 * normal trainers, i.e., trainers that require the selected training data in a single data set 114 */ 115 private List<ITrainingStrategy> trainers; 116 117 /** 118 * evaluators used for the the experiment results 119 */ 120 private List<IEvaluationStrategy> evaluators; 121 122 /** 123 * indicates, if the classifier should be saved 124 */ 125 private Boolean saveClassifier = null; 126 127 /** 128 * indicates, which execution strategy to choose 129 * (e.g. CrossProjectExperiment, ClassifierCreationExecution). 130 * Default is CrossProjectExperiment. 131 */ 132 private String executionStrategy = "CrossProjectExperiment"; 133 134 /** 135 * Constructor. Creates a new configuration from a given file. 136 * @param filename name of the file from the configuration is loaded. 137 * @throws ExperimentConfigurationException thrown if there is an error creating the configuration 138 */ 139 public ExperimentConfiguration(String filename) throws ExperimentConfigurationException { 140 this(new File(filename)); 141 } 142 143 /** 144 * Constructor. Creates a new configuration from a given file. 145 * @param filename handle of the file from the configuration is loaded. 146 * @throws ExperimentConfigurationException thrown if there is an error creating the configuration 147 */ 148 public ExperimentConfiguration(File file) throws ExperimentConfigurationException { 149 loaders = new LinkedList<>(); 150 versionFilters = new LinkedList<>(); 151 testVersionFilters = new LinkedList<>(); 152 trainingVersionFilters = new LinkedList<>(); 153 setwisepreprocessors = new LinkedList<>(); 154 setwiseselectors = new LinkedList<>(); 155 setwisepostprocessors = new LinkedList<>(); 156 setwiseTrainers = new LinkedList<>(); 157 preprocessors = new LinkedList<>(); 158 pointwiseselectors = new LinkedList<>(); 159 postprocessors = new LinkedList<>(); 160 trainers = new LinkedList<>(); 161 evaluators = new LinkedList<>(); 162 163 if (file == null) { 57 public class ExperimentConfiguration extends DefaultHandler { 58 59 /** 60 * handle of the file that contains the configuration 61 */ 62 private final File configFile; 63 64 /** 65 * name of the experiment (automatically set to the file name without the .xml ending) 66 */ 67 private String experimentName = "exp"; 68 69 /** 70 * loads instances 71 */ 72 private List<IVersionLoader> loaders; 73 74 /** 75 * path were the results of the experiments are stored 76 */ 77 private String resultsPath = "results"; 78 79 /** 80 * data set filters applied to all data 81 */ 82 private List<IVersionFilter> versionFilters; 83 84 /** 85 * data set filters that decide if a data set is used as test data 86 */ 87 private List<IVersionFilter> testVersionFilters; 88 89 /** 90 * data set filters that decide if a data is used as candidate training data 91 */ 92 private List<IVersionFilter> trainingVersionFilters; 93 94 /** 95 * setwise data processors that are applied before the setwise data selection 96 */ 97 private List<ISetWiseProcessingStrategy> setwisepreprocessors; 98 99 /** 100 * setwise data selection strategies 101 */ 102 private List<ISetWiseDataselectionStrategy> setwiseselectors; 103 104 /** 105 * setwise data processors that are applied after the setwise data selection 106 */ 107 private List<ISetWiseProcessingStrategy> setwisepostprocessors; 108 109 /** 110 * setwise trainers, i.e., trainers that require the selected training data to be separate from 111 * each other 112 */ 113 private List<ISetWiseTrainingStrategy> setwiseTrainers; 114 115 /** 116 * data processors that are applied before the pointwise data selection 117 */ 118 private List<IProcessesingStrategy> preprocessors; 119 120 /** 121 * pointwise data selection strategies 122 */ 123 private List<IPointWiseDataselectionStrategy> pointwiseselectors; 124 125 /** 126 * data processors that are applied before the pointwise data selection 127 */ 128 private List<IProcessesingStrategy> postprocessors; 129 130 /** 131 * normal trainers, i.e., trainers that require the selected training data in a single data set 132 */ 133 private List<ITrainingStrategy> trainers; 134 135 /** 136 * evaluators used for the the experiment results 137 */ 138 private List<IEvaluationStrategy> evaluators; 139 140 /** 141 * indicates, if the classifier should be saved 142 */ 143 private Boolean saveClassifier = null; 144 145 /** 146 * indicates, which execution strategy to choose (e.g. CrossProjectExperiment, 147 * ClassifierCreationExecution). Default is CrossProjectExperiment. 148 */ 149 private String executionStrategy = "CrossProjectExperiment"; 150 151 /** 152 * Constructor. Creates a new configuration from a given file. 153 * 154 * @param filename 155 * name of the file from the configuration is loaded. 156 * @throws ExperimentConfigurationException 157 * thrown if there is an error creating the configuration 158 */ 159 public ExperimentConfiguration(String filename) throws ExperimentConfigurationException { 160 this(new File(filename)); 161 } 162 163 /** 164 * Constructor. Creates a new configuration from a given file. 165 * 166 * @param filename 167 * handle of the file from the configuration is loaded. 168 * @throws ExperimentConfigurationException 169 * thrown if there is an error creating the configuration 170 */ 171 public ExperimentConfiguration(File file) throws ExperimentConfigurationException { 172 loaders = new LinkedList<>(); 173 versionFilters = new LinkedList<>(); 174 testVersionFilters = new LinkedList<>(); 175 trainingVersionFilters = new LinkedList<>(); 176 setwisepreprocessors = new LinkedList<>(); 177 setwiseselectors = new LinkedList<>(); 178 setwisepostprocessors = new LinkedList<>(); 179 setwiseTrainers = new LinkedList<>(); 180 preprocessors = new LinkedList<>(); 181 pointwiseselectors = new LinkedList<>(); 182 postprocessors = new LinkedList<>(); 183 trainers = new LinkedList<>(); 184 evaluators = new LinkedList<>(); 185 186 if (file == null) { 164 187 throw new IllegalArgumentException("file must not be null"); 165 188 } 166 167 168 169 170 171 189 if (file.isDirectory()) { 190 throw new IllegalArgumentException("file must not be a directory"); 191 } 192 configFile = file; 193 194 experimentName = file.getName().split("\\.")[0]; 172 195 173 196 final SAXParserFactory spf = SAXParserFactory.newInstance(); … … 177 200 InputSource inputSource = null; 178 201 try { 179 saxParser = spf.newSAXParser(); 180 } catch (ParserConfigurationException | SAXException e) { 181 throw new ExperimentConfigurationException(e); 182 } 183 202 saxParser = spf.newSAXParser(); 203 } 204 catch (ParserConfigurationException | SAXException e) { 205 throw new ExperimentConfigurationException(e); 206 } 207 184 208 InputStreamReader reader = null; 185 try { 186 reader = new InputStreamReader(new FileInputStream(file), "UTF-8"); 187 inputSource = new InputSource(reader); 188 } catch (UnsupportedEncodingException | FileNotFoundException e) { 189 throw new ExperimentConfigurationException("Could not open configuration file.", e); 190 } 191 209 try { 210 reader = new InputStreamReader(new FileInputStream(file), "UTF-8"); 211 inputSource = new InputSource(reader); 212 } 213 catch (UnsupportedEncodingException | FileNotFoundException e) { 214 throw new ExperimentConfigurationException("Could not open configuration file.", e); 215 } 216 192 217 if (inputSource != null) { 193 218 inputSource.setSystemId("file://" + file.getAbsolutePath()); 194 try { 195 saxParser.parse(inputSource, this); 196 } catch (SAXException | IOException e) { 197 throw new ExperimentConfigurationException("Error parsing configuration.", e); 198 } 199 } 200 if( reader!=null ) { 201 try { 202 reader.close(); 203 } catch (IOException e) { 204 throw new ExperimentConfigurationException("Error closing reader.", e); 205 } 206 } 207 } 208 209 /** 210 * returns the name of the experiment 211 * @return name of the experiment 212 */ 213 public String getExperimentName() { 214 return experimentName; 215 } 216 217 /** 218 * returns the loaders for instances 219 * @return data loaders 220 */ 221 public List<IVersionLoader> getLoaders() { 222 return loaders; 223 } 224 225 /** 226 * returns the results path 227 * @return results path 228 */ 229 public String getResultsPath() { 230 return resultsPath; 231 } 232 233 /** 234 * returns the data set filters of the experiment 235 * @return data set filters of the experiment 236 */ 237 public List<IVersionFilter> getVersionFilters() { 238 return versionFilters; 239 } 240 241 /** 242 * returns the test set filters of the experiment 243 * @return test set filters of the experiment 244 */ 245 public List<IVersionFilter> getTestVersionFilters() { 246 return testVersionFilters; 247 } 248 249 /** 250 * returns the candidate training version filters of the experiment 251 * @return candidate training version filters of the experiment 252 */ 253 public List<IVersionFilter> getTrainingVersionFilters() { 254 return trainingVersionFilters; 255 } 256 257 /** 258 * returns the setwise processors applied before the setwise data selection 259 * @return setwise processors applied before the setwise data selection 260 */ 261 public List<ISetWiseProcessingStrategy> getSetWisePreprocessors() { 262 return setwisepreprocessors; 263 } 264 265 /** 266 * returns the setwise data selection strategies 267 * @return setwise data selection strategies 268 */ 269 public List<ISetWiseDataselectionStrategy> getSetWiseSelectors() { 270 return setwiseselectors; 271 } 272 273 /** 274 * returns the setwise processors applied after the setwise data selection 275 * @return setwise processors applied after the setwise data selection 276 */ 277 public List<ISetWiseProcessingStrategy> getSetWisePostprocessors() { 278 return setwisepostprocessors; 279 } 280 281 /** 282 * returns the setwise training algorithms 283 * @return setwise training algorithms 284 */ 285 public List<ISetWiseTrainingStrategy> getSetWiseTrainers() { 286 return setwiseTrainers; 287 } 288 289 /** 290 * returns the processors applied before the pointwise data selection 291 * @return processors applied before the pointwise data selection 292 */ 293 public List<IProcessesingStrategy> getPreProcessors() { 294 return preprocessors; 295 } 296 297 /** 298 * returns the pointwise data selection strategies 299 * @return pointwise data selection strategies 300 */ 301 public List<IPointWiseDataselectionStrategy> getPointWiseSelectors() { 302 return pointwiseselectors; 303 } 304 305 /** 306 * returns the processors applied after the pointwise data selection 307 * @return processors applied after the pointwise data selection 308 */ 309 public List<IProcessesingStrategy> getPostProcessors() { 310 return postprocessors; 311 } 312 313 /** 314 * returns the normal training algorithm 315 * @return normal training algorithms 316 */ 317 public List<ITrainingStrategy> getTrainers() { 318 return trainers; 319 } 320 321 /** 322 * returns the evaluation strategies 323 * @return evaluation strategies 324 */ 325 public List<IEvaluationStrategy> getEvaluators() { 326 return evaluators; 327 } 328 329 /** 330 * returns boolean, if classifier should be saved 331 * @return boolean 332 */ 333 public boolean getSaveClassifier() { 334 return saveClassifier; 335 } 336 337 /** 338 * returns the execution strategy 339 * @return String execution strategy 340 */ 341 public String getExecutionStrategy() { 342 return executionStrategy; 343 } 344 345 /* (non-Javadoc) 346 * @see org.xml.sax.helpers.DefaultHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes) 347 */ 348 @Override 349 public void startElement(String uri, String localName, String qName, 350 Attributes attributes) throws SAXException { 351 try { 352 if( qName.equals("config") ) { 353 // ingore 354 } 355 else if( qName.equals("loader") ) { 356 final IVersionLoader loader = (IVersionLoader) Class.forName("de.ugoe.cs.cpdp.loader." + attributes.getValue("name")).newInstance(); 357 loader.setLocation(attributes.getValue("datalocation")); 358 loaders.add(loader); 359 360 // TODO location as relative 361 } 362 else if( qName.equals("resultspath") ) { 363 resultsPath = attributes.getValue("path"); 364 } 365 else if( qName.equals("versionfilter") ) { 366 final IVersionFilter filter = (IVersionFilter) Class.forName("de.ugoe.cs.cpdp.versions." + attributes.getValue("name")).newInstance(); 367 filter.setParameter(attributes.getValue("param")); 368 versionFilters.add(filter); 369 } 370 else if( qName.equals("testVersionfilter") ) { 371 final IVersionFilter filter = (IVersionFilter) Class.forName("de.ugoe.cs.cpdp.versions." + attributes.getValue("name")).newInstance(); 372 filter.setParameter(attributes.getValue("param")); 373 testVersionFilters.add(filter); 374 } 375 else if( qName.equals("trainVersionfilter") ) { 376 final IVersionFilter filter = (IVersionFilter) Class.forName("de.ugoe.cs.cpdp.versions." + attributes.getValue("name")).newInstance(); 377 filter.setParameter(attributes.getValue("param")); 378 trainingVersionFilters.add(filter); 379 } 380 else if( qName.equals("setwisepreprocessor") ) { 381 final ISetWiseProcessingStrategy processor = (ISetWiseProcessingStrategy) Class.forName("de.ugoe.cs.cpdp.dataprocessing." + attributes.getValue("name")).newInstance(); 382 processor.setParameter(attributes.getValue("param")); 383 setwisepreprocessors.add(processor); 384 } 385 else if( qName.equals("setwiseselector") ) { 386 final ISetWiseDataselectionStrategy selection = (ISetWiseDataselectionStrategy) Class.forName("de.ugoe.cs.cpdp.dataselection." + attributes.getValue("name")).newInstance(); 387 selection.setParameter(attributes.getValue("param")); 388 setwiseselectors.add(selection); 389 } 390 else if( qName.equals("setwisepostprocessor") ) { 391 final ISetWiseProcessingStrategy processor = (ISetWiseProcessingStrategy) Class.forName("de.ugoe.cs.cpdp.dataprocessing." + attributes.getValue("name")).newInstance(); 392 processor.setParameter(attributes.getValue("param")); 393 setwisepostprocessors.add(processor); 394 } 395 else if( qName.equals("setwisetrainer") ) { 396 final ISetWiseTrainingStrategy trainer = (ISetWiseTrainingStrategy) Class.forName("de.ugoe.cs.cpdp.training." + attributes.getValue("name")).newInstance(); 397 trainer.setParameter(attributes.getValue("param")); 398 setwiseTrainers.add(trainer); 399 } 400 else if( qName.equals("preprocessor") ) { 401 final IProcessesingStrategy processor = (IProcessesingStrategy) Class.forName("de.ugoe.cs.cpdp.dataprocessing." + attributes.getValue("name")).newInstance(); 402 processor.setParameter( attributes.getValue("param")); 403 preprocessors.add(processor); 404 } 405 else if( qName.equals("pointwiseselector") ) { 406 final IPointWiseDataselectionStrategy selection = (IPointWiseDataselectionStrategy) Class.forName("de.ugoe.cs.cpdp.dataselection." + attributes.getValue("name")).newInstance(); 407 selection.setParameter( attributes.getValue("param")); 408 pointwiseselectors.add(selection); 409 } 410 else if( qName.equals("postprocessor") ) { 411 final IProcessesingStrategy processor = (IProcessesingStrategy) Class.forName("de.ugoe.cs.cpdp.dataprocessing." + attributes.getValue("name")).newInstance(); 412 processor.setParameter( attributes.getValue("param")); 413 postprocessors.add(processor); 414 } 415 else if( qName.equals("trainer") ) { 416 final ITrainingStrategy trainer = (ITrainingStrategy) Class.forName("de.ugoe.cs.cpdp.training." + attributes.getValue("name")).newInstance(); 417 trainer.setParameter(attributes.getValue("param")); 418 trainers.add(trainer); 419 } 420 else if( qName.equals("eval") ) { 421 final IEvaluationStrategy evaluator = (IEvaluationStrategy) Class.forName("de.ugoe.cs.cpdp.eval." + attributes.getValue("name")).newInstance(); 422 evaluators.add(evaluator); 423 } 424 else if( qName.equals("saveClassifier")) { 425 saveClassifier = true; 426 } 427 else if( qName.equals("executionStrategy")) { 428 executionStrategy = attributes.getValue("name"); 429 } 430 else if( qName.equals("partialconfig") ) { 431 String path = attributes.getValue("path"); 432 try { 433 boolean relative = true; 434 if( attributes.getValue("relative")!=null ) { 435 relative = Boolean.parseBoolean(attributes.getValue("relative")); 436 } 437 438 if( relative ) { 439 path = configFile.getParentFile().getPath() + "/" + path; 440 } 441 addConfigurations(new ExperimentConfiguration(path)); 442 } catch (ExperimentConfigurationException e) { 443 throw new SAXException("Could not load partial configuration: " + path, e); 444 } 445 } else { 446 Console.traceln(Level.WARNING, "element in config-file " + configFile.getName() + " ignored: " + qName); 447 } 448 } 449 catch (NoClassDefFoundError | ClassNotFoundException | IllegalAccessException | InstantiationException | ClassCastException e) { 450 throw new SAXException("Could not initialize class correctly", (Exception) e); 451 } 452 } 453 454 /** 455 * Adds the information of another experiment configuration to this configuration. This mechanism allows the usage of partial configuration files. The name of the other configuration is lost. 456 * <br><br> 457 * If the current data path is the empty string (""), it is override by the datapath of the other configuration. Otherwise, the current data path is kept. 458 * @param other experiment whose information is added 459 * @throws ExperimentConfigurationException 460 */ 461 private void addConfigurations(ExperimentConfiguration other) throws ExperimentConfigurationException { 462 if( "results".equals(resultsPath) ) { 463 resultsPath = other.resultsPath; 464 } 465 loaders.addAll(other.loaders); 466 versionFilters.addAll(other.versionFilters); 467 testVersionFilters.addAll(other.testVersionFilters); 468 trainingVersionFilters.addAll(other.trainingVersionFilters); 469 setwisepreprocessors.addAll(other.setwisepreprocessors); 470 setwiseselectors.addAll(other.setwiseselectors); 471 setwisepostprocessors.addAll(other.setwisepostprocessors); 472 setwiseTrainers.addAll(other.setwiseTrainers); 473 preprocessors.addAll(other.preprocessors); 474 pointwiseselectors.addAll(other.pointwiseselectors); 475 postprocessors.addAll(other.postprocessors); 476 trainers.addAll(other.trainers); 477 evaluators.addAll(other.evaluators); 478 479 if(!executionStrategy.equals(other.executionStrategy)) { 480 throw new ExperimentConfigurationException("Executionstrategies must be the same, if config files should be added."); 481 } 482 483 /* Only if saveClassifier is not set in the main config and 484 * the other configs saveClassifier is true, it must be set. 485 */ 486 if(saveClassifier == null && other.saveClassifier == true) { 487 saveClassifier = other.saveClassifier; 488 } 489 490 } 491 492 /* (non-Javadoc) 493 * @see java.lang.Object#toString() 494 */ 495 @Override 496 public String toString() { 497 final StringBuilder builder = new StringBuilder(); 498 builder.append("Experiment name: " + experimentName + StringTools.ENDLINE); 499 builder.append("Loaders: " + loaders + StringTools.ENDLINE); 500 builder.append("Results path: " + resultsPath + StringTools.ENDLINE); 501 builder.append("Version filters: " + versionFilters.toString() + StringTools.ENDLINE); 502 builder.append("Test version filters: " + testVersionFilters.toString() + StringTools.ENDLINE); 503 builder.append("Training version filters: " + trainingVersionFilters.toString() + StringTools.ENDLINE); 504 builder.append("Setwise preprocessors: " + setwisepreprocessors.toString() + StringTools.ENDLINE); 505 builder.append("Setwise selectors: " + setwiseselectors.toString() + StringTools.ENDLINE); 506 builder.append("Setwise postprocessors: " + setwisepostprocessors.toString() + StringTools.ENDLINE); 507 builder.append("Setwise trainers: " + setwiseTrainers.toString() + StringTools.ENDLINE); 508 builder.append("Pointwise preprocessors: " + preprocessors.toString() + StringTools.ENDLINE); 509 builder.append("Pointwise selectors: " + pointwiseselectors.toString() + StringTools.ENDLINE); 510 builder.append("Pointwise postprocessors: " + postprocessors.toString() + StringTools.ENDLINE); 511 builder.append("Pointwise trainers: " + trainers.toString() + StringTools.ENDLINE); 512 builder.append("Evaluators: " + evaluators.toString() + StringTools.ENDLINE); 513 builder.append("Save Classifier?: " + saveClassifier + StringTools.ENDLINE); 514 builder.append("Execution Strategy: " + executionStrategy + StringTools.ENDLINE); 515 516 return builder.toString(); 517 } 219 try { 220 saxParser.parse(inputSource, this); 221 } 222 catch (SAXException | IOException e) { 223 throw new ExperimentConfigurationException("Error parsing configuration.", e); 224 } 225 } 226 if (reader != null) { 227 try { 228 reader.close(); 229 } 230 catch (IOException e) { 231 throw new ExperimentConfigurationException("Error closing reader.", e); 232 } 233 } 234 } 235 236 /** 237 * returns the name of the experiment 238 * 239 * @return name of the experiment 240 */ 241 public String getExperimentName() { 242 return experimentName; 243 } 244 245 /** 246 * returns the loaders for instances 247 * 248 * @return data loaders 249 */ 250 public List<IVersionLoader> getLoaders() { 251 return loaders; 252 } 253 254 /** 255 * returns the results path 256 * 257 * @return results path 258 */ 259 public String getResultsPath() { 260 return resultsPath; 261 } 262 263 /** 264 * returns the data set filters of the experiment 265 * 266 * @return data set filters of the experiment 267 */ 268 public List<IVersionFilter> getVersionFilters() { 269 return versionFilters; 270 } 271 272 /** 273 * returns the test set filters of the experiment 274 * 275 * @return test set filters of the experiment 276 */ 277 public List<IVersionFilter> getTestVersionFilters() { 278 return testVersionFilters; 279 } 280 281 /** 282 * returns the candidate training version filters of the experiment 283 * 284 * @return candidate training version filters of the experiment 285 */ 286 public List<IVersionFilter> getTrainingVersionFilters() { 287 return trainingVersionFilters; 288 } 289 290 /** 291 * returns the setwise processors applied before the setwise data selection 292 * 293 * @return setwise processors applied before the setwise data selection 294 */ 295 public List<ISetWiseProcessingStrategy> getSetWisePreprocessors() { 296 return setwisepreprocessors; 297 } 298 299 /** 300 * returns the setwise data selection strategies 301 * 302 * @return setwise data selection strategies 303 */ 304 public List<ISetWiseDataselectionStrategy> getSetWiseSelectors() { 305 return setwiseselectors; 306 } 307 308 /** 309 * returns the setwise processors applied after the setwise data selection 310 * 311 * @return setwise processors applied after the setwise data selection 312 */ 313 public List<ISetWiseProcessingStrategy> getSetWisePostprocessors() { 314 return setwisepostprocessors; 315 } 316 317 /** 318 * returns the setwise training algorithms 319 * 320 * @return setwise training algorithms 321 */ 322 public List<ISetWiseTrainingStrategy> getSetWiseTrainers() { 323 return setwiseTrainers; 324 } 325 326 /** 327 * returns the processors applied before the pointwise data selection 328 * 329 * @return processors applied before the pointwise data selection 330 */ 331 public List<IProcessesingStrategy> getPreProcessors() { 332 return preprocessors; 333 } 334 335 /** 336 * returns the pointwise data selection strategies 337 * 338 * @return pointwise data selection strategies 339 */ 340 public List<IPointWiseDataselectionStrategy> getPointWiseSelectors() { 341 return pointwiseselectors; 342 } 343 344 /** 345 * returns the processors applied after the pointwise data selection 346 * 347 * @return processors applied after the pointwise data selection 348 */ 349 public List<IProcessesingStrategy> getPostProcessors() { 350 return postprocessors; 351 } 352 353 /** 354 * returns the normal training algorithm 355 * 356 * @return normal training algorithms 357 */ 358 public List<ITrainingStrategy> getTrainers() { 359 return trainers; 360 } 361 362 /** 363 * returns the evaluation strategies 364 * 365 * @return evaluation strategies 366 */ 367 public List<IEvaluationStrategy> getEvaluators() { 368 return evaluators; 369 } 370 371 /** 372 * returns boolean, if classifier should be saved 373 * 374 * @return boolean 375 */ 376 public boolean getSaveClassifier() { 377 return saveClassifier; 378 } 379 380 /** 381 * returns the execution strategy 382 * 383 * @return String execution strategy 384 */ 385 public String getExecutionStrategy() { 386 return executionStrategy; 387 } 388 389 /* 390 * (non-Javadoc) 391 * 392 * @see org.xml.sax.helpers.DefaultHandler#startElement(java.lang.String, java.lang.String, 393 * java.lang.String, org.xml.sax.Attributes) 394 */ 395 @Override 396 public void startElement(String uri, String localName, String qName, Attributes attributes) 397 throws SAXException 398 { 399 try { 400 if (qName.equals("config")) { 401 // ingore 402 } 403 else if (qName.equals("loader")) { 404 final IVersionLoader loader = 405 (IVersionLoader) Class.forName("de.ugoe.cs.cpdp.loader." + 406 attributes.getValue("name")).newInstance(); 407 loader.setLocation(attributes.getValue("datalocation")); 408 loaders.add(loader); 409 410 // TODO location as relative 411 } 412 else if (qName.equals("resultspath")) { 413 resultsPath = attributes.getValue("path"); 414 } 415 else if (qName.equals("versionfilter")) { 416 final IVersionFilter filter = 417 (IVersionFilter) Class.forName("de.ugoe.cs.cpdp.versions." + 418 attributes.getValue("name")).newInstance(); 419 filter.setParameter(attributes.getValue("param")); 420 versionFilters.add(filter); 421 } 422 else if (qName.equals("testVersionfilter")) { 423 final IVersionFilter filter = 424 (IVersionFilter) Class.forName("de.ugoe.cs.cpdp.versions." + 425 attributes.getValue("name")).newInstance(); 426 filter.setParameter(attributes.getValue("param")); 427 testVersionFilters.add(filter); 428 } 429 else if (qName.equals("trainVersionfilter")) { 430 final IVersionFilter filter = 431 (IVersionFilter) Class.forName("de.ugoe.cs.cpdp.versions." + 432 attributes.getValue("name")).newInstance(); 433 filter.setParameter(attributes.getValue("param")); 434 trainingVersionFilters.add(filter); 435 } 436 else if (qName.equals("setwisepreprocessor")) { 437 final ISetWiseProcessingStrategy processor = 438 (ISetWiseProcessingStrategy) Class.forName("de.ugoe.cs.cpdp.dataprocessing." + 439 attributes.getValue("name")) 440 .newInstance(); 441 processor.setParameter(attributes.getValue("param")); 442 setwisepreprocessors.add(processor); 443 } 444 else if (qName.equals("setwiseselector")) { 445 final ISetWiseDataselectionStrategy selection = 446 (ISetWiseDataselectionStrategy) Class.forName("de.ugoe.cs.cpdp.dataselection." + 447 attributes.getValue("name")) 448 .newInstance(); 449 selection.setParameter(attributes.getValue("param")); 450 setwiseselectors.add(selection); 451 } 452 else if (qName.equals("setwisepostprocessor")) { 453 final ISetWiseProcessingStrategy processor = 454 (ISetWiseProcessingStrategy) Class.forName("de.ugoe.cs.cpdp.dataprocessing." + 455 attributes.getValue("name")) 456 .newInstance(); 457 processor.setParameter(attributes.getValue("param")); 458 setwisepostprocessors.add(processor); 459 } 460 else if (qName.equals("setwisetrainer")) { 461 final ISetWiseTrainingStrategy trainer = 462 (ISetWiseTrainingStrategy) Class.forName("de.ugoe.cs.cpdp.training." + 463 attributes.getValue("name")) 464 .newInstance(); 465 trainer.setParameter(attributes.getValue("param")); 466 setwiseTrainers.add(trainer); 467 } 468 else if (qName.equals("preprocessor")) { 469 final IProcessesingStrategy processor = 470 (IProcessesingStrategy) Class.forName("de.ugoe.cs.cpdp.dataprocessing." + 471 attributes.getValue("name")) 472 .newInstance(); 473 processor.setParameter(attributes.getValue("param")); 474 preprocessors.add(processor); 475 } 476 else if (qName.equals("pointwiseselector")) { 477 final IPointWiseDataselectionStrategy selection = 478 (IPointWiseDataselectionStrategy) Class 479 .forName("de.ugoe.cs.cpdp.dataselection." + attributes.getValue("name")) 480 .newInstance(); 481 selection.setParameter(attributes.getValue("param")); 482 pointwiseselectors.add(selection); 483 } 484 else if (qName.equals("postprocessor")) { 485 final IProcessesingStrategy processor = 486 (IProcessesingStrategy) Class.forName("de.ugoe.cs.cpdp.dataprocessing." + 487 attributes.getValue("name")) 488 .newInstance(); 489 processor.setParameter(attributes.getValue("param")); 490 postprocessors.add(processor); 491 } 492 else if (qName.equals("trainer")) { 493 final ITrainingStrategy trainer = 494 (ITrainingStrategy) Class.forName("de.ugoe.cs.cpdp.training." + 495 attributes.getValue("name")) 496 .newInstance(); 497 trainer.setParameter(attributes.getValue("param")); 498 trainers.add(trainer); 499 } 500 else if (qName.equals("eval")) { 501 final IEvaluationStrategy evaluator = 502 (IEvaluationStrategy) Class.forName("de.ugoe.cs.cpdp.eval." + 503 attributes.getValue("name")) 504 .newInstance(); 505 evaluators.add(evaluator); 506 } 507 else if (qName.equals("saveClassifier")) { 508 saveClassifier = true; 509 } 510 else if (qName.equals("executionStrategy")) { 511 executionStrategy = attributes.getValue("name"); 512 } 513 else if (qName.equals("partialconfig")) { 514 String path = attributes.getValue("path"); 515 try { 516 boolean relative = true; 517 if (attributes.getValue("relative") != null) { 518 relative = Boolean.parseBoolean(attributes.getValue("relative")); 519 } 520 521 if (relative) { 522 path = configFile.getParentFile().getPath() + "/" + path; 523 } 524 addConfigurations(new ExperimentConfiguration(path)); 525 } 526 catch (ExperimentConfigurationException e) { 527 throw new SAXException("Could not load partial configuration: " + path, e); 528 } 529 } 530 else { 531 Console.traceln(Level.WARNING, "element in config-file " + configFile.getName() + 532 " ignored: " + qName); 533 } 534 } 535 catch (NoClassDefFoundError | ClassNotFoundException | IllegalAccessException 536 | InstantiationException | ClassCastException e) 537 { 538 throw new SAXException("Could not initialize class correctly", (Exception) e); 539 } 540 } 541 542 /** 543 * Adds the information of another experiment configuration to this configuration. This 544 * mechanism allows the usage of partial configuration files. The name of the other 545 * configuration is lost. <br> 546 * <br> 547 * If the current data path is the empty string (""), it is override by the datapath 548 * of the other configuration. Otherwise, the current data path is kept. 549 * 550 * @param other 551 * experiment whose information is added 552 * @throws ExperimentConfigurationException 553 */ 554 private void addConfigurations(ExperimentConfiguration other) 555 throws ExperimentConfigurationException 556 { 557 if ("results".equals(resultsPath)) { 558 resultsPath = other.resultsPath; 559 } 560 loaders.addAll(other.loaders); 561 versionFilters.addAll(other.versionFilters); 562 testVersionFilters.addAll(other.testVersionFilters); 563 trainingVersionFilters.addAll(other.trainingVersionFilters); 564 setwisepreprocessors.addAll(other.setwisepreprocessors); 565 setwiseselectors.addAll(other.setwiseselectors); 566 setwisepostprocessors.addAll(other.setwisepostprocessors); 567 setwiseTrainers.addAll(other.setwiseTrainers); 568 preprocessors.addAll(other.preprocessors); 569 pointwiseselectors.addAll(other.pointwiseselectors); 570 postprocessors.addAll(other.postprocessors); 571 trainers.addAll(other.trainers); 572 evaluators.addAll(other.evaluators); 573 574 if (!executionStrategy.equals(other.executionStrategy)) { 575 throw new ExperimentConfigurationException( 576 "Executionstrategies must be the same, if config files should be added."); 577 } 578 579 /* 580 * Only if saveClassifier is not set in the main config and the other configs saveClassifier 581 * is true, it must be set. 582 */ 583 if (saveClassifier == null && other.saveClassifier == true) { 584 saveClassifier = other.saveClassifier; 585 } 586 587 } 588 589 /* 590 * (non-Javadoc) 591 * 592 * @see java.lang.Object#toString() 593 */ 594 @Override 595 public String toString() { 596 final StringBuilder builder = new StringBuilder(); 597 builder.append("Experiment name: " + experimentName + StringTools.ENDLINE); 598 builder.append("Loaders: " + loaders + StringTools.ENDLINE); 599 builder.append("Results path: " + resultsPath + StringTools.ENDLINE); 600 builder.append("Version filters: " + versionFilters.toString() + StringTools.ENDLINE); 601 builder.append("Test version filters: " + testVersionFilters.toString() + 602 StringTools.ENDLINE); 603 builder.append("Training version filters: " + trainingVersionFilters.toString() + 604 StringTools.ENDLINE); 605 builder.append("Setwise preprocessors: " + setwisepreprocessors.toString() + 606 StringTools.ENDLINE); 607 builder.append("Setwise selectors: " + setwiseselectors.toString() + StringTools.ENDLINE); 608 builder.append("Setwise postprocessors: " + setwisepostprocessors.toString() + 609 StringTools.ENDLINE); 610 builder.append("Setwise trainers: " + setwiseTrainers.toString() + StringTools.ENDLINE); 611 builder 612 .append("Pointwise preprocessors: " + preprocessors.toString() + StringTools.ENDLINE); 613 builder.append("Pointwise selectors: " + pointwiseselectors.toString() + 614 StringTools.ENDLINE); 615 builder.append("Pointwise postprocessors: " + postprocessors.toString() + 616 StringTools.ENDLINE); 617 builder.append("Pointwise trainers: " + trainers.toString() + StringTools.ENDLINE); 618 builder.append("Evaluators: " + evaluators.toString() + StringTools.ENDLINE); 619 builder.append("Save Classifier?: " + saveClassifier + StringTools.ENDLINE); 620 builder.append("Execution Strategy: " + executionStrategy + StringTools.ENDLINE); 621 622 return builder.toString(); 623 } 518 624 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/ExperimentConfigurationException.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp; 2 16 3 17 /** 4 18 * Thrown if there is an error creating an experiment configuration. 19 * 5 20 * @author Steffen Herbold 6 21 */ 7 22 public class ExperimentConfigurationException extends Exception { 8 23 9 10 * Standard serialization ID. 11 12 13 14 15 * @see Exception#Exception() 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 24 /** 25 * Standard serialization ID. 26 */ 27 private static final long serialVersionUID = 1L; 28 29 /** 30 * @see Exception#Exception() 31 */ 32 public ExperimentConfigurationException() { 33 super(); 34 } 35 36 /** 37 * @see Exception#Exception(String) 38 */ 39 public ExperimentConfigurationException(String message) { 40 super(message); 41 } 42 43 /** 44 * @see Exception#Exception(String, Throwable) 45 */ 46 public ExperimentConfigurationException(String message, Throwable e) { 47 super(message, e); 48 } 49 50 /** 51 * @see Exception#Exception(Throwable) 52 */ 53 public ExperimentConfigurationException(Throwable e) { 54 super(e); 55 } 41 56 42 57 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/IParameterizable.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp; 2 16 3 17 /** 4 * Interface that defines that an entity excepts a parameter string. Can be used to configure parts of an experiment. How (and if) this parameter is interpreted depends entirely on the entity. 18 * Interface that defines that an entity excepts a parameter string. Can be used to configure parts 19 * of an experiment. How (and if) this parameter is interpreted depends entirely on the entity. 20 * 5 21 * @author Steffen Herbold 6 * 22 * 7 23 */ 8 24 public interface IParameterizable { 9 25 10 /** 11 * Sets the parameters of an entity. 12 * @param parameters parameters as string 13 */ 14 void setParameter(String parameters); 26 /** 27 * Sets the parameters of an entity. 28 * 29 * @param parameters 30 * parameters as string 31 */ 32 void setParameter(String parameters); 15 33 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/Runner.java
r38 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp; 2 16 … … 15 29 /** 16 30 * Executable that can be used to run experiments. 31 * 17 32 * @author Steffen Herbold 18 * 33 * 19 34 */ 20 35 public class Runner { 21 22 /**23 * Main class. The arguments are {@link ExperimentConfiguration} files. Each experiment is started in a separate thread. The number of concurrently running threads is the number of logical processors of the host system.24 * @param args experiment configuration files25 */26 public static void main(String[] args) {27 new TextConsole(Level.FINE);28 final int concurrentThreads = Runtime.getRuntime().availableProcessors()-2;29 final ExecutorService threadPool = Executors.newFixedThreadPool(concurrentThreads);30 for( String arg : args ) {31 File file = new File(arg);32 if( file.isFile() ) {33 createConfig(threadPool, file.getAbsolutePath());34 }35 else if( file.isDirectory() ) {36 for( File subfile : file.listFiles() ) {37 if( subfile.isFile() ) {38 createConfig(threadPool, subfile.getAbsolutePath());39 }40 }41 }42 }43 threadPool.shutdown();44 try {45 threadPool.awaitTermination(Long.MAX_VALUE, TimeUnit.SECONDS);46 } catch (InterruptedException e) {47 e.printStackTrace();48 }49 }50 51 /**52 * Creates the config and starts the corresponding experiment53 * @param threadPool54 * @param configFile location of the config file55 */56 public static void createConfig(ExecutorService threadPool, String configFile) {57 ExperimentConfiguration config = null;58 try {59 config = new ExperimentConfiguration(configFile);60 } catch (Exception e) {61 Console.printerrln("Failure initializing the experiment configuration for configuration file " + configFile);62 e.printStackTrace();63 }64 36 65 if( config!=null ) { 66 Console.trace(Level.FINE, config.toString()); 67 // Instantiate the class like it was given as parameter in the config file and cast it to the interface 68 try { 69 // Because we need to pass a parameter, a normal new Instance call is not possible 70 Class<?> executionStrategyClass = Class.forName("de.ugoe.cs.cpdp.execution."+config.getExecutionStrategy()); 71 Constructor<?> executionStrategyConstructor = 72 executionStrategyClass.getConstructor(ExperimentConfiguration.class); 73 74 IExecutionStrategy experiment = (IExecutionStrategy) executionStrategyConstructor.newInstance(config); 75 threadPool.execute(experiment); 76 } catch (NoSuchMethodException e) { 77 Console.printerrln("Class \"" + config.getExecutionStrategy()+ "\" does not have the right Constructor"); 78 e.printStackTrace(); 79 } catch (SecurityException e) { 80 Console.printerrln("Security manager prevents reflection"); 81 e.printStackTrace(); 82 } catch (IllegalArgumentException e) { 83 Console.printerrln("Class \"" + config.getExecutionStrategy()+ "\" does not have a Constructor, which" 84 + "matches the given arguments"); 85 e.printStackTrace(); 86 } catch (InvocationTargetException e) { 87 Console.printerrln("Constructor in Class \"" + config.getExecutionStrategy()+ "\" is not public"); 88 e.printStackTrace(); 89 } catch (InstantiationException e) { 90 Console.printerrln("Cannot instantiate Class \"" + config.getExecutionStrategy()+"\""); 91 e.printStackTrace(); 92 } catch (IllegalAccessException e) { 93 Console.printerrln("Cannot access Class \"" + config.getExecutionStrategy()+"\""); 94 e.printStackTrace(); 95 } catch (ClassNotFoundException e) { 96 Console.printerrln("Class \"" + config.getExecutionStrategy()+ "\" was not found"); 97 e.printStackTrace(); 98 } 99 100 } 101 102 } 37 /** 38 * Main class. The arguments are {@link ExperimentConfiguration} files. Each experiment is 39 * started in a separate thread. The number of concurrently running threads is the number of 40 * logical processors of the host system. 41 * 42 * @param args 43 * experiment configuration files 44 */ 45 public static void main(String[] args) { 46 new TextConsole(Level.FINE); 47 final int concurrentThreads = Runtime.getRuntime().availableProcessors() - 2; 48 final ExecutorService threadPool = Executors.newFixedThreadPool(concurrentThreads); 49 for (String arg : args) { 50 File file = new File(arg); 51 if (file.isFile()) { 52 createConfig(threadPool, file.getAbsolutePath()); 53 } 54 else if (file.isDirectory()) { 55 for (File subfile : file.listFiles()) { 56 if (subfile.isFile()) { 57 createConfig(threadPool, subfile.getAbsolutePath()); 58 } 59 } 60 } 61 } 62 threadPool.shutdown(); 63 try { 64 threadPool.awaitTermination(Long.MAX_VALUE, TimeUnit.SECONDS); 65 } 66 catch (InterruptedException e) { 67 e.printStackTrace(); 68 } 69 } 70 71 /** 72 * Creates the config and starts the corresponding experiment 73 * 74 * @param threadPool 75 * @param configFile 76 * location of the config file 77 */ 78 public static void createConfig(ExecutorService threadPool, String configFile) { 79 ExperimentConfiguration config = null; 80 try { 81 config = new ExperimentConfiguration(configFile); 82 } 83 catch (Exception e) { 84 Console 85 .printerrln("Failure initializing the experiment configuration for configuration file " + 86 configFile); 87 e.printStackTrace(); 88 } 89 90 if (config != null) { 91 Console.trace(Level.FINE, config.toString()); 92 // Instantiate the class like it was given as parameter in the config file and cast it 93 // to the interface 94 try { 95 // Because we need to pass a parameter, a normal new Instance call is not possible 96 Class<?> executionStrategyClass = 97 Class.forName("de.ugoe.cs.cpdp.execution." + config.getExecutionStrategy()); 98 Constructor<?> executionStrategyConstructor = 99 executionStrategyClass.getConstructor(ExperimentConfiguration.class); 100 101 IExecutionStrategy experiment = 102 (IExecutionStrategy) executionStrategyConstructor.newInstance(config); 103 threadPool.execute(experiment); 104 } 105 catch (NoSuchMethodException e) { 106 Console.printerrln("Class \"" + config.getExecutionStrategy() + 107 "\" does not have the right Constructor"); 108 e.printStackTrace(); 109 } 110 catch (SecurityException e) { 111 Console.printerrln("Security manager prevents reflection"); 112 e.printStackTrace(); 113 } 114 catch (IllegalArgumentException e) { 115 Console.printerrln("Class \"" + config.getExecutionStrategy() + 116 "\" does not have a Constructor, which" + "matches the given arguments"); 117 e.printStackTrace(); 118 } 119 catch (InvocationTargetException e) { 120 Console.printerrln("Constructor in Class \"" + config.getExecutionStrategy() + 121 "\" is not public"); 122 e.printStackTrace(); 123 } 124 catch (InstantiationException e) { 125 Console.printerrln("Cannot instantiate Class \"" + config.getExecutionStrategy() + 126 "\""); 127 e.printStackTrace(); 128 } 129 catch (IllegalAccessException e) { 130 Console.printerrln("Cannot access Class \"" + config.getExecutionStrategy() + "\""); 131 e.printStackTrace(); 132 } 133 catch (ClassNotFoundException e) { 134 Console.printerrln("Class \"" + config.getExecutionStrategy() + "\" was not found"); 135 e.printStackTrace(); 136 } 137 138 } 139 140 } 103 141 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/AttributeNonRemoval.java
r32 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 8 22 9 23 /** 10 * Removes attributes from all data sets, except the one defined, using their name. 24 * Removes attributes from all data sets, except the one defined, using their name. 25 * 11 26 * @author Fabian Trautsch 12 27 */ 13 28 public class AttributeNonRemoval implements ISetWiseProcessingStrategy, IProcessesingStrategy { 14 29 15 /** 16 * names of the attributes to be kept (determined by {@link #setParameter(String)}) 17 */ 18 private ArrayList<String> attributeNames = new ArrayList<String>(); 19 20 /** 21 * Sets that attributes that will be kept. The string contains the blank-separated names of the attributes to be kept. 22 * <br><br> 23 * Note, that keeping of attributes with blanks is currently not supported! 24 * @param parameters string with the blank-separated attribute names 25 */ 26 @Override 27 public void setParameter(String parameters) { 28 if( parameters!=null ) { 29 String[] attributeNamesArray = parameters.split(" "); 30 for(String attributeName : attributeNamesArray) { 31 attributeNames.add(attributeName); 32 } 33 } 34 } 30 /** 31 * names of the attributes to be kept (determined by {@link #setParameter(String)}) 32 */ 33 private ArrayList<String> attributeNames = new ArrayList<String>(); 35 34 36 /** 37 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 38 */ 39 @Override 40 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 41 for( String attributeName : attributeNames ) { 42 for( int i=0 ; i<testdata.numAttributes() ; i++ ) { 43 if(!attributeName.equals(testdata.attribute(i).name()) ) { 44 testdata.deleteAttributeAt(i); 45 for( Instances traindata : traindataSet ) { 46 traindata.deleteAttributeAt(i); 47 } 48 } 49 } 50 } 51 } 35 /** 36 * Sets that attributes that will be kept. The string contains the blank-separated names of the 37 * attributes to be kept. <br> 38 * <br> 39 * Note, that keeping of attributes with blanks is currently not supported! 40 * 41 * @param parameters 42 * string with the blank-separated attribute names 43 */ 44 @Override 45 public void setParameter(String parameters) { 46 if (parameters != null) { 47 String[] attributeNamesArray = parameters.split(" "); 48 for (String attributeName : attributeNamesArray) { 49 attributeNames.add(attributeName); 50 } 51 } 52 } 52 53 53 /** 54 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 55 */ 56 @Override 57 public void apply(Instances testdata, Instances traindata) { 58 for(int i=testdata.numAttributes()-1; i>=0; i--) { 59 if(!attributeNames.contains(testdata.attribute(i).name())) { 60 testdata.deleteAttributeAt(i); 61 traindata.deleteAttributeAt(i); 62 } 63 } 64 } 54 /** 55 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, 56 * org.apache.commons.collections4.list.SetUniqueList) 57 */ 58 @Override 59 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 60 for (String attributeName : attributeNames) { 61 for (int i = 0; i < testdata.numAttributes(); i++) { 62 if (!attributeName.equals(testdata.attribute(i).name())) { 63 testdata.deleteAttributeAt(i); 64 for (Instances traindata : traindataSet) { 65 traindata.deleteAttributeAt(i); 66 } 67 } 68 } 69 } 70 } 71 72 /** 73 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, 74 * weka.core.Instances) 75 */ 76 @Override 77 public void apply(Instances testdata, Instances traindata) { 78 for (int i = testdata.numAttributes() - 1; i >= 0; i--) { 79 if (!attributeNames.contains(testdata.attribute(i).name())) { 80 testdata.deleteAttributeAt(i); 81 traindata.deleteAttributeAt(i); 82 } 83 } 84 } 65 85 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/AttributeRemoval.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 6 20 7 21 /** 8 * Removes an attributes from all data sets using their name. 22 * Removes an attributes from all data sets using their name. 23 * 9 24 * @author Steffen Herbold 10 25 */ 11 26 public class AttributeRemoval implements ISetWiseProcessingStrategy, IProcessesingStrategy { 12 27 13 /** 14 * names of the attributes to be removed (determined by {@link #setParameter(String)}) 15 */ 16 private String[] attributeNames = new String[]{}; 17 18 /** 19 * Sets that attributes that will be removed. The string contains the blank-separated names of the attributes to be removed. 20 * <br><br> 21 * Note, that removal of attributes with blanks is currently not supported! 22 * @param parameters string with the blank-separated attribute names 23 */ 24 @Override 25 public void setParameter(String parameters) { 26 if( parameters!=null ) { 27 attributeNames = parameters.split(" "); 28 } 29 } 28 /** 29 * names of the attributes to be removed (determined by {@link #setParameter(String)}) 30 */ 31 private String[] attributeNames = new String[] { }; 30 32 31 /** 32 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 33 */ 34 @Override 35 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 36 for( String attributeName : attributeNames ) { 37 for( int i=0 ; i<testdata.numAttributes() ; i++ ) { 38 if( attributeName.equals(testdata.attribute(i).name()) ) { 39 testdata.deleteAttributeAt(i); 40 for( Instances traindata : traindataSet ) { 41 traindata.deleteAttributeAt(i); 42 } 43 } 44 } 45 } 46 } 33 /** 34 * Sets that attributes that will be removed. The string contains the blank-separated names of 35 * the attributes to be removed. <br> 36 * <br> 37 * Note, that removal of attributes with blanks is currently not supported! 38 * 39 * @param parameters 40 * string with the blank-separated attribute names 41 */ 42 @Override 43 public void setParameter(String parameters) { 44 if (parameters != null) { 45 attributeNames = parameters.split(" "); 46 } 47 } 47 48 48 /** 49 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 50 */ 51 @Override 52 public void apply(Instances testdata, Instances traindata) { 53 for( String attributeName : attributeNames ) { 54 for( int i=0 ; i<testdata.numAttributes() ; i++ ) { 55 if( attributeName.equals(testdata.attribute(i).name()) ) { 56 testdata.deleteAttributeAt(i); 57 traindata.deleteAttributeAt(i); 58 } 59 } 60 } 61 } 49 /** 50 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, 51 * org.apache.commons.collections4.list.SetUniqueList) 52 */ 53 @Override 54 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 55 for (String attributeName : attributeNames) { 56 for (int i = 0; i < testdata.numAttributes(); i++) { 57 if (attributeName.equals(testdata.attribute(i).name())) { 58 testdata.deleteAttributeAt(i); 59 for (Instances traindata : traindataSet) { 60 traindata.deleteAttributeAt(i); 61 } 62 } 63 } 64 } 65 } 66 67 /** 68 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, 69 * weka.core.Instances) 70 */ 71 @Override 72 public void apply(Instances testdata, Instances traindata) { 73 for (String attributeName : attributeNames) { 74 for (int i = 0; i < testdata.numAttributes(); i++) { 75 if (attributeName.equals(testdata.attribute(i).name())) { 76 testdata.deleteAttributeAt(i); 77 traindata.deleteAttributeAt(i); 78 } 79 } 80 } 81 } 62 82 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/AverageStandardization.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 8 22 9 23 /** 10 * Standardization procedure after Watanabe et al.: Adapting a Fault Prediction Model to Allow Inter Language Reuse. 11 * <br><br> 12 * In comparison to Watanabe et al., we transform training data instead of the test data. Otherwise, this approach would not be feasible with multiple projects. 24 * Standardization procedure after Watanabe et al.: Adapting a Fault Prediction Model to Allow Inter 25 * Language Reuse. <br> 26 * <br> 27 * In comparison to Watanabe et al., we transform training data instead of the test data. Otherwise, 28 * this approach would not be feasible with multiple projects. 29 * 13 30 * @author Steffen Herbold 14 31 */ 15 32 public class AverageStandardization implements ISetWiseProcessingStrategy, IProcessesingStrategy { 16 33 17 /** 18 * Does not have parameters. String is ignored. 19 * @param parameters ignored 20 */ 21 @Override 22 public void setParameter(String parameters) { 23 // dummy 24 } 34 /** 35 * Does not have parameters. String is ignored. 36 * 37 * @param parameters 38 * ignored 39 */ 40 @Override 41 public void setParameter(String parameters) { 42 // dummy 43 } 25 44 26 /** 27 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 28 */ 29 @Override 30 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 31 final Attribute classAttribute = testdata.classAttribute(); 32 33 final double[] meanTest = new double[testdata.numAttributes()]; 34 35 // get means of testdata 36 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 37 if( testdata.attribute(j)!=classAttribute ) { 38 meanTest[j] = testdata.meanOrMode(j); 39 } 40 } 41 42 // preprocess training data 43 for( Instances traindata : traindataSet ) { 44 double[] meanTrain = new double[testdata.numAttributes()]; 45 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 46 if( testdata.attribute(j)!=classAttribute ) { 47 meanTrain[j] = traindata.meanOrMode(j); 48 } 49 } 50 51 for( int i=0 ; i<traindata.numInstances() ; i++ ) { 52 Instance instance = traindata.instance(i); 53 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 54 if( testdata.attribute(j)!=classAttribute ) { 55 instance.setValue(j, instance.value(j)*meanTest[j]/meanTrain[j]); 56 } 57 } 58 } 59 } 60 } 45 /** 46 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, 47 * org.apache.commons.collections4.list.SetUniqueList) 48 */ 49 @Override 50 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 51 final Attribute classAttribute = testdata.classAttribute(); 61 52 62 /** 63 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 64 */ 65 @Override 66 public void apply(Instances testdata, Instances traindata) { 67 final Attribute classAttribute = testdata.classAttribute(); 68 69 final double[] meanTest = new double[testdata.numAttributes()]; 70 71 // get means of testdata 72 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 73 if( testdata.attribute(j)!=classAttribute ) { 74 meanTest[j] = testdata.meanOrMode(j); 75 } 76 } 77 78 // preprocess training data 79 final double[] meanTrain = new double[testdata.numAttributes()]; 80 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 81 if( testdata.attribute(j)!=classAttribute ) { 82 meanTrain[j] = traindata.meanOrMode(j); 83 } 84 } 85 86 for( int i=0 ; i<traindata.numInstances() ; i++ ) { 87 Instance instance = traindata.instance(i); 88 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 89 if( testdata.attribute(j)!=classAttribute ) { 90 instance.setValue(j, instance.value(j)*meanTest[j]/meanTrain[j]); 91 } 92 } 93 } 94 } 53 final double[] meanTest = new double[testdata.numAttributes()]; 54 55 // get means of testdata 56 for (int j = 0; j < testdata.numAttributes(); j++) { 57 if (testdata.attribute(j) != classAttribute) { 58 meanTest[j] = testdata.meanOrMode(j); 59 } 60 } 61 62 // preprocess training data 63 for (Instances traindata : traindataSet) { 64 double[] meanTrain = new double[testdata.numAttributes()]; 65 for (int j = 0; j < testdata.numAttributes(); j++) { 66 if (testdata.attribute(j) != classAttribute) { 67 meanTrain[j] = traindata.meanOrMode(j); 68 } 69 } 70 71 for (int i = 0; i < traindata.numInstances(); i++) { 72 Instance instance = traindata.instance(i); 73 for (int j = 0; j < testdata.numAttributes(); j++) { 74 if (testdata.attribute(j) != classAttribute) { 75 instance.setValue(j, instance.value(j) * meanTest[j] / meanTrain[j]); 76 } 77 } 78 } 79 } 80 } 81 82 /** 83 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, 84 * weka.core.Instances) 85 */ 86 @Override 87 public void apply(Instances testdata, Instances traindata) { 88 final Attribute classAttribute = testdata.classAttribute(); 89 90 final double[] meanTest = new double[testdata.numAttributes()]; 91 92 // get means of testdata 93 for (int j = 0; j < testdata.numAttributes(); j++) { 94 if (testdata.attribute(j) != classAttribute) { 95 meanTest[j] = testdata.meanOrMode(j); 96 } 97 } 98 99 // preprocess training data 100 final double[] meanTrain = new double[testdata.numAttributes()]; 101 for (int j = 0; j < testdata.numAttributes(); j++) { 102 if (testdata.attribute(j) != classAttribute) { 103 meanTrain[j] = traindata.meanOrMode(j); 104 } 105 } 106 107 for (int i = 0; i < traindata.numInstances(); i++) { 108 Instance instance = traindata.instance(i); 109 for (int j = 0; j < testdata.numAttributes(); j++) { 110 if (testdata.attribute(j) != classAttribute) { 111 instance.setValue(j, instance.value(j) * meanTest[j] / meanTrain[j]); 112 } 113 } 114 } 115 } 95 116 96 117 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/BiasedWeights.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 7 21 8 22 /** 9 * Sets the bias of the weights of the training data. By using a bias of 0.5 (default value) the total weight of the positive instances (i.e. 10 * fault-prone) is equal to the total weight of the negative instances (i.e. non-fault-prone). Otherwise the weights between the two will be 11 * distributed according to the bias, where <0.5 means in favor of the negative instances and >0.5 in favor of the positive instances. 12 * equal to the total weight of the test 23 * Sets the bias of the weights of the training data. By using a bias of 0.5 (default value) the 24 * total weight of the positive instances (i.e. fault-prone) is equal to the total weight of the 25 * negative instances (i.e. non-fault-prone). Otherwise the weights between the two will be 26 * distributed according to the bias, where <0.5 means in favor of the negative instances and 27 * >0.5 in favor of the positive instances. equal to the total weight of the test 28 * 13 29 * @author Steffen Herbold 14 30 */ 15 31 public class BiasedWeights implements IProcessesingStrategy, ISetWiseProcessingStrategy { 16 32 17 /** 18 * bias used for the weighting 19 */ 20 private double bias = 0.5; 21 22 23 /** 24 * Sets the bias to be used for weighting. 25 * @param parameters string with the bias 26 */ 27 @Override 28 public void setParameter(String parameters) { 29 bias = Double.parseDouble(parameters); 30 } 33 /** 34 * bias used for the weighting 35 */ 36 private double bias = 0.5; 31 37 32 /** 33 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 34 */ 35 @Override 36 public void apply(Instances testdata, Instances traindata) { 37 //setBiasedWeights(testdata); 38 setBiasedWeights(traindata); 39 } 40 41 /** 42 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 43 */ 44 @Override 45 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 46 for( Instances traindata : traindataSet ) { 47 setBiasedWeights(traindata); 48 } 49 } 50 51 /** 52 * Helper method that sets the weights for a given data set. 53 * @param data data set whose weights are set 54 */ 55 private void setBiasedWeights(Instances data) { 56 final int classIndex = data.classIndex(); 57 58 final int[] counts = data.attributeStats(classIndex).nominalCounts; 59 60 final double weightNegatives = ((1-bias)*data.numInstances()) / counts[0]; 61 final double weightPositives = (bias*data.numInstances()) / counts[1]; 62 63 64 for( int i=0 ; i<data.numInstances() ; i++ ) { 65 Instance instance = data.instance(i); 66 if( instance.value(classIndex)==0 ) { 67 instance.setWeight(weightNegatives); 68 } 69 if( instance.value(classIndex)==1 ) { 70 instance.setWeight(weightPositives); 71 } 72 } 73 } 38 /** 39 * Sets the bias to be used for weighting. 40 * 41 * @param parameters 42 * string with the bias 43 */ 44 @Override 45 public void setParameter(String parameters) { 46 bias = Double.parseDouble(parameters); 47 } 74 48 75 49 /** 50 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, 51 * weka.core.Instances) 52 */ 53 @Override 54 public void apply(Instances testdata, Instances traindata) { 55 // setBiasedWeights(testdata); 56 setBiasedWeights(traindata); 57 } 58 59 /** 60 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, 61 * org.apache.commons.collections4.list.SetUniqueList) 62 */ 63 @Override 64 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 65 for (Instances traindata : traindataSet) { 66 setBiasedWeights(traindata); 67 } 68 } 69 70 /** 71 * Helper method that sets the weights for a given data set. 72 * 73 * @param data 74 * data set whose weights are set 75 */ 76 private void setBiasedWeights(Instances data) { 77 final int classIndex = data.classIndex(); 78 79 final int[] counts = data.attributeStats(classIndex).nominalCounts; 80 81 final double weightNegatives = ((1 - bias) * data.numInstances()) / counts[0]; 82 final double weightPositives = (bias * data.numInstances()) / counts[1]; 83 84 for (int i = 0; i < data.numInstances(); i++) { 85 Instance instance = data.instance(i); 86 if (instance.value(classIndex) == 0) { 87 instance.setWeight(weightNegatives); 88 } 89 if (instance.value(classIndex) == 1) { 90 instance.setWeight(weightPositives); 91 } 92 } 93 } 76 94 77 95 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/DataGravitation.java
r10 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 8 22 9 23 /** 10 * Implements an approach for data weighting suggested after Y. Ma, G. Luo, X. Zeng, and A. Chen: Transfer learning for 11 * cross-company software defect prediction. The instances are weighted higher, the more attributes are within the range they are in the training data. 24 * Implements an approach for data weighting suggested after Y. Ma, G. Luo, X. Zeng, and A. Chen: 25 * Transfer learning for cross-company software defect prediction. The instances are weighted 26 * higher, the more attributes are within the range they are in the training data. 27 * 12 28 * @author Steffen Herbold 13 29 */ 14 30 public class DataGravitation implements IProcessesingStrategy, ISetWiseProcessingStrategy { 15 31 16 /** 17 * Does not have parameters. String is ignored. 18 * @param parameters ignored 19 */ 20 @Override 21 public void setParameter(String parameters) { 22 // dummy 23 } 24 25 /* (non-Javadoc) 26 * @see de.ugoe.cs.cpdp.dataprocessing.ISetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 27 */ 28 @Override 29 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 30 for( Instances traindata : traindataSet ) { 31 apply(testdata, traindata); 32 } 33 } 32 /** 33 * Does not have parameters. String is ignored. 34 * 35 * @param parameters 36 * ignored 37 */ 38 @Override 39 public void setParameter(String parameters) { 40 // dummy 41 } 34 42 35 /* (non-Javadoc) 36 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 37 */ 38 @Override 39 public void apply(Instances testdata, Instances traindata) { 40 Attribute classAtt = testdata.classAttribute(); 41 42 double[] minAttValues = new double[testdata.numAttributes()]; 43 double[] maxAttValues = new double[testdata.numAttributes()]; 44 double[] weights = new double[traindata.numInstances()]; 45 double weightsum = 0.0; 46 47 for( int j=0; j<testdata.numAttributes(); j++) { 48 if( testdata.attribute(j)!=classAtt ) { 49 minAttValues[j] = testdata.attributeStats(j).numericStats.min; 50 maxAttValues[j] = testdata.attributeStats(j).numericStats.max; 51 } 52 } 53 54 for( int i=0; i<traindata.numInstances(); i++ ) { 55 Instance inst = traindata.instance(i); 56 int similar = 0; 57 for( int j=0; j<testdata.numAttributes(); j++ ) { 58 if( testdata.attribute(j)!=classAtt ) { 59 if( inst.value(j)>=minAttValues[j] && inst.value(j)<=maxAttValues[j] ) { 60 similar++; 61 } 62 } 63 } 64 weights[i] = similar/Math.sqrt(testdata.numAttributes()-similar); 65 weightsum += weights[i]; 66 } 67 for( int i=0; i<traindata.numInstances(); i++ ) { 68 traindata.instance(i).setWeight(weights[i]*traindata.numInstances()/weightsum); 69 } 70 } 43 /* 44 * (non-Javadoc) 45 * 46 * @see de.ugoe.cs.cpdp.dataprocessing.ISetWiseProcessingStrategy#apply(weka.core.Instances, 47 * org.apache.commons.collections4.list.SetUniqueList) 48 */ 49 @Override 50 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 51 for (Instances traindata : traindataSet) { 52 apply(testdata, traindata); 53 } 54 } 55 56 /* 57 * (non-Javadoc) 58 * 59 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances, 60 * weka.core.Instances) 61 */ 62 @Override 63 public void apply(Instances testdata, Instances traindata) { 64 Attribute classAtt = testdata.classAttribute(); 65 66 double[] minAttValues = new double[testdata.numAttributes()]; 67 double[] maxAttValues = new double[testdata.numAttributes()]; 68 double[] weights = new double[traindata.numInstances()]; 69 double weightsum = 0.0; 70 71 for (int j = 0; j < testdata.numAttributes(); j++) { 72 if (testdata.attribute(j) != classAtt) { 73 minAttValues[j] = testdata.attributeStats(j).numericStats.min; 74 maxAttValues[j] = testdata.attributeStats(j).numericStats.max; 75 } 76 } 77 78 for (int i = 0; i < traindata.numInstances(); i++) { 79 Instance inst = traindata.instance(i); 80 int similar = 0; 81 for (int j = 0; j < testdata.numAttributes(); j++) { 82 if (testdata.attribute(j) != classAtt) { 83 if (inst.value(j) >= minAttValues[j] && inst.value(j) <= maxAttValues[j]) { 84 similar++; 85 } 86 } 87 } 88 weights[i] = similar / Math.sqrt(testdata.numAttributes() - similar); 89 weightsum += weights[i]; 90 } 91 for (int i = 0; i < traindata.numInstances(); i++) { 92 traindata.instance(i).setWeight(weights[i] * traindata.numInstances() / weightsum); 93 } 94 } 71 95 72 96 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/IProcessesingStrategy.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 5 19 6 20 /** 7 * A data processing strategy that is applied to the test data and a single set of training data. 21 * A data processing strategy that is applied to the test data and a single set of training data. 22 * 8 23 * @author Steffen Herbold 9 24 */ 10 25 public interface IProcessesingStrategy extends IParameterizable { 11 12 /** 13 * Applies the processing strategy. 14 * @param testdata test data 15 * @param traindata training data 16 */ 17 void apply(Instances testdata, Instances traindata); 26 27 /** 28 * Applies the processing strategy. 29 * 30 * @param testdata 31 * test data 32 * @param traindata 33 * training data 34 */ 35 void apply(Instances testdata, Instances traindata); 18 36 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/ISetWiseProcessingStrategy.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 8 22 9 23 /** 10 * A data processing strategy that is applied to the test data and a multiple sets of training data. 24 * A data processing strategy that is applied to the test data and a multiple sets of training data. 25 * 11 26 * @author Steffen Herbold 12 27 */ 13 28 public interface ISetWiseProcessingStrategy extends IParameterizable { 14 29 15 /** 16 * Applies the processing strategy. 17 * @param testdata test data 18 * @param traindataSet training data sets 19 */ 20 void apply(Instances testdata, SetUniqueList<Instances> traindataSet); 21 30 /** 31 * Applies the processing strategy. 32 * 33 * @param testdata 34 * test data 35 * @param traindataSet 36 * training data sets 37 */ 38 void apply(Instances testdata, SetUniqueList<Instances> traindataSet); 39 22 40 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/InformationGainFilter.java
r10 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 12 26 13 27 /** 14 * Implements an attribute filter that is based on the information gain of each attribute after Z. He, F. Peters, T. Menzies, Y. Yang: Learning from Open-Source Projects: An Empirical Study on Defect Prediction. 15 * A logistic classifier is trained to separate a random sample of the training data from a random sample of the test data. As standard, the best 50% of attributes are retained. 16 * This ratio can be adjusted using the parameter of the filter (0.5 = 50%). 17 * <br><br> 18 * Best means the least information gain, because this means that the attribute is similar in both test and training data. 28 * Implements an attribute filter that is based on the information gain of each attribute after Z. 29 * He, F. Peters, T. Menzies, Y. Yang: Learning from Open-Source Projects: An Empirical Study on 30 * Defect Prediction. A logistic classifier is trained to separate a random sample of the training 31 * data from a random sample of the test data. As standard, the best 50% of attributes are retained. 32 * This ratio can be adjusted using the parameter of the filter (0.5 = 50%). <br> 33 * <br> 34 * Best means the least information gain, because this means that the attribute is similar in both 35 * test and training data. 36 * 19 37 * @author Steffen Herbold 20 38 */ 21 39 public class InformationGainFilter implements ISetWiseProcessingStrategy, IProcessesingStrategy { 22 40 23 /** 24 * size of the random sample that is drawn from both test data and training data 25 */ 26 private final int sampleSize = 500; 27 28 /** 29 * ratio of features that is kept 30 */ 31 private double featureRatio = 0.5; 32 33 /** 34 * Sets the feature ratio. 35 * @param parameters feature ratio 36 */ 37 @Override 38 public void setParameter(String parameters) { 39 if( !"".equals(parameters) ) { 40 featureRatio = Double.parseDouble(parameters); 41 } 42 } 41 /** 42 * size of the random sample that is drawn from both test data and training data 43 */ 44 private final int sampleSize = 500; 43 45 44 /** 45 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 46 */ 47 @Override 48 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 49 for( Instances traindata : traindataSet ) { 50 apply(testdata, traindata, false); 51 } 52 53 } 54 55 /** 56 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 57 */ 58 @Override 59 public void apply(Instances testdata, Instances traindata) { 60 apply(testdata, traindata, true); 61 } 62 63 /** 64 * Internal helper function for the application of the filter to both all data set as well as a single data set. 65 * @param testdata data of the target product 66 * @param traindata data of the training product 67 * @param removeFromTest defines whether the attributes shall be removed from the test data as well or not 68 */ 69 private void apply(Instances testdata, Instances traindata, boolean removeFromTest) { 70 final Random rand = new Random(1); 71 final int removalNumber = (int) (featureRatio*(testdata.numAttributes()-1)); 72 73 final int classIndex = testdata.classIndex(); 74 75 // sample instances 76 final Instances sample = new Instances(testdata); 77 for( int j=0; j<sampleSize; j++ ) { 78 Instance inst = new DenseInstance(testdata.instance(rand.nextInt(testdata.numInstances()))); 79 inst.setDataset(sample); 80 inst.setClassValue(1.0); 81 sample.add(inst); 82 inst = new DenseInstance(traindata.instance(rand.nextInt(traindata.numInstances()))); 83 inst.setDataset(sample); 84 inst.setClassValue(0.0); 85 sample.add(inst); 86 } 87 88 final double[] gain = new double[sample.numAttributes()]; 89 90 final InfoGainAttributeEval gainEval = new InfoGainAttributeEval(); 91 try { 92 gainEval.buildEvaluator(sample); 93 for( int i=0 ; i<testdata.numAttributes() ; i++ ) { 94 //if( sample.classAttribute().equals(sample.attribute(i)) ) { 95 // gain[i] = 0.0; 96 //} else { 97 if( !sample.classAttribute().equals(sample.attribute(i)) ) { 98 gain[i] = gainEval.evaluateAttribute(i); 99 } 100 } 101 } catch (Exception e) { 102 //throw new RuntimeException("could not determine information gain for all attributes", e); 103 // ignore exception; it is caused by attributes that are extremely 104 } 105 106 // select best attributes 107 final double[] gainCopy = Arrays.copyOf(gain, gain.length); 108 Arrays.sort(gainCopy); 109 final double cutoffGain = gainCopy[testdata.numAttributes()-removalNumber]; 110 111 for( int i=testdata.numAttributes()-1; i>=0 ; i-- ) { 112 if( gain[i]>=cutoffGain && i!=classIndex) { 113 traindata.deleteAttributeAt(i); 114 if( removeFromTest ) { 115 testdata.deleteAttributeAt(i); 116 } 117 } 118 } 119 } 46 /** 47 * ratio of features that is kept 48 */ 49 private double featureRatio = 0.5; 50 51 /** 52 * Sets the feature ratio. 53 * 54 * @param parameters 55 * feature ratio 56 */ 57 @Override 58 public void setParameter(String parameters) { 59 if (!"".equals(parameters)) { 60 featureRatio = Double.parseDouble(parameters); 61 } 62 } 63 64 /** 65 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, 66 * org.apache.commons.collections4.list.SetUniqueList) 67 */ 68 @Override 69 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 70 for (Instances traindata : traindataSet) { 71 apply(testdata, traindata, false); 72 } 73 74 } 75 76 /** 77 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, 78 * weka.core.Instances) 79 */ 80 @Override 81 public void apply(Instances testdata, Instances traindata) { 82 apply(testdata, traindata, true); 83 } 84 85 /** 86 * Internal helper function for the application of the filter to both all data set as well as a 87 * single data set. 88 * 89 * @param testdata 90 * data of the target product 91 * @param traindata 92 * data of the training product 93 * @param removeFromTest 94 * defines whether the attributes shall be removed from the test data as well or not 95 */ 96 private void apply(Instances testdata, Instances traindata, boolean removeFromTest) { 97 final Random rand = new Random(1); 98 final int removalNumber = (int) (featureRatio * (testdata.numAttributes() - 1)); 99 100 final int classIndex = testdata.classIndex(); 101 102 // sample instances 103 final Instances sample = new Instances(testdata); 104 for (int j = 0; j < sampleSize; j++) { 105 Instance inst = 106 new DenseInstance(testdata.instance(rand.nextInt(testdata.numInstances()))); 107 inst.setDataset(sample); 108 inst.setClassValue(1.0); 109 sample.add(inst); 110 inst = new DenseInstance(traindata.instance(rand.nextInt(traindata.numInstances()))); 111 inst.setDataset(sample); 112 inst.setClassValue(0.0); 113 sample.add(inst); 114 } 115 116 final double[] gain = new double[sample.numAttributes()]; 117 118 final InfoGainAttributeEval gainEval = new InfoGainAttributeEval(); 119 try { 120 gainEval.buildEvaluator(sample); 121 for (int i = 0; i < testdata.numAttributes(); i++) { 122 // if( sample.classAttribute().equals(sample.attribute(i)) ) { 123 // gain[i] = 0.0; 124 // } else { 125 if (!sample.classAttribute().equals(sample.attribute(i))) { 126 gain[i] = gainEval.evaluateAttribute(i); 127 } 128 } 129 } 130 catch (Exception e) { 131 // throw new RuntimeException("could not determine information gain for all attributes", 132 // e); 133 // ignore exception; it is caused by attributes that are extremely 134 } 135 136 // select best attributes 137 final double[] gainCopy = Arrays.copyOf(gain, gain.length); 138 Arrays.sort(gainCopy); 139 final double cutoffGain = gainCopy[testdata.numAttributes() - removalNumber]; 140 141 for (int i = testdata.numAttributes() - 1; i >= 0; i--) { 142 if (gain[i] >= cutoffGain && i != classIndex) { 143 traindata.deleteAttributeAt(i); 144 if (removeFromTest) { 145 testdata.deleteAttributeAt(i); 146 } 147 } 148 } 149 } 120 150 121 151 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/LogarithmTransform.java
r40 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 8 22 9 23 /** 10 * Logarithm transformation after Carmargo Cruz and Ochimizu: Towards Logistic Regression Models for Predicting Fault-prone Code across Software Projects. 11 * <br><br> 12 * Transform each attribute value x into log(x+1). 24 * Logarithm transformation after Carmargo Cruz and Ochimizu: Towards Logistic Regression Models for 25 * Predicting Fault-prone Code across Software Projects. <br> 26 * <br> 27 * Transform each attribute value x into log(x+1). 28 * 13 29 * @author Steffen Herbold 14 30 */ 15 31 public class LogarithmTransform implements ISetWiseProcessingStrategy, IProcessesingStrategy { 16 32 17 /** 18 * Does not have parameters. String is ignored. 19 * @param parameters ignored 20 */ 21 @Override 22 public void setParameter(String parameters) { 23 // dummy 24 } 33 /** 34 * Does not have parameters. String is ignored. 35 * 36 * @param parameters 37 * ignored 38 */ 39 @Override 40 public void setParameter(String parameters) { 41 // dummy 42 } 25 43 26 /** 27 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 28 */ 29 @Override 30 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 31 final Attribute classAttribute = testdata.classAttribute(); 32 33 // preprocess testdata 34 for( int i=0 ; i<testdata.numInstances() ; i++ ) { 35 Instance instance = testdata.instance(i); 36 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 37 if( testdata.attribute(j)!=classAttribute && testdata.attribute(j).isNumeric() ) { 38 if( instance.value(j) < 0 ) { 39 instance.setValue(j, (-1*(Math.log(-1*instance.value(j))))); 40 }else { 41 instance.setValue(j, Math.log(1+instance.value(j))); 42 } 43 } 44 } 45 } 46 47 // preprocess training data 48 for( Instances traindata : traindataSet ) { 49 for( int i=0 ; i<traindata.numInstances() ; i++ ) { 50 Instance instance = traindata.instance(i); 51 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 52 if( traindata.attribute(j)!=classAttribute && traindata.attribute(j).isNumeric() ) { 53 if( instance.value(j) < 0 ) { 54 instance.setValue(j, (-1*(Math.log(-1*instance.value(j))))); 55 }else { 56 instance.setValue(j, Math.log(1+instance.value(j))); 57 } 58 } 59 } 60 } 61 } 62 } 44 /** 45 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, 46 * org.apache.commons.collections4.list.SetUniqueList) 47 */ 48 @Override 49 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 50 final Attribute classAttribute = testdata.classAttribute(); 63 51 64 /** 65 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 66 */ 67 @Override 68 public void apply(Instances testdata, Instances traindata) { 69 final Attribute classAttribute = testdata.classAttribute(); 70 71 // preprocess testdata 72 for( int i=0 ; i<testdata.numInstances() ; i++ ) { 73 Instance instance = testdata.instance(i); 74 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 75 if( testdata.attribute(j)!=classAttribute && testdata.attribute(j).isNumeric() ) { 76 if( instance.value(j) < 0 ) { 77 instance.setValue(j, (-1*(Math.log(-1*instance.value(j))))); 78 }else { 79 instance.setValue(j, Math.log(1+instance.value(j))); 80 } 81 } 82 } 83 } 84 85 // preprocess training data 86 for( int i=0 ; i<traindata.numInstances() ; i++ ) { 87 Instance instance = traindata.instance(i); 88 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 89 if( traindata.attribute(j)!=classAttribute && traindata.attribute(j).isNumeric() ) { 90 if( instance.value(j) < 0 ) { 91 instance.setValue(j, (-1*(Math.log(-1*instance.value(j))))); 92 }else { 93 instance.setValue(j, Math.log(1+instance.value(j))); 94 } 95 } 96 } 97 } 98 } 52 // preprocess testdata 53 for (int i = 0; i < testdata.numInstances(); i++) { 54 Instance instance = testdata.instance(i); 55 for (int j = 0; j < testdata.numAttributes(); j++) { 56 if (testdata.attribute(j) != classAttribute && testdata.attribute(j).isNumeric()) { 57 if (instance.value(j) < 0) { 58 instance.setValue(j, (-1 * (Math.log(-1 * instance.value(j))))); 59 } 60 else { 61 instance.setValue(j, Math.log(1 + instance.value(j))); 62 } 63 } 64 } 65 } 66 67 // preprocess training data 68 for (Instances traindata : traindataSet) { 69 for (int i = 0; i < traindata.numInstances(); i++) { 70 Instance instance = traindata.instance(i); 71 for (int j = 0; j < testdata.numAttributes(); j++) { 72 if (traindata.attribute(j) != classAttribute && 73 traindata.attribute(j).isNumeric()) 74 { 75 if (instance.value(j) < 0) { 76 instance.setValue(j, (-1 * (Math.log(-1 * instance.value(j))))); 77 } 78 else { 79 instance.setValue(j, Math.log(1 + instance.value(j))); 80 } 81 } 82 } 83 } 84 } 85 } 86 87 /** 88 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, 89 * weka.core.Instances) 90 */ 91 @Override 92 public void apply(Instances testdata, Instances traindata) { 93 final Attribute classAttribute = testdata.classAttribute(); 94 95 // preprocess testdata 96 for (int i = 0; i < testdata.numInstances(); i++) { 97 Instance instance = testdata.instance(i); 98 for (int j = 0; j < testdata.numAttributes(); j++) { 99 if (testdata.attribute(j) != classAttribute && testdata.attribute(j).isNumeric()) { 100 if (instance.value(j) < 0) { 101 instance.setValue(j, (-1 * (Math.log(-1 * instance.value(j))))); 102 } 103 else { 104 instance.setValue(j, Math.log(1 + instance.value(j))); 105 } 106 } 107 } 108 } 109 110 // preprocess training data 111 for (int i = 0; i < traindata.numInstances(); i++) { 112 Instance instance = traindata.instance(i); 113 for (int j = 0; j < testdata.numAttributes(); j++) { 114 if (traindata.attribute(j) != classAttribute && traindata.attribute(j).isNumeric()) 115 { 116 if (instance.value(j) < 0) { 117 instance.setValue(j, (-1 * (Math.log(-1 * instance.value(j))))); 118 } 119 else { 120 instance.setValue(j, Math.log(1 + instance.value(j))); 121 } 122 } 123 } 124 } 125 } 99 126 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/MedianAsReference.java
r40 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 8 22 9 23 /** 10 * Median as reference transformation after Carmargo Cruz and Ochimizu: Towards Logistic Regression Models for Predicting Fault-prone Code across Software Projects 11 * <br><br> 12 * For each attribute value x, the new value is x + (median of the test data - median of the current project) 24 * Median as reference transformation after Carmargo Cruz and Ochimizu: Towards Logistic Regression 25 * Models for Predicting Fault-prone Code across Software Projects <br> 26 * <br> 27 * For each attribute value x, the new value is x + (median of the test data - median of the current 28 * project) 29 * 13 30 * @author Steffen Herbold 14 31 */ 15 32 public class MedianAsReference implements ISetWiseProcessingStrategy, IProcessesingStrategy { 16 33 17 /** 18 * Does not have parameters. String is ignored. 19 * @param parameters ignored 20 */ 21 @Override 22 public void setParameter(String parameters) { 23 // dummy 24 } 34 /** 35 * Does not have parameters. String is ignored. 36 * 37 * @param parameters 38 * ignored 39 */ 40 @Override 41 public void setParameter(String parameters) { 42 // dummy 43 } 25 44 26 /** 27 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 28 */ 29 @Override 30 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 31 final Attribute classAttribute = testdata.classAttribute(); 32 final double[] median = new double[testdata.numAttributes()]; 33 34 // test and train have the same number of attributes 35 Attribute traindataClassAttribute; 36 double[] currentmedian = new double[testdata.numAttributes()]; 37 38 // get medians 39 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 40 if( testdata.attribute(j)!=classAttribute ) { 41 median[j] = testdata.kthSmallestValue(j, (testdata.numInstances()+1)>>1); // (>>2 -> /2) 42 } 43 } 44 45 // preprocess training data 46 for( Instances traindata : traindataSet ) { 47 // get median of current training set 48 traindataClassAttribute = traindata.classAttribute(); 49 for( int j=0 ; j<traindata.numAttributes() ; j++ ) { 50 if( traindata.attribute(j)!=traindataClassAttribute && traindata.attribute(j).isNumeric()) { 51 currentmedian[j] = traindata.kthSmallestValue(j, (traindata.numInstances()+1)>>1); // (>>2 -> /2) 52 } 53 } 54 for( int i=0 ; i<traindata.numInstances() ; i++ ) { 55 Instance instance = traindata.instance(i); 56 for( int j=0 ; j<traindata.numAttributes() ; j++ ) { 57 if( traindata.attribute(j)!=classAttribute && traindata.attribute(j).isNumeric() ) { 58 instance.setValue(j, instance.value(j) + (median[j] - currentmedian[j])); 59 } 60 } 61 } 62 } 63 } 45 /** 46 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, 47 * org.apache.commons.collections4.list.SetUniqueList) 48 */ 49 @Override 50 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 51 final Attribute classAttribute = testdata.classAttribute(); 52 final double[] median = new double[testdata.numAttributes()]; 64 53 65 /** 66 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 67 */ 68 @Override 69 public void apply(Instances testdata, Instances traindata) { 70 final Attribute classAttribute = testdata.classAttribute(); 71 final Attribute traindataClassAttribute = traindata.classAttribute(); 72 final double[] median = new double[testdata.numAttributes()]; 54 // test and train have the same number of attributes 55 Attribute traindataClassAttribute; 56 double[] currentmedian = new double[testdata.numAttributes()]; 73 57 74 // test and train have the same number of attributes 75 double[] currentmedian = new double[testdata.numAttributes()]; 76 77 // get medians 78 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 79 if( testdata.attribute(j)!=classAttribute ) { 80 median[j] = testdata.kthSmallestValue(j, (testdata.numInstances()+1)>>1); // (>>2 -> /2) 81 } 82 } 58 // get medians 59 for (int j = 0; j < testdata.numAttributes(); j++) { 60 if (testdata.attribute(j) != classAttribute) { 61 median[j] = testdata.kthSmallestValue(j, (testdata.numInstances() + 1) >> 1); // (>>2 62 // -> 63 // /2) 64 } 65 } 83 66 84 // get median of current training set 85 for( int j=0 ; j<traindata.numAttributes() ; j++ ) { 86 if( traindata.attribute(j)!=traindataClassAttribute && traindata.attribute(j).isNumeric() ) { 87 currentmedian[j] = traindata.kthSmallestValue(j, (traindata.numInstances()+1)>>1); // (>>2 -> /2) 88 } 89 } 90 91 // preprocess training data 92 for( int i=0 ; i<traindata.numInstances() ; i++ ) { 93 Instance instance = traindata.instance(i); 94 for( int j=0 ; j<traindata.numAttributes() ; j++ ) { 95 if( traindata.attribute(j)!=classAttribute && traindata.attribute(j).isNumeric() ) { 96 instance.setValue(j, instance.value(j) + (median[j] - currentmedian[j])); 97 } 98 } 99 } 100 } 67 // preprocess training data 68 for (Instances traindata : traindataSet) { 69 // get median of current training set 70 traindataClassAttribute = traindata.classAttribute(); 71 for (int j = 0; j < traindata.numAttributes(); j++) { 72 if (traindata.attribute(j) != traindataClassAttribute && 73 traindata.attribute(j).isNumeric()) 74 { 75 currentmedian[j] = 76 traindata.kthSmallestValue(j, (traindata.numInstances() + 1) >> 1); // (>>2 77 // -> 78 // /2) 79 } 80 } 81 for (int i = 0; i < traindata.numInstances(); i++) { 82 Instance instance = traindata.instance(i); 83 for (int j = 0; j < traindata.numAttributes(); j++) { 84 if (traindata.attribute(j) != classAttribute && 85 traindata.attribute(j).isNumeric()) 86 { 87 instance.setValue(j, instance.value(j) + (median[j] - currentmedian[j])); 88 } 89 } 90 } 91 } 92 } 93 94 /** 95 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, 96 * weka.core.Instances) 97 */ 98 @Override 99 public void apply(Instances testdata, Instances traindata) { 100 final Attribute classAttribute = testdata.classAttribute(); 101 final Attribute traindataClassAttribute = traindata.classAttribute(); 102 final double[] median = new double[testdata.numAttributes()]; 103 104 // test and train have the same number of attributes 105 double[] currentmedian = new double[testdata.numAttributes()]; 106 107 // get medians 108 for (int j = 0; j < testdata.numAttributes(); j++) { 109 if (testdata.attribute(j) != classAttribute) { 110 median[j] = testdata.kthSmallestValue(j, (testdata.numInstances() + 1) >> 1); // (>>2 111 // -> 112 // /2) 113 } 114 } 115 116 // get median of current training set 117 for (int j = 0; j < traindata.numAttributes(); j++) { 118 if (traindata.attribute(j) != traindataClassAttribute && 119 traindata.attribute(j).isNumeric()) 120 { 121 currentmedian[j] = 122 traindata.kthSmallestValue(j, (traindata.numInstances() + 1) >> 1); // (>>2 -> 123 // /2) 124 } 125 } 126 127 // preprocess training data 128 for (int i = 0; i < traindata.numInstances(); i++) { 129 Instance instance = traindata.instance(i); 130 for (int j = 0; j < traindata.numAttributes(); j++) { 131 if (traindata.attribute(j) != classAttribute && traindata.attribute(j).isNumeric()) 132 { 133 instance.setValue(j, instance.value(j) + (median[j] - currentmedian[j])); 134 } 135 } 136 } 137 } 101 138 102 139 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/NominalAttributeFilter.java
r32 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 10 24 11 25 /** 12 * Filters the given dataset for an nominal attribute. 13 * Every instance that has a value of thedefined values of the given nominal attribute is removed.26 * Filters the given dataset for an nominal attribute. Every instance that has a value of the 27 * defined values of the given nominal attribute is removed. 14 28 * 15 * 16 * (e.g. param="CONFIDECNE low middle"; all instances where the "CONFIDENCE" attribute 17 * value is"low" or "middle" are removed from the dataset)29 * 30 * (e.g. param="CONFIDECNE low middle"; all instances where the "CONFIDENCE" attribute value is 31 * "low" or "middle" are removed from the dataset) 18 32 */ 19 33 20 public class NominalAttributeFilter implements IProcessesingStrategy {34 public class NominalAttributeFilter implements IProcessesingStrategy { 21 35 22 private String nominalAttributeName = ""; 23 private String[] nominalAttributeValues = new String[]{}; 24 25 /** 26 * Sets the nominal attribute name (first parameter) and the nominal attribute values (other 27 * parameters), which should be removed from the dataset. 28 * 29 * @param parameters string with the blank-separated parameters (first parameter 30 * is the name of the nominal attribute, everything else are the values) 31 */ 32 @Override 33 public void setParameter(String parameters) { 34 if( parameters!=null ) { 35 String[] parameter = parameters.split(" "); 36 nominalAttributeName = parameter[0]; 37 nominalAttributeValues = Arrays.copyOfRange(parameter, 1, parameter.length); 38 } 39 } 40 41 /* (non-Javadoc) 42 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 43 */ 44 @Override 45 public void apply(Instances testdata, Instances traindata) { 46 int indexOfConfidenceAttribute = -1; 47 48 // Find index of the named confidence attribute to filter for 49 for(int i=0; i<traindata.numAttributes(); i++) { 50 if(traindata.attribute(i).name().equals(nominalAttributeName)) { 51 indexOfConfidenceAttribute = i; 52 } 53 } 54 55 // if it was not found return 56 if(indexOfConfidenceAttribute == -1) { 57 return; 58 } 59 60 // Find index of nominal values 61 Attribute confidenceAttribute = traindata.attribute(indexOfConfidenceAttribute); 62 ArrayList<Object> nominalValuesOfConfidenceAttribute = Collections.list(confidenceAttribute.enumerateValues()); 63 ArrayList<Double> indexOfnominalAttributeValues = new ArrayList<Double>(); 64 65 66 for(int k=0; k<nominalValuesOfConfidenceAttribute.size(); k++) { 67 for(String attributeValue : nominalAttributeValues) { 68 if(((String)nominalValuesOfConfidenceAttribute.get(k)).equals(attributeValue)) { 69 indexOfnominalAttributeValues.add((double) k); 70 } 71 } 72 } 36 private String nominalAttributeName = ""; 37 private String[] nominalAttributeValues = new String[] { }; 73 38 74 75 76 77 // Go through all instances and check if nominal attribute equals 78 for(int j=traindata.numInstances()-1; j>=0; j--) { 79 Instance wekaInstance = traindata.get(j); 80 81 // delete all instances where nominal attribute has the value of one of the parameter 82 if(indexOfnominalAttributeValues.contains(wekaInstance.value(indexOfConfidenceAttribute))) { 83 traindata.delete(j); 84 } 85 } 86 } 39 /** 40 * Sets the nominal attribute name (first parameter) and the nominal attribute values (other 41 * parameters), which should be removed from the dataset. 42 * 43 * @param parameters 44 * string with the blank-separated parameters (first parameter is the name of the 45 * nominal attribute, everything else are the values) 46 */ 47 @Override 48 public void setParameter(String parameters) { 49 if (parameters != null) { 50 String[] parameter = parameters.split(" "); 51 nominalAttributeName = parameter[0]; 52 nominalAttributeValues = Arrays.copyOfRange(parameter, 1, parameter.length); 53 } 54 } 55 56 /* 57 * (non-Javadoc) 58 * 59 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances, 60 * weka.core.Instances) 61 */ 62 @Override 63 public void apply(Instances testdata, Instances traindata) { 64 int indexOfConfidenceAttribute = -1; 65 66 // Find index of the named confidence attribute to filter for 67 for (int i = 0; i < traindata.numAttributes(); i++) { 68 if (traindata.attribute(i).name().equals(nominalAttributeName)) { 69 indexOfConfidenceAttribute = i; 70 } 71 } 72 73 // if it was not found return 74 if (indexOfConfidenceAttribute == -1) { 75 return; 76 } 77 78 // Find index of nominal values 79 Attribute confidenceAttribute = traindata.attribute(indexOfConfidenceAttribute); 80 ArrayList<Object> nominalValuesOfConfidenceAttribute = 81 Collections.list(confidenceAttribute.enumerateValues()); 82 ArrayList<Double> indexOfnominalAttributeValues = new ArrayList<Double>(); 83 84 for (int k = 0; k < nominalValuesOfConfidenceAttribute.size(); k++) { 85 for (String attributeValue : nominalAttributeValues) { 86 if (((String) nominalValuesOfConfidenceAttribute.get(k)).equals(attributeValue)) { 87 indexOfnominalAttributeValues.add((double) k); 88 } 89 } 90 } 91 92 // Go through all instances and check if nominal attribute equals 93 for (int j = traindata.numInstances() - 1; j >= 0; j--) { 94 Instance wekaInstance = traindata.get(j); 95 96 // delete all instances where nominal attribute has the value of one of the parameter 97 if (indexOfnominalAttributeValues.contains(wekaInstance 98 .value(indexOfConfidenceAttribute))) 99 { 100 traindata.delete(j); 101 } 102 } 103 } 87 104 88 105 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/Normalization.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 9 23 10 24 /** 11 * Normalizes each attribute of each data set separately. 25 * Normalizes each attribute of each data set separately. 26 * 12 27 * @author Steffen Herbold 13 28 */ 14 29 public class Normalization implements ISetWiseProcessingStrategy, IProcessesingStrategy { 15 30 16 /** 17 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 18 */ 19 @Override 20 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 21 final Attribute classAtt = testdata.classAttribute(); 22 23 for( int i=0 ; i<testdata.numAttributes(); i++) { 24 if( !testdata.attribute(i).equals(classAtt) ) { 25 Stats teststats = testdata.attributeStats(i).numericStats; 26 27 double minVal = teststats.min; 28 double maxVal = teststats.max; 29 30 for( Instances traindata : traindataSet ) { 31 Stats trainstats = traindata.attributeStats(i).numericStats; 32 if( minVal>trainstats.min ) { 33 minVal = trainstats.min; 34 } 35 if( maxVal<trainstats.max ) { 36 maxVal = trainstats.max; 37 } 38 } 39 40 for( int j=0 ; j<testdata.numInstances() ; j++ ) { 41 Instance inst = testdata.instance(j); 42 double newValue = (inst.value(i)-minVal)/(maxVal-minVal); 43 inst.setValue(i, newValue); 44 } 45 46 for( Instances traindata : traindataSet ) { 47 for( int j=0 ; j<traindata.numInstances() ; j++ ) { 48 Instance inst = traindata.instance(j); 49 double newValue = (inst.value(i)-minVal)/(maxVal-minVal); 50 inst.setValue(i, newValue); 51 } 52 } 53 } 54 } 55 56 } 57 58 /** 59 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 60 */ 61 @Override 62 public void apply(Instances testdata, Instances traindata) { 63 final Attribute classAtt = testdata.classAttribute(); 64 65 for( int i=0 ; i<testdata.numAttributes(); i++) { 66 if( !testdata.attribute(i).equals(classAtt) ) { 67 Stats teststats = testdata.attributeStats(i).numericStats; 68 69 double minVal = teststats.min; 70 double maxVal = teststats.max; 71 72 Stats trainstats = traindata.attributeStats(i).numericStats; 73 if( minVal>trainstats.min ) { 74 minVal = trainstats.min; 75 } 76 if( maxVal<trainstats.max ) { 77 maxVal = trainstats.max; 78 } 79 80 for( int j=0 ; j<testdata.numInstances() ; j++ ) { 81 Instance inst = testdata.instance(j); 82 double newValue = (inst.value(i)-minVal)/(maxVal-minVal); 83 inst.setValue(i, newValue); 84 } 85 86 for( int j=0 ; j<traindata.numInstances() ; j++ ) { 87 Instance inst = traindata.instance(j); 88 double newValue = (inst.value(i)-minVal)/(maxVal-minVal); 89 inst.setValue(i, newValue); 90 } 91 } 92 } 93 } 31 /** 32 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, 33 * org.apache.commons.collections4.list.SetUniqueList) 34 */ 35 @Override 36 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 37 final Attribute classAtt = testdata.classAttribute(); 94 38 95 /** 96 * Does not have parameters. String is ignored. 97 * @param parameters ignored 98 */ 99 @Override 100 public void setParameter(String parameters) { 101 // no parameters 102 } 39 for (int i = 0; i < testdata.numAttributes(); i++) { 40 if (!testdata.attribute(i).equals(classAtt)) { 41 Stats teststats = testdata.attributeStats(i).numericStats; 42 43 double minVal = teststats.min; 44 double maxVal = teststats.max; 45 46 for (Instances traindata : traindataSet) { 47 Stats trainstats = traindata.attributeStats(i).numericStats; 48 if (minVal > trainstats.min) { 49 minVal = trainstats.min; 50 } 51 if (maxVal < trainstats.max) { 52 maxVal = trainstats.max; 53 } 54 } 55 56 for (int j = 0; j < testdata.numInstances(); j++) { 57 Instance inst = testdata.instance(j); 58 double newValue = (inst.value(i) - minVal) / (maxVal - minVal); 59 inst.setValue(i, newValue); 60 } 61 62 for (Instances traindata : traindataSet) { 63 for (int j = 0; j < traindata.numInstances(); j++) { 64 Instance inst = traindata.instance(j); 65 double newValue = (inst.value(i) - minVal) / (maxVal - minVal); 66 inst.setValue(i, newValue); 67 } 68 } 69 } 70 } 71 72 } 73 74 /** 75 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, 76 * weka.core.Instances) 77 */ 78 @Override 79 public void apply(Instances testdata, Instances traindata) { 80 final Attribute classAtt = testdata.classAttribute(); 81 82 for (int i = 0; i < testdata.numAttributes(); i++) { 83 if (!testdata.attribute(i).equals(classAtt)) { 84 Stats teststats = testdata.attributeStats(i).numericStats; 85 86 double minVal = teststats.min; 87 double maxVal = teststats.max; 88 89 Stats trainstats = traindata.attributeStats(i).numericStats; 90 if (minVal > trainstats.min) { 91 minVal = trainstats.min; 92 } 93 if (maxVal < trainstats.max) { 94 maxVal = trainstats.max; 95 } 96 97 for (int j = 0; j < testdata.numInstances(); j++) { 98 Instance inst = testdata.instance(j); 99 double newValue = (inst.value(i) - minVal) / (maxVal - minVal); 100 inst.setValue(i, newValue); 101 } 102 103 for (int j = 0; j < traindata.numInstances(); j++) { 104 Instance inst = traindata.instance(j); 105 double newValue = (inst.value(i) - minVal) / (maxVal - minVal); 106 inst.setValue(i, newValue); 107 } 108 } 109 } 110 } 111 112 /** 113 * Does not have parameters. String is ignored. 114 * 115 * @param parameters 116 * ignored 117 */ 118 @Override 119 public void setParameter(String parameters) { 120 // no parameters 121 } 103 122 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/Oversampling.java
r38 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 8 22 9 23 /** 10 * Implements oversampling, a strategy for 11 * handling bias in data. In case there are less positive samples (i.e. 12 * defect-prone) samples in the data than negative samples (i.e. 13 * non-defect-prone), the defect-prone entities are over-sampled such that the 14 * number of defect-prone and non-defect-prone instances is the same afterwards. 15 * This means, that some of the defect-prone entities will be more than once 16 * within the data. 24 * Implements oversampling, a strategy for handling bias in data. In case there are less positive 25 * samples (i.e. defect-prone) samples in the data than negative samples (i.e. non-defect-prone), 26 * the defect-prone entities are over-sampled such that the number of defect-prone and 27 * non-defect-prone instances is the same afterwards. This means, that some of the defect-prone 28 * entities will be more than once within the data. 17 29 * 18 30 * @author Steffen Herbold 19 31 */ 20 public class Oversampling implements IProcessesingStrategy, 21 ISetWiseProcessingStrategy { 32 public class Oversampling implements IProcessesingStrategy, ISetWiseProcessingStrategy { 22 33 23 24 25 26 27 28 29 30 31 32 34 /** 35 * Does not have parameters. String is ignored. 36 * 37 * @param parameters 38 * ignored 39 */ 40 @Override 41 public void setParameter(String parameters) { 42 // dummy 43 } 33 44 34 /* 35 * (non-Javadoc) 36 * 37 * @see 38 * de.ugoe.cs.cpdp.dataprocessing.ISetWiseProcessingStrategy#apply(weka. 39 * core.Instances, org.apache.commons.collections4.list.SetUniqueList) 40 */ 41 @Override 42 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 43 for (Instances traindata : traindataSet) { 44 apply(testdata, traindata); 45 } 46 } 45 /* 46 * (non-Javadoc) 47 * 48 * @see de.ugoe.cs.cpdp.dataprocessing.ISetWiseProcessingStrategy#apply(weka. core.Instances, 49 * org.apache.commons.collections4.list.SetUniqueList) 50 */ 51 @Override 52 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 53 for (Instances traindata : traindataSet) { 54 apply(testdata, traindata); 55 } 56 } 47 57 48 /* 49 * (non-Javadoc) 50 * 51 * @see 52 * de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core. 53 * Instances, weka.core.Instances) 54 */ 55 @Override 56 public void apply(Instances testdata, Instances traindata) { 58 /* 59 * (non-Javadoc) 60 * 61 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core. Instances, 62 * weka.core.Instances) 63 */ 64 @Override 65 public void apply(Instances testdata, Instances traindata) { 57 66 58 59 60 61 67 final int[] counts = traindata.attributeStats(traindata.classIndex()).nominalCounts; 68 if (counts[1] < counts[0]) { 69 Instances negatives = new Instances(traindata); 70 Instances positives = new Instances(traindata); 62 71 63 64 65 66 67 68 69 70 72 for (int i = traindata.size() - 1; i >= 0; i--) { 73 if (Double.compare(1.0, negatives.get(i).classValue()) == 0) { 74 negatives.remove(i); 75 } 76 if (Double.compare(0.0, positives.get(i).classValue()) == 0) { 77 positives.remove(i); 78 } 79 } 71 80 72 Resample resample = new Resample(); 73 // TODO: resample.setSampleSizePercent((100.0*counts[1])/100+0.01); 74 // Ohne +0.01 wird bei tomcat, xerces-1.2 und jedit-4.0 ein negative 75 // weniger zurückgegeben 76 resample.setSampleSizePercent((100.0 * counts[0]) / counts[1]); 77 try { 78 resample.setInputFormat(traindata); 79 positives = Filter.useFilter(positives, resample); 80 } catch (Exception e) { 81 throw new RuntimeException(e); 82 } 83 traindata.clear(); 84 for (int i = 0; i < negatives.size(); i++) { 85 traindata.add(negatives.get(i)); 86 } 87 for (int i = 0; i < positives.size(); i++) { 88 traindata.add(positives.get(i)); 89 } 90 } 91 } 81 Resample resample = new Resample(); 82 // TODO: resample.setSampleSizePercent((100.0*counts[1])/100+0.01); 83 // Ohne +0.01 wird bei tomcat, xerces-1.2 und jedit-4.0 ein negative 84 // weniger zurückgegeben 85 resample.setSampleSizePercent((100.0 * counts[0]) / counts[1]); 86 try { 87 resample.setInputFormat(traindata); 88 positives = Filter.useFilter(positives, resample); 89 } 90 catch (Exception e) { 91 throw new RuntimeException(e); 92 } 93 traindata.clear(); 94 for (int i = 0; i < negatives.size(); i++) { 95 traindata.add(negatives.get(i)); 96 } 97 for (int i = 0; i < positives.size(); i++) { 98 traindata.add(positives.get(i)); 99 } 100 } 101 } 92 102 93 103 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/Resampling.java
r38 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 8 22 9 23 /** 10 * Resamples the data with WEKA {@link Resample} to have a uniform distribution among all classes. 24 * Resamples the data with WEKA {@link Resample} to have a uniform distribution among all classes. 25 * 11 26 * @author Steffen Herbold 12 27 */ 13 public class Resampling implements IProcessesingStrategy, 14 ISetWiseProcessingStrategy { 28 public class Resampling implements IProcessesingStrategy, ISetWiseProcessingStrategy { 15 29 16 17 /** 18 * Does not have parameters. String is ignored. 19 * @param parameters ignored 20 */ 21 @Override 22 public void setParameter(String parameters) { 23 // dummy 24 } 30 /** 31 * Does not have parameters. String is ignored. 32 * 33 * @param parameters 34 * ignored 35 */ 36 @Override 37 public void setParameter(String parameters) { 38 // dummy 39 } 25 40 26 /* 27 * (non-Javadoc) 28 * @see de.ugoe.cs.cpdp.dataprocessing.ISetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 29 */ 30 @Override 31 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 32 for( Instances traindata : traindataSet ) { 33 apply(testdata, traindata); 34 } 35 } 41 /* 42 * (non-Javadoc) 43 * 44 * @see de.ugoe.cs.cpdp.dataprocessing.ISetWiseProcessingStrategy#apply(weka.core.Instances, 45 * org.apache.commons.collections4.list.SetUniqueList) 46 */ 47 @Override 48 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 49 for (Instances traindata : traindataSet) { 50 apply(testdata, traindata); 51 } 52 } 36 53 37 /* 38 * (non-Javadoc) 39 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 40 */ 41 @Override 42 public void apply(Instances testdata, Instances traindata) { 43 Resample resample = new Resample(); 44 resample.setSampleSizePercent(100); 45 resample.setBiasToUniformClass(1.0); 46 47 Instances traindataSample; 48 try { 49 resample.setInputFormat(traindata); 50 traindataSample = Filter.useFilter(traindata, resample); 51 } catch (Exception e) { 52 throw new RuntimeException(e); 53 } 54 traindata.clear(); 55 for( int i=0 ; i<traindataSample.size() ; i++ ) { 56 traindata.add(traindataSample.get(i)); 57 } 58 } 54 /* 55 * (non-Javadoc) 56 * 57 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances, 58 * weka.core.Instances) 59 */ 60 @Override 61 public void apply(Instances testdata, Instances traindata) { 62 Resample resample = new Resample(); 63 resample.setSampleSizePercent(100); 64 resample.setBiasToUniformClass(1.0); 65 66 Instances traindataSample; 67 try { 68 resample.setInputFormat(traindata); 69 traindataSample = Filter.useFilter(traindata, resample); 70 } 71 catch (Exception e) { 72 throw new RuntimeException(e); 73 } 74 traindata.clear(); 75 for (int i = 0; i < traindataSample.size(); i++) { 76 traindata.add(traindataSample.get(i)); 77 } 78 } 59 79 60 80 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/SimulationFilter.java
r32 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 12 26 * Filter for the Repast Simulation of Software Projects. 13 27 * 14 * Filters the training dataset in the following way: If 0 is no bug 15 * and 1 means there is a bug in this artifact, then this filter 16 * filters the dataset in this way: 28 * Filters the training dataset in the following way: If 0 is no bug and 1 means there is a bug in 29 * this artifact, then this filter filters the dataset in this way: 17 30 * 18 * 10010111000101110101111011101 19 * x--x-x-----x-x---x-x----x---x 31 * 10010111000101110101111011101 x--x-x-----x-x---x-x----x---x 20 32 * 21 * The instances, which are marked with x in this graphic are included 22 * in the newly created datasetand form the trainingsdataset.33 * The instances, which are marked with x in this graphic are included in the newly created dataset 34 * and form the trainingsdataset. 23 35 * 24 36 * @author Fabian Trautsch 25 * 37 * 26 38 */ 27 39 28 public class SimulationFilter implements IProcessesingStrategy {40 public class SimulationFilter implements IProcessesingStrategy { 29 41 30 31 32 * @param parameters ignored 33 */ 34 @Override 35 public void setParameter(String parameters) { 36 // dummy 37 38 } 42 /** 43 * Does not have parameters. String is ignored. 44 * 45 * @param parameters 46 * ignored 47 */ 48 @Override 49 public void setParameter(String parameters) { 50 // dummy 39 51 40 41 /* 42 * (non-Javadoc) 43 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 44 */ 45 @Override 46 public void apply(Instances testdata, Instances traindata) { 47 Instances newDataSet = new Instances(traindata); 48 traindata.delete(); 49 50 HashMap<Double, Instance> artifactNames = new HashMap<Double, Instance>(); 51 52 // This is to add all data, where the first occurence of the file has a bug 53 ArrayList<Double> firstOccurenceArtifactNames = new ArrayList<Double>(); 54 55 // Sort dataset (StateID is connected to the date of commit: Lower StateID 56 // means earlier commit than a higher stateID) 57 Attribute wekaAttribute = newDataSet.attribute("Artifact.Target.StateID"); 58 newDataSet.sort(wekaAttribute); 59 60 61 /* 62 * Logical summary: 63 * If there is an instance that dont have a bug, put it into the hashmap (only unique values in there) 64 * 65 * If there is an instance, that hava a bug look up if it is in the hashmap already (this means: 66 * it does not had a bug before!): If this is true add it to a new dataset and remove it from 67 * the hashmap, so that new changes from "nonBug" -> "bug" for this file can be found. 68 * 69 * If the instance has a bug and is not in the hashmap (this means: The file has a bug with its 70 * first occurence or this file only has bugs and not an instance with no bug), then (if it is 71 * not in the arrayList above) add it to the new dataset. This way it is possible to get 72 * the first occurence of a file, which has a bug 73 * 74 */ 75 for(int i=0; i<newDataSet.numInstances(); i++) { 76 Instance wekaInstance = newDataSet.instance(i); 52 } 77 53 78 double newBugLabel = wekaInstance.classValue(); 79 Attribute wekaArtifactName = newDataSet.attribute("Artifact.Name"); 80 Double artifactName = wekaInstance.value(wekaArtifactName); 81 82 if(newBugLabel == 0.0 && artifactNames.keySet().contains(artifactName)) { 83 artifactNames.put(artifactName, wekaInstance); 84 } else if(newBugLabel == 0.0 && !artifactNames.keySet().contains(artifactName)) { 85 artifactNames.put(artifactName, wekaInstance); 86 } else if(newBugLabel == 1.0 && artifactNames.keySet().contains(artifactName)) { 87 traindata.add(wekaInstance); 88 artifactNames.remove(artifactName); 89 } else if(newBugLabel == 1.0 && !artifactNames.keySet().contains(artifactName)) { 90 if(!firstOccurenceArtifactNames.contains(artifactName)) { 91 traindata.add(wekaInstance); 92 firstOccurenceArtifactNames.add(artifactName); 93 } 94 } 95 } 96 97 98 // If we have a file, that never had a bug (this is, when it is NOT in the 99 // new created dataset, but it is in the HashMap from above) add it to 100 // the new dataset 101 102 double[] artifactNamesinNewDataSet = traindata.attributeToDoubleArray(0); 103 HashMap<Double, Instance> artifactNamesCopy = new HashMap<Double, Instance>(artifactNames); 104 105 106 for(Double artifactName : artifactNames.keySet()) { 107 108 for(int i=0; i<artifactNamesinNewDataSet.length; i++) { 109 if(artifactNamesinNewDataSet[i] == artifactName) { 110 artifactNamesCopy.remove(artifactName); 111 } 112 } 113 } 114 115 for(Double artifact: artifactNamesCopy.keySet()) { 116 traindata.add(artifactNamesCopy.get(artifact)); 117 } 118 119 } 54 /* 55 * (non-Javadoc) 56 * 57 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances, 58 * weka.core.Instances) 59 */ 60 @Override 61 public void apply(Instances testdata, Instances traindata) { 62 Instances newDataSet = new Instances(traindata); 63 traindata.delete(); 64 65 HashMap<Double, Instance> artifactNames = new HashMap<Double, Instance>(); 66 67 // This is to add all data, where the first occurence of the file has a bug 68 ArrayList<Double> firstOccurenceArtifactNames = new ArrayList<Double>(); 69 70 // Sort dataset (StateID is connected to the date of commit: Lower StateID 71 // means earlier commit than a higher stateID) 72 Attribute wekaAttribute = newDataSet.attribute("Artifact.Target.StateID"); 73 newDataSet.sort(wekaAttribute); 74 75 /* 76 * Logical summary: If there is an instance that dont have a bug, put it into the hashmap 77 * (only unique values in there) 78 * 79 * If there is an instance, that hava a bug look up if it is in the hashmap already (this 80 * means: it does not had a bug before!): If this is true add it to a new dataset and remove 81 * it from the hashmap, so that new changes from "nonBug" -> "bug" for this file can be 82 * found. 83 * 84 * If the instance has a bug and is not in the hashmap (this means: The file has a bug with 85 * its first occurence or this file only has bugs and not an instance with no bug), then (if 86 * it is not in the arrayList above) add it to the new dataset. This way it is possible to 87 * get the first occurence of a file, which has a bug 88 */ 89 for (int i = 0; i < newDataSet.numInstances(); i++) { 90 Instance wekaInstance = newDataSet.instance(i); 91 92 double newBugLabel = wekaInstance.classValue(); 93 Attribute wekaArtifactName = newDataSet.attribute("Artifact.Name"); 94 Double artifactName = wekaInstance.value(wekaArtifactName); 95 96 if (newBugLabel == 0.0 && artifactNames.keySet().contains(artifactName)) { 97 artifactNames.put(artifactName, wekaInstance); 98 } 99 else if (newBugLabel == 0.0 && !artifactNames.keySet().contains(artifactName)) { 100 artifactNames.put(artifactName, wekaInstance); 101 } 102 else if (newBugLabel == 1.0 && artifactNames.keySet().contains(artifactName)) { 103 traindata.add(wekaInstance); 104 artifactNames.remove(artifactName); 105 } 106 else if (newBugLabel == 1.0 && !artifactNames.keySet().contains(artifactName)) { 107 if (!firstOccurenceArtifactNames.contains(artifactName)) { 108 traindata.add(wekaInstance); 109 firstOccurenceArtifactNames.add(artifactName); 110 } 111 } 112 } 113 114 // If we have a file, that never had a bug (this is, when it is NOT in the 115 // new created dataset, but it is in the HashMap from above) add it to 116 // the new dataset 117 118 double[] artifactNamesinNewDataSet = traindata.attributeToDoubleArray(0); 119 HashMap<Double, Instance> artifactNamesCopy = new HashMap<Double, Instance>(artifactNames); 120 121 for (Double artifactName : artifactNames.keySet()) { 122 123 for (int i = 0; i < artifactNamesinNewDataSet.length; i++) { 124 if (artifactNamesinNewDataSet[i] == artifactName) { 125 artifactNamesCopy.remove(artifactName); 126 } 127 } 128 } 129 130 for (Double artifact : artifactNamesCopy.keySet()) { 131 traindata.add(artifactNamesCopy.get(artifact)); 132 } 133 134 } 120 135 121 136 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/Undersampling.java
r18 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 8 22 9 23 /** 10 * Implements undersampling, a strategy for handling bias in data. In case there are less positive samples (i.e. defect-prone) samples in the 11 * data than negative samples (i.e. non-defect-prone), the non-defect-prone entities are sampled such thatthe number of defect-prone and non-defect-prone instances is the same afterwards. 24 * Implements undersampling, a strategy for handling bias in data. In case there are less positive 25 * samples (i.e. defect-prone) samples in the data than negative samples (i.e. non-defect-prone), 26 * the non-defect-prone entities are sampled such thatthe number of defect-prone and 27 * non-defect-prone instances is the same afterwards. 28 * 12 29 * @author Steffen Herbold 13 30 */ 14 public class Undersampling implements IProcessesingStrategy, 15 ISetWiseProcessingStrategy { 31 public class Undersampling implements IProcessesingStrategy, ISetWiseProcessingStrategy { 16 32 17 18 /** 19 * Does not have parameters. String is ignored. 20 * @param parameters ignored 21 */ 22 @Override 23 public void setParameter(String parameters) { 24 // dummy 25 } 33 /** 34 * Does not have parameters. String is ignored. 35 * 36 * @param parameters 37 * ignored 38 */ 39 @Override 40 public void setParameter(String parameters) { 41 // dummy 42 } 26 43 27 /* 28 * (non-Javadoc) 29 * @see de.ugoe.cs.cpdp.dataprocessing.ISetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 30 */ 31 @Override 32 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 33 for( Instances traindata : traindataSet ) { 34 apply(testdata, traindata); 35 } 36 } 44 /* 45 * (non-Javadoc) 46 * 47 * @see de.ugoe.cs.cpdp.dataprocessing.ISetWiseProcessingStrategy#apply(weka.core.Instances, 48 * org.apache.commons.collections4.list.SetUniqueList) 49 */ 50 @Override 51 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 52 for (Instances traindata : traindataSet) { 53 apply(testdata, traindata); 54 } 55 } 37 56 38 /* 39 * (non-Javadoc) 40 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 41 */ 42 @Override 43 public void apply(Instances testdata, Instances traindata) { 44 45 final int[] counts = traindata.attributeStats(traindata.classIndex()).nominalCounts; 46 47 if( counts[1]<counts[0] ) { 48 Instances negatives = new Instances(traindata); 49 Instances positives = new Instances(traindata); 50 51 for( int i=traindata.size()-1 ; i>=0 ; i-- ) { 52 if( Double.compare(1.0, negatives.get(i).classValue())==0 ) { 53 negatives.remove(i); 54 } 55 if( Double.compare(0.0, positives.get(i).classValue())==0 ) { 56 positives.remove(i); 57 } 58 } 59 60 Resample resample = new Resample(); 61 // TODO: resample.setSampleSizePercent((100.0*counts[1])/100+0.01); 62 // Ohne +0.01 wird bei tomcat, xerces-1.2 und jedit-4.0 ein negative weniger zurückgegeben 63 resample.setSampleSizePercent((100.0* counts[1])/counts[0]); 64 try { 65 resample.setInputFormat(traindata); 66 negatives = Filter.useFilter(negatives, resample); 67 } catch (Exception e) { 68 throw new RuntimeException(e); 69 } 70 traindata.clear(); 71 for( int i=0 ; i<negatives.size() ; i++ ) { 72 traindata.add(negatives.get(i)); 73 } 74 for( int i=0 ; i<positives.size() ; i++ ) { 75 traindata.add(positives.get(i)); 76 } 77 } 78 } 57 /* 58 * (non-Javadoc) 59 * 60 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances, 61 * weka.core.Instances) 62 */ 63 @Override 64 public void apply(Instances testdata, Instances traindata) { 65 66 final int[] counts = traindata.attributeStats(traindata.classIndex()).nominalCounts; 67 68 if (counts[1] < counts[0]) { 69 Instances negatives = new Instances(traindata); 70 Instances positives = new Instances(traindata); 71 72 for (int i = traindata.size() - 1; i >= 0; i--) { 73 if (Double.compare(1.0, negatives.get(i).classValue()) == 0) { 74 negatives.remove(i); 75 } 76 if (Double.compare(0.0, positives.get(i).classValue()) == 0) { 77 positives.remove(i); 78 } 79 } 80 81 Resample resample = new Resample(); 82 // TODO: resample.setSampleSizePercent((100.0*counts[1])/100+0.01); 83 // Ohne +0.01 wird bei tomcat, xerces-1.2 und jedit-4.0 ein negative weniger 84 // zurückgegeben 85 resample.setSampleSizePercent((100.0 * counts[1]) / counts[0]); 86 try { 87 resample.setInputFormat(traindata); 88 negatives = Filter.useFilter(negatives, resample); 89 } 90 catch (Exception e) { 91 throw new RuntimeException(e); 92 } 93 traindata.clear(); 94 for (int i = 0; i < negatives.size(); i++) { 95 traindata.add(negatives.get(i)); 96 } 97 for (int i = 0; i < positives.size(); i++) { 98 traindata.add(positives.get(i)); 99 } 100 } 101 } 79 102 80 103 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/ZScoreNormalization.java
r38 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 14 28 public class ZScoreNormalization implements ISetWiseProcessingStrategy, IProcessesingStrategy { 15 29 16 /** 17 * Does not have parameters. String is ignored. 18 * @param parameters ignored 19 */ 20 @Override 21 public void setParameter(String parameters) { 22 // dummy 23 } 30 /** 31 * Does not have parameters. String is ignored. 32 * 33 * @param parameters 34 * ignored 35 */ 36 @Override 37 public void setParameter(String parameters) { 38 // dummy 39 } 24 40 25 /** 26 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 27 */ 28 @Override 29 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 30 normalize(testdata); 31 for( Instances instances : traindataSet ) { 32 normalize(instances); 33 } 34 } 41 /** 42 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, 43 * org.apache.commons.collections4.list.SetUniqueList) 44 */ 45 @Override 46 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 47 normalize(testdata); 48 for (Instances instances : traindataSet) { 49 normalize(instances); 50 } 51 } 35 52 36 /** 37 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 38 */ 39 @Override 40 public void apply(Instances testdata, Instances traindata) { 41 normalize(testdata); 42 normalize(traindata); 43 } 44 45 private void normalize(Instances instances) { 46 instances.toString(); 47 final Attribute classAttribute = instances.classAttribute(); 48 49 final double[] means = new double[instances.numAttributes()]; 50 final double[] stddevs = new double[instances.numAttributes()]; 51 52 // get means and stddevs of data 53 for( int j=0 ; j<instances.numAttributes() ; j++ ) { 54 if( instances.attribute(j)!=classAttribute ) { 55 means[j] = instances.meanOrMode(j); 56 stddevs[j] = Math.sqrt(instances.variance(j)); 57 } 58 } 59 for( int i=0 ; i<instances.numAttributes(); i++) { 60 if( !instances.attribute(i).equals(classAttribute) ) { 61 for( int j=0 ; j<instances.numInstances() ; j++ ) { 62 Instance inst = instances.get(i); 63 double newValue = (inst.value(i)-means[i])/stddevs[i]; 64 if( newValue==Double.NaN ) { 65 System.out.println("foooooo"); 66 } 67 inst.setValue(i, newValue); 68 } 69 } 70 } 71 } 53 /** 54 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, 55 * weka.core.Instances) 56 */ 57 @Override 58 public void apply(Instances testdata, Instances traindata) { 59 normalize(testdata); 60 normalize(traindata); 61 } 62 63 private void normalize(Instances instances) { 64 instances.toString(); 65 final Attribute classAttribute = instances.classAttribute(); 66 67 final double[] means = new double[instances.numAttributes()]; 68 final double[] stddevs = new double[instances.numAttributes()]; 69 70 // get means and stddevs of data 71 for (int j = 0; j < instances.numAttributes(); j++) { 72 if (instances.attribute(j) != classAttribute) { 73 means[j] = instances.meanOrMode(j); 74 stddevs[j] = Math.sqrt(instances.variance(j)); 75 } 76 } 77 for (int i = 0; i < instances.numAttributes(); i++) { 78 if (!instances.attribute(i).equals(classAttribute)) { 79 for (int j = 0; j < instances.numInstances(); j++) { 80 Instance inst = instances.get(i); 81 double newValue = (inst.value(i) - means[i]) / stddevs[i]; 82 if (newValue == Double.NaN) { 83 System.out.println("foooooo"); 84 } 85 inst.setValue(i, newValue); 86 } 87 } 88 } 89 } 72 90 73 91 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/ZScoreTargetNormalization.java
r38 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 12 26 * @author Steffen Herbold 13 27 */ 14 public class ZScoreTargetNormalization implements ISetWiseProcessingStrategy, IProcessesingStrategy { 28 public class ZScoreTargetNormalization implements ISetWiseProcessingStrategy, IProcessesingStrategy 29 { 15 30 16 /** 17 * Does not have parameters. String is ignored. 18 * @param parameters ignored 19 */ 20 @Override 21 public void setParameter(String parameters) { 22 // dummy 23 } 31 /** 32 * Does not have parameters. String is ignored. 33 * 34 * @param parameters 35 * ignored 36 */ 37 @Override 38 public void setParameter(String parameters) { 39 // dummy 40 } 24 41 25 /** 26 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 27 */ 28 @Override 29 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 30 final Attribute classAttribute = testdata.classAttribute(); 31 32 final double[] meanTest = new double[testdata.numAttributes()]; 33 final double[] stddevTest = new double[testdata.numAttributes()]; 34 35 // get means of testdata 36 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 37 if( testdata.attribute(j)!=classAttribute ) { 38 meanTest[j] = testdata.meanOrMode(j); 39 stddevTest[j] = Math.sqrt(testdata.variance(j)); 40 } 41 } 42 43 // preprocess test data 44 for( int i=0 ; i<testdata.numInstances() ; i++ ) { 45 Instance instance = testdata.instance(i); 46 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 47 if( testdata.attribute(j)!=classAttribute ) { 48 instance.setValue(j, instance.value(j)-meanTest[j]/stddevTest[j]); 49 } 50 } 51 } 52 53 // preprocess training data 54 for( Instances traindata : traindataSet ) { 55 for( int i=0 ; i<traindata.numInstances() ; i++ ) { 56 Instance instance = traindata.instance(i); 57 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 58 if( testdata.attribute(j)!=classAttribute ) { 59 instance.setValue(j, instance.value(j)-meanTest[j]/stddevTest[j]); 60 } 61 } 62 } 63 } 64 } 42 /** 43 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, 44 * org.apache.commons.collections4.list.SetUniqueList) 45 */ 46 @Override 47 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 48 final Attribute classAttribute = testdata.classAttribute(); 65 49 66 /** 67 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 68 */ 69 @Override 70 public void apply(Instances testdata, Instances traindata) { 71 final Attribute classAttribute = testdata.classAttribute(); 72 73 final double[] meanTest = new double[testdata.numAttributes()]; 74 final double[] stddevTest = new double[testdata.numAttributes()]; 75 76 // get means of testdata 77 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 78 if( testdata.attribute(j)!=classAttribute ) { 79 meanTest[j] = testdata.meanOrMode(j); 80 stddevTest[j] = Math.sqrt(testdata.variance(j)); 81 } 82 } 83 84 // preprocess test data 85 for( int i=0 ; i<testdata.numInstances() ; i++ ) { 86 Instance instance = testdata.instance(i); 87 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 88 if( testdata.attribute(j)!=classAttribute ) { 89 instance.setValue(j, instance.value(j)-meanTest[j]/stddevTest[j]); 90 } 91 } 92 } 93 94 // preprocess training data 95 for( int i=0 ; i<traindata.numInstances() ; i++ ) { 96 Instance instance = traindata.instance(i); 97 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 98 if( testdata.attribute(j)!=classAttribute ) { 99 instance.setValue(j, instance.value(j)-meanTest[j]/stddevTest[j]); 100 } 101 } 102 } 103 } 50 final double[] meanTest = new double[testdata.numAttributes()]; 51 final double[] stddevTest = new double[testdata.numAttributes()]; 52 53 // get means of testdata 54 for (int j = 0; j < testdata.numAttributes(); j++) { 55 if (testdata.attribute(j) != classAttribute) { 56 meanTest[j] = testdata.meanOrMode(j); 57 stddevTest[j] = Math.sqrt(testdata.variance(j)); 58 } 59 } 60 61 // preprocess test data 62 for (int i = 0; i < testdata.numInstances(); i++) { 63 Instance instance = testdata.instance(i); 64 for (int j = 0; j < testdata.numAttributes(); j++) { 65 if (testdata.attribute(j) != classAttribute) { 66 instance.setValue(j, instance.value(j) - meanTest[j] / stddevTest[j]); 67 } 68 } 69 } 70 71 // preprocess training data 72 for (Instances traindata : traindataSet) { 73 for (int i = 0; i < traindata.numInstances(); i++) { 74 Instance instance = traindata.instance(i); 75 for (int j = 0; j < testdata.numAttributes(); j++) { 76 if (testdata.attribute(j) != classAttribute) { 77 instance.setValue(j, instance.value(j) - meanTest[j] / stddevTest[j]); 78 } 79 } 80 } 81 } 82 } 83 84 /** 85 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, 86 * weka.core.Instances) 87 */ 88 @Override 89 public void apply(Instances testdata, Instances traindata) { 90 final Attribute classAttribute = testdata.classAttribute(); 91 92 final double[] meanTest = new double[testdata.numAttributes()]; 93 final double[] stddevTest = new double[testdata.numAttributes()]; 94 95 // get means of testdata 96 for (int j = 0; j < testdata.numAttributes(); j++) { 97 if (testdata.attribute(j) != classAttribute) { 98 meanTest[j] = testdata.meanOrMode(j); 99 stddevTest[j] = Math.sqrt(testdata.variance(j)); 100 } 101 } 102 103 // preprocess test data 104 for (int i = 0; i < testdata.numInstances(); i++) { 105 Instance instance = testdata.instance(i); 106 for (int j = 0; j < testdata.numAttributes(); j++) { 107 if (testdata.attribute(j) != classAttribute) { 108 instance.setValue(j, instance.value(j) - meanTest[j] / stddevTest[j]); 109 } 110 } 111 } 112 113 // preprocess training data 114 for (int i = 0; i < traindata.numInstances(); i++) { 115 Instance instance = traindata.instance(i); 116 for (int j = 0; j < testdata.numAttributes(); j++) { 117 if (testdata.attribute(j) != classAttribute) { 118 instance.setValue(j, instance.value(j) - meanTest[j] / stddevTest[j]); 119 } 120 } 121 } 122 } 104 123 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/AbstractCharacteristicSelection.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataselection; 2 16 … … 14 28 15 29 /** 16 * Abstract class that implements the foundation of setwise data selection strategies using distributional characteristics. 17 * This class provides the means to transform the data sets into their characteristic vectors. 30 * Abstract class that implements the foundation of setwise data selection strategies using 31 * distributional characteristics. This class provides the means to transform the data sets into 32 * their characteristic vectors. 33 * 18 34 * @author Steffen Herbold 19 35 */ 20 public abstract class AbstractCharacteristicSelection implements 21 ISetWiseDataselectionStrategy { 36 public abstract class AbstractCharacteristicSelection implements ISetWiseDataselectionStrategy { 22 37 23 /** 24 * vector with the distributional characteristics 25 */ 26 private String[] characteristics = new String[]{"mean","stddev"}; 27 28 /** 29 * Sets the distributional characteristics. The names of the characteristics are separated by blanks. 30 */ 31 @Override 32 public void setParameter(String parameters) { 33 if( !"".equals(parameters) ) { 34 characteristics = parameters.split(" "); 35 } 36 } 37 38 /** 39 * Transforms the data into the distributional characteristics. The first instance is the test data, followed by the training data. 40 * @param testdata test data 41 * @param traindataSet training data sets 42 * @return distributional characteristics of the data 43 */ 44 protected Instances characteristicInstances(Instances testdata, SetUniqueList<Instances> traindataSet) { 45 // setup weka Instances for clustering 46 final ArrayList<Attribute> atts = new ArrayList<Attribute>(); 47 48 final Attribute classAtt = testdata.classAttribute(); 49 for( int i=0 ; i<testdata.numAttributes() ; i++ ) { 50 Attribute dataAtt = testdata.attribute(i); 51 if( !dataAtt.equals(classAtt) ) { 52 for( String characteristic : characteristics ) { 53 atts.add(new Attribute(dataAtt.name() + "_" + characteristic)); 54 } 55 } 56 } 57 final Instances data = new Instances("distributional_characteristics", atts, 0); 58 59 // setup data for clustering 60 double[] instanceValues = new double[atts.size()]; 61 for( int i=0 ; i<testdata.numAttributes() ; i++ ) { 62 Attribute dataAtt = testdata.attribute(i); 63 if( !dataAtt.equals(classAtt) ) { 64 Stats stats = testdata.attributeStats(i).numericStats; 65 for( int j=0; j<characteristics.length; j++ ) { 66 if( "mean".equals(characteristics[j]) ) { 67 instanceValues[i*characteristics.length+j] = stats.mean; 68 } else if( "stddev".equals(characteristics[j])) { 69 instanceValues[i*characteristics.length+j] = stats.stdDev; 70 } else if( "var".equals(characteristics[j])) { 71 instanceValues[i*characteristics.length+j] = testdata.variance(j); 72 } else { 73 throw new RuntimeException("Unkown distributional characteristic: " + characteristics[j]); 74 } 75 } 76 } 77 } 78 data.add(new DenseInstance(1.0, instanceValues)); 79 80 for( Instances traindata : traindataSet ) { 81 instanceValues = new double[atts.size()]; 82 for( int i=0 ; i<traindata.numAttributes() ; i++ ) { 83 Attribute dataAtt = traindata.attribute(i); 84 if( !dataAtt.equals(classAtt) ) { 85 Stats stats = traindata.attributeStats(i).numericStats; 86 for( int j=0; j<characteristics.length; j++ ) { 87 if( "mean".equals(characteristics[j]) ) { 88 instanceValues[i*characteristics.length+j] = stats.mean; 89 } else if( "stddev".equals(characteristics[j])) { 90 instanceValues[i*characteristics.length+j] = stats.stdDev; 91 } else if( "var".equals(characteristics[j])) { 92 instanceValues[i*characteristics.length+j] = testdata.variance(j); 93 } else { 94 throw new RuntimeException("Unkown distributional characteristic: " + characteristics[j]); 95 } 96 } 97 } 98 } 99 Instance instance = new DenseInstance(1.0, instanceValues); 100 101 data.add(instance); 102 } 103 return data; 104 } 105 106 /** 107 * Returns the normalized distributional characteristics of the training data. 108 * @param testdata test data 109 * @param traindataSet training data sets 110 * @return normalized distributional characteristics of the data 111 */ 112 protected Instances normalizedCharacteristicInstances(Instances testdata, SetUniqueList<Instances> traindataSet) { 113 Instances data = characteristicInstances(testdata, traindataSet); 114 try { 115 final Normalize normalizer = new Normalize(); 116 normalizer.setInputFormat(data); 117 data = Filter.useFilter(data, normalizer); 118 } catch (Exception e) { 119 throw new RuntimeException("Unexpected exception during normalization of distributional characteristics.", e); 120 } 121 return data; 122 } 38 /** 39 * vector with the distributional characteristics 40 */ 41 private String[] characteristics = new String[] 42 { "mean", "stddev" }; 43 44 /** 45 * Sets the distributional characteristics. The names of the characteristics are separated by 46 * blanks. 47 */ 48 @Override 49 public void setParameter(String parameters) { 50 if (!"".equals(parameters)) { 51 characteristics = parameters.split(" "); 52 } 53 } 54 55 /** 56 * Transforms the data into the distributional characteristics. The first instance is the test 57 * data, followed by the training data. 58 * 59 * @param testdata 60 * test data 61 * @param traindataSet 62 * training data sets 63 * @return distributional characteristics of the data 64 */ 65 protected Instances characteristicInstances(Instances testdata, 66 SetUniqueList<Instances> traindataSet) 67 { 68 // setup weka Instances for clustering 69 final ArrayList<Attribute> atts = new ArrayList<Attribute>(); 70 71 final Attribute classAtt = testdata.classAttribute(); 72 for (int i = 0; i < testdata.numAttributes(); i++) { 73 Attribute dataAtt = testdata.attribute(i); 74 if (!dataAtt.equals(classAtt)) { 75 for (String characteristic : characteristics) { 76 atts.add(new Attribute(dataAtt.name() + "_" + characteristic)); 77 } 78 } 79 } 80 final Instances data = new Instances("distributional_characteristics", atts, 0); 81 82 // setup data for clustering 83 double[] instanceValues = new double[atts.size()]; 84 for (int i = 0; i < testdata.numAttributes(); i++) { 85 Attribute dataAtt = testdata.attribute(i); 86 if (!dataAtt.equals(classAtt)) { 87 Stats stats = testdata.attributeStats(i).numericStats; 88 for (int j = 0; j < characteristics.length; j++) { 89 if ("mean".equals(characteristics[j])) { 90 instanceValues[i * characteristics.length + j] = stats.mean; 91 } 92 else if ("stddev".equals(characteristics[j])) { 93 instanceValues[i * characteristics.length + j] = stats.stdDev; 94 } 95 else if ("var".equals(characteristics[j])) { 96 instanceValues[i * characteristics.length + j] = testdata.variance(j); 97 } 98 else { 99 throw new RuntimeException("Unkown distributional characteristic: " + 100 characteristics[j]); 101 } 102 } 103 } 104 } 105 data.add(new DenseInstance(1.0, instanceValues)); 106 107 for (Instances traindata : traindataSet) { 108 instanceValues = new double[atts.size()]; 109 for (int i = 0; i < traindata.numAttributes(); i++) { 110 Attribute dataAtt = traindata.attribute(i); 111 if (!dataAtt.equals(classAtt)) { 112 Stats stats = traindata.attributeStats(i).numericStats; 113 for (int j = 0; j < characteristics.length; j++) { 114 if ("mean".equals(characteristics[j])) { 115 instanceValues[i * characteristics.length + j] = stats.mean; 116 } 117 else if ("stddev".equals(characteristics[j])) { 118 instanceValues[i * characteristics.length + j] = stats.stdDev; 119 } 120 else if ("var".equals(characteristics[j])) { 121 instanceValues[i * characteristics.length + j] = testdata.variance(j); 122 } 123 else { 124 throw new RuntimeException("Unkown distributional characteristic: " + 125 characteristics[j]); 126 } 127 } 128 } 129 } 130 Instance instance = new DenseInstance(1.0, instanceValues); 131 132 data.add(instance); 133 } 134 return data; 135 } 136 137 /** 138 * Returns the normalized distributional characteristics of the training data. 139 * 140 * @param testdata 141 * test data 142 * @param traindataSet 143 * training data sets 144 * @return normalized distributional characteristics of the data 145 */ 146 protected Instances normalizedCharacteristicInstances(Instances testdata, 147 SetUniqueList<Instances> traindataSet) 148 { 149 Instances data = characteristicInstances(testdata, traindataSet); 150 try { 151 final Normalize normalizer = new Normalize(); 152 normalizer.setInputFormat(data); 153 data = Filter.useFilter(data, normalizer); 154 } 155 catch (Exception e) { 156 throw new RuntimeException( 157 "Unexpected exception during normalization of distributional characteristics.", 158 e); 159 } 160 return data; 161 } 123 162 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/IPointWiseDataselectionStrategy.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataselection; 2 16 … … 6 20 7 21 /** 8 * Interface for pointwise data selection strategies. 22 * Interface for pointwise data selection strategies. 23 * 9 24 * @author Steffen Herbold 10 25 */ 11 26 public interface IPointWiseDataselectionStrategy extends IParameterizable { 12 27 13 /** 14 * Applies the data selection strategy. 15 * @param testdata test data 16 * @param traindata candidate training data 17 * @return the selected training data 18 */ 19 Instances apply(Instances testdata, Instances traindata); 28 /** 29 * Applies the data selection strategy. 30 * 31 * @param testdata 32 * test data 33 * @param traindata 34 * candidate training data 35 * @return the selected training data 36 */ 37 Instances apply(Instances testdata, Instances traindata); 20 38 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/ISetWiseDataselectionStrategy.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataselection; 2 16 … … 9 23 /** 10 24 * Interface for setwise data selection strategies. 25 * 11 26 * @author Steffen Herbold 12 27 */ 13 28 public interface ISetWiseDataselectionStrategy extends IParameterizable { 14 29 15 /** 16 * Applies a setwise data selection strategy. 17 * @param testdata test data for which the training data is selected 18 * @param traindataSet candidate training data 19 */ 20 void apply(Instances testdata, SetUniqueList<Instances> traindataSet); 30 /** 31 * Applies a setwise data selection strategy. 32 * 33 * @param testdata 34 * test data for which the training data is selected 35 * @param traindataSet 36 * candidate training data 37 */ 38 void apply(Instances testdata, SetUniqueList<Instances> traindataSet); 21 39 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/PetersFilter.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataselection; 2 16 … … 13 27 14 28 /** 15 * Filter according to F. Peters, T. Menzies, and A. Marcus: Better Cross Company Defect Prediction 16 * <br><br> 17 * This filter does not work, the paper has been withdrawn. 29 * Filter according to F. Peters, T. Menzies, and A. Marcus: Better Cross Company Defect Prediction <br> 30 * <br> 31 * This filter does not work, the paper has been withdrawn. 32 * 18 33 * @author Steffen Herbold 19 34 */ … … 21 36 public class PetersFilter implements IPointWiseDataselectionStrategy { 22 37 23 24 /** 25 * @see de.ugoe.cs.cpdp.IParameterizable#setParameter(java.lang.String) 26 */ 27 @Override 28 public void setParameter(String parameters) { 29 // dummy 30 } 38 /** 39 * @see de.ugoe.cs.cpdp.IParameterizable#setParameter(java.lang.String) 40 */ 41 @Override 42 public void setParameter(String parameters) { 43 // dummy 44 } 31 45 32 /** 33 * @see de.ugoe.cs.cpdp.dataselection.IPointWiseDataselectionStrategy#apply(weka.core.Instances, weka.core.Instances) 34 */ 35 @Override 36 public Instances apply(Instances testdata, Instances traindata) { 37 final Attribute classAttribute = testdata.classAttribute(); 38 39 final double[][] testDoubles = new double[testdata.numInstances()][testdata.numAttributes()]; 40 for( int i=0; i<testdata.numInstances() ; i++ ) { 41 Instance instance = testdata.instance(i); 42 int tmp = 0; 43 for( int j=0 ; j<testdata.numAttributes(); j++ ) { 44 if( testdata.attribute(j)!=classAttribute ) { 45 testDoubles[i][tmp++] = instance.value(j); 46 } 47 } 48 } 49 50 final double[][] trainDoubles = new double[traindata.numInstances()][testdata.numAttributes()]; 51 for( int i=0; i<traindata.numInstances() ; i++ ) { 52 Instance instance = traindata.instance(i); 53 int tmp = 0; 54 for( int j=0 ; j<testdata.numAttributes(); j++ ) { 55 if( testdata.attribute(j)!=classAttribute ) { 56 trainDoubles[i][tmp++] = instance.value(j); 57 } 58 } 59 } 60 61 final List<List<Integer>> fanList = new ArrayList<List<Integer>>(testdata.numInstances()); 62 for( int i=0; i<testdata.numInstances(); i++ ) { 63 fanList.add(new LinkedList<Integer>()); 64 } 65 66 for( int i=0; i<traindata.numInstances(); i++ ) { 67 double minDistance = Double.MAX_VALUE; 68 int minIndex = 0; 69 for( int j=0; j<testdata.numInstances(); j++ ) { 70 double distance = MathArrays.distance(trainDoubles[i], testDoubles[j]); 71 if( distance<minDistance ) { 72 minDistance = distance; 73 minIndex = j; 74 } 75 } 76 fanList.get(minIndex).add(i); 77 } 78 79 final SetUniqueList<Integer> selectedIndex = SetUniqueList.setUniqueList(new LinkedList<Integer>()); 80 for( int i=0; i<testdata.numInstances(); i++ ) { 81 double minDistance = Double.MAX_VALUE; 82 int minIndex = -1; 83 for( Integer j : fanList.get(i) ) { 84 double distance = MathArrays.distance(testDoubles[i], trainDoubles[j]); 85 if( distance<minDistance && distance>0.0d ) { 86 minDistance = distance; 87 minIndex = j; 88 } 89 } 90 if( minIndex!=-1 ) { 91 selectedIndex.add(minIndex); 92 } 93 } 94 95 final Instances selected = new Instances(testdata); 96 selected.delete(); 97 for( Integer i : selectedIndex) { 98 selected.add(traindata.instance(i)); 99 } 100 return selected; 101 } 46 /** 47 * @see de.ugoe.cs.cpdp.dataselection.IPointWiseDataselectionStrategy#apply(weka.core.Instances, 48 * weka.core.Instances) 49 */ 50 @Override 51 public Instances apply(Instances testdata, Instances traindata) { 52 final Attribute classAttribute = testdata.classAttribute(); 53 54 final double[][] testDoubles = 55 new double[testdata.numInstances()][testdata.numAttributes()]; 56 for (int i = 0; i < testdata.numInstances(); i++) { 57 Instance instance = testdata.instance(i); 58 int tmp = 0; 59 for (int j = 0; j < testdata.numAttributes(); j++) { 60 if (testdata.attribute(j) != classAttribute) { 61 testDoubles[i][tmp++] = instance.value(j); 62 } 63 } 64 } 65 66 final double[][] trainDoubles = 67 new double[traindata.numInstances()][testdata.numAttributes()]; 68 for (int i = 0; i < traindata.numInstances(); i++) { 69 Instance instance = traindata.instance(i); 70 int tmp = 0; 71 for (int j = 0; j < testdata.numAttributes(); j++) { 72 if (testdata.attribute(j) != classAttribute) { 73 trainDoubles[i][tmp++] = instance.value(j); 74 } 75 } 76 } 77 78 final List<List<Integer>> fanList = new ArrayList<List<Integer>>(testdata.numInstances()); 79 for (int i = 0; i < testdata.numInstances(); i++) { 80 fanList.add(new LinkedList<Integer>()); 81 } 82 83 for (int i = 0; i < traindata.numInstances(); i++) { 84 double minDistance = Double.MAX_VALUE; 85 int minIndex = 0; 86 for (int j = 0; j < testdata.numInstances(); j++) { 87 double distance = MathArrays.distance(trainDoubles[i], testDoubles[j]); 88 if (distance < minDistance) { 89 minDistance = distance; 90 minIndex = j; 91 } 92 } 93 fanList.get(minIndex).add(i); 94 } 95 96 final SetUniqueList<Integer> selectedIndex = 97 SetUniqueList.setUniqueList(new LinkedList<Integer>()); 98 for (int i = 0; i < testdata.numInstances(); i++) { 99 double minDistance = Double.MAX_VALUE; 100 int minIndex = -1; 101 for (Integer j : fanList.get(i)) { 102 double distance = MathArrays.distance(testDoubles[i], trainDoubles[j]); 103 if (distance < minDistance && distance > 0.0d) { 104 minDistance = distance; 105 minIndex = j; 106 } 107 } 108 if (minIndex != -1) { 109 selectedIndex.add(minIndex); 110 } 111 } 112 113 final Instances selected = new Instances(testdata); 114 selected.delete(); 115 for (Integer i : selectedIndex) { 116 selected.add(traindata.instance(i)); 117 } 118 return selected; 119 } 102 120 103 121 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/PointWiseEMClusterSelection.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataselection; 2 16 … … 14 28 import de.ugoe.cs.util.console.Console; 15 29 16 17 30 /** 18 31 * Use in Config: 19 32 * 20 * Specify number of clusters 21 * -N = Num Clusters 22 * <pointwiseselector name="PointWiseEMClusterSelection" param="-N 10"/> 23 * 24 * Try to determine the number of clusters: 25 * -I 10 = max iterations 26 * -X 5 = 5 folds for cross evaluation 27 * -max = max number of clusters 28 * <pointwiseselector name="PointWiseEMClusterSelection" param="-I 10 -X 5 -max 300"/> 33 * Specify number of clusters -N = Num Clusters <pointwiseselector 34 * name="PointWiseEMClusterSelection" param="-N 10"/> 29 35 * 30 * Don't forget to add: 31 * <preprocessor name="Normalization" param=""/> 36 * Try to determine the number of clusters: -I 10 = max iterations -X 5 = 5 folds for cross 37 * evaluation -max = max number of clusters <pointwiseselector name="PointWiseEMClusterSelection" 38 * param="-I 10 -X 5 -max 300"/> 39 * 40 * Don't forget to add: <preprocessor name="Normalization" param=""/> 32 41 */ 33 42 public class PointWiseEMClusterSelection implements IPointWiseDataselectionStrategy { 34 35 private String[] params;36 37 @Override38 public void setParameter(String parameters) {39 params = parameters.split(" ");40 }41 43 42 43 /** 44 * 1. Cluster the traindata 45 * 2. for each instance in the testdata find the assigned cluster 46 * 3. select only traindata from the clusters we found in our testdata 47 * 48 * @returns the selected training data 49 */ 50 @Override 51 public Instances apply(Instances testdata, Instances traindata) { 52 //final Attribute classAttribute = testdata.classAttribute(); 53 54 final List<Integer> selectedCluster = SetUniqueList.setUniqueList(new LinkedList<Integer>()); 44 private String[] params; 55 45 56 // 1. copy train- and testdata 57 Instances train = new Instances(traindata); 58 Instances test = new Instances(testdata); 59 60 Instances selected = null; 61 62 try { 63 // remove class attribute from traindata 64 Remove filter = new Remove(); 65 filter.setAttributeIndices("" + (train.classIndex() + 1)); 66 filter.setInputFormat(train); 67 train = Filter.useFilter(train, filter); 68 69 Console.traceln(Level.INFO, String.format("starting clustering")); 70 71 // 3. cluster data 72 EM clusterer = new EM(); 73 clusterer.setOptions(params); 74 clusterer.buildClusterer(train); 75 int numClusters = clusterer.getNumClusters(); 76 if ( numClusters == -1) { 77 Console.traceln(Level.INFO, String.format("we have unlimited clusters")); 78 }else { 79 Console.traceln(Level.INFO, String.format("we have: "+numClusters+" clusters")); 80 } 81 82 83 // 4. classify testdata, save cluster int 84 85 // remove class attribute from testdata? 86 Remove filter2 = new Remove(); 87 filter2.setAttributeIndices("" + (test.classIndex() + 1)); 88 filter2.setInputFormat(test); 89 test = Filter.useFilter(test, filter2); 90 91 int cnum; 92 for( int i=0; i < test.numInstances(); i++ ) { 93 cnum = ((EM)clusterer).clusterInstance(test.get(i)); 46 @Override 47 public void setParameter(String parameters) { 48 params = parameters.split(" "); 49 } 94 50 95 // we dont want doubles (maybe use a hashset instead of list?) 96 if ( !selectedCluster.contains(cnum) ) { 97 selectedCluster.add(cnum); 98 //Console.traceln(Level.INFO, String.format("assigned to cluster: "+cnum)); 99 } 100 } 101 102 Console.traceln(Level.INFO, String.format("our testdata is in: "+selectedCluster.size()+" different clusters")); 103 104 // 5. get cluster membership of our traindata 105 AddCluster cfilter = new AddCluster(); 106 cfilter.setClusterer(clusterer); 107 cfilter.setInputFormat(train); 108 Instances ctrain = Filter.useFilter(train, cfilter); 109 110 111 // 6. for all traindata get the cluster int, if it is in our list of testdata cluster int add the traindata 112 // of this cluster to our returned traindata 113 int cnumber; 114 selected = new Instances(traindata); 115 selected.delete(); 116 117 for ( int j=0; j < ctrain.numInstances(); j++ ) { 118 // get the cluster number from the attributes 119 cnumber = Integer.parseInt(ctrain.get(j).stringValue(ctrain.get(j).numAttributes()-1).replace("cluster", "")); 120 121 //Console.traceln(Level.INFO, String.format("instance "+j+" is in cluster: "+cnumber)); 122 if ( selectedCluster.contains(cnumber) ) { 123 // this only works if the index does not change 124 selected.add(traindata.get(j)); 125 // check for differences, just one attribute, we are pretty sure the index does not change 126 if ( traindata.get(j).value(3) != ctrain.get(j).value(3) ) { 127 Console.traceln(Level.WARNING, String.format("we have a difference between train an ctrain!")); 128 } 129 } 130 } 131 132 Console.traceln(Level.INFO, String.format("that leaves us with: "+selected.numInstances()+" traindata instances from "+traindata.numInstances())); 133 }catch( Exception e ) { 134 Console.traceln(Level.WARNING, String.format("ERROR")); 135 throw new RuntimeException("error in pointwise em", e); 136 } 137 138 return selected; 139 } 51 /** 52 * 1. Cluster the traindata 2. for each instance in the testdata find the assigned cluster 3. 53 * select only traindata from the clusters we found in our testdata 54 * 55 * @returns the selected training data 56 */ 57 @Override 58 public Instances apply(Instances testdata, Instances traindata) { 59 // final Attribute classAttribute = testdata.classAttribute(); 60 61 final List<Integer> selectedCluster = 62 SetUniqueList.setUniqueList(new LinkedList<Integer>()); 63 64 // 1. copy train- and testdata 65 Instances train = new Instances(traindata); 66 Instances test = new Instances(testdata); 67 68 Instances selected = null; 69 70 try { 71 // remove class attribute from traindata 72 Remove filter = new Remove(); 73 filter.setAttributeIndices("" + (train.classIndex() + 1)); 74 filter.setInputFormat(train); 75 train = Filter.useFilter(train, filter); 76 77 Console.traceln(Level.INFO, String.format("starting clustering")); 78 79 // 3. cluster data 80 EM clusterer = new EM(); 81 clusterer.setOptions(params); 82 clusterer.buildClusterer(train); 83 int numClusters = clusterer.getNumClusters(); 84 if (numClusters == -1) { 85 Console.traceln(Level.INFO, String.format("we have unlimited clusters")); 86 } 87 else { 88 Console.traceln(Level.INFO, String.format("we have: " + numClusters + " clusters")); 89 } 90 91 // 4. classify testdata, save cluster int 92 93 // remove class attribute from testdata? 94 Remove filter2 = new Remove(); 95 filter2.setAttributeIndices("" + (test.classIndex() + 1)); 96 filter2.setInputFormat(test); 97 test = Filter.useFilter(test, filter2); 98 99 int cnum; 100 for (int i = 0; i < test.numInstances(); i++) { 101 cnum = ((EM) clusterer).clusterInstance(test.get(i)); 102 103 // we dont want doubles (maybe use a hashset instead of list?) 104 if (!selectedCluster.contains(cnum)) { 105 selectedCluster.add(cnum); 106 // Console.traceln(Level.INFO, String.format("assigned to cluster: "+cnum)); 107 } 108 } 109 110 Console.traceln(Level.INFO, 111 String.format("our testdata is in: " + selectedCluster.size() + 112 " different clusters")); 113 114 // 5. get cluster membership of our traindata 115 AddCluster cfilter = new AddCluster(); 116 cfilter.setClusterer(clusterer); 117 cfilter.setInputFormat(train); 118 Instances ctrain = Filter.useFilter(train, cfilter); 119 120 // 6. for all traindata get the cluster int, if it is in our list of testdata cluster 121 // int add the traindata 122 // of this cluster to our returned traindata 123 int cnumber; 124 selected = new Instances(traindata); 125 selected.delete(); 126 127 for (int j = 0; j < ctrain.numInstances(); j++) { 128 // get the cluster number from the attributes 129 cnumber = 130 Integer.parseInt(ctrain.get(j).stringValue(ctrain.get(j).numAttributes() - 1) 131 .replace("cluster", "")); 132 133 // Console.traceln(Level.INFO, 134 // String.format("instance "+j+" is in cluster: "+cnumber)); 135 if (selectedCluster.contains(cnumber)) { 136 // this only works if the index does not change 137 selected.add(traindata.get(j)); 138 // check for differences, just one attribute, we are pretty sure the index does 139 // not change 140 if (traindata.get(j).value(3) != ctrain.get(j).value(3)) { 141 Console.traceln(Level.WARNING, String 142 .format("we have a difference between train an ctrain!")); 143 } 144 } 145 } 146 147 Console.traceln(Level.INFO, 148 String.format("that leaves us with: " + selected.numInstances() + 149 " traindata instances from " + traindata.numInstances())); 150 } 151 catch (Exception e) { 152 Console.traceln(Level.WARNING, String.format("ERROR")); 153 throw new RuntimeException("error in pointwise em", e); 154 } 155 156 return selected; 157 } 140 158 141 159 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/SeparatabilitySelection.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataselection; 2 16 … … 13 27 14 28 /** 15 * A setwise data selection strategy based on the separatability of the training data from the test data after Z. He, F. Peters, T. Menzies, Y. Yang: Learning from Open-Source Projects: An Empirical Study on Defect Prediction. 16 * <br><br> 17 * This is calculated through the error of a logistic regression classifier that tries to separate the sets. 29 * A setwise data selection strategy based on the separatability of the training data from the test 30 * data after Z. He, F. Peters, T. Menzies, Y. Yang: Learning from Open-Source Projects: An 31 * Empirical Study on Defect Prediction. <br> 32 * <br> 33 * This is calculated through the error of a logistic regression classifier that tries to separate 34 * the sets. 35 * 18 36 * @author Steffen Herbold 19 37 */ 20 38 public class SeparatabilitySelection implements ISetWiseDataselectionStrategy { 21 39 22 /** 23 * size of the random sample that is drawn from both test data and training data 24 */ 25 private final int sampleSize = 500; 26 27 /** 28 * number of repetitions of the sample drawing 29 */ 30 private final int maxRep = 10; 31 32 /** 33 * number of neighbors that are selected 34 */ 35 private int neighbors = 10; 36 37 /** 38 * Sets the number of neighbors that are selected. 39 */ 40 @Override 41 public void setParameter(String parameters) { 42 if( !"".equals(parameters) ) { 43 neighbors = Integer.parseInt(parameters); 44 } 45 } 40 /** 41 * size of the random sample that is drawn from both test data and training data 42 */ 43 private final int sampleSize = 500; 46 44 47 /** 48 * @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 49 */ 50 @Override 51 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 52 final Random rand = new Random(1); 53 54 // calculate distances between testdata and traindata 55 final double[] distances = new double[traindataSet.size()]; 56 57 int i=0; 58 for( Instances traindata : traindataSet ) { 59 double distance = 0.0; 60 for( int rep=0; rep<maxRep ; rep++ ) { 61 // sample instances 62 Instances sample = new Instances(testdata); 63 for( int j=0; j<sampleSize; j++ ) { 64 Instance inst = new DenseInstance(testdata.instance(rand.nextInt(testdata.numInstances()))); 65 inst.setDataset(sample); 66 inst.setClassValue(1.0); 67 sample.add(inst); 68 inst = new DenseInstance(traindata.instance(rand.nextInt(traindata.numInstances()))); 69 inst.setDataset(sample); 70 inst.setClassValue(0.0); 71 sample.add(inst); 72 } 73 74 // calculate separation 75 Evaluation eval; 76 try { 77 eval = new Evaluation(sample); 78 eval.crossValidateModel(new Logistic(), sample, 5, rand); 79 } catch (Exception e) { 80 throw new RuntimeException("cross-validation during calculation of separatability failed", e); 81 } 82 distance += eval.pctCorrect()/100.0; 83 } 84 distances[i++] = 2*((distance/maxRep)-0.5); 85 } 86 87 // select closest neighbors 88 final double[] distancesCopy = Arrays.copyOf(distances, distances.length); 89 Arrays.sort(distancesCopy); 90 final double cutoffDistance = distancesCopy[neighbors]; 91 92 for( i=traindataSet.size()-1; i>=0 ; i-- ) { 93 if( distances[i]>cutoffDistance ) { 94 traindataSet.remove(i); 95 } 96 } 97 } 45 /** 46 * number of repetitions of the sample drawing 47 */ 48 private final int maxRep = 10; 49 50 /** 51 * number of neighbors that are selected 52 */ 53 private int neighbors = 10; 54 55 /** 56 * Sets the number of neighbors that are selected. 57 */ 58 @Override 59 public void setParameter(String parameters) { 60 if (!"".equals(parameters)) { 61 neighbors = Integer.parseInt(parameters); 62 } 63 } 64 65 /** 66 * @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances, 67 * org.apache.commons.collections4.list.SetUniqueList) 68 */ 69 @Override 70 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 71 final Random rand = new Random(1); 72 73 // calculate distances between testdata and traindata 74 final double[] distances = new double[traindataSet.size()]; 75 76 int i = 0; 77 for (Instances traindata : traindataSet) { 78 double distance = 0.0; 79 for (int rep = 0; rep < maxRep; rep++) { 80 // sample instances 81 Instances sample = new Instances(testdata); 82 for (int j = 0; j < sampleSize; j++) { 83 Instance inst = 84 new DenseInstance(testdata.instance(rand.nextInt(testdata.numInstances()))); 85 inst.setDataset(sample); 86 inst.setClassValue(1.0); 87 sample.add(inst); 88 inst = 89 new DenseInstance( 90 traindata.instance(rand.nextInt(traindata.numInstances()))); 91 inst.setDataset(sample); 92 inst.setClassValue(0.0); 93 sample.add(inst); 94 } 95 96 // calculate separation 97 Evaluation eval; 98 try { 99 eval = new Evaluation(sample); 100 eval.crossValidateModel(new Logistic(), sample, 5, rand); 101 } 102 catch (Exception e) { 103 throw new RuntimeException( 104 "cross-validation during calculation of separatability failed", 105 e); 106 } 107 distance += eval.pctCorrect() / 100.0; 108 } 109 distances[i++] = 2 * ((distance / maxRep) - 0.5); 110 } 111 112 // select closest neighbors 113 final double[] distancesCopy = Arrays.copyOf(distances, distances.length); 114 Arrays.sort(distancesCopy); 115 final double cutoffDistance = distancesCopy[neighbors]; 116 117 for (i = traindataSet.size() - 1; i >= 0; i--) { 118 if (distances[i] > cutoffDistance) { 119 traindataSet.remove(i); 120 } 121 } 122 } 98 123 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/SetWiseEMClusterSelection.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataselection; 2 16 … … 11 25 12 26 /** 13 * Filter based on EM clustering after S. Herbold: Training data selection for cross-project defect prediction 27 * Filter based on EM clustering after S. Herbold: Training data selection for cross-project defect 28 * prediction 29 * 14 30 * @author Steffen Herbold 15 31 */ 16 32 public class SetWiseEMClusterSelection extends AbstractCharacteristicSelection { 17 18 /** 19 * @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 20 */ 21 @Override 22 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 23 final Instances data = normalizedCharacteristicInstances(testdata, traindataSet); 24 final Instance targetInstance = data.instance(0); 25 final List<Instance> candidateInstances = new LinkedList<Instance>(); 26 for( int i=1; i<data.numInstances(); i++ ) { 27 candidateInstances.add(data.instance(i)); 28 } 29 30 // cluster and select 31 try { 32 final EM emeans = new EM(); 33 boolean onlyTarget = true; 34 int targetCluster; 35 int maxNumClusters = candidateInstances.size(); 36 do { // while(onlyTarget) 37 emeans.setMaximumNumberOfClusters(maxNumClusters); 38 emeans.buildClusterer(data); 39 40 targetCluster = emeans.clusterInstance(targetInstance); 41 42 // check if cluster only contains target project 43 for( int i=0 ; i<candidateInstances.size() && onlyTarget; i++ ) { 44 onlyTarget &= !(emeans.clusterInstance(candidateInstances.get(i))==targetCluster); 45 } 46 maxNumClusters = emeans.numberOfClusters()-1; 47 } while(onlyTarget); 48 49 int numRemoved = 0; 50 for( int i=0 ; i<candidateInstances.size() ; i++ ) { 51 if( emeans.clusterInstance(candidateInstances.get(i))!=targetCluster ) { 52 traindataSet.remove(i-numRemoved++); 53 } 54 } 55 } catch(Exception e) { 56 throw new RuntimeException("error applying setwise EM clustering training data selection", e); 57 } 58 } 33 34 /** 35 * @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances, 36 * org.apache.commons.collections4.list.SetUniqueList) 37 */ 38 @Override 39 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 40 final Instances data = normalizedCharacteristicInstances(testdata, traindataSet); 41 final Instance targetInstance = data.instance(0); 42 final List<Instance> candidateInstances = new LinkedList<Instance>(); 43 for (int i = 1; i < data.numInstances(); i++) { 44 candidateInstances.add(data.instance(i)); 45 } 46 47 // cluster and select 48 try { 49 final EM emeans = new EM(); 50 boolean onlyTarget = true; 51 int targetCluster; 52 int maxNumClusters = candidateInstances.size(); 53 do { // while(onlyTarget) 54 emeans.setMaximumNumberOfClusters(maxNumClusters); 55 emeans.buildClusterer(data); 56 57 targetCluster = emeans.clusterInstance(targetInstance); 58 59 // check if cluster only contains target project 60 for (int i = 0; i < candidateInstances.size() && onlyTarget; i++) { 61 onlyTarget &= 62 !(emeans.clusterInstance(candidateInstances.get(i)) == targetCluster); 63 } 64 maxNumClusters = emeans.numberOfClusters() - 1; 65 } 66 while (onlyTarget); 67 68 int numRemoved = 0; 69 for (int i = 0; i < candidateInstances.size(); i++) { 70 if (emeans.clusterInstance(candidateInstances.get(i)) != targetCluster) { 71 traindataSet.remove(i - numRemoved++); 72 } 73 } 74 } 75 catch (Exception e) { 76 throw new RuntimeException( 77 "error applying setwise EM clustering training data selection", 78 e); 79 } 80 } 59 81 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/SetWiseEMContextSelection.java
r38 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataselection; 2 16 … … 22 36 * Selects training data by clustering project context factors. 23 37 * 24 * The project context factors used for the clustering are configured in 25 * the XML param attribute, Example: 26 * <setwiseselector name="SetWiseEMContextSelection" param="AFS TND TNC" /> 38 * The project context factors used for the clustering are configured in the XML param attribute, 39 * Example: <setwiseselector name="SetWiseEMContextSelection" param="AFS TND TNC" /> 27 40 */ 28 41 public class SetWiseEMContextSelection implements ISetWiseDataselectionStrategy { 29 30 private String[] project_context_factors; // = new String[]{"TND", "TNC", "TNF", "TLOC"}; 31 32 @Override 33 public void setParameter(String parameters) { 34 if( parameters!=null ) { 35 project_context_factors = parameters.split(" "); 36 } 37 } 38 39 /** 40 * Uses the Weka EM-Clustering algorithm to cluster the projects 41 * by their project context factors. 42 * The project context factors are first normalized and then used for clustering. 43 * They can be configured in the configuration param. 44 * 45 * @param testdata 46 * @param traindataSet 47 */ 48 protected void cluster(Instances testdata, SetUniqueList<Instances> traindataSet) { 49 // now do the clustering, normalizedCharacteristicInstances ruft getContextFactors auf 50 final Instances data = this.normalizedCharacteristicInstances(testdata, traindataSet); 51 52 final Instance targetInstance = data.instance(0); 53 final List<Instance> candidateInstances = new LinkedList<Instance>(); 54 for( int i=1; i<data.numInstances(); i++ ) { 55 candidateInstances.add(data.instance(i)); 56 } 57 58 // cluster and select 59 try { 60 final EM emeans = new EM(); 61 boolean onlyTarget = true; 62 int targetCluster; 63 int maxNumClusters = candidateInstances.size(); 64 65 do { // while(onlyTarget) 66 emeans.setMaximumNumberOfClusters(maxNumClusters); 67 emeans.buildClusterer(data); 68 69 targetCluster = emeans.clusterInstance(targetInstance); 70 71 // check if cluster only contains target project 72 for( int i=0 ; i<candidateInstances.size() && onlyTarget; i++ ) { 73 onlyTarget &= !(emeans.clusterInstance(candidateInstances.get(i))==targetCluster); 74 } 75 maxNumClusters = emeans.numberOfClusters()-1; 76 77 //Console.traceln(Level.INFO, "number of clusters: " + emeans.numberOfClusters()); 78 } while(onlyTarget); 79 80 Console.traceln(Level.INFO, "clusters: " + maxNumClusters); 81 Console.traceln(Level.INFO, "instances vor dem clustern: " + traindataSet.size()); 82 int numRemoved = 0; 83 for( int i=0 ; i<candidateInstances.size() ; i++ ) { 84 if( emeans.clusterInstance(candidateInstances.get(i))!=targetCluster ) { 85 traindataSet.remove(i-numRemoved++); 86 } 87 } 88 Console.traceln(Level.INFO, "instances nach dem clustern: " + traindataSet.size()); 89 } catch(Exception e) { 90 throw new RuntimeException("error applying setwise EM clustering training data selection", e); 91 } 92 } 93 94 @Override 95 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 96 // issuetracking und pl muss passen 97 /* 98 int s = traindataSet.size(); 99 Console.traceln(Level.INFO, "remove non matching PL and IssueTracking projects, size now: " + s); 100 this.removeWrongContext(testdata, traindataSet, "PL"); 101 this.removeWrongContext(testdata, traindataSet, "IssueTracking"); 102 s = traindataSet.size(); 103 Console.traceln(Level.INFO, "size after removal: " + s); 104 */ 105 // now cluster 106 this.cluster(testdata, traindataSet); 107 } 108 109 /** 110 * Returns test- and training data with only the project context factors 111 * which were chosen in the configuration. 112 * This is later used for clustering. 113 * 114 * @param testdata 115 * @param traindataSet 116 * @return 117 */ 118 protected Instances getContextFactors(Instances testdata, SetUniqueList<Instances> traindataSet) { 119 // setup weka Instances for clustering 120 final ArrayList<Attribute> atts = new ArrayList<Attribute>(); 121 122 // we only want the project context factors 123 for( String pcf : this.project_context_factors ) { 124 atts.add(new Attribute(pcf)); 125 } 126 127 // set up the data 128 final Instances data = new Instances("project_context_factors", atts, 0); 129 double[] instanceValues = new double[atts.size()]; 130 131 // only project context factors + only one instance per project needed 132 int i = 0; 133 for( String pcf : this.project_context_factors ) { 134 instanceValues[i] = testdata.instance(0).value(testdata.attribute(pcf)); 135 //Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " + instanceValues[i]); 136 i++; 137 } 138 data.add(new DenseInstance(1.0, instanceValues)); 139 140 // now for the projects of the training stet 141 for( Instances traindata : traindataSet ) { 142 instanceValues = new double[atts.size()]; // ohne das hier immer dieselben werte?! 143 i = 0; 144 for( String pcf : this.project_context_factors ) { 145 instanceValues[i] = traindata.instance(0).value(traindata.attribute(pcf)); 146 //Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " + instanceValues[i]); 147 i++; 148 } 149 150 data.add(new DenseInstance(1.0, instanceValues)); 151 } 152 153 return data; 154 } 155 156 /** 157 * Delete projects where the project context does not match the training project 158 * 159 * @param testdata 160 * @param traindataSet 161 * @param attribute 162 */ 163 protected void removeWrongContext(Instances testdata, SetUniqueList<Instances> traindataSet, String attribute) { 164 Set<Instances> remove = new HashSet<Instances>(); 165 for( Instances traindata : traindataSet ) { 166 if( traindata.firstInstance().value(traindata.attribute(attribute)) != testdata.firstInstance().value(testdata.attribute(attribute)) ) { 167 remove.add(traindata); 168 //Console.traceln(Level.WARNING, "rmove attribute "+attribute+" test: "+testdata.firstInstance().value(testdata.attribute(attribute))+" train: "+traindata.firstInstance().value(traindata.attribute(attribute))); 169 } 170 } 171 172 // now delete the projects from set 173 for( Instances i : remove ) { 174 traindataSet.remove(i); 175 //Console.traceln(Level.INFO, "removing training project from set"); 176 } 177 } 178 179 /** 180 * Normalizes the data before it gets used for clustering 181 * 182 * @param testdata 183 * @param traindataSet 184 * @return 185 */ 186 protected Instances normalizedCharacteristicInstances(Instances testdata, SetUniqueList<Instances> traindataSet) { 187 Instances data = this.getContextFactors(testdata, traindataSet); 188 try { 189 final Normalize normalizer = new Normalize(); 190 normalizer.setInputFormat(data); 191 data = Filter.useFilter(data, normalizer); 192 } catch (Exception e) { 193 throw new RuntimeException("Unexpected exception during normalization of distributional characteristics.", e); 194 } 195 return data; 196 } 42 43 private String[] project_context_factors; // = new String[]{"TND", "TNC", "TNF", "TLOC"}; 44 45 @Override 46 public void setParameter(String parameters) { 47 if (parameters != null) { 48 project_context_factors = parameters.split(" "); 49 } 50 } 51 52 /** 53 * Uses the Weka EM-Clustering algorithm to cluster the projects by their project context 54 * factors. The project context factors are first normalized and then used for clustering. They 55 * can be configured in the configuration param. 56 * 57 * @param testdata 58 * @param traindataSet 59 */ 60 protected void cluster(Instances testdata, SetUniqueList<Instances> traindataSet) { 61 // now do the clustering, normalizedCharacteristicInstances ruft getContextFactors auf 62 final Instances data = this.normalizedCharacteristicInstances(testdata, traindataSet); 63 64 final Instance targetInstance = data.instance(0); 65 final List<Instance> candidateInstances = new LinkedList<Instance>(); 66 for (int i = 1; i < data.numInstances(); i++) { 67 candidateInstances.add(data.instance(i)); 68 } 69 70 // cluster and select 71 try { 72 final EM emeans = new EM(); 73 boolean onlyTarget = true; 74 int targetCluster; 75 int maxNumClusters = candidateInstances.size(); 76 77 do { // while(onlyTarget) 78 emeans.setMaximumNumberOfClusters(maxNumClusters); 79 emeans.buildClusterer(data); 80 81 targetCluster = emeans.clusterInstance(targetInstance); 82 83 // check if cluster only contains target project 84 for (int i = 0; i < candidateInstances.size() && onlyTarget; i++) { 85 onlyTarget &= 86 !(emeans.clusterInstance(candidateInstances.get(i)) == targetCluster); 87 } 88 maxNumClusters = emeans.numberOfClusters() - 1; 89 90 // Console.traceln(Level.INFO, "number of clusters: " + emeans.numberOfClusters()); 91 } 92 while (onlyTarget); 93 94 Console.traceln(Level.INFO, "clusters: " + maxNumClusters); 95 Console.traceln(Level.INFO, "instances vor dem clustern: " + traindataSet.size()); 96 int numRemoved = 0; 97 for (int i = 0; i < candidateInstances.size(); i++) { 98 if (emeans.clusterInstance(candidateInstances.get(i)) != targetCluster) { 99 traindataSet.remove(i - numRemoved++); 100 } 101 } 102 Console.traceln(Level.INFO, "instances nach dem clustern: " + traindataSet.size()); 103 } 104 catch (Exception e) { 105 throw new RuntimeException( 106 "error applying setwise EM clustering training data selection", 107 e); 108 } 109 } 110 111 @Override 112 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 113 // issuetracking und pl muss passen 114 /* 115 * int s = traindataSet.size(); Console.traceln(Level.INFO, 116 * "remove non matching PL and IssueTracking projects, size now: " + s); 117 * this.removeWrongContext(testdata, traindataSet, "PL"); this.removeWrongContext(testdata, 118 * traindataSet, "IssueTracking"); s = traindataSet.size(); Console.traceln(Level.INFO, 119 * "size after removal: " + s); 120 */ 121 // now cluster 122 this.cluster(testdata, traindataSet); 123 } 124 125 /** 126 * Returns test- and training data with only the project context factors which were chosen in 127 * the configuration. This is later used for clustering. 128 * 129 * @param testdata 130 * @param traindataSet 131 * @return 132 */ 133 protected Instances getContextFactors(Instances testdata, SetUniqueList<Instances> traindataSet) 134 { 135 // setup weka Instances for clustering 136 final ArrayList<Attribute> atts = new ArrayList<Attribute>(); 137 138 // we only want the project context factors 139 for (String pcf : this.project_context_factors) { 140 atts.add(new Attribute(pcf)); 141 } 142 143 // set up the data 144 final Instances data = new Instances("project_context_factors", atts, 0); 145 double[] instanceValues = new double[atts.size()]; 146 147 // only project context factors + only one instance per project needed 148 int i = 0; 149 for (String pcf : this.project_context_factors) { 150 instanceValues[i] = testdata.instance(0).value(testdata.attribute(pcf)); 151 // Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " + 152 // instanceValues[i]); 153 i++; 154 } 155 data.add(new DenseInstance(1.0, instanceValues)); 156 157 // now for the projects of the training stet 158 for (Instances traindata : traindataSet) { 159 instanceValues = new double[atts.size()]; // ohne das hier immer dieselben werte?! 160 i = 0; 161 for (String pcf : this.project_context_factors) { 162 instanceValues[i] = traindata.instance(0).value(traindata.attribute(pcf)); 163 // Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " + 164 // instanceValues[i]); 165 i++; 166 } 167 168 data.add(new DenseInstance(1.0, instanceValues)); 169 } 170 171 return data; 172 } 173 174 /** 175 * Delete projects where the project context does not match the training project 176 * 177 * @param testdata 178 * @param traindataSet 179 * @param attribute 180 */ 181 protected void removeWrongContext(Instances testdata, 182 SetUniqueList<Instances> traindataSet, 183 String attribute) 184 { 185 Set<Instances> remove = new HashSet<Instances>(); 186 for (Instances traindata : traindataSet) { 187 if (traindata.firstInstance().value(traindata.attribute(attribute)) != testdata 188 .firstInstance().value(testdata.attribute(attribute))) 189 { 190 remove.add(traindata); 191 // Console.traceln(Level.WARNING, 192 // "rmove attribute "+attribute+" test: "+testdata.firstInstance().value(testdata.attribute(attribute))+" train: "+traindata.firstInstance().value(traindata.attribute(attribute))); 193 } 194 } 195 196 // now delete the projects from set 197 for (Instances i : remove) { 198 traindataSet.remove(i); 199 // Console.traceln(Level.INFO, "removing training project from set"); 200 } 201 } 202 203 /** 204 * Normalizes the data before it gets used for clustering 205 * 206 * @param testdata 207 * @param traindataSet 208 * @return 209 */ 210 protected Instances normalizedCharacteristicInstances(Instances testdata, 211 SetUniqueList<Instances> traindataSet) 212 { 213 Instances data = this.getContextFactors(testdata, traindataSet); 214 try { 215 final Normalize normalizer = new Normalize(); 216 normalizer.setInputFormat(data); 217 data = Filter.useFilter(data, normalizer); 218 } 219 catch (Exception e) { 220 throw new RuntimeException( 221 "Unexpected exception during normalization of distributional characteristics.", 222 e); 223 } 224 return data; 225 } 197 226 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/SetWiseKNNSelection.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataselection; 2 16 … … 10 24 11 25 /** 12 * Filter based on the k-nearest neighbor (KNN) algorithm S. Herbold: Training data selection for cross-project defect prediction 26 * Filter based on the k-nearest neighbor (KNN) algorithm S. Herbold: Training data selection for 27 * cross-project defect prediction 28 * 13 29 * @author Steffen Herbold 14 30 */ 15 31 public class SetWiseKNNSelection extends AbstractCharacteristicSelection { 16 17 /**18 * number of neighbors selected19 */20 private int k = 1;21 22 /**23 * @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList)24 */25 @Override26 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {27 final Instances data = normalizedCharacteristicInstances(testdata, traindataSet);28 29 final Set<Integer> selected = new HashSet<Integer>();30 for( int i=0 ; i<k ; i++ ) {31 int closestIndex = getClosest(data);32 33 selected.add(closestIndex);34 data.delete(closestIndex);35 }36 37 for( int i=traindataSet.size()-1; i>=0 ; i-- ) {38 if( selected.contains(i) ) {39 traindataSet.remove(i);40 }41 }42 }43 44 /**45 * Helper method that determines the index of the instance with the smallest distance to the first instance (index 0).46 * @param data data set47 * @return index of the closest instance48 */49 private int getClosest(Instances data) {50 double closestDistance = Double.MAX_VALUE;51 int closestIndex = 1;52 for( int i=1 ; i<data.numInstances() ; i++ ) {53 double distance = MathArrays.distance(data.instance(0).toDoubleArray(), data.instance(i).toDoubleArray());54 if( distance < closestDistance) {55 closestDistance = distance;56 closestIndex = i;57 }58 }59 return closestIndex;60 }61 32 62 /** 63 * Sets the number of neighbors followed by the distributional characteristics, the values are separated by blanks. 64 * @see AbstractCharacteristicSelection#setParameter(String) 65 */ 66 @Override 67 public void setParameter(String parameters) { 68 if( !"".equals(parameters) ) { 69 final String[] split = parameters.split(" "); 70 k = Integer.parseInt(split[0]); 71 String str = ""; 72 for( int i=1 ; i<split.length; i++ ) { 73 str += split[i]; 74 if( i<split.length-1 ) { 75 str += " "; 76 } 77 } 78 super.setParameter(str); 79 } 80 } 33 /** 34 * number of neighbors selected 35 */ 36 private int k = 1; 37 38 /** 39 * @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances, 40 * org.apache.commons.collections4.list.SetUniqueList) 41 */ 42 @Override 43 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 44 final Instances data = normalizedCharacteristicInstances(testdata, traindataSet); 45 46 final Set<Integer> selected = new HashSet<Integer>(); 47 for (int i = 0; i < k; i++) { 48 int closestIndex = getClosest(data); 49 50 selected.add(closestIndex); 51 data.delete(closestIndex); 52 } 53 54 for (int i = traindataSet.size() - 1; i >= 0; i--) { 55 if (selected.contains(i)) { 56 traindataSet.remove(i); 57 } 58 } 59 } 60 61 /** 62 * Helper method that determines the index of the instance with the smallest distance to the 63 * first instance (index 0). 64 * 65 * @param data 66 * data set 67 * @return index of the closest instance 68 */ 69 private int getClosest(Instances data) { 70 double closestDistance = Double.MAX_VALUE; 71 int closestIndex = 1; 72 for (int i = 1; i < data.numInstances(); i++) { 73 double distance = 74 MathArrays.distance(data.instance(0).toDoubleArray(), data.instance(i) 75 .toDoubleArray()); 76 if (distance < closestDistance) { 77 closestDistance = distance; 78 closestIndex = i; 79 } 80 } 81 return closestIndex; 82 } 83 84 /** 85 * Sets the number of neighbors followed by the distributional characteristics, the values are 86 * separated by blanks. 87 * 88 * @see AbstractCharacteristicSelection#setParameter(String) 89 */ 90 @Override 91 public void setParameter(String parameters) { 92 if (!"".equals(parameters)) { 93 final String[] split = parameters.split(" "); 94 k = Integer.parseInt(split[0]); 95 String str = ""; 96 for (int i = 1; i < split.length; i++) { 97 str += split[i]; 98 if (i < split.length - 1) { 99 str += " "; 100 } 101 } 102 super.setParameter(str); 103 } 104 } 81 105 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/TestAsTraining.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataselection; 2 16 … … 7 21 /** 8 22 * Uses the test data as training data. 23 * 9 24 * @author Steffen Herbold 10 * 25 * 11 26 */ 12 27 public class TestAsTraining implements ISetWiseDataselectionStrategy { 13 28 14 15 16 17 18 19 20 29 /** 30 * no parameters 31 */ 32 @Override 33 public void setParameter(String parameters) { 34 // dummy 35 } 21 36 22 /**(non-Javadoc) 23 * @see de.ugoe.cs.cpdp.dataselection.ISetWiseDataselectionStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 24 */ 25 @Override 26 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 27 traindataSet.clear(); 28 traindataSet.add(new Instances(testdata)); 29 } 37 /** 38 * (non-Javadoc) 39 * 40 * @see de.ugoe.cs.cpdp.dataselection.ISetWiseDataselectionStrategy#apply(weka.core.Instances, 41 * org.apache.commons.collections4.list.SetUniqueList) 42 */ 43 @Override 44 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 45 traindataSet.clear(); 46 traindataSet.add(new Instances(testdata)); 47 } 30 48 31 49 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/TurhanFilter.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataselection; 2 16 … … 13 27 14 28 /** 15 * Filter according to B. Turhan, T. Menzies, A. Bener, and J. Die Stefano: On the relative value of cross-company and within company defect prediction 29 * Filter according to B. Turhan, T. Menzies, A. Bener, and J. Die Stefano: On the relative value of 30 * cross-company and within company defect prediction 31 * 16 32 * @author Steffen Herbold 17 33 */ 18 34 public class TurhanFilter implements IPointWiseDataselectionStrategy { 19 35 20 /** 21 * number of neighbors that are selected 22 */ 23 private int k = 10; 24 25 /** 26 * Sets the number of neighbors. 27 * @param parameters number of neighbors 28 */ 29 @Override 30 public void setParameter(String parameters) { 31 k = Integer.parseInt(parameters); 32 } 36 /** 37 * number of neighbors that are selected 38 */ 39 private int k = 10; 33 40 34 /** 35 * @see de.ugoe.cs.cpdp.dataselection.PointWiseDataselectionStrategy#apply(weka.core.Instances, weka.core.Instances) 36 */ 37 @Override 38 public Instances apply(Instances testdata, Instances traindata) { 39 final Attribute classAttribute = testdata.classAttribute(); 40 41 final List<Integer> selectedIndex = SetUniqueList.setUniqueList(new LinkedList<Integer>()); 42 43 final double[][] trainDoubles = new double[traindata.numInstances()][testdata.numAttributes()]; 44 45 for( int i=0; i<traindata.numInstances() ; i++ ) { 46 Instance instance = traindata.instance(i); 47 int tmp = 0; 48 for( int j=0 ; j<testdata.numAttributes(); j++ ) { 49 if( testdata.attribute(j)!=classAttribute ) { 50 trainDoubles[i][tmp++] = instance.value(j); 51 } 52 } 53 } 54 55 for( int i=0; i<testdata.numInstances() ; i++ ) { 56 Instance testIntance = testdata.instance(i); 57 double[] targetVector = new double[testdata.numAttributes()-1]; 58 int tmp = 0; 59 for( int j=0 ; j<testdata.numAttributes(); j++ ) { 60 if( testdata.attribute(j)!=classAttribute ) { 61 targetVector[tmp++] = testIntance.value(j); 62 } 63 } 64 65 double farthestClosestDistance = Double.MAX_VALUE; 66 int farthestClosestIndex = 0; 67 double[] closestDistances = new double[k]; 68 for( int m=0 ; m<closestDistances.length ; m++ ) { 69 closestDistances[m] = Double.MAX_VALUE; 70 } 71 int[] closestIndex = new int[k]; 72 73 for( int n=0; n<traindata.numInstances() ; n++ ) { 74 double distance = MathArrays.distance(targetVector, trainDoubles[n]); 75 76 if( distance<farthestClosestDistance ) { 77 closestIndex[farthestClosestIndex] = n; 78 closestDistances[farthestClosestIndex] = distance; 79 80 farthestClosestIndex = ArrayTools.findMax(closestDistances); 81 farthestClosestDistance = closestDistances[farthestClosestIndex]; 82 } 83 } 84 for( int index : closestIndex ) { 85 selectedIndex.add(index); 86 } 87 } 88 89 final Instances selected = new Instances(testdata); 90 selected.delete(); 91 for( Integer i : selectedIndex) { 92 selected.add(traindata.instance(i)); 93 } 94 return selected; 95 } 41 /** 42 * Sets the number of neighbors. 43 * 44 * @param parameters 45 * number of neighbors 46 */ 47 @Override 48 public void setParameter(String parameters) { 49 k = Integer.parseInt(parameters); 50 } 51 52 /** 53 * @see de.ugoe.cs.cpdp.dataselection.PointWiseDataselectionStrategy#apply(weka.core.Instances, 54 * weka.core.Instances) 55 */ 56 @Override 57 public Instances apply(Instances testdata, Instances traindata) { 58 final Attribute classAttribute = testdata.classAttribute(); 59 60 final List<Integer> selectedIndex = SetUniqueList.setUniqueList(new LinkedList<Integer>()); 61 62 final double[][] trainDoubles = 63 new double[traindata.numInstances()][testdata.numAttributes()]; 64 65 for (int i = 0; i < traindata.numInstances(); i++) { 66 Instance instance = traindata.instance(i); 67 int tmp = 0; 68 for (int j = 0; j < testdata.numAttributes(); j++) { 69 if (testdata.attribute(j) != classAttribute) { 70 trainDoubles[i][tmp++] = instance.value(j); 71 } 72 } 73 } 74 75 for (int i = 0; i < testdata.numInstances(); i++) { 76 Instance testIntance = testdata.instance(i); 77 double[] targetVector = new double[testdata.numAttributes() - 1]; 78 int tmp = 0; 79 for (int j = 0; j < testdata.numAttributes(); j++) { 80 if (testdata.attribute(j) != classAttribute) { 81 targetVector[tmp++] = testIntance.value(j); 82 } 83 } 84 85 double farthestClosestDistance = Double.MAX_VALUE; 86 int farthestClosestIndex = 0; 87 double[] closestDistances = new double[k]; 88 for (int m = 0; m < closestDistances.length; m++) { 89 closestDistances[m] = Double.MAX_VALUE; 90 } 91 int[] closestIndex = new int[k]; 92 93 for (int n = 0; n < traindata.numInstances(); n++) { 94 double distance = MathArrays.distance(targetVector, trainDoubles[n]); 95 96 if (distance < farthestClosestDistance) { 97 closestIndex[farthestClosestIndex] = n; 98 closestDistances[farthestClosestIndex] = distance; 99 100 farthestClosestIndex = ArrayTools.findMax(closestDistances); 101 farthestClosestDistance = closestDistances[farthestClosestIndex]; 102 } 103 } 104 for (int index : closestIndex) { 105 selectedIndex.add(index); 106 } 107 } 108 109 final Instances selected = new Instances(testdata); 110 selected.delete(); 111 for (Integer i : selectedIndex) { 112 selected.add(traindata.instance(i)); 113 } 114 return selected; 115 } 96 116 97 117 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/decentApp/ARFFxResourceTool.java
r36 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.decentApp; 2 16 … … 11 25 * 12 26 * @author Philip Makedonski, Fabian Trautsch 13 * 27 * 14 28 */ 15 29 public class ARFFxResourceTool extends ResourceTool { 16 17 /** 18 * Initializes the Tool Factory, from which the models can be loaded and 19 * inizializes the validator. 20 */ 21 public ARFFxResourceTool(){ 22 super(ARFFxResourceTool.class.getName()); 23 ARFFxPackageImpl.init(); 24 25 // Commented, because simulation has problems with this 26 initializeValidator(); 27 } 28 29 /** 30 * Inizializes the model validator 31 */ 32 @Override 33 protected void initializeValidator(){ 34 super.initializeValidator(); 35 EObjectValidator validator = new EObjectValidator(); 36 EValidator.Registry.INSTANCE.put(ARFFxPackage.eINSTANCE, validator); 37 } 38 30 31 /** 32 * Initializes the Tool Factory, from which the models can be loaded and inizializes the 33 * validator. 34 */ 35 public ARFFxResourceTool() { 36 super(ARFFxResourceTool.class.getName()); 37 ARFFxPackageImpl.init(); 38 39 // Commented, because simulation has problems with this 40 initializeValidator(); 41 } 42 43 /** 44 * Inizializes the model validator 45 */ 46 @Override 47 protected void initializeValidator() { 48 super.initializeValidator(); 49 EObjectValidator validator = new EObjectValidator(); 50 EValidator.Registry.INSTANCE.put(ARFFxPackage.eINSTANCE, validator); 51 } 39 52 40 53 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/decentApp/DECENTEpsilonModelHandler.java
r36 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.decentApp; 2 16 … … 20 34 * 21 35 * @author Philip Makedonski, Fabian Trautsch 22 * 36 * 23 37 */ 24 38 25 39 public class DECENTEpsilonModelHandler { 26 private HashMap<String, Object> metaModelCache = new HashMap<>(); 27 private boolean useDECENTBinary = false; 28 private boolean useARFFxBinary = false; 29 30 public static String metaPath = "./decent/models/"; 31 32 /** 33 * Returns the decent model as IModel instance 34 * 35 * @param decentModelLocation location of the decent model file 36 * @param read indicates if the model should be read from 37 * @param write indicates if data should be written in the model 38 * @return EmFModel (IModel) instance from the decent model, which was loaded 39 * @throws Exception 40 */ 41 public IModel getDECENTModel(String decentModelLocation, boolean read, boolean write) throws Exception { 42 43 EmfModel model; 44 45 if (isUseDECENTBinary()) { 46 unregisterMetaModels(""); 47 if (!read) { 48 new File(decentModelLocation).delete(); 49 new File(decentModelLocation+"bin").delete(); 50 } 51 DECENTResourceTool tool = new DECENTResourceTool(); 52 if (new File(decentModelLocation).exists() && !new File(decentModelLocation+"bin").exists()) { 53 Resource resource = tool.loadResourceFromXMI(decentModelLocation,"decent", DECENTPackage.eINSTANCE); 54 tool.storeBinaryResourceContents(resource.getContents(), decentModelLocation+"bin", "decentbin"); 55 } 56 57 Resource resourceBin = tool.loadResourceFromBinary(decentModelLocation+"bin","decentbin", DECENTPackage.eINSTANCE); 58 //alternative pattern 59 // model = createInMemoryEmfModel("DECENT", resourceLocation, "../DECENT.Meta/model/DECENTv3.ecore", read, write, resourceBin, DECENTPackage.eINSTANCE); 60 // restoreMetaModels(); 61 62 //NOTE: Adding the package is essential as otherwise epsilon breaks 63 model = new InMemoryEmfModel("DECENT", resourceBin, DECENTPackage.eINSTANCE); 64 model.setStoredOnDisposal(write); 65 model.setReadOnLoad(read); 66 model.setCachingEnabled(true); 67 restoreMetaModels(); 68 } else { 69 model = createEmfModel("DECENT", decentModelLocation, metaPath+"DECENTv3.ecore", read, write); 70 } 71 72 return model; 73 } 74 75 /** 76 * Converts the decent model to a binary form 77 * 78 * @param location of the decent model file 79 */ 80 public void convertDECENTModelToBinary(String location) { 81 unregisterMetaModels(""); 82 DECENTResourceTool tool = new DECENTResourceTool(); 83 Resource resource = tool.loadResourceFromXMI(location+"/model.decent","decent", DECENTPackage.eINSTANCE); 84 tool.storeBinaryResourceContents(resource.getContents(), location+"/model.decent"+"bin", "decentbin"); 85 restoreMetaModels(); 86 } 87 88 /** 89 * Converts the decent model to a xmi form 90 * 91 * @param location of the decent model file 92 */ 93 94 public void convertDECENTModelToXMI(String location) { 95 unregisterMetaModels(""); 96 DECENTResourceTool tool = new DECENTResourceTool(); 97 Resource resource = tool.loadResourceFromBinary(location+"/model.decentbin","decentbin", DECENTPackage.eINSTANCE); 98 restoreMetaModels(); 99 tool.storeResourceContents(resource.getContents(), location+"/model.decent", "decent"); 100 } 101 102 /** 103 * Returns the arffx model as IModel instance 104 * 105 * @param arffxModelLocation location of the arffx model file 106 * @param read indicates if the model should be read from 107 * @param write indicates if data should be written in the model 108 * @return EmFModel (IModel) instance from the arffx model, which was loaded 109 * @throws Exception 110 */ 111 112 public IModel getARFFxModel(String arffxModelLocation, boolean read, boolean write) throws Exception { 113 114 EmfModel model; 115 116 if (isUseARFFxBinary()) { 117 unregisterMetaModels(""); 118 if (!read) { 119 new File(arffxModelLocation).delete(); 120 new File(arffxModelLocation+"bin").delete(); 121 } 122 ARFFxResourceTool tool = new ARFFxResourceTool(); 123 if (new File(arffxModelLocation).exists() && !new File(arffxModelLocation+"bin").exists()) { 124 Resource resource = tool.loadResourceFromXMI(arffxModelLocation,"arffx", ARFFxPackage.eINSTANCE); 125 tool.storeBinaryResourceContents(resource.getContents(), arffxModelLocation+"bin", "arffxbin"); 126 } 127 128 Resource resourceBin = tool.loadResourceFromBinary(arffxModelLocation+"bin","arffxbin", ARFFxPackage.eINSTANCE); 129 //alternative pattern 130 // model = createInMemoryEmfModel("DECENT", resourceLocation, "../DECENT.Meta/model/DECENTv3.ecore", read, write, resourceBin, DECENTPackage.eINSTANCE); 131 // restoreMetaModels(); 132 133 //NOTE: Adding the package is essential as otherwise epsilon breaks 134 model = new InMemoryEmfModel("ARFFx", resourceBin, ARFFxPackage.eINSTANCE); 135 // model.getModelImpl().getURI().toFileString() 136 model.setStoredOnDisposal(write); 137 model.setReadOnLoad(read); 138 model.setCachingEnabled(true); 139 restoreMetaModels(); 140 } else { 141 model = createEmfModel("ARFFx", arffxModelLocation, metaPath+"ARFFx.ecore", read, write); 142 } 143 144 return model; 145 } 146 147 148 /** 149 * Converts an arffx model to a binary version 150 * 151 * @param location of the arffx model 152 */ 153 public void convertARFFxModelToBinary(String location) { 154 unregisterMetaModels(""); 155 ARFFxResourceTool tool = new ARFFxResourceTool(); 156 Resource resource = tool.loadResourceFromXMI(location+"/model.arffx","arffx", ARFFxPackage.eINSTANCE); 157 tool.storeBinaryResourceContents(resource.getContents(), location+"/model.arffx"+"bin", "arffxbin"); 158 restoreMetaModels(); 159 } 160 161 /** 162 * Converts an arffx model to xmi 163 * 164 * @param location of the arffx model 165 */ 166 167 public void convertARFFxModelToXMI(String location) { 168 unregisterMetaModels(""); 169 ARFFxResourceTool tool = new ARFFxResourceTool(); 170 Resource resource = tool.loadResourceFromBinary(location+"/model.arffxbin","arffxbin", DECENTPackage.eINSTANCE); 171 restoreMetaModels(); 172 tool.storeResourceContents(resource.getContents(), location+"/model.arffx", "arffx"); 173 } 174 175 176 /** 177 * Returns the log model as IModel instance 178 * 179 * @param logModelLocation location of the log model file 180 * @param read indicates if the model should be read from 181 * @param write indicates if data should be written in the model 182 * @return EmFModel (IModel) instance from the log model, which was loaded 183 * @throws Exception 184 */ 185 186 public IModel getLOGModel(String logModelLocation, boolean read, boolean write) throws Exception { 187 if (!new File(logModelLocation).exists()) { 188 read = false; 189 } 190 IModel model = createEmfModel("LOG", logModelLocation, metaPath +"LOG.ecore", read, write); 191 System.setProperty("epsilon.logFileAvailable", "true"); 192 return model; 193 } 194 195 /** 196 * Creates an EMF Model 197 * 198 * @param name of the emf model 199 * @param model name of the model 200 * @param metamodel name of the metamodel 201 * @param readOnLoad indicates if the model should be read on load 202 * @param storeOnDisposal indicates if the model should be stored on disposal 203 * @return 204 * @throws EolModelLoadingException 205 * @throws URISyntaxException 206 */ 207 208 @SuppressWarnings("deprecation") 209 protected EmfModel createEmfModel(String name, String model, 210 String metamodel, boolean readOnLoad, boolean storeOnDisposal) 211 throws EolModelLoadingException, URISyntaxException { 212 EmfModel emfModel = new EmfModel(); 213 StringProperties properties = new StringProperties(); 214 properties.put(EmfModel.PROPERTY_NAME, name); 215 properties.put(EmfModel.PROPERTY_ALIASES, name); 216 properties.put(EmfModel.PROPERTY_FILE_BASED_METAMODEL_URI, 217 "file:/" + getFile(metamodel).getAbsolutePath()); 218 properties.put(EmfModel.PROPERTY_MODEL_URI, 219 "file:/" + getFile(model).getAbsolutePath()); 220 properties.put(EmfModel.PROPERTY_IS_METAMODEL_FILE_BASED, "true"); 221 properties.put(EmfModel.PROPERTY_READONLOAD, readOnLoad + ""); 222 properties.put(EmfModel.PROPERTY_CACHED, "true"); 223 properties.put(EmfModel.PROPERTY_STOREONDISPOSAL, 224 storeOnDisposal + ""); 225 emfModel.load(properties, ""); 226 //System.out.println(emfModel.allContents()); 227 return emfModel; 228 } 229 230 /** 231 * Returns a new File instance from the given filename 232 * 233 * @param fileName of the file 234 * @return 235 * @throws URISyntaxException 236 */ 237 public File getFile(String fileName) throws URISyntaxException {; 238 return new File(fileName); 239 } 240 241 /** 242 * Restores the metamodels, so that they are registered in the 243 * EPackage registry 244 */ 245 private void restoreMetaModels() { 246 for (String key : metaModelCache .keySet()) { 247 EPackage.Registry.INSTANCE.put(key, metaModelCache.get(key)); 248 }; 249 } 250 251 /** 252 * Unregister the metamodels from the EPackage registry 253 * 254 * @param filter for filtering out certain instances 255 */ 256 private void unregisterMetaModels(String filter) { 257 for (String key : EPackage.Registry.INSTANCE.keySet()) { 258 if (key.contains(filter)) { 259 metaModelCache.put(key, EPackage.Registry.INSTANCE.get(key)); 260 } 261 }; 262 for (String key : metaModelCache .keySet()) { 263 EPackage.Registry.INSTANCE.remove(key); 264 }; 265 } 266 267 /** 268 * Returns true if decent binary model is used 269 * @return 270 */ 271 272 public boolean isUseDECENTBinary() { 273 return useDECENTBinary; 274 } 275 276 /** 277 * Sets the boolean which indicates, if the decent binary 278 * model is used 279 * @param useDECENTBinary 280 */ 281 public void setUseDECENTBinary(boolean useDECENTBinary) { 282 this.useDECENTBinary = useDECENTBinary; 283 } 284 285 /** 286 * Returns true if arffx binary model is used 287 * @return 288 */ 289 public boolean isUseARFFxBinary() { 290 return useARFFxBinary; 291 } 292 293 /** 294 * Sets the boolean which indicates, if the arffx binary 295 * model is used 296 * @param useARFFxBinary 297 */ 298 299 public void setUseARFFxBinary(boolean useARFFxBinary) { 300 this.useARFFxBinary = useARFFxBinary; 301 } 302 303 40 private HashMap<String, Object> metaModelCache = new HashMap<>(); 41 private boolean useDECENTBinary = false; 42 private boolean useARFFxBinary = false; 43 44 public static String metaPath = "./decent/models/"; 45 46 /** 47 * Returns the decent model as IModel instance 48 * 49 * @param decentModelLocation 50 * location of the decent model file 51 * @param read 52 * indicates if the model should be read from 53 * @param write 54 * indicates if data should be written in the model 55 * @return EmFModel (IModel) instance from the decent model, which was loaded 56 * @throws Exception 57 */ 58 public IModel getDECENTModel(String decentModelLocation, boolean read, boolean write) 59 throws Exception 60 { 61 62 EmfModel model; 63 64 if (isUseDECENTBinary()) { 65 unregisterMetaModels(""); 66 if (!read) { 67 new File(decentModelLocation).delete(); 68 new File(decentModelLocation + "bin").delete(); 69 } 70 DECENTResourceTool tool = new DECENTResourceTool(); 71 if (new File(decentModelLocation).exists() && 72 !new File(decentModelLocation + "bin").exists()) 73 { 74 Resource resource = 75 tool.loadResourceFromXMI(decentModelLocation, "decent", DECENTPackage.eINSTANCE); 76 tool.storeBinaryResourceContents(resource.getContents(), decentModelLocation + 77 "bin", "decentbin"); 78 } 79 80 Resource resourceBin = 81 tool.loadResourceFromBinary(decentModelLocation + "bin", "decentbin", 82 DECENTPackage.eINSTANCE); 83 // alternative pattern 84 // model = createInMemoryEmfModel("DECENT", resourceLocation, 85 // "../DECENT.Meta/model/DECENTv3.ecore", read, write, resourceBin, 86 // DECENTPackage.eINSTANCE); 87 // restoreMetaModels(); 88 89 // NOTE: Adding the package is essential as otherwise epsilon breaks 90 model = new InMemoryEmfModel("DECENT", resourceBin, DECENTPackage.eINSTANCE); 91 model.setStoredOnDisposal(write); 92 model.setReadOnLoad(read); 93 model.setCachingEnabled(true); 94 restoreMetaModels(); 95 } 96 else { 97 model = 98 createEmfModel("DECENT", decentModelLocation, metaPath + "DECENTv3.ecore", read, 99 write); 100 } 101 102 return model; 103 } 104 105 /** 106 * Converts the decent model to a binary form 107 * 108 * @param location 109 * of the decent model file 110 */ 111 public void convertDECENTModelToBinary(String location) { 112 unregisterMetaModels(""); 113 DECENTResourceTool tool = new DECENTResourceTool(); 114 Resource resource = 115 tool.loadResourceFromXMI(location + "/model.decent", "decent", DECENTPackage.eINSTANCE); 116 tool.storeBinaryResourceContents(resource.getContents(), 117 location + "/model.decent" + "bin", "decentbin"); 118 restoreMetaModels(); 119 } 120 121 /** 122 * Converts the decent model to a xmi form 123 * 124 * @param location 125 * of the decent model file 126 */ 127 128 public void convertDECENTModelToXMI(String location) { 129 unregisterMetaModels(""); 130 DECENTResourceTool tool = new DECENTResourceTool(); 131 Resource resource = 132 tool.loadResourceFromBinary(location + "/model.decentbin", "decentbin", 133 DECENTPackage.eINSTANCE); 134 restoreMetaModels(); 135 tool.storeResourceContents(resource.getContents(), location + "/model.decent", "decent"); 136 } 137 138 /** 139 * Returns the arffx model as IModel instance 140 * 141 * @param arffxModelLocation 142 * location of the arffx model file 143 * @param read 144 * indicates if the model should be read from 145 * @param write 146 * indicates if data should be written in the model 147 * @return EmFModel (IModel) instance from the arffx model, which was loaded 148 * @throws Exception 149 */ 150 151 public IModel getARFFxModel(String arffxModelLocation, boolean read, boolean write) 152 throws Exception 153 { 154 155 EmfModel model; 156 157 if (isUseARFFxBinary()) { 158 unregisterMetaModels(""); 159 if (!read) { 160 new File(arffxModelLocation).delete(); 161 new File(arffxModelLocation + "bin").delete(); 162 } 163 ARFFxResourceTool tool = new ARFFxResourceTool(); 164 if (new File(arffxModelLocation).exists() && 165 !new File(arffxModelLocation + "bin").exists()) 166 { 167 Resource resource = 168 tool.loadResourceFromXMI(arffxModelLocation, "arffx", ARFFxPackage.eINSTANCE); 169 tool.storeBinaryResourceContents(resource.getContents(), 170 arffxModelLocation + "bin", "arffxbin"); 171 } 172 173 Resource resourceBin = 174 tool.loadResourceFromBinary(arffxModelLocation + "bin", "arffxbin", 175 ARFFxPackage.eINSTANCE); 176 // alternative pattern 177 // model = createInMemoryEmfModel("DECENT", resourceLocation, 178 // "../DECENT.Meta/model/DECENTv3.ecore", read, write, resourceBin, 179 // DECENTPackage.eINSTANCE); 180 // restoreMetaModels(); 181 182 // NOTE: Adding the package is essential as otherwise epsilon breaks 183 model = new InMemoryEmfModel("ARFFx", resourceBin, ARFFxPackage.eINSTANCE); 184 // model.getModelImpl().getURI().toFileString() 185 model.setStoredOnDisposal(write); 186 model.setReadOnLoad(read); 187 model.setCachingEnabled(true); 188 restoreMetaModels(); 189 } 190 else { 191 model = 192 createEmfModel("ARFFx", arffxModelLocation, metaPath + "ARFFx.ecore", read, write); 193 } 194 195 return model; 196 } 197 198 /** 199 * Converts an arffx model to a binary version 200 * 201 * @param location 202 * of the arffx model 203 */ 204 public void convertARFFxModelToBinary(String location) { 205 unregisterMetaModels(""); 206 ARFFxResourceTool tool = new ARFFxResourceTool(); 207 Resource resource = 208 tool.loadResourceFromXMI(location + "/model.arffx", "arffx", ARFFxPackage.eINSTANCE); 209 tool.storeBinaryResourceContents(resource.getContents(), location + "/model.arffx" + "bin", 210 "arffxbin"); 211 restoreMetaModels(); 212 } 213 214 /** 215 * Converts an arffx model to xmi 216 * 217 * @param location 218 * of the arffx model 219 */ 220 221 public void convertARFFxModelToXMI(String location) { 222 unregisterMetaModels(""); 223 ARFFxResourceTool tool = new ARFFxResourceTool(); 224 Resource resource = 225 tool.loadResourceFromBinary(location + "/model.arffxbin", "arffxbin", 226 DECENTPackage.eINSTANCE); 227 restoreMetaModels(); 228 tool.storeResourceContents(resource.getContents(), location + "/model.arffx", "arffx"); 229 } 230 231 /** 232 * Returns the log model as IModel instance 233 * 234 * @param logModelLocation 235 * location of the log model file 236 * @param read 237 * indicates if the model should be read from 238 * @param write 239 * indicates if data should be written in the model 240 * @return EmFModel (IModel) instance from the log model, which was loaded 241 * @throws Exception 242 */ 243 244 public IModel getLOGModel(String logModelLocation, boolean read, boolean write) 245 throws Exception 246 { 247 if (!new File(logModelLocation).exists()) { 248 read = false; 249 } 250 IModel model = createEmfModel("LOG", logModelLocation, metaPath + "LOG.ecore", read, write); 251 System.setProperty("epsilon.logFileAvailable", "true"); 252 return model; 253 } 254 255 /** 256 * Creates an EMF Model 257 * 258 * @param name 259 * of the emf model 260 * @param model 261 * name of the model 262 * @param metamodel 263 * name of the metamodel 264 * @param readOnLoad 265 * indicates if the model should be read on load 266 * @param storeOnDisposal 267 * indicates if the model should be stored on disposal 268 * @return 269 * @throws EolModelLoadingException 270 * @throws URISyntaxException 271 */ 272 273 @SuppressWarnings("deprecation") 274 protected EmfModel createEmfModel(String name, 275 String model, 276 String metamodel, 277 boolean readOnLoad, 278 boolean storeOnDisposal) throws EolModelLoadingException, 279 URISyntaxException 280 { 281 EmfModel emfModel = new EmfModel(); 282 StringProperties properties = new StringProperties(); 283 properties.put(EmfModel.PROPERTY_NAME, name); 284 properties.put(EmfModel.PROPERTY_ALIASES, name); 285 properties.put(EmfModel.PROPERTY_FILE_BASED_METAMODEL_URI, "file:/" + 286 getFile(metamodel).getAbsolutePath()); 287 properties.put(EmfModel.PROPERTY_MODEL_URI, "file:/" + getFile(model).getAbsolutePath()); 288 properties.put(EmfModel.PROPERTY_IS_METAMODEL_FILE_BASED, "true"); 289 properties.put(EmfModel.PROPERTY_READONLOAD, readOnLoad + ""); 290 properties.put(EmfModel.PROPERTY_CACHED, "true"); 291 properties.put(EmfModel.PROPERTY_STOREONDISPOSAL, storeOnDisposal + ""); 292 emfModel.load(properties, ""); 293 // System.out.println(emfModel.allContents()); 294 return emfModel; 295 } 296 297 /** 298 * Returns a new File instance from the given filename 299 * 300 * @param fileName 301 * of the file 302 * @return 303 * @throws URISyntaxException 304 */ 305 public File getFile(String fileName) throws URISyntaxException { 306 ; 307 return new File(fileName); 308 } 309 310 /** 311 * Restores the metamodels, so that they are registered in the EPackage registry 312 */ 313 private void restoreMetaModels() { 314 for (String key : metaModelCache.keySet()) { 315 EPackage.Registry.INSTANCE.put(key, metaModelCache.get(key)); 316 }; 317 } 318 319 /** 320 * Unregister the metamodels from the EPackage registry 321 * 322 * @param filter 323 * for filtering out certain instances 324 */ 325 private void unregisterMetaModels(String filter) { 326 for (String key : EPackage.Registry.INSTANCE.keySet()) { 327 if (key.contains(filter)) { 328 metaModelCache.put(key, EPackage.Registry.INSTANCE.get(key)); 329 } 330 }; 331 for (String key : metaModelCache.keySet()) { 332 EPackage.Registry.INSTANCE.remove(key); 333 }; 334 } 335 336 /** 337 * Returns true if decent binary model is used 338 * 339 * @return 340 */ 341 342 public boolean isUseDECENTBinary() { 343 return useDECENTBinary; 344 } 345 346 /** 347 * Sets the boolean which indicates, if the decent binary model is used 348 * 349 * @param useDECENTBinary 350 */ 351 public void setUseDECENTBinary(boolean useDECENTBinary) { 352 this.useDECENTBinary = useDECENTBinary; 353 } 354 355 /** 356 * Returns true if arffx binary model is used 357 * 358 * @return 359 */ 360 public boolean isUseARFFxBinary() { 361 return useARFFxBinary; 362 } 363 364 /** 365 * Sets the boolean which indicates, if the arffx binary model is used 366 * 367 * @param useARFFxBinary 368 */ 369 370 public void setUseARFFxBinary(boolean useARFFxBinary) { 371 this.useARFFxBinary = useARFFxBinary; 372 } 373 304 374 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/decentApp/DECENTResourceTool.java
r36 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.decentApp; 2 16 … … 12 26 * 13 27 * @author Philip Makedonski, Fabian Trautsch 14 * 28 * 15 29 */ 16 30 public class DECENTResourceTool extends ResourceTool { 17 18 /** 19 * Initializes the Tool Factory, from which the models can be loaded and 20 * inizializes the validator. 21 */ 22 public DECENTResourceTool(){ 23 super(DECENTResourceTool.class.getName()); 24 DECENTPackageImpl.init(); 25 this.resourceFactory = new DECENTResourceFactoryImpl(); 26 initializeValidator(); 27 } 28 29 /** 30 * Inizializes the model validator 31 */ 32 @Override 33 protected void initializeValidator(){ 34 super.initializeValidator(); 35 EObjectValidator validator = new EObjectValidator(); 36 EValidator.Registry.INSTANCE.put(DECENTPackage.eINSTANCE, validator); 37 } 38 39 31 32 /** 33 * Initializes the Tool Factory, from which the models can be loaded and inizializes the 34 * validator. 35 */ 36 public DECENTResourceTool() { 37 super(DECENTResourceTool.class.getName()); 38 DECENTPackageImpl.init(); 39 this.resourceFactory = new DECENTResourceFactoryImpl(); 40 initializeValidator(); 41 } 42 43 /** 44 * Inizializes the model validator 45 */ 46 @Override 47 protected void initializeValidator() { 48 super.initializeValidator(); 49 EObjectValidator validator = new EObjectValidator(); 50 EValidator.Registry.INSTANCE.put(DECENTPackage.eINSTANCE, validator); 51 } 40 52 41 53 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/decentApp/FileWatcher.java
r32 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.decentApp; 2 16 … … 8 22 * 9 23 * @author Philip Makedonski 10 * 24 * 11 25 */ 12 26 public abstract class FileWatcher extends TimerTask { 13 // Last timestamp 14 private long timeStamp; 15 16 // File to watch 17 private File file; 27 // Last timestamp 28 private long timeStamp; 18 29 19 /** 20 * Constructor 21 * @param file 22 */ 23 public FileWatcher(File file) { 24 this.file = file; 25 this.timeStamp = file.lastModified(); 26 } 30 // File to watch 31 private File file; 27 32 28 /** 29 * Watches a file and executes the onChange Method 30 * if a file is changed 31 */ 32 public final void run() { 33 long timeStamp = file.lastModified(); 33 /** 34 * Constructor 35 * 36 * @param file 37 */ 38 public FileWatcher(File file) { 39 this.file = file; 40 this.timeStamp = file.lastModified(); 41 } 34 42 35 if (this.timeStamp != timeStamp) { 36 this.timeStamp = timeStamp; 37 onChange(file); 38 } 39 } 43 /** 44 * Watches a file and executes the onChange Method if a file is changed 45 */ 46 public final void run() { 47 long timeStamp = file.lastModified(); 40 48 41 protected abstract void onChange(File file); 49 if (this.timeStamp != timeStamp) { 50 this.timeStamp = timeStamp; 51 onChange(file); 52 } 53 } 54 55 protected abstract void onChange(File file); 42 56 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/decentApp/ResourceTool.java
r32 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.decentApp; 2 16 … … 32 46 * 33 47 * @author Philip Makedonski 34 * 48 * 35 49 */ 36 50 public class ResourceTool { 37 51 38 protected ResourceFactoryImpl resourceFactory = new XMIResourceFactoryImpl(); 39 40 /** 41 * Constructor 42 * @param loggedClass 43 */ 44 public ResourceTool(String loggedClass) { 45 System.setProperty("org.slf4j.simpleLogger.logFile","validation.log"); 46 System.setProperty("org.slf4j.simpleLogger.logFile","System.out"); 47 } 48 49 /** 50 * Initializes the validator 51 */ 52 protected void initializeValidator() { 53 // OCL.initialize(null); 54 String oclDelegateURI = OCLConstants.OCL_DELEGATE_URI+"/Pivot"; 55 56 EOperation.Internal.InvocationDelegate.Factory.Registry.INSTANCE.put(oclDelegateURI, 57 new OCLInvocationDelegateFactory(oclDelegateURI)); 58 EStructuralFeature.Internal.SettingDelegate.Factory.Registry.INSTANCE.put(oclDelegateURI, 59 new OCLSettingDelegateFactory(oclDelegateURI)); 60 EValidator.ValidationDelegate.Registry.INSTANCE.put(oclDelegateURI, 61 new OCLValidationDelegateFactory(oclDelegateURI)); 62 63 // EStructuralFeature.Internal.SettingDelegate.Factory.Registry.INSTANCE.put(oclDelegateURI, 64 // new OCLSettingDelegateFactory.Global()); 65 // QueryDelegate.Factory.Registry.INSTANCE.put(oclDelegateURI, new OCLQueryDelegateFactory.Global()); 66 67 } 68 69 /** 70 * Validates the ressource 71 * @param resource to validate 72 */ 73 public void validateResource(Resource resource) { 74 BasicDiagnostic diagnostics = new BasicDiagnostic(); 75 boolean valid = true; 76 for (EObject eo : resource.getContents()) 77 { 78 Map<Object, Object> context = new HashMap<Object, Object>(); 79 boolean validationResult = Diagnostician.INSTANCE.validate(eo, diagnostics, context); 80 showDiagnostics(diagnostics, ""); 81 valid &= validationResult; 82 } 83 84 if (!valid){ 85 System.out.println("Problem with validation!"); 86 } 87 } 88 89 /** 90 * Output method for showing diagnostics for different ressources 91 * @param diagnostics 92 * @param indent 93 */ 94 protected void showDiagnostics(Diagnostic diagnostics, String indent) { 95 indent+=" "; 96 for (Diagnostic d : diagnostics.getChildren()){ 97 System.out.println(indent+d.getSource()); 98 System.out.println(indent+" "+d.getMessage()); 99 showDiagnostics(d,indent); 100 } 101 } 102 103 104 /** 105 * Loads a ressource from XMI 106 * @param inputPath path to the xmi 107 * @param extension of the ressource to load 108 * @param p the given EPackage 109 * @return 110 */ 111 //TODO: workarounds copied from respective methods without EPackage parameter 112 @SuppressWarnings({ "rawtypes", "unchecked" }) 113 public Resource loadResourceFromXMI(String inputPath, String extension, EPackage p) { 114 Resource.Factory.Registry reg = Resource.Factory.Registry.INSTANCE; 115 Map<String, Object> m = reg.getExtensionToFactoryMap(); 116 m.put(extension, resourceFactory); 117 ResourceSet resSetIn = new ResourceSetImpl(); 118 //critical part 119 resSetIn.getPackageRegistry().put(p.getNsURI(), p); 120 121 Resource inputResource = resSetIn.createResource(URI.createURI(inputPath)); 122 try { 123 Map options = new HashMap<>(); 124 options.put(XMIResourceImpl.OPTION_DEFER_IDREF_RESOLUTION, Boolean.TRUE); 125 // options.put(XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF, XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF_DISCARD); 126 inputResource.load(options); 127 } catch (IOException e) { 128 e.printStackTrace(); 129 } 130 return inputResource; 131 } 132 133 /** 134 * Loads a ressource from XMI 135 * @param inputPath path to the xmi 136 * @param extension of the ressource to load 137 * @return 138 */ 139 140 @SuppressWarnings({ "rawtypes", "unchecked" }) 141 public Resource loadResourceFromXMI(String inputPath, String extension) { 142 Resource.Factory.Registry reg = Resource.Factory.Registry.INSTANCE; 143 Map<String, Object> m = reg.getExtensionToFactoryMap(); 144 m.put(extension, resourceFactory); 145 ResourceSet resSetIn = new ResourceSetImpl(); 146 Resource inputResource = resSetIn.createResource(URI.createURI(inputPath)); 147 try { 148 Map options = new HashMap<>(); 149 options.put(XMIResourceImpl.OPTION_DEFER_IDREF_RESOLUTION, Boolean.TRUE); 150 // options.put(XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF, XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF_DISCARD); 151 inputResource.load(options); 152 } catch (IOException e) { 153 e.printStackTrace(); 154 } 155 return inputResource; 156 } 157 158 /** 159 * Gets a ressource from a binary form 160 * @param inputPath path to the binary 161 * @param extension of the model to load 162 * @param p EPackage to put the loaded ressource in 163 * @return 164 */ 165 public Resource getResourceFromBinary(String inputPath, String extension, EPackage p) { 166 Resource.Factory.Registry reg = Resource.Factory.Registry.INSTANCE; 167 Map<String, Object> m = reg.getExtensionToFactoryMap(); 168 m.put(extension, new Resource.Factory() { 169 170 @Override 171 public Resource createResource(URI uri) { 172 return new BinaryResourceImpl(uri); 173 } 174 175 }); 176 177 ResourceSet resSetIn = new ResourceSetImpl(); 178 //critical part 179 resSetIn.getPackageRegistry().put(p.getNsURI(), p); 180 181 Resource inputResource = resSetIn.createResource(URI.createURI(inputPath)); 182 return inputResource; 183 } 184 185 186 /** 187 * Loads a ressource from a binary form 188 * @param inputPath path to the binary 189 * @param extension of the model to load 190 * @param p EPackage to put the loaded ressource in 191 * @return 192 */ 193 //TODO: workarounds copied from respective methods without EPackage parameter 194 @SuppressWarnings({ "rawtypes" }) 195 public Resource loadResourceFromBinary(String inputPath, String extension, EPackage p) { 196 Resource.Factory.Registry reg = Resource.Factory.Registry.INSTANCE; 197 Map<String, Object> m = reg.getExtensionToFactoryMap(); 198 m.put(extension, new Resource.Factory() { 199 200 @Override 201 public Resource createResource(URI uri) { 202 return new BinaryResourceImpl(uri); 203 } 204 205 }); 206 207 ResourceSet resSetIn = new ResourceSetImpl(); 208 //critical part 209 resSetIn.getPackageRegistry().put(p.getNsURI(), p); 210 211 Resource inputResource = resSetIn.createResource(URI.createURI(inputPath)); 212 if (new File(inputPath).exists()) { 213 214 try { 215 Map options = new HashMap<>(); 216 // options.put(BinaryResourceImpl.OPTION_DEFER_IDREF_RESOLUTION, Boolean.TRUE); 217 // options.put(BinaryResourceImpl.OPTION_DEFER_IDREF_RESOLUTION, Boolean.TRUE); 218 // options.put(XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF, XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF_DISCARD); 219 inputResource.load(options); 220 } catch (IOException e) { 221 e.printStackTrace(); 222 } 223 } 224 return inputResource; 225 } 226 227 /** 228 * Loads a ressource from a binary form 229 * 230 * @param inputPath path to the binary 231 * @param extension of the model to load 232 * @return 233 */ 234 @SuppressWarnings({ "rawtypes" }) 235 public Resource loadResourceFromBinary(String inputPath, String extension) { 236 Resource.Factory.Registry reg = Resource.Factory.Registry.INSTANCE; 237 Map<String, Object> m = reg.getExtensionToFactoryMap(); 238 m.put(extension, new Resource.Factory() { 239 240 @Override 241 public Resource createResource(URI uri) { 242 return new BinaryResourceImpl(uri); 243 } 244 245 }); 246 247 ResourceSet resSetIn = new ResourceSetImpl(); 248 Resource inputResource = resSetIn.createResource(URI.createURI(inputPath)); 249 try { 250 Map options = new HashMap<>(); 251 // options.put(XMIResourceImpl.OPTION_DEFER_IDREF_RESOLUTION, Boolean.TRUE); 252 // options.put(XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF, XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF_DISCARD); 253 inputResource.load(options); 254 } catch (IOException e) { 255 e.printStackTrace(); 256 } 257 return inputResource; 258 } 259 260 /** 261 * Stores the binary resource contents to a given path 262 * 263 * @param contents EList of different EObjects to store 264 * @param outputPath path to store to 265 * @param extension of the model to store 266 */ 267 @SuppressWarnings({ "rawtypes" }) 268 public void storeBinaryResourceContents(EList<EObject> contents, String outputPath, String extension) { 269 Resource.Factory.Registry reg = Resource.Factory.Registry.INSTANCE; 270 Map<String, Object> m = reg.getExtensionToFactoryMap(); 271 m.put(extension, new Resource.Factory() { 272 273 @Override 274 public Resource createResource(URI uri) { 275 return new BinaryResourceImpl(uri); 276 } 277 278 }); 279 280 ResourceSet resSet = new ResourceSetImpl(); 281 Resource outputResource = resSet.createResource(URI.createURI(outputPath)); 282 outputResource.getContents().addAll(contents); 283 try { 284 Map options = new HashMap<>(); 285 // options.put(XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF, XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF_DISCARD); 286 outputResource.save(options); 287 } catch (IOException e) { 288 e.printStackTrace(); 289 } 290 } 291 292 /** 293 * Stores the resource contents to a given path 294 * 295 * @param contents EList of different EObjects to store 296 * @param outputPath path to store to 297 * @param extension of the model to store 298 */ 299 @SuppressWarnings({ "unchecked", "rawtypes" }) 300 public void storeResourceContents(EList<EObject> contents, String outputPath, String extension) { 301 //TODO: duplicated from loadResourceFromXMI => move to a more appropriate location 302 Resource.Factory.Registry reg = Resource.Factory.Registry.INSTANCE; 303 Map<String, Object> m = reg.getExtensionToFactoryMap(); 304 m.put(extension, resourceFactory); 305 306 ResourceSet resSet = new ResourceSetImpl(); 307 Resource outputResource = resSet.createResource(URI.createURI(outputPath)); 308 outputResource.getContents().addAll(contents); 309 try { 310 Map options = new HashMap<>(); 311 options.put(XMIResourceImpl.OPTION_ENCODING, "UTF-8"); 312 // options.put(XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF, XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF_DISCARD); 313 outputResource.save(options); 314 } catch (IOException e) { 315 e.printStackTrace(); 316 } 317 } 318 52 protected ResourceFactoryImpl resourceFactory = new XMIResourceFactoryImpl(); 53 54 /** 55 * Constructor 56 * 57 * @param loggedClass 58 */ 59 public ResourceTool(String loggedClass) { 60 System.setProperty("org.slf4j.simpleLogger.logFile", "validation.log"); 61 System.setProperty("org.slf4j.simpleLogger.logFile", "System.out"); 62 } 63 64 /** 65 * Initializes the validator 66 */ 67 protected void initializeValidator() { 68 // OCL.initialize(null); 69 String oclDelegateURI = OCLConstants.OCL_DELEGATE_URI + "/Pivot"; 70 71 EOperation.Internal.InvocationDelegate.Factory.Registry.INSTANCE 72 .put(oclDelegateURI, new OCLInvocationDelegateFactory(oclDelegateURI)); 73 EStructuralFeature.Internal.SettingDelegate.Factory.Registry.INSTANCE 74 .put(oclDelegateURI, new OCLSettingDelegateFactory(oclDelegateURI)); 75 EValidator.ValidationDelegate.Registry.INSTANCE 76 .put(oclDelegateURI, new OCLValidationDelegateFactory(oclDelegateURI)); 77 78 // EStructuralFeature.Internal.SettingDelegate.Factory.Registry.INSTANCE.put(oclDelegateURI, 79 // new OCLSettingDelegateFactory.Global()); 80 // QueryDelegate.Factory.Registry.INSTANCE.put(oclDelegateURI, new 81 // OCLQueryDelegateFactory.Global()); 82 83 } 84 85 /** 86 * Validates the ressource 87 * 88 * @param resource 89 * to validate 90 */ 91 public void validateResource(Resource resource) { 92 BasicDiagnostic diagnostics = new BasicDiagnostic(); 93 boolean valid = true; 94 for (EObject eo : resource.getContents()) { 95 Map<Object, Object> context = new HashMap<Object, Object>(); 96 boolean validationResult = Diagnostician.INSTANCE.validate(eo, diagnostics, context); 97 showDiagnostics(diagnostics, ""); 98 valid &= validationResult; 99 } 100 101 if (!valid) { 102 System.out.println("Problem with validation!"); 103 } 104 } 105 106 /** 107 * Output method for showing diagnostics for different ressources 108 * 109 * @param diagnostics 110 * @param indent 111 */ 112 protected void showDiagnostics(Diagnostic diagnostics, String indent) { 113 indent += " "; 114 for (Diagnostic d : diagnostics.getChildren()) { 115 System.out.println(indent + d.getSource()); 116 System.out.println(indent + " " + d.getMessage()); 117 showDiagnostics(d, indent); 118 } 119 } 120 121 /** 122 * Loads a ressource from XMI 123 * 124 * @param inputPath 125 * path to the xmi 126 * @param extension 127 * of the ressource to load 128 * @param p 129 * the given EPackage 130 * @return 131 */ 132 // TODO: workarounds copied from respective methods without EPackage parameter 133 @SuppressWarnings( 134 { "rawtypes", "unchecked" }) 135 public Resource loadResourceFromXMI(String inputPath, String extension, EPackage p) { 136 Resource.Factory.Registry reg = Resource.Factory.Registry.INSTANCE; 137 Map<String, Object> m = reg.getExtensionToFactoryMap(); 138 m.put(extension, resourceFactory); 139 ResourceSet resSetIn = new ResourceSetImpl(); 140 // critical part 141 resSetIn.getPackageRegistry().put(p.getNsURI(), p); 142 143 Resource inputResource = resSetIn.createResource(URI.createURI(inputPath)); 144 try { 145 Map options = new HashMap<>(); 146 options.put(XMIResourceImpl.OPTION_DEFER_IDREF_RESOLUTION, Boolean.TRUE); 147 // options.put(XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF, 148 // XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF_DISCARD); 149 inputResource.load(options); 150 } 151 catch (IOException e) { 152 e.printStackTrace(); 153 } 154 return inputResource; 155 } 156 157 /** 158 * Loads a ressource from XMI 159 * 160 * @param inputPath 161 * path to the xmi 162 * @param extension 163 * of the ressource to load 164 * @return 165 */ 166 167 @SuppressWarnings( 168 { "rawtypes", "unchecked" }) 169 public Resource loadResourceFromXMI(String inputPath, String extension) { 170 Resource.Factory.Registry reg = Resource.Factory.Registry.INSTANCE; 171 Map<String, Object> m = reg.getExtensionToFactoryMap(); 172 m.put(extension, resourceFactory); 173 ResourceSet resSetIn = new ResourceSetImpl(); 174 Resource inputResource = resSetIn.createResource(URI.createURI(inputPath)); 175 try { 176 Map options = new HashMap<>(); 177 options.put(XMIResourceImpl.OPTION_DEFER_IDREF_RESOLUTION, Boolean.TRUE); 178 // options.put(XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF, 179 // XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF_DISCARD); 180 inputResource.load(options); 181 } 182 catch (IOException e) { 183 e.printStackTrace(); 184 } 185 return inputResource; 186 } 187 188 /** 189 * Gets a ressource from a binary form 190 * 191 * @param inputPath 192 * path to the binary 193 * @param extension 194 * of the model to load 195 * @param p 196 * EPackage to put the loaded ressource in 197 * @return 198 */ 199 public Resource getResourceFromBinary(String inputPath, String extension, EPackage p) { 200 Resource.Factory.Registry reg = Resource.Factory.Registry.INSTANCE; 201 Map<String, Object> m = reg.getExtensionToFactoryMap(); 202 m.put(extension, new Resource.Factory() { 203 204 @Override 205 public Resource createResource(URI uri) { 206 return new BinaryResourceImpl(uri); 207 } 208 209 }); 210 211 ResourceSet resSetIn = new ResourceSetImpl(); 212 // critical part 213 resSetIn.getPackageRegistry().put(p.getNsURI(), p); 214 215 Resource inputResource = resSetIn.createResource(URI.createURI(inputPath)); 216 return inputResource; 217 } 218 219 /** 220 * Loads a ressource from a binary form 221 * 222 * @param inputPath 223 * path to the binary 224 * @param extension 225 * of the model to load 226 * @param p 227 * EPackage to put the loaded ressource in 228 * @return 229 */ 230 // TODO: workarounds copied from respective methods without EPackage parameter 231 @SuppressWarnings( 232 { "rawtypes" }) 233 public Resource loadResourceFromBinary(String inputPath, String extension, EPackage p) { 234 Resource.Factory.Registry reg = Resource.Factory.Registry.INSTANCE; 235 Map<String, Object> m = reg.getExtensionToFactoryMap(); 236 m.put(extension, new Resource.Factory() { 237 238 @Override 239 public Resource createResource(URI uri) { 240 return new BinaryResourceImpl(uri); 241 } 242 243 }); 244 245 ResourceSet resSetIn = new ResourceSetImpl(); 246 // critical part 247 resSetIn.getPackageRegistry().put(p.getNsURI(), p); 248 249 Resource inputResource = resSetIn.createResource(URI.createURI(inputPath)); 250 if (new File(inputPath).exists()) { 251 252 try { 253 Map options = new HashMap<>(); 254 // options.put(BinaryResourceImpl.OPTION_DEFER_IDREF_RESOLUTION, Boolean.TRUE); 255 // options.put(BinaryResourceImpl.OPTION_DEFER_IDREF_RESOLUTION, Boolean.TRUE); 256 // options.put(XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF, 257 // XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF_DISCARD); 258 inputResource.load(options); 259 } 260 catch (IOException e) { 261 e.printStackTrace(); 262 } 263 } 264 return inputResource; 265 } 266 267 /** 268 * Loads a ressource from a binary form 269 * 270 * @param inputPath 271 * path to the binary 272 * @param extension 273 * of the model to load 274 * @return 275 */ 276 @SuppressWarnings( 277 { "rawtypes" }) 278 public Resource loadResourceFromBinary(String inputPath, String extension) { 279 Resource.Factory.Registry reg = Resource.Factory.Registry.INSTANCE; 280 Map<String, Object> m = reg.getExtensionToFactoryMap(); 281 m.put(extension, new Resource.Factory() { 282 283 @Override 284 public Resource createResource(URI uri) { 285 return new BinaryResourceImpl(uri); 286 } 287 288 }); 289 290 ResourceSet resSetIn = new ResourceSetImpl(); 291 Resource inputResource = resSetIn.createResource(URI.createURI(inputPath)); 292 try { 293 Map options = new HashMap<>(); 294 // options.put(XMIResourceImpl.OPTION_DEFER_IDREF_RESOLUTION, Boolean.TRUE); 295 // options.put(XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF, 296 // XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF_DISCARD); 297 inputResource.load(options); 298 } 299 catch (IOException e) { 300 e.printStackTrace(); 301 } 302 return inputResource; 303 } 304 305 /** 306 * Stores the binary resource contents to a given path 307 * 308 * @param contents 309 * EList of different EObjects to store 310 * @param outputPath 311 * path to store to 312 * @param extension 313 * of the model to store 314 */ 315 @SuppressWarnings( 316 { "rawtypes" }) 317 public void storeBinaryResourceContents(EList<EObject> contents, 318 String outputPath, 319 String extension) 320 { 321 Resource.Factory.Registry reg = Resource.Factory.Registry.INSTANCE; 322 Map<String, Object> m = reg.getExtensionToFactoryMap(); 323 m.put(extension, new Resource.Factory() { 324 325 @Override 326 public Resource createResource(URI uri) { 327 return new BinaryResourceImpl(uri); 328 } 329 330 }); 331 332 ResourceSet resSet = new ResourceSetImpl(); 333 Resource outputResource = resSet.createResource(URI.createURI(outputPath)); 334 outputResource.getContents().addAll(contents); 335 try { 336 Map options = new HashMap<>(); 337 // options.put(XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF, 338 // XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF_DISCARD); 339 outputResource.save(options); 340 } 341 catch (IOException e) { 342 e.printStackTrace(); 343 } 344 } 345 346 /** 347 * Stores the resource contents to a given path 348 * 349 * @param contents 350 * EList of different EObjects to store 351 * @param outputPath 352 * path to store to 353 * @param extension 354 * of the model to store 355 */ 356 @SuppressWarnings( 357 { "unchecked", "rawtypes" }) 358 public void storeResourceContents(EList<EObject> contents, String outputPath, String extension) 359 { 360 // TODO: duplicated from loadResourceFromXMI => move to a more appropriate location 361 Resource.Factory.Registry reg = Resource.Factory.Registry.INSTANCE; 362 Map<String, Object> m = reg.getExtensionToFactoryMap(); 363 m.put(extension, resourceFactory); 364 365 ResourceSet resSet = new ResourceSetImpl(); 366 Resource outputResource = resSet.createResource(URI.createURI(outputPath)); 367 outputResource.getContents().addAll(contents); 368 try { 369 Map options = new HashMap<>(); 370 options.put(XMIResourceImpl.OPTION_ENCODING, "UTF-8"); 371 // options.put(XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF, 372 // XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF_DISCARD); 373 outputResource.save(options); 374 } 375 catch (IOException e) { 376 e.printStackTrace(); 377 } 378 } 319 379 320 380 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/eval/AbstractWekaEvaluation.java
r35 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.eval; 2 16 … … 17 31 18 32 /** 19 * Base class for the evaluation of results of classifiers compatible with the {@link Classifier} interface.20 * For each classifier, the following metrics are calculated:33 * Base class for the evaluation of results of classifiers compatible with the {@link Classifier} 34 * interface. For each classifier, the following metrics are calculated: 21 35 * <ul> 22 * <li>succHe: Success with recall>0.7, precision>0.5</li> 23 * <li>succZi: Success with recall>0.7, precision>0.7</li> 24 * <li>succG75: Success with gscore>0.75</li> 25 * <li>succG60: Success with gscore>0.6</li> 26 * <li>error</li> 27 * <li>recall</li> 28 * <li>precision</li> 29 * <li>fscore</li> 30 * <li>gscore</li> 31 * <li>AUC</li> 32 * <li>AUCEC (weighted by LOC, if applicable; 0.0 if LOC not available)</li> 33 * <li>tpr: true positive rate</li> 34 * <li>tnr: true negative rate</li> 35 * <li>tp: true positives</li> 36 * <li>fp: false positives</li> 37 * <li>tn: true negatives</li> 38 * <li>fn: false negatives</li> 39 * <li>errortrain: training error</li> 40 * <li>recalltrain: training recall</li> 41 * <li>precisiontrain: training precision</li> 42 * <li>succHetrain: training success with recall>0.7 and precision>0.5 43 * </ul> 36 * <li>succHe: Success with recall>0.7, precision>0.5</li> 37 * <li>succZi: Success with recall>0.7, precision>0.7</li> 38 * <li>succG75: Success with gscore>0.75</li> 39 * <li>succG60: Success with gscore>0.6</li> 40 * <li>error</li> 41 * <li>recall</li> 42 * <li>precision</li> 43 * <li>fscore</li> 44 * <li>gscore</li> 45 * <li>AUC</li> 46 * <li>AUCEC (weighted by LOC, if applicable; 0.0 if LOC not available)</li> 47 * <li>tpr: true positive rate</li> 48 * <li>tnr: true negative rate</li> 49 * <li>tp: true positives</li> 50 * <li>fp: false positives</li> 51 * <li>tn: true negatives</li> 52 * <li>fn: false negatives</li> 53 * <li>errortrain: training error</li> 54 * <li>recalltrain: training recall</li> 55 * <li>precisiontrain: training precision</li> 56 * <li>succHetrain: training success with recall>0.7 and precision>0.5 57 * </ul> 58 * 44 59 * @author Steffen Herbold 45 60 */ 46 61 public abstract class AbstractWekaEvaluation implements IEvaluationStrategy { 47 62 48 /** 49 * writer for the evaluation results 50 */ 51 private PrintWriter output = new PrintWriter(System.out); 52 53 private boolean outputIsSystemOut = true; 54 55 /** 56 * Creates the weka evaluator. Allows the creation of the evaluator in different ways, e.g., for cross-validation 57 * or evaluation on the test data. 58 * @param testdata test data 59 * @param classifier classifier used 60 * @return evaluator 61 */ 62 protected abstract Evaluation createEvaluator(Instances testdata, Classifier classifier); 63 64 /* 65 * (non-Javadoc) 66 * @see de.ugoe.cs.cpdp.eval.EvaluationStrategy#apply(weka.core.Instances, weka.core.Instances, java.util.List, boolean) 67 */ 68 @Override 69 public void apply(Instances testdata, Instances traindata, List<ITrainer> trainers, 70 boolean writeHeader) { 71 final List<Classifier> classifiers = new LinkedList<Classifier>(); 72 for( ITrainer trainer : trainers ) { 73 if( trainer instanceof IWekaCompatibleTrainer ) { 74 classifiers.add(((IWekaCompatibleTrainer) trainer).getClassifier()); 75 } else { 76 throw new RuntimeException("The selected evaluator only support Weka classifiers"); 77 } 78 } 79 80 if( writeHeader ) { 81 output.append("version,size_test,size_training"); 82 for( ITrainer trainer : trainers ) { 83 output.append(",succHe_" + ((IWekaCompatibleTrainer) trainer).getName()); 84 output.append(",succZi_" + ((IWekaCompatibleTrainer) trainer).getName()); 85 output.append(",succG75_" + ((IWekaCompatibleTrainer) trainer).getName()); 86 output.append(",succG60_" + ((IWekaCompatibleTrainer) trainer).getName()); 87 output.append(",error_" + ((IWekaCompatibleTrainer) trainer).getName()); 88 output.append(",recall_" + ((IWekaCompatibleTrainer) trainer).getName()); 89 output.append(",precision_" + ((IWekaCompatibleTrainer) trainer).getName()); 90 output.append(",fscore_" + ((IWekaCompatibleTrainer) trainer).getName()); 91 output.append(",gscore_" + ((IWekaCompatibleTrainer) trainer).getName()); 92 output.append(",mcc_" + ((IWekaCompatibleTrainer) trainer).getName()); 93 output.append(",auc_" + ((IWekaCompatibleTrainer) trainer).getName()); 94 output.append(",aucec_" + ((IWekaCompatibleTrainer) trainer).getName()); 95 output.append(",tpr_" + ((IWekaCompatibleTrainer) trainer).getName()); 96 output.append(",tnr_" + ((IWekaCompatibleTrainer) trainer).getName()); 97 output.append(",tp_" + ((IWekaCompatibleTrainer) trainer).getName()); 98 output.append(",fn_" + ((IWekaCompatibleTrainer) trainer).getName()); 99 output.append(",tn_" + ((IWekaCompatibleTrainer) trainer).getName()); 100 output.append(",fp_" + ((IWekaCompatibleTrainer) trainer).getName()); 101 output.append(",trainerror_" + ((IWekaCompatibleTrainer) trainer).getName()); 102 output.append(",trainrecall_" + ((IWekaCompatibleTrainer) trainer).getName()); 103 output.append(",trainprecision_" + ((IWekaCompatibleTrainer) trainer).getName()); 104 output.append(",trainsuccHe_" + ((IWekaCompatibleTrainer) trainer).getName()); 105 } 106 output.append(StringTools.ENDLINE); 107 } 108 109 output.append(testdata.relationName()); 110 output.append("," + testdata.numInstances()); 111 output.append("," + traindata.numInstances()); 112 113 Evaluation eval = null; 114 Evaluation evalTrain = null; 115 for( Classifier classifier : classifiers ) { 116 eval = createEvaluator(testdata, classifier); 117 evalTrain = createEvaluator(traindata, classifier); 118 119 double pf = eval.numFalsePositives(1)/(eval.numFalsePositives(1)+eval.numTrueNegatives(1)); 120 double gmeasure = 2*eval.recall(1)*(1.0-pf)/(eval.recall(1)+(1.0-pf)); 121 double mcc = (eval.numTruePositives(1)*eval.numTrueNegatives(1)-eval.numFalsePositives(1)*eval.numFalseNegatives(1))/Math.sqrt((eval.numTruePositives(1)+eval.numFalsePositives(1))*(eval.numTruePositives(1)+eval.numFalseNegatives(1))*(eval.numTrueNegatives(1)+eval.numFalsePositives(1))*(eval.numTrueNegatives(1)+eval.numFalseNegatives(1))); 122 double aucec = calculateReviewEffort(testdata, classifier); 123 124 if( eval.recall(1)>=0.7 && eval.precision(1) >= 0.5 ) { 125 output.append(",1"); 126 } else { 127 output.append(",0"); 128 } 129 130 if( eval.recall(1)>=0.7 && eval.precision(1) >= 0.7 ) { 131 output.append(",1"); 132 } else { 133 output.append(",0"); 134 } 135 136 if( gmeasure>0.75 ) { 137 output.append(",1"); 138 } else { 139 output.append(",0"); 140 } 141 142 if( gmeasure>0.6 ) { 143 output.append(",1"); 144 } else { 145 output.append(",0"); 146 } 147 148 output.append("," + eval.errorRate()); 149 output.append("," + eval.recall(1)); 150 output.append("," + eval.precision(1)); 151 output.append("," + eval.fMeasure(1)); 152 output.append("," + gmeasure); 153 output.append("," + mcc); 154 output.append("," + eval.areaUnderROC(1)); 155 output.append("," + aucec); 156 output.append("," + eval.truePositiveRate(1)); 157 output.append("," + eval.trueNegativeRate(1)); 158 output.append("," + eval.numTruePositives(1)); 159 output.append("," + eval.numFalseNegatives(1)); 160 output.append("," + eval.numTrueNegatives(1)); 161 output.append("," + eval.numFalsePositives(1)); 162 output.append("," + evalTrain.errorRate()); 163 output.append("," + evalTrain.recall(1)); 164 output.append("," + evalTrain.precision(1)); 165 if( evalTrain.recall(1)>=0.7 && evalTrain.precision(1) >= 0.5 ) { 166 output.append(",1"); 167 } else { 168 output.append(",0"); 169 } 170 } 171 172 output.append(StringTools.ENDLINE); 173 output.flush(); 174 } 175 176 private double calculateReviewEffort(Instances testdata, Classifier classifier) { 177 178 final Attribute loc = testdata.attribute("loc"); 179 if( loc==null ) { 180 return 0.0; 181 } 182 183 final List<Integer> bugPredicted = new ArrayList<>(); 184 final List<Integer> nobugPredicted = new ArrayList<>(); 185 double totalLoc = 0.0d; 186 int totalBugs = 0; 187 for( int i=0 ; i<testdata.numInstances() ; i++ ) { 188 try { 189 if( Double.compare(classifier.classifyInstance(testdata.instance(i)),0.0d)==0 ) { 190 nobugPredicted.add(i); 191 } else { 192 bugPredicted.add(i); 193 } 194 } catch (Exception e) { 195 throw new RuntimeException("unexpected error during the evaluation of the review effort", e); 196 } 197 if(Double.compare(testdata.instance(i).classValue(),1.0d)==0) { 198 totalBugs++; 199 } 200 totalLoc += testdata.instance(i).value(loc); 201 } 202 203 final List<Double> reviewLoc = new ArrayList<>(testdata.numInstances()); 204 final List<Double> bugsFound = new ArrayList<>(testdata.numInstances()); 205 206 double currentBugsFound = 0; 207 208 while( !bugPredicted.isEmpty() ) { 209 double minLoc = Double.MAX_VALUE; 210 int minIndex = -1; 211 for( int i=0 ; i<bugPredicted.size() ; i++ ) { 212 double currentLoc = testdata.instance(bugPredicted.get(i)).value(loc); 213 if( currentLoc<minLoc ) { 214 minIndex = i; 215 minLoc = currentLoc; 216 } 217 } 218 if( minIndex!=-1 ) { 219 reviewLoc.add(minLoc/totalLoc); 220 221 currentBugsFound += testdata.instance(bugPredicted.get(minIndex)).classValue(); 222 bugsFound.add(currentBugsFound); 223 224 bugPredicted.remove(minIndex); 225 } else { 226 throw new RuntimeException("Shouldn't happen!"); 227 } 228 } 229 230 while( !nobugPredicted.isEmpty() ) { 231 double minLoc = Double.MAX_VALUE; 232 int minIndex = -1; 233 for( int i=0 ; i<nobugPredicted.size() ; i++ ) { 234 double currentLoc = testdata.instance(nobugPredicted.get(i)).value(loc); 235 if( currentLoc<minLoc ) { 236 minIndex = i; 237 minLoc = currentLoc; 238 } 239 } 240 if( minIndex!=-1 ) { 241 reviewLoc.add(minLoc/totalLoc); 242 243 currentBugsFound += testdata.instance(nobugPredicted.get(minIndex)).classValue(); 244 bugsFound.add(currentBugsFound); 245 nobugPredicted.remove(minIndex); 246 } else { 247 throw new RuntimeException("Shouldn't happen!"); 248 } 249 } 250 251 double auc = 0.0; 252 for( int i=0 ; i<bugsFound.size() ; i++ ) { 253 auc += reviewLoc.get(i)*bugsFound.get(i)/totalBugs; 254 } 255 256 return auc; 257 } 258 259 /* 260 * (non-Javadoc) 261 * @see de.ugoe.cs.cpdp.Parameterizable#setParameter(java.lang.String) 262 */ 263 @Override 264 public void setParameter(String parameters) { 265 if( output!=null && !outputIsSystemOut ) { 266 output.close(); 267 } 268 if( "system.out".equals(parameters) || "".equals(parameters) ) { 269 output = new PrintWriter(System.out); 270 outputIsSystemOut = true; 271 } else { 272 try { 273 output = new PrintWriter(new FileOutputStream(parameters)); 274 outputIsSystemOut = false; 275 } catch (FileNotFoundException e) { 276 throw new RuntimeException(e); 277 } 278 } 279 } 63 /** 64 * writer for the evaluation results 65 */ 66 private PrintWriter output = new PrintWriter(System.out); 67 68 private boolean outputIsSystemOut = true; 69 70 /** 71 * Creates the weka evaluator. Allows the creation of the evaluator in different ways, e.g., for 72 * cross-validation or evaluation on the test data. 73 * 74 * @param testdata 75 * test data 76 * @param classifier 77 * classifier used 78 * @return evaluator 79 */ 80 protected abstract Evaluation createEvaluator(Instances testdata, Classifier classifier); 81 82 /* 83 * (non-Javadoc) 84 * 85 * @see de.ugoe.cs.cpdp.eval.EvaluationStrategy#apply(weka.core.Instances, weka.core.Instances, 86 * java.util.List, boolean) 87 */ 88 @Override 89 public void apply(Instances testdata, 90 Instances traindata, 91 List<ITrainer> trainers, 92 boolean writeHeader) 93 { 94 final List<Classifier> classifiers = new LinkedList<Classifier>(); 95 for (ITrainer trainer : trainers) { 96 if (trainer instanceof IWekaCompatibleTrainer) { 97 classifiers.add(((IWekaCompatibleTrainer) trainer).getClassifier()); 98 } 99 else { 100 throw new RuntimeException("The selected evaluator only support Weka classifiers"); 101 } 102 } 103 104 if (writeHeader) { 105 output.append("version,size_test,size_training"); 106 for (ITrainer trainer : trainers) { 107 output.append(",succHe_" + ((IWekaCompatibleTrainer) trainer).getName()); 108 output.append(",succZi_" + ((IWekaCompatibleTrainer) trainer).getName()); 109 output.append(",succG75_" + ((IWekaCompatibleTrainer) trainer).getName()); 110 output.append(",succG60_" + ((IWekaCompatibleTrainer) trainer).getName()); 111 output.append(",error_" + ((IWekaCompatibleTrainer) trainer).getName()); 112 output.append(",recall_" + ((IWekaCompatibleTrainer) trainer).getName()); 113 output.append(",precision_" + ((IWekaCompatibleTrainer) trainer).getName()); 114 output.append(",fscore_" + ((IWekaCompatibleTrainer) trainer).getName()); 115 output.append(",gscore_" + ((IWekaCompatibleTrainer) trainer).getName()); 116 output.append(",mcc_" + ((IWekaCompatibleTrainer) trainer).getName()); 117 output.append(",auc_" + ((IWekaCompatibleTrainer) trainer).getName()); 118 output.append(",aucec_" + ((IWekaCompatibleTrainer) trainer).getName()); 119 output.append(",tpr_" + ((IWekaCompatibleTrainer) trainer).getName()); 120 output.append(",tnr_" + ((IWekaCompatibleTrainer) trainer).getName()); 121 output.append(",tp_" + ((IWekaCompatibleTrainer) trainer).getName()); 122 output.append(",fn_" + ((IWekaCompatibleTrainer) trainer).getName()); 123 output.append(",tn_" + ((IWekaCompatibleTrainer) trainer).getName()); 124 output.append(",fp_" + ((IWekaCompatibleTrainer) trainer).getName()); 125 output.append(",trainerror_" + ((IWekaCompatibleTrainer) trainer).getName()); 126 output.append(",trainrecall_" + ((IWekaCompatibleTrainer) trainer).getName()); 127 output.append(",trainprecision_" + ((IWekaCompatibleTrainer) trainer).getName()); 128 output.append(",trainsuccHe_" + ((IWekaCompatibleTrainer) trainer).getName()); 129 } 130 output.append(StringTools.ENDLINE); 131 } 132 133 output.append(testdata.relationName()); 134 output.append("," + testdata.numInstances()); 135 output.append("," + traindata.numInstances()); 136 137 Evaluation eval = null; 138 Evaluation evalTrain = null; 139 for (Classifier classifier : classifiers) { 140 eval = createEvaluator(testdata, classifier); 141 evalTrain = createEvaluator(traindata, classifier); 142 143 double pf = 144 eval.numFalsePositives(1) / (eval.numFalsePositives(1) + eval.numTrueNegatives(1)); 145 double gmeasure = 2 * eval.recall(1) * (1.0 - pf) / (eval.recall(1) + (1.0 - pf)); 146 double mcc = 147 (eval.numTruePositives(1) * eval.numTrueNegatives(1) - eval.numFalsePositives(1) * 148 eval.numFalseNegatives(1)) / 149 Math.sqrt((eval.numTruePositives(1) + eval.numFalsePositives(1)) * 150 (eval.numTruePositives(1) + eval.numFalseNegatives(1)) * 151 (eval.numTrueNegatives(1) + eval.numFalsePositives(1)) * 152 (eval.numTrueNegatives(1) + eval.numFalseNegatives(1))); 153 double aucec = calculateReviewEffort(testdata, classifier); 154 155 if (eval.recall(1) >= 0.7 && eval.precision(1) >= 0.5) { 156 output.append(",1"); 157 } 158 else { 159 output.append(",0"); 160 } 161 162 if (eval.recall(1) >= 0.7 && eval.precision(1) >= 0.7) { 163 output.append(",1"); 164 } 165 else { 166 output.append(",0"); 167 } 168 169 if (gmeasure > 0.75) { 170 output.append(",1"); 171 } 172 else { 173 output.append(",0"); 174 } 175 176 if (gmeasure > 0.6) { 177 output.append(",1"); 178 } 179 else { 180 output.append(",0"); 181 } 182 183 output.append("," + eval.errorRate()); 184 output.append("," + eval.recall(1)); 185 output.append("," + eval.precision(1)); 186 output.append("," + eval.fMeasure(1)); 187 output.append("," + gmeasure); 188 output.append("," + mcc); 189 output.append("," + eval.areaUnderROC(1)); 190 output.append("," + aucec); 191 output.append("," + eval.truePositiveRate(1)); 192 output.append("," + eval.trueNegativeRate(1)); 193 output.append("," + eval.numTruePositives(1)); 194 output.append("," + eval.numFalseNegatives(1)); 195 output.append("," + eval.numTrueNegatives(1)); 196 output.append("," + eval.numFalsePositives(1)); 197 output.append("," + evalTrain.errorRate()); 198 output.append("," + evalTrain.recall(1)); 199 output.append("," + evalTrain.precision(1)); 200 if (evalTrain.recall(1) >= 0.7 && evalTrain.precision(1) >= 0.5) { 201 output.append(",1"); 202 } 203 else { 204 output.append(",0"); 205 } 206 } 207 208 output.append(StringTools.ENDLINE); 209 output.flush(); 210 } 211 212 private double calculateReviewEffort(Instances testdata, Classifier classifier) { 213 214 final Attribute loc = testdata.attribute("loc"); 215 if (loc == null) { 216 return 0.0; 217 } 218 219 final List<Integer> bugPredicted = new ArrayList<>(); 220 final List<Integer> nobugPredicted = new ArrayList<>(); 221 double totalLoc = 0.0d; 222 int totalBugs = 0; 223 for (int i = 0; i < testdata.numInstances(); i++) { 224 try { 225 if (Double.compare(classifier.classifyInstance(testdata.instance(i)), 0.0d) == 0) { 226 nobugPredicted.add(i); 227 } 228 else { 229 bugPredicted.add(i); 230 } 231 } 232 catch (Exception e) { 233 throw new RuntimeException( 234 "unexpected error during the evaluation of the review effort", 235 e); 236 } 237 if (Double.compare(testdata.instance(i).classValue(), 1.0d) == 0) { 238 totalBugs++; 239 } 240 totalLoc += testdata.instance(i).value(loc); 241 } 242 243 final List<Double> reviewLoc = new ArrayList<>(testdata.numInstances()); 244 final List<Double> bugsFound = new ArrayList<>(testdata.numInstances()); 245 246 double currentBugsFound = 0; 247 248 while (!bugPredicted.isEmpty()) { 249 double minLoc = Double.MAX_VALUE; 250 int minIndex = -1; 251 for (int i = 0; i < bugPredicted.size(); i++) { 252 double currentLoc = testdata.instance(bugPredicted.get(i)).value(loc); 253 if (currentLoc < minLoc) { 254 minIndex = i; 255 minLoc = currentLoc; 256 } 257 } 258 if (minIndex != -1) { 259 reviewLoc.add(minLoc / totalLoc); 260 261 currentBugsFound += testdata.instance(bugPredicted.get(minIndex)).classValue(); 262 bugsFound.add(currentBugsFound); 263 264 bugPredicted.remove(minIndex); 265 } 266 else { 267 throw new RuntimeException("Shouldn't happen!"); 268 } 269 } 270 271 while (!nobugPredicted.isEmpty()) { 272 double minLoc = Double.MAX_VALUE; 273 int minIndex = -1; 274 for (int i = 0; i < nobugPredicted.size(); i++) { 275 double currentLoc = testdata.instance(nobugPredicted.get(i)).value(loc); 276 if (currentLoc < minLoc) { 277 minIndex = i; 278 minLoc = currentLoc; 279 } 280 } 281 if (minIndex != -1) { 282 reviewLoc.add(minLoc / totalLoc); 283 284 currentBugsFound += testdata.instance(nobugPredicted.get(minIndex)).classValue(); 285 bugsFound.add(currentBugsFound); 286 nobugPredicted.remove(minIndex); 287 } 288 else { 289 throw new RuntimeException("Shouldn't happen!"); 290 } 291 } 292 293 double auc = 0.0; 294 for (int i = 0; i < bugsFound.size(); i++) { 295 auc += reviewLoc.get(i) * bugsFound.get(i) / totalBugs; 296 } 297 298 return auc; 299 } 300 301 /* 302 * (non-Javadoc) 303 * 304 * @see de.ugoe.cs.cpdp.Parameterizable#setParameter(java.lang.String) 305 */ 306 @Override 307 public void setParameter(String parameters) { 308 if (output != null && !outputIsSystemOut) { 309 output.close(); 310 } 311 if ("system.out".equals(parameters) || "".equals(parameters)) { 312 output = new PrintWriter(System.out); 313 outputIsSystemOut = true; 314 } 315 else { 316 try { 317 output = new PrintWriter(new FileOutputStream(parameters)); 318 outputIsSystemOut = false; 319 } 320 catch (FileNotFoundException e) { 321 throw new RuntimeException(e); 322 } 323 } 324 } 280 325 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/eval/CVWekaEvaluation.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.eval; 2 16 … … 12 26 /** 13 27 * Implements the {@link AbstractWekaEvaluation} for 10-fold cross validation. 28 * 14 29 * @author Steffen Herbold 15 30 */ 16 31 public class CVWekaEvaluation extends AbstractWekaEvaluation { 17 18 /** 19 * @see de.ugoe.cs.cpdp.eval.AbstractWekaEvaluation#createEvaluator(weka.core.Instances, weka.classifiers.Classifier) 20 */ 21 @Override 22 protected Evaluation createEvaluator(Instances testdata, Classifier classifier) { 23 PrintStream errStr = System.err; 24 System.setErr(new PrintStream(new NullOutputStream())); 25 try { 26 final Evaluation eval = new Evaluation(testdata); 27 eval.crossValidateModel(classifier, testdata, 10, new Random(1)); 28 return eval; 29 } catch (Exception e) { 30 throw new RuntimeException(e); 31 } finally { 32 System.setErr(errStr); 33 } 34 } 32 33 /** 34 * @see de.ugoe.cs.cpdp.eval.AbstractWekaEvaluation#createEvaluator(weka.core.Instances, 35 * weka.classifiers.Classifier) 36 */ 37 @Override 38 protected Evaluation createEvaluator(Instances testdata, Classifier classifier) { 39 PrintStream errStr = System.err; 40 System.setErr(new PrintStream(new NullOutputStream())); 41 try { 42 final Evaluation eval = new Evaluation(testdata); 43 eval.crossValidateModel(classifier, testdata, 10, new Random(1)); 44 return eval; 45 } 46 catch (Exception e) { 47 throw new RuntimeException(e); 48 } 49 finally { 50 System.setErr(errStr); 51 } 52 } 35 53 36 54 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/eval/IEvaluationStrategy.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.eval; 2 16 … … 9 23 10 24 /** 11 * Interface for evaluation strategies to evaluate the performance of classifiers. 25 * Interface for evaluation strategies to evaluate the performance of classifiers. 26 * 12 27 * @author Steffen Herbold 13 28 */ 14 29 public interface IEvaluationStrategy extends IParameterizable { 15 30 16 /** 17 * Applies the evaluation strategy. 18 * @param testdata test data for the evaluation 19 * @param traindata training data used 20 * @param trainers list of training algorithms used to train the classifiers 21 * @param writeHeader if true, a header line for the results file is written (may not be applicable) 22 */ 23 void apply(Instances testdata, Instances traindata, List<ITrainer> trainers, boolean writeHeader); 31 /** 32 * Applies the evaluation strategy. 33 * 34 * @param testdata 35 * test data for the evaluation 36 * @param traindata 37 * training data used 38 * @param trainers 39 * list of training algorithms used to train the classifiers 40 * @param writeHeader 41 * if true, a header line for the results file is written (may not be applicable) 42 */ 43 void apply(Instances testdata, Instances traindata, List<ITrainer> trainers, boolean writeHeader); 24 44 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/eval/NormalWekaEvaluation.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.eval; 2 16 … … 7 21 /** 8 22 * Implements the {@link AbstractWekaEvaluation} for evaluation on the test data. 23 * 9 24 * @author Steffen Herbold 10 * 25 * 11 26 */ 12 27 public class NormalWekaEvaluation extends AbstractWekaEvaluation { 13 28 14 /** 15 * @see de.ugoe.cs.cpdp.eval.AbstractWekaEvaluation#createEvaluator(weka.core.Instances, weka.classifiers.Classifier) 16 */ 17 @Override 18 protected Evaluation createEvaluator(Instances testdata, Classifier classifier) { 19 try { 20 final Evaluation eval = new Evaluation(testdata); 21 eval.evaluateModel(classifier, testdata); 22 return eval; 23 } catch (Exception e) { 24 throw new RuntimeException(e); 25 } 26 } 29 /** 30 * @see de.ugoe.cs.cpdp.eval.AbstractWekaEvaluation#createEvaluator(weka.core.Instances, 31 * weka.classifiers.Classifier) 32 */ 33 @Override 34 protected Evaluation createEvaluator(Instances testdata, Classifier classifier) { 35 try { 36 final Evaluation eval = new Evaluation(testdata); 37 eval.evaluateModel(classifier, testdata); 38 return eval; 39 } 40 catch (Exception e) { 41 throw new RuntimeException(e); 42 } 43 } 27 44 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/execution/ClassifierCreationExperiment.java
r33 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.execution; 2 16 … … 19 33 20 34 /** 21 * Class responsible for executing an experiment according to an {@link ExperimentConfiguration}. The steps22 * of this ClassifierCreationExperiment are as follows:35 * Class responsible for executing an experiment according to an {@link ExperimentConfiguration}. 36 * The steps of this ClassifierCreationExperiment are as follows: 23 37 * <ul> 24 * <li>load the data from the provided data path</li> 25 * <li>check if given resultsdir exists, if not create one</li> 26 * <li>execute the following steps for each data set: 27 * <ul> 28 * <li>load the dataset</li> 29 * <li>set testdata == traindata</li> 30 * <li>preprocess the data</li> 31 * <li>postprocess the data</li> 32 * <li>for each configured trainer do the following:</li> 33 * <ul> 34 * <li>if the classifier should be saved, train it with the dataset</li> 35 * <li>save it in the results dir</li> 36 * <li>For each configured evaluator: Do the evaluation and save results</li> 37 * </ul> 38 * </ul> 38 * <li>load the data from the provided data path</li> 39 * <li>check if given resultsdir exists, if not create one</li> 40 * <li>execute the following steps for each data set: 41 * <ul> 42 * <li>load the dataset</li> 43 * <li>set testdata == traindata</li> 44 * <li>preprocess the data</li> 45 * <li>postprocess the data</li> 46 * <li>for each configured trainer do the following:</li> 47 * <ul> 48 * <li>if the classifier should be saved, train it with the dataset</li> 49 * <li>save it in the results dir</li> 50 * <li>For each configured evaluator: Do the evaluation and save results</li> 39 51 * </ul> 40 * 41 * Note that this class implements {@link IExectuionStrategy}, i.e., each experiment can be started 52 * </ul> 53 * </ul> 54 * 55 * Note that this class implements {@link IExectuionStrategy}, i.e., each experiment can be started 42 56 * in its own thread. 43 57 * … … 46 60 public class ClassifierCreationExperiment implements IExecutionStrategy { 47 61 48 /** 49 * configuration of the experiment 50 */ 51 private final ExperimentConfiguration config; 52 53 /** 54 * Constructor. Creates a new experiment based on a configuration. 55 * @param config configuration of the experiment 56 */ 57 public ClassifierCreationExperiment(ExperimentConfiguration config) { 58 this.config = config; 59 } 60 61 /** 62 * Executes the experiment with the steps as described in the class comment. 63 * @see Runnable#run() 64 */ 65 @Override 66 public void run() { 67 final List<SoftwareVersion> versions = new LinkedList<>(); 68 69 boolean writeHeader = true; 70 71 for(IVersionLoader loader : config.getLoaders()) { 72 versions.addAll(loader.load()); 73 } 74 62 /** 63 * configuration of the experiment 64 */ 65 private final ExperimentConfiguration config; 75 66 76 File resultsDir = new File(config.getResultsPath()); 77 if (!resultsDir.exists()) { 78 resultsDir.mkdir(); 79 } 80 81 82 int versionCount = 1; 83 for( SoftwareVersion testVersion : versions ) { 84 85 // At first: traindata == testdata 86 Instances testdata = testVersion.getInstances(); 87 Instances traindata = new Instances(testdata); 88 89 // Give the dataset a new name 90 testdata.setRelationName(testVersion.getProject()); 91 92 for( IProcessesingStrategy processor : config.getPreProcessors() ) { 93 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying preprocessor %s", config.getExperimentName(), versionCount, versions.size(), testVersion.getProject(), processor.getClass().getName())); 94 processor.apply(testdata, traindata); 95 } 96 97 for( IPointWiseDataselectionStrategy dataselector : config.getPointWiseSelectors() ) { 98 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying pointwise selection %s", config.getExperimentName(), versionCount, versions.size(), testVersion.getProject(), dataselector.getClass().getName())); 99 traindata = dataselector.apply(testdata, traindata); 100 } 101 102 for( IProcessesingStrategy processor : config.getPostProcessors() ) { 103 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying setwise postprocessor %s", config.getExperimentName(), versionCount, versions.size(), testVersion.getProject(), processor.getClass().getName())); 104 processor.apply(testdata, traindata); 105 } 106 107 108 109 110 // Trainerlist for evaluation later on 111 List<ITrainer> allTrainers = new LinkedList<>(); 112 113 for( ITrainingStrategy trainer : config.getTrainers() ) { 67 /** 68 * Constructor. Creates a new experiment based on a configuration. 69 * 70 * @param config 71 * configuration of the experiment 72 */ 73 public ClassifierCreationExperiment(ExperimentConfiguration config) { 74 this.config = config; 75 } 114 76 115 // Add trainer to list for evaluation 116 allTrainers.add(trainer); 117 118 // Train classifier 119 trainer.apply(traindata); 120 121 if(config.getSaveClassifier()) { 122 // If classifier should be saved, train him and save him 123 // be careful with typecasting here! 124 IWekaCompatibleTrainer trainerToSave = (IWekaCompatibleTrainer) trainer; 125 //Console.println(trainerToSave.getClassifier().toString()); 126 try { 127 weka.core.SerializationHelper.write(resultsDir.getAbsolutePath()+"/"+trainer.getName()+"-"+testVersion.getProject(), trainerToSave.getClassifier()); 128 } catch (Exception e) { 129 e.printStackTrace(); 130 } 131 132 } 133 } 134 135 136 137 for( IEvaluationStrategy evaluator : config.getEvaluators() ) { 138 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying evaluator %s", config.getExperimentName(), versionCount, versions.size(), testVersion.getProject(), evaluator.getClass().getName())); 77 /** 78 * Executes the experiment with the steps as described in the class comment. 79 * 80 * @see Runnable#run() 81 */ 82 @Override 83 public void run() { 84 final List<SoftwareVersion> versions = new LinkedList<>(); 139 85 140 if( writeHeader ) { 141 evaluator.setParameter(config.getResultsPath() + "/" + config.getExperimentName() + ".csv"); 142 } 143 evaluator.apply(testdata, traindata, allTrainers, writeHeader); 144 writeHeader = false; 145 } 146 147 versionCount++; 148 149 Console.traceln(Level.INFO, String.format("[%s] [%02d/%02d] %s: finished", config.getExperimentName(), versionCount, versions.size(), testVersion.getProject())); 150 151 } 152 153 } 154 86 boolean writeHeader = true; 87 88 for (IVersionLoader loader : config.getLoaders()) { 89 versions.addAll(loader.load()); 90 } 91 92 File resultsDir = new File(config.getResultsPath()); 93 if (!resultsDir.exists()) { 94 resultsDir.mkdir(); 95 } 96 97 int versionCount = 1; 98 for (SoftwareVersion testVersion : versions) { 99 100 // At first: traindata == testdata 101 Instances testdata = testVersion.getInstances(); 102 Instances traindata = new Instances(testdata); 103 104 // Give the dataset a new name 105 testdata.setRelationName(testVersion.getProject()); 106 107 for (IProcessesingStrategy processor : config.getPreProcessors()) { 108 Console.traceln(Level.FINE, String 109 .format("[%s] [%02d/%02d] %s: applying preprocessor %s", 110 config.getExperimentName(), versionCount, versions.size(), 111 testVersion.getProject(), processor.getClass().getName())); 112 processor.apply(testdata, traindata); 113 } 114 115 for (IPointWiseDataselectionStrategy dataselector : config.getPointWiseSelectors()) { 116 Console.traceln(Level.FINE, String 117 .format("[%s] [%02d/%02d] %s: applying pointwise selection %s", 118 config.getExperimentName(), versionCount, versions.size(), 119 testVersion.getProject(), dataselector.getClass().getName())); 120 traindata = dataselector.apply(testdata, traindata); 121 } 122 123 for (IProcessesingStrategy processor : config.getPostProcessors()) { 124 Console.traceln(Level.FINE, String 125 .format("[%s] [%02d/%02d] %s: applying setwise postprocessor %s", 126 config.getExperimentName(), versionCount, versions.size(), 127 testVersion.getProject(), processor.getClass().getName())); 128 processor.apply(testdata, traindata); 129 } 130 131 // Trainerlist for evaluation later on 132 List<ITrainer> allTrainers = new LinkedList<>(); 133 134 for (ITrainingStrategy trainer : config.getTrainers()) { 135 136 // Add trainer to list for evaluation 137 allTrainers.add(trainer); 138 139 // Train classifier 140 trainer.apply(traindata); 141 142 if (config.getSaveClassifier()) { 143 // If classifier should be saved, train him and save him 144 // be careful with typecasting here! 145 IWekaCompatibleTrainer trainerToSave = (IWekaCompatibleTrainer) trainer; 146 // Console.println(trainerToSave.getClassifier().toString()); 147 try { 148 weka.core.SerializationHelper.write(resultsDir.getAbsolutePath() + "/" + 149 trainer.getName() + "-" + 150 testVersion.getProject(), 151 trainerToSave.getClassifier()); 152 } 153 catch (Exception e) { 154 e.printStackTrace(); 155 } 156 157 } 158 } 159 160 for (IEvaluationStrategy evaluator : config.getEvaluators()) { 161 Console.traceln(Level.FINE, String 162 .format("[%s] [%02d/%02d] %s: applying evaluator %s", 163 config.getExperimentName(), versionCount, versions.size(), 164 testVersion.getProject(), evaluator.getClass().getName())); 165 166 if (writeHeader) { 167 evaluator.setParameter(config.getResultsPath() + "/" + 168 config.getExperimentName() + ".csv"); 169 } 170 evaluator.apply(testdata, traindata, allTrainers, writeHeader); 171 writeHeader = false; 172 } 173 174 versionCount++; 175 176 Console.traceln(Level.INFO, String.format("[%s] [%02d/%02d] %s: finished", 177 config.getExperimentName(), versionCount, 178 versions.size(), testVersion.getProject())); 179 180 } 181 182 } 183 155 184 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/execution/CrossProjectExperiment.java
r32 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.execution; 2 16 … … 25 39 26 40 /** 27 * Class responsible for executing an experiment according to an {@link ExperimentConfiguration}. The steps of an experiment are as follows: 41 * Class responsible for executing an experiment according to an {@link ExperimentConfiguration}. 42 * The steps of an experiment are as follows: 28 43 * <ul> 29 * <li>load the data from the provided data path</li> 30 * <li>filter the data sets according to the provided version filters</li> 31 * <li>execute the following steps for each data sets as test data that is not ignored through the test version filter: 32 * <ul> 33 * <li>filter the data sets to setup the candidate training data: 34 * <ul> 35 * <li>remove all data sets from the same project</li> 36 * <li>filter all data sets according to the training data filter 37 * </ul></li> 38 * <li>apply the setwise preprocessors</li> 39 * <li>apply the setwise data selection algorithms</li> 40 * <li>apply the setwise postprocessors</li> 41 * <li>train the setwise training classifiers</li> 42 * <li>unify all remaining training data into one data set</li> 43 * <li>apply the preprocessors</li> 44 * <li>apply the pointwise data selection algorithms</li> 45 * <li>apply the postprocessors</li> 46 * <li>train the normal classifiers</li> 47 * <li>evaluate the results for all trained classifiers on the training data</li> 48 * </ul></li> 44 * <li>load the data from the provided data path</li> 45 * <li>filter the data sets according to the provided version filters</li> 46 * <li>execute the following steps for each data sets as test data that is not ignored through the 47 * test version filter: 48 * <ul> 49 * <li>filter the data sets to setup the candidate training data: 50 * <ul> 51 * <li>remove all data sets from the same project</li> 52 * <li>filter all data sets according to the training data filter 53 * </ul> 54 * </li> 55 * <li>apply the setwise preprocessors</li> 56 * <li>apply the setwise data selection algorithms</li> 57 * <li>apply the setwise postprocessors</li> 58 * <li>train the setwise training classifiers</li> 59 * <li>unify all remaining training data into one data set</li> 60 * <li>apply the preprocessors</li> 61 * <li>apply the pointwise data selection algorithms</li> 62 * <li>apply the postprocessors</li> 63 * <li>train the normal classifiers</li> 64 * <li>evaluate the results for all trained classifiers on the training data</li> 65 * </ul> 66 * </li> 49 67 * </ul> 50 68 * 51 * Note that this class implements {@link Runnable}, i.e., each experiment can be started in its own thread. 69 * Note that this class implements {@link Runnable}, i.e., each experiment can be started in its own 70 * thread. 71 * 52 72 * @author Steffen Herbold 53 73 */ 54 74 public class CrossProjectExperiment implements IExecutionStrategy { 55 75 56 /** 57 * configuration of the experiment 58 */ 59 private final ExperimentConfiguration config; 60 61 /** 62 * Constructor. Creates a new experiment based on a configuration. 63 * @param config configuration of the experiment 64 */ 65 public CrossProjectExperiment(ExperimentConfiguration config) { 66 this.config = config; 67 } 68 69 /** 70 * Executes the experiment with the steps as described in the class comment. 71 * @see Runnable#run() 72 */ 73 @Override 74 public void run() { 75 final List<SoftwareVersion> versions = new LinkedList<>(); 76 77 for(IVersionLoader loader : config.getLoaders()) { 78 versions.addAll(loader.load()); 79 } 80 81 for( IVersionFilter filter : config.getVersionFilters() ) { 82 filter.apply(versions); 83 } 84 boolean writeHeader = true; 85 int versionCount = 1; 86 int testVersionCount = 0; 87 88 for( SoftwareVersion testVersion : versions ) { 89 if( isVersion(testVersion, config.getTestVersionFilters()) ) { 90 testVersionCount++; 91 } 92 } 93 94 // sort versions 95 Collections.sort(versions); 96 97 for( SoftwareVersion testVersion : versions ) { 98 if( isVersion(testVersion, config.getTestVersionFilters()) ) { 99 Console.traceln(Level.INFO, String.format("[%s] [%02d/%02d] %s: starting", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion())); 100 101 // Setup testdata and training data 102 Instances testdata = testVersion.getInstances(); 103 String testProject = testVersion.getProject(); 104 SetUniqueList<Instances> traindataSet = SetUniqueList.setUniqueList(new LinkedList<Instances>()); 105 for( SoftwareVersion trainingVersion : versions ) { 106 if( isVersion(trainingVersion, config.getTrainingVersionFilters()) ) { 107 if( trainingVersion!=testVersion ) { 108 if( !trainingVersion.getProject().equals(testProject) ) { 109 traindataSet.add(trainingVersion.getInstances()); 110 } 111 } 112 } 113 } 114 115 for( ISetWiseProcessingStrategy processor : config.getSetWisePreprocessors() ) { 116 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying setwise preprocessor %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), processor.getClass().getName())); 117 processor.apply(testdata, traindataSet); 118 } 119 for( ISetWiseDataselectionStrategy dataselector : config.getSetWiseSelectors() ) { 120 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying setwise selection %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), dataselector.getClass().getName())); 121 dataselector.apply(testdata, traindataSet); 122 } 123 for( ISetWiseProcessingStrategy processor : config.getSetWisePostprocessors() ) { 124 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying setwise postprocessor %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), processor.getClass().getName())); 125 processor.apply(testdata, traindataSet); 126 } 127 for( ISetWiseTrainingStrategy setwiseTrainer : config.getSetWiseTrainers() ) { 128 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying setwise trainer %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), setwiseTrainer.getName())); 129 setwiseTrainer.apply(traindataSet); 130 } 131 Instances traindata = makeSingleTrainingSet(traindataSet); 132 for( IProcessesingStrategy processor : config.getPreProcessors() ) { 133 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying preprocessor %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), processor.getClass().getName())); 134 processor.apply(testdata, traindata); 135 } 136 for( IPointWiseDataselectionStrategy dataselector : config.getPointWiseSelectors() ) { 137 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying pointwise selection %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), dataselector.getClass().getName())); 138 traindata = dataselector.apply(testdata, traindata); 139 } 140 for( IProcessesingStrategy processor : config.getPostProcessors() ) { 141 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying setwise postprocessor %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), processor.getClass().getName())); 142 processor.apply(testdata, traindata); 143 } 144 for( ITrainingStrategy trainer : config.getTrainers() ) { 145 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying trainer %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), trainer.getName())); 146 trainer.apply(traindata); 147 } 148 File resultsDir = new File(config.getResultsPath()); 149 if (!resultsDir.exists()) { 150 resultsDir.mkdir(); 151 } 152 for( IEvaluationStrategy evaluator : config.getEvaluators() ) { 153 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying evaluator %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), evaluator.getClass().getName())); 154 List<ITrainer> allTrainers = new LinkedList<>(); 155 for( ISetWiseTrainingStrategy setwiseTrainer : config.getSetWiseTrainers() ) { 156 allTrainers.add(setwiseTrainer); 157 } 158 for( ITrainingStrategy trainer : config.getTrainers() ) { 159 allTrainers.add(trainer); 160 } 161 if( writeHeader ) { 162 evaluator.setParameter(config.getResultsPath() + "/" + config.getExperimentName() + ".csv"); 163 } 164 evaluator.apply(testdata, traindata, allTrainers, writeHeader); 165 writeHeader = false; 166 } 167 Console.traceln(Level.INFO, String.format("[%s] [%02d/%02d] %s: finished", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion())); 168 versionCount++; 169 } 170 } 171 } 172 173 /** 174 * Helper method that checks if a version passes all filters. 175 * @param version version that is checked 176 * @param filters list of the filters 177 * @return true, if the version passes all filters, false otherwise 178 */ 179 private boolean isVersion(SoftwareVersion version, List<IVersionFilter> filters) { 180 boolean result = true; 181 for( IVersionFilter filter : filters) { 182 result &= !filter.apply(version); 183 } 184 return result; 185 } 186 187 /** 188 * Helper method that combines a set of Weka {@link Instances} sets into a single {@link Instances} set. 189 * @param traindataSet set of {@link Instances} to be combines 190 * @return single {@link Instances} set 191 */ 192 public static Instances makeSingleTrainingSet(SetUniqueList<Instances> traindataSet) { 193 Instances traindataFull = null; 194 for( Instances traindata : traindataSet) { 195 if( traindataFull==null ) { 196 traindataFull = new Instances(traindata); 197 } else { 198 for( int i=0 ; i<traindata.numInstances() ; i++ ) { 199 traindataFull.add(traindata.instance(i)); 200 } 201 } 202 } 203 return traindataFull; 204 } 76 /** 77 * configuration of the experiment 78 */ 79 private final ExperimentConfiguration config; 80 81 /** 82 * Constructor. Creates a new experiment based on a configuration. 83 * 84 * @param config 85 * configuration of the experiment 86 */ 87 public CrossProjectExperiment(ExperimentConfiguration config) { 88 this.config = config; 89 } 90 91 /** 92 * Executes the experiment with the steps as described in the class comment. 93 * 94 * @see Runnable#run() 95 */ 96 @Override 97 public void run() { 98 final List<SoftwareVersion> versions = new LinkedList<>(); 99 100 for (IVersionLoader loader : config.getLoaders()) { 101 versions.addAll(loader.load()); 102 } 103 104 for (IVersionFilter filter : config.getVersionFilters()) { 105 filter.apply(versions); 106 } 107 boolean writeHeader = true; 108 int versionCount = 1; 109 int testVersionCount = 0; 110 111 for (SoftwareVersion testVersion : versions) { 112 if (isVersion(testVersion, config.getTestVersionFilters())) { 113 testVersionCount++; 114 } 115 } 116 117 // sort versions 118 Collections.sort(versions); 119 120 for (SoftwareVersion testVersion : versions) { 121 if (isVersion(testVersion, config.getTestVersionFilters())) { 122 Console.traceln(Level.INFO, String.format("[%s] [%02d/%02d] %s: starting", 123 config.getExperimentName(), versionCount, 124 testVersionCount, 125 testVersion.getVersion())); 126 127 // Setup testdata and training data 128 Instances testdata = testVersion.getInstances(); 129 String testProject = testVersion.getProject(); 130 SetUniqueList<Instances> traindataSet = 131 SetUniqueList.setUniqueList(new LinkedList<Instances>()); 132 for (SoftwareVersion trainingVersion : versions) { 133 if (isVersion(trainingVersion, config.getTrainingVersionFilters())) { 134 if (trainingVersion != testVersion) { 135 if (!trainingVersion.getProject().equals(testProject)) { 136 traindataSet.add(trainingVersion.getInstances()); 137 } 138 } 139 } 140 } 141 142 for (ISetWiseProcessingStrategy processor : config.getSetWisePreprocessors()) { 143 Console.traceln(Level.FINE, String 144 .format("[%s] [%02d/%02d] %s: applying setwise preprocessor %s", 145 config.getExperimentName(), versionCount, testVersionCount, 146 testVersion.getVersion(), processor.getClass().getName())); 147 processor.apply(testdata, traindataSet); 148 } 149 for (ISetWiseDataselectionStrategy dataselector : config.getSetWiseSelectors()) { 150 Console.traceln(Level.FINE, String 151 .format("[%s] [%02d/%02d] %s: applying setwise selection %s", 152 config.getExperimentName(), versionCount, testVersionCount, 153 testVersion.getVersion(), dataselector.getClass().getName())); 154 dataselector.apply(testdata, traindataSet); 155 } 156 for (ISetWiseProcessingStrategy processor : config.getSetWisePostprocessors()) { 157 Console.traceln(Level.FINE, String 158 .format("[%s] [%02d/%02d] %s: applying setwise postprocessor %s", 159 config.getExperimentName(), versionCount, testVersionCount, 160 testVersion.getVersion(), processor.getClass().getName())); 161 processor.apply(testdata, traindataSet); 162 } 163 for (ISetWiseTrainingStrategy setwiseTrainer : config.getSetWiseTrainers()) { 164 Console.traceln(Level.FINE, String 165 .format("[%s] [%02d/%02d] %s: applying setwise trainer %s", 166 config.getExperimentName(), versionCount, testVersionCount, 167 testVersion.getVersion(), setwiseTrainer.getName())); 168 setwiseTrainer.apply(traindataSet); 169 } 170 Instances traindata = makeSingleTrainingSet(traindataSet); 171 for (IProcessesingStrategy processor : config.getPreProcessors()) { 172 Console.traceln(Level.FINE, String 173 .format("[%s] [%02d/%02d] %s: applying preprocessor %s", 174 config.getExperimentName(), versionCount, testVersionCount, 175 testVersion.getVersion(), processor.getClass().getName())); 176 processor.apply(testdata, traindata); 177 } 178 for (IPointWiseDataselectionStrategy dataselector : config.getPointWiseSelectors()) 179 { 180 Console.traceln(Level.FINE, String 181 .format("[%s] [%02d/%02d] %s: applying pointwise selection %s", 182 config.getExperimentName(), versionCount, testVersionCount, 183 testVersion.getVersion(), dataselector.getClass().getName())); 184 traindata = dataselector.apply(testdata, traindata); 185 } 186 for (IProcessesingStrategy processor : config.getPostProcessors()) { 187 Console.traceln(Level.FINE, String 188 .format("[%s] [%02d/%02d] %s: applying setwise postprocessor %s", 189 config.getExperimentName(), versionCount, testVersionCount, 190 testVersion.getVersion(), processor.getClass().getName())); 191 processor.apply(testdata, traindata); 192 } 193 for (ITrainingStrategy trainer : config.getTrainers()) { 194 Console.traceln(Level.FINE, String 195 .format("[%s] [%02d/%02d] %s: applying trainer %s", 196 config.getExperimentName(), versionCount, testVersionCount, 197 testVersion.getVersion(), trainer.getName())); 198 trainer.apply(traindata); 199 } 200 File resultsDir = new File(config.getResultsPath()); 201 if (!resultsDir.exists()) { 202 resultsDir.mkdir(); 203 } 204 for (IEvaluationStrategy evaluator : config.getEvaluators()) { 205 Console.traceln(Level.FINE, String 206 .format("[%s] [%02d/%02d] %s: applying evaluator %s", 207 config.getExperimentName(), versionCount, testVersionCount, 208 testVersion.getVersion(), evaluator.getClass().getName())); 209 List<ITrainer> allTrainers = new LinkedList<>(); 210 for (ISetWiseTrainingStrategy setwiseTrainer : config.getSetWiseTrainers()) { 211 allTrainers.add(setwiseTrainer); 212 } 213 for (ITrainingStrategy trainer : config.getTrainers()) { 214 allTrainers.add(trainer); 215 } 216 if (writeHeader) { 217 evaluator.setParameter(config.getResultsPath() + "/" + 218 config.getExperimentName() + ".csv"); 219 } 220 evaluator.apply(testdata, traindata, allTrainers, writeHeader); 221 writeHeader = false; 222 } 223 Console.traceln(Level.INFO, String.format("[%s] [%02d/%02d] %s: finished", 224 config.getExperimentName(), versionCount, 225 testVersionCount, 226 testVersion.getVersion())); 227 versionCount++; 228 } 229 } 230 } 231 232 /** 233 * Helper method that checks if a version passes all filters. 234 * 235 * @param version 236 * version that is checked 237 * @param filters 238 * list of the filters 239 * @return true, if the version passes all filters, false otherwise 240 */ 241 private boolean isVersion(SoftwareVersion version, List<IVersionFilter> filters) { 242 boolean result = true; 243 for (IVersionFilter filter : filters) { 244 result &= !filter.apply(version); 245 } 246 return result; 247 } 248 249 /** 250 * Helper method that combines a set of Weka {@link Instances} sets into a single 251 * {@link Instances} set. 252 * 253 * @param traindataSet 254 * set of {@link Instances} to be combines 255 * @return single {@link Instances} set 256 */ 257 public static Instances makeSingleTrainingSet(SetUniqueList<Instances> traindataSet) { 258 Instances traindataFull = null; 259 for (Instances traindata : traindataSet) { 260 if (traindataFull == null) { 261 traindataFull = new Instances(traindata); 262 } 263 else { 264 for (int i = 0; i < traindata.numInstances(); i++) { 265 traindataFull.add(traindata.instance(i)); 266 } 267 } 268 } 269 return traindataFull; 270 } 205 271 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/execution/IExecutionStrategy.java
r32 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.execution; 2 16 … … 4 18 5 19 /** 6 * Interface that must be implemented from the different experiments 7 * (e.g.ClassifierCreationExeperiment) to be runnable by {@link Runner}20 * Interface that must be implemented from the different experiments (e.g. 21 * ClassifierCreationExeperiment) to be runnable by {@link Runner} 8 22 * 9 23 * @author Fabian Trautsch 10 * 24 * 11 25 */ 12 public interface IExecutionStrategy extends Runnable {26 public interface IExecutionStrategy extends Runnable { 13 27 14 28 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/execution/RelaxedCrossProjectExperiment.java
r39 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.execution; 2 16 … … 25 39 26 40 /** 27 * Class responsible for executing an experiment according to an {@link ExperimentConfiguration}. The steps of an experiment are as follows: 41 * Class responsible for executing an experiment according to an {@link ExperimentConfiguration}. 42 * The steps of an experiment are as follows: 28 43 * <ul> 29 * <li>load the data from the provided data path</li> 30 * <li>filter the data sets according to the provided version filters</li> 31 * <li>execute the following steps for each data sets as test data that is not ignored through the test version filter: 32 * <ul> 33 * <li>filter the data sets to setup the candidate training data: 34 * <ul> 35 * <li>filter all data sets according to the training data filter 36 * </ul></li> 37 * <li>apply the setwise preprocessors</li> 38 * <li>apply the setwise data selection algorithms</li> 39 * <li>apply the setwise postprocessors</li> 40 * <li>train the setwise training classifiers</li> 41 * <li>unify all remaining training data into one data set</li> 42 * <li>apply the preprocessors</li> 43 * <li>apply the pointwise data selection algorithms</li> 44 * <li>apply the postprocessors</li> 45 * <li>train the normal classifiers</li> 46 * <li>evaluate the results for all trained classifiers on the training data</li> 47 * </ul></li> 44 * <li>load the data from the provided data path</li> 45 * <li>filter the data sets according to the provided version filters</li> 46 * <li>execute the following steps for each data sets as test data that is not ignored through the 47 * test version filter: 48 * <ul> 49 * <li>filter the data sets to setup the candidate training data: 50 * <ul> 51 * <li>filter all data sets according to the training data filter 52 * </ul> 53 * </li> 54 * <li>apply the setwise preprocessors</li> 55 * <li>apply the setwise data selection algorithms</li> 56 * <li>apply the setwise postprocessors</li> 57 * <li>train the setwise training classifiers</li> 58 * <li>unify all remaining training data into one data set</li> 59 * <li>apply the preprocessors</li> 60 * <li>apply the pointwise data selection algorithms</li> 61 * <li>apply the postprocessors</li> 62 * <li>train the normal classifiers</li> 63 * <li>evaluate the results for all trained classifiers on the training data</li> 64 * </ul> 65 * </li> 48 66 * </ul> 49 67 * 50 * Note that this class implements {@link Runnable}, i.e., each experiment can be started in its own thread. 68 * Note that this class implements {@link Runnable}, i.e., each experiment can be started in its own 69 * thread. 70 * 51 71 * @author Steffen Herbold 52 72 */ 53 73 public class RelaxedCrossProjectExperiment implements IExecutionStrategy { 54 74 55 /** 56 * configuration of the experiment 57 */ 58 private final ExperimentConfiguration config; 59 60 /** 61 * Constructor. Creates a new experiment based on a configuration. 62 * @param config configuration of the experiment 63 */ 64 public RelaxedCrossProjectExperiment(ExperimentConfiguration config) { 65 this.config = config; 66 } 67 68 /** 69 * Executes the experiment with the steps as described in the class comment. 70 * @see Runnable#run() 71 */ 72 @Override 73 public void run() { 74 final List<SoftwareVersion> versions = new LinkedList<>(); 75 76 for(IVersionLoader loader : config.getLoaders()) { 77 versions.addAll(loader.load()); 78 } 79 80 for( IVersionFilter filter : config.getVersionFilters() ) { 81 filter.apply(versions); 82 } 83 boolean writeHeader = true; 84 int versionCount = 1; 85 int testVersionCount = 0; 86 87 for( SoftwareVersion testVersion : versions ) { 88 if( isVersion(testVersion, config.getTestVersionFilters()) ) { 89 testVersionCount++; 90 } 91 } 92 93 // sort versions 94 Collections.sort(versions); 95 96 for( SoftwareVersion testVersion : versions ) { 97 if( isVersion(testVersion, config.getTestVersionFilters()) ) { 98 Console.traceln(Level.INFO, String.format("[%s] [%02d/%02d] %s: starting", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion())); 99 100 // Setup testdata and training data 101 Instances testdata = testVersion.getInstances(); 102 String testProject = testVersion.getProject(); 103 SetUniqueList<Instances> traindataSet = SetUniqueList.setUniqueList(new LinkedList<Instances>()); 104 for( SoftwareVersion trainingVersion : versions ) { 105 if( isVersion(trainingVersion, config.getTrainingVersionFilters()) ) { 106 if( trainingVersion!=testVersion ) { 107 if( trainingVersion.getProject().equals(testProject) ) { 108 if( trainingVersion.compareTo(testVersion)<0 ) { 109 // only add if older 110 traindataSet.add(trainingVersion.getInstances()); 111 } 112 } else { 113 traindataSet.add(trainingVersion.getInstances()); 114 } 115 } 116 } 117 } 118 119 for( ISetWiseProcessingStrategy processor : config.getSetWisePreprocessors() ) { 120 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying setwise preprocessor %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), processor.getClass().getName())); 121 processor.apply(testdata, traindataSet); 122 } 123 for( ISetWiseDataselectionStrategy dataselector : config.getSetWiseSelectors() ) { 124 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying setwise selection %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), dataselector.getClass().getName())); 125 dataselector.apply(testdata, traindataSet); 126 } 127 for( ISetWiseProcessingStrategy processor : config.getSetWisePostprocessors() ) { 128 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying setwise postprocessor %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), processor.getClass().getName())); 129 processor.apply(testdata, traindataSet); 130 } 131 for( ISetWiseTrainingStrategy setwiseTrainer : config.getSetWiseTrainers() ) { 132 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying setwise trainer %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), setwiseTrainer.getName())); 133 setwiseTrainer.apply(traindataSet); 134 } 135 Instances traindata = makeSingleTrainingSet(traindataSet); 136 for( IProcessesingStrategy processor : config.getPreProcessors() ) { 137 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying preprocessor %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), processor.getClass().getName())); 138 processor.apply(testdata, traindata); 139 } 140 for( IPointWiseDataselectionStrategy dataselector : config.getPointWiseSelectors() ) { 141 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying pointwise selection %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), dataselector.getClass().getName())); 142 traindata = dataselector.apply(testdata, traindata); 143 } 144 for( IProcessesingStrategy processor : config.getPostProcessors() ) { 145 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying setwise postprocessor %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), processor.getClass().getName())); 146 processor.apply(testdata, traindata); 147 } 148 for( ITrainingStrategy trainer : config.getTrainers() ) { 149 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying trainer %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), trainer.getName())); 150 trainer.apply(traindata); 151 } 152 File resultsDir = new File(config.getResultsPath()); 153 if (!resultsDir.exists()) { 154 resultsDir.mkdir(); 155 } 156 for( IEvaluationStrategy evaluator : config.getEvaluators() ) { 157 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying evaluator %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), evaluator.getClass().getName())); 158 List<ITrainer> allTrainers = new LinkedList<>(); 159 for( ISetWiseTrainingStrategy setwiseTrainer : config.getSetWiseTrainers() ) { 160 allTrainers.add(setwiseTrainer); 161 } 162 for( ITrainingStrategy trainer : config.getTrainers() ) { 163 allTrainers.add(trainer); 164 } 165 if( writeHeader ) { 166 evaluator.setParameter(config.getResultsPath() + "/" + config.getExperimentName() + ".csv"); 167 } 168 evaluator.apply(testdata, traindata, allTrainers, writeHeader); 169 writeHeader = false; 170 } 171 Console.traceln(Level.INFO, String.format("[%s] [%02d/%02d] %s: finished", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion())); 172 versionCount++; 173 } 174 } 175 } 176 177 /** 178 * Helper method that checks if a version passes all filters. 179 * @param version version that is checked 180 * @param filters list of the filters 181 * @return true, if the version passes all filters, false otherwise 182 */ 183 private boolean isVersion(SoftwareVersion version, List<IVersionFilter> filters) { 184 boolean result = true; 185 for( IVersionFilter filter : filters) { 186 result &= !filter.apply(version); 187 } 188 return result; 189 } 190 191 /** 192 * Helper method that combines a set of Weka {@link Instances} sets into a single {@link Instances} set. 193 * @param traindataSet set of {@link Instances} to be combines 194 * @return single {@link Instances} set 195 */ 196 public static Instances makeSingleTrainingSet(SetUniqueList<Instances> traindataSet) { 197 Instances traindataFull = null; 198 for( Instances traindata : traindataSet) { 199 if( traindataFull==null ) { 200 traindataFull = new Instances(traindata); 201 } else { 202 for( int i=0 ; i<traindata.numInstances() ; i++ ) { 203 traindataFull.add(traindata.instance(i)); 204 } 205 } 206 } 207 return traindataFull; 208 } 75 /** 76 * configuration of the experiment 77 */ 78 private final ExperimentConfiguration config; 79 80 /** 81 * Constructor. Creates a new experiment based on a configuration. 82 * 83 * @param config 84 * configuration of the experiment 85 */ 86 public RelaxedCrossProjectExperiment(ExperimentConfiguration config) { 87 this.config = config; 88 } 89 90 /** 91 * Executes the experiment with the steps as described in the class comment. 92 * 93 * @see Runnable#run() 94 */ 95 @Override 96 public void run() { 97 final List<SoftwareVersion> versions = new LinkedList<>(); 98 99 for (IVersionLoader loader : config.getLoaders()) { 100 versions.addAll(loader.load()); 101 } 102 103 for (IVersionFilter filter : config.getVersionFilters()) { 104 filter.apply(versions); 105 } 106 boolean writeHeader = true; 107 int versionCount = 1; 108 int testVersionCount = 0; 109 110 for (SoftwareVersion testVersion : versions) { 111 if (isVersion(testVersion, config.getTestVersionFilters())) { 112 testVersionCount++; 113 } 114 } 115 116 // sort versions 117 Collections.sort(versions); 118 119 for (SoftwareVersion testVersion : versions) { 120 if (isVersion(testVersion, config.getTestVersionFilters())) { 121 Console.traceln(Level.INFO, String.format("[%s] [%02d/%02d] %s: starting", 122 config.getExperimentName(), versionCount, 123 testVersionCount, 124 testVersion.getVersion())); 125 126 // Setup testdata and training data 127 Instances testdata = testVersion.getInstances(); 128 String testProject = testVersion.getProject(); 129 SetUniqueList<Instances> traindataSet = 130 SetUniqueList.setUniqueList(new LinkedList<Instances>()); 131 for (SoftwareVersion trainingVersion : versions) { 132 if (isVersion(trainingVersion, config.getTrainingVersionFilters())) { 133 if (trainingVersion != testVersion) { 134 if (trainingVersion.getProject().equals(testProject)) { 135 if (trainingVersion.compareTo(testVersion) < 0) { 136 // only add if older 137 traindataSet.add(trainingVersion.getInstances()); 138 } 139 } 140 else { 141 traindataSet.add(trainingVersion.getInstances()); 142 } 143 } 144 } 145 } 146 147 for (ISetWiseProcessingStrategy processor : config.getSetWisePreprocessors()) { 148 Console.traceln(Level.FINE, String 149 .format("[%s] [%02d/%02d] %s: applying setwise preprocessor %s", 150 config.getExperimentName(), versionCount, testVersionCount, 151 testVersion.getVersion(), processor.getClass().getName())); 152 processor.apply(testdata, traindataSet); 153 } 154 for (ISetWiseDataselectionStrategy dataselector : config.getSetWiseSelectors()) { 155 Console.traceln(Level.FINE, String 156 .format("[%s] [%02d/%02d] %s: applying setwise selection %s", 157 config.getExperimentName(), versionCount, testVersionCount, 158 testVersion.getVersion(), dataselector.getClass().getName())); 159 dataselector.apply(testdata, traindataSet); 160 } 161 for (ISetWiseProcessingStrategy processor : config.getSetWisePostprocessors()) { 162 Console.traceln(Level.FINE, String 163 .format("[%s] [%02d/%02d] %s: applying setwise postprocessor %s", 164 config.getExperimentName(), versionCount, testVersionCount, 165 testVersion.getVersion(), processor.getClass().getName())); 166 processor.apply(testdata, traindataSet); 167 } 168 for (ISetWiseTrainingStrategy setwiseTrainer : config.getSetWiseTrainers()) { 169 Console.traceln(Level.FINE, String 170 .format("[%s] [%02d/%02d] %s: applying setwise trainer %s", 171 config.getExperimentName(), versionCount, testVersionCount, 172 testVersion.getVersion(), setwiseTrainer.getName())); 173 setwiseTrainer.apply(traindataSet); 174 } 175 Instances traindata = makeSingleTrainingSet(traindataSet); 176 for (IProcessesingStrategy processor : config.getPreProcessors()) { 177 Console.traceln(Level.FINE, String 178 .format("[%s] [%02d/%02d] %s: applying preprocessor %s", 179 config.getExperimentName(), versionCount, testVersionCount, 180 testVersion.getVersion(), processor.getClass().getName())); 181 processor.apply(testdata, traindata); 182 } 183 for (IPointWiseDataselectionStrategy dataselector : config.getPointWiseSelectors()) 184 { 185 Console.traceln(Level.FINE, String 186 .format("[%s] [%02d/%02d] %s: applying pointwise selection %s", 187 config.getExperimentName(), versionCount, testVersionCount, 188 testVersion.getVersion(), dataselector.getClass().getName())); 189 traindata = dataselector.apply(testdata, traindata); 190 } 191 for (IProcessesingStrategy processor : config.getPostProcessors()) { 192 Console.traceln(Level.FINE, String 193 .format("[%s] [%02d/%02d] %s: applying setwise postprocessor %s", 194 config.getExperimentName(), versionCount, testVersionCount, 195 testVersion.getVersion(), processor.getClass().getName())); 196 processor.apply(testdata, traindata); 197 } 198 for (ITrainingStrategy trainer : config.getTrainers()) { 199 Console.traceln(Level.FINE, String 200 .format("[%s] [%02d/%02d] %s: applying trainer %s", 201 config.getExperimentName(), versionCount, testVersionCount, 202 testVersion.getVersion(), trainer.getName())); 203 trainer.apply(traindata); 204 } 205 File resultsDir = new File(config.getResultsPath()); 206 if (!resultsDir.exists()) { 207 resultsDir.mkdir(); 208 } 209 for (IEvaluationStrategy evaluator : config.getEvaluators()) { 210 Console.traceln(Level.FINE, String 211 .format("[%s] [%02d/%02d] %s: applying evaluator %s", 212 config.getExperimentName(), versionCount, testVersionCount, 213 testVersion.getVersion(), evaluator.getClass().getName())); 214 List<ITrainer> allTrainers = new LinkedList<>(); 215 for (ISetWiseTrainingStrategy setwiseTrainer : config.getSetWiseTrainers()) { 216 allTrainers.add(setwiseTrainer); 217 } 218 for (ITrainingStrategy trainer : config.getTrainers()) { 219 allTrainers.add(trainer); 220 } 221 if (writeHeader) { 222 evaluator.setParameter(config.getResultsPath() + "/" + 223 config.getExperimentName() + ".csv"); 224 } 225 evaluator.apply(testdata, traindata, allTrainers, writeHeader); 226 writeHeader = false; 227 } 228 Console.traceln(Level.INFO, String.format("[%s] [%02d/%02d] %s: finished", 229 config.getExperimentName(), versionCount, 230 testVersionCount, 231 testVersion.getVersion())); 232 versionCount++; 233 } 234 } 235 } 236 237 /** 238 * Helper method that checks if a version passes all filters. 239 * 240 * @param version 241 * version that is checked 242 * @param filters 243 * list of the filters 244 * @return true, if the version passes all filters, false otherwise 245 */ 246 private boolean isVersion(SoftwareVersion version, List<IVersionFilter> filters) { 247 boolean result = true; 248 for (IVersionFilter filter : filters) { 249 result &= !filter.apply(version); 250 } 251 return result; 252 } 253 254 /** 255 * Helper method that combines a set of Weka {@link Instances} sets into a single 256 * {@link Instances} set. 257 * 258 * @param traindataSet 259 * set of {@link Instances} to be combines 260 * @return single {@link Instances} set 261 */ 262 public static Instances makeSingleTrainingSet(SetUniqueList<Instances> traindataSet) { 263 Instances traindataFull = null; 264 for (Instances traindata : traindataSet) { 265 if (traindataFull == null) { 266 traindataFull = new Instances(traindata); 267 } 268 else { 269 for (int i = 0; i < traindata.numInstances(); i++) { 270 traindataFull.add(traindata.instance(i)); 271 } 272 } 273 } 274 return traindataFull; 275 } 209 276 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/ARFFFolderLoader.java
r4 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.loader; 2 16 … … 9 23 public class ARFFFolderLoader extends AbstractFolderLoader { 10 24 11 12 13 14 15 16 17 18 19 25 /* 26 * (non-Javadoc) 27 * 28 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader#getSingleLoader() 29 */ 30 @Override 31 protected SingleVersionLoader getSingleLoader() { 32 return new ARFFLoader(); 33 } 20 34 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/ARFFLoader.java
r6 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.loader; 2 16 … … 15 29 public class ARFFLoader implements SingleVersionLoader { 16 30 17 /* 18 * (non-Javadoc) 19 * 20 * @see de.ugoe.cs.cpdp.loader.SingleVersionLoader#load(java.io.File) 21 */ 22 @Override 23 public Instances load(File file) { 24 BufferedReader reader; 25 Instances data; 26 try { 27 reader = new BufferedReader(new FileReader(file)); 28 data = new Instances(reader); 29 reader.close(); 30 } catch (IOException e) { 31 throw new RuntimeException("error reading file: " + file.getName(), e); 32 } 31 /* 32 * (non-Javadoc) 33 * 34 * @see de.ugoe.cs.cpdp.loader.SingleVersionLoader#load(java.io.File) 35 */ 36 @Override 37 public Instances load(File file) { 38 BufferedReader reader; 39 Instances data; 40 try { 41 reader = new BufferedReader(new FileReader(file)); 42 data = new Instances(reader); 43 reader.close(); 44 } 45 catch (IOException e) { 46 throw new RuntimeException("error reading file: " + file.getName(), e); 47 } 33 48 34 35 49 // setting class attribute 50 data.setClassIndex(data.numAttributes() - 1); 36 51 37 38 52 return data; 53 } 39 54 40 /* 41 * (non-Javadoc) 42 * 43 * @see 44 * de.ugoe.cs.cpdp.loader.SingleVersionLoader#filenameFilter(java.lang.String 45 * ) 46 */ 47 @Override 48 public boolean filenameFilter(String filename) { 49 return filename.endsWith(".arff"); 50 } 55 /* 56 * (non-Javadoc) 57 * 58 * @see de.ugoe.cs.cpdp.loader.SingleVersionLoader#filenameFilter(java.lang.String ) 59 */ 60 @Override 61 public boolean filenameFilter(String filename) { 62 return filename.endsWith(".arff"); 63 } 51 64 52 65 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/AUDIChangeFolderLoader.java
r38 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.loader; 2 16 3 17 public class AUDIChangeFolderLoader extends AbstractFolderLoader { 4 18 5 6 7 8 9 10 11 12 13 19 /* 20 * (non-Javadoc) 21 * 22 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader#getSingleLoader() 23 */ 24 @Override 25 protected SingleVersionLoader getSingleLoader() { 26 return new AUDIChangeLoader(); 27 } 14 28 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/AUDIChangeLoader.java
r38 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.loader; 2 16 … … 15 29 /** 16 30 * TODO 31 * 17 32 * @author sherbold 18 * 33 * 19 34 */ 20 35 class AUDIChangeLoader implements SingleVersionLoader { 21 22 private class EntityRevisionPair implements Comparable<EntityRevisionPair> { 23 private final String entity; 24 private final int revision; 25 26 public EntityRevisionPair(String entity, int revision) { 27 this.entity = entity; 28 this.revision = revision; 29 } 30 31 @Override 32 public boolean equals(Object other) { 33 if( !(other instanceof EntityRevisionPair) ) { 34 return false; 35 } else { 36 return compareTo((EntityRevisionPair) other)==0; 37 } 38 } 39 40 @Override 41 public int hashCode() { 42 return entity.hashCode()+revision; 43 } 44 45 @Override 46 public int compareTo(EntityRevisionPair other) { 47 int strCmp = this.entity.compareTo(other.entity); 48 if( strCmp!=0 ) { 49 return strCmp; 50 } 51 return Integer.compare(revision, other.revision); 52 } 53 54 @Override 55 public String toString() { 56 return entity+"@"+revision; 57 } 58 } 59 60 @Override 61 public Instances load(File file) { 62 final String[] lines; 63 String[] lineSplit; 64 String[] lineSplitBug; 65 66 try { 67 lines = FileTools.getLinesFromFile(file.getAbsolutePath()); 68 } catch (IOException e) { 69 throw new RuntimeException(e); 70 } 71 72 // information about bugs are in another file 73 String path = file.getAbsolutePath(); 74 path = path.substring(0, path.length()-14) + "repro.csv"; 75 final String[] linesBug; 76 try { 77 linesBug = FileTools.getLinesFromFile(path); 78 } catch (IOException e) { 79 throw new RuntimeException(e); 80 } 81 82 int revisionIndex=-1; 83 int bugIndex=-1; 84 lineSplitBug = linesBug[0].split(";"); 85 for( int j=0; j<lineSplitBug.length ; j++ ) { 86 if( lineSplitBug[j].equals("svnrev") ) { 87 revisionIndex=j; 88 } 89 if( lineSplitBug[j].equals("num_bugs_trace") ) { 90 bugIndex=j; 91 } 92 } 93 if( revisionIndex<0 ) { 94 throw new RuntimeException("could not find SVN revisions"); 95 } 96 if( bugIndex<0 ) { 97 throw new RuntimeException("could not find bug information"); 98 } 99 100 int metricsStartIndex=-1; 101 int metricsEndIndex=-1; 102 lineSplit = lines[0].split(";"); 103 for( int j=0; j<lineSplit.length ; j++ ) { 104 if( lineSplit[j].equals("lm_LOC") ) { 105 metricsStartIndex=j; 106 } 107 if( lineSplit[j].equals("h_E") ) { 108 metricsEndIndex=j; 109 } 110 } 111 if( metricsStartIndex<0 ) { 112 throw new RuntimeException("could not find first metric, i.e., lm_LOC"); 113 } 114 if( metricsEndIndex<0 ) { 115 throw new RuntimeException("could not find last metric, i.e., h_E"); 116 } 117 int numMetrics = metricsEndIndex-metricsStartIndex+1; 118 119 // create sets of all filenames and revisions 120 SortedMap<EntityRevisionPair, Integer> entityRevisionPairs = new TreeMap<>(); 121 for( int i=1; i<linesBug.length ; i++ ) { 122 lineSplitBug = linesBug[i].split(";"); 123 entityRevisionPairs.put(new EntityRevisionPair(lineSplitBug[0], Integer.parseInt(lineSplitBug[revisionIndex])), i); 124 } 125 126 127 // prepare weka instances 128 final ArrayList<Attribute> atts = new ArrayList<Attribute>(); 129 lineSplit = lines[0].split(";"); 130 for (int j = metricsStartIndex; j<=metricsEndIndex; j++) { 131 atts.add(new Attribute(lineSplit[j]+"_delta")); 132 } 133 for (int j = metricsStartIndex; j<=metricsEndIndex; j++) { 134 atts.add(new Attribute(lineSplit[j]+"_abs")); 135 } 136 final ArrayList<String> classAttVals = new ArrayList<String>(); 137 classAttVals.add("0"); 138 classAttVals.add("1"); 139 final Attribute classAtt = new Attribute("bug", classAttVals); 140 atts.add(classAtt); 141 142 final Instances data = new Instances(file.getName(), atts, 0); 143 data.setClass(classAtt); 144 145 // create data 146 String lastFile = null; 147 double[] lastValues = null; 148 int lastNumBugs = 0; 149 for( Entry<EntityRevisionPair, Integer> entry : entityRevisionPairs.entrySet() ) { 150 try { 151 // first get values 152 lineSplit = lines[entry.getValue()].split(";"); 153 lineSplitBug = linesBug[entry.getValue()].split(";"); 154 int i=0; 155 double[] values = new double[numMetrics]; 156 for(int j=metricsStartIndex ; j<=metricsEndIndex ; j++ ) { 157 values[i] = Double.parseDouble(lineSplit[j]); 158 i++; 159 } 160 int numBugs = Integer.parseInt(lineSplitBug[bugIndex]); 161 162 // then check if an entity must be created 163 if( entry.getKey().entity.equals(lastFile)) { 164 // create new instance 165 double[] instanceValues = new double[2*numMetrics+1]; 166 for( int j=0; j<numMetrics; j++ ) { 167 instanceValues[j] = values[j]-lastValues[j]; 168 instanceValues[j+numMetrics]= values[j]; 169 } 170 // check if any value>0 171 boolean changeOccured = false; 172 for( int j=0; j<numMetrics; j++ ) { 173 if( instanceValues[j]>0 ) { 174 changeOccured = true; 175 } 176 } 177 if( changeOccured ) { 178 instanceValues[instanceValues.length-1] = numBugs<=lastNumBugs ? 0 : 1; 179 data.add(new DenseInstance(1.0, instanceValues)); 180 } 181 } 182 lastFile = entry.getKey().entity; 183 lastValues = values; 184 lastNumBugs = numBugs; 185 } catch(IllegalArgumentException e) { 186 System.err.println("error in line " + entry.getValue() + ": " + e.getMessage()); 187 System.err.println("metrics line: " + lines[entry.getValue()]); 188 System.err.println("bugs line: " + linesBug[entry.getValue()]); 189 System.err.println("line is ignored"); 190 } 191 } 192 193 return data; 194 } 195 196 /* 197 * (non-Javadoc) 198 * 199 * @see 200 * de.ugoe.cs.cpdp.loader.AbstractFolderLoader.SingleVersionLoader#load( 201 * java.io.File) 202 */ 203 204 public Instances load(File file, String dummy) { 205 final String[] lines; 206 try { 207 lines = FileTools.getLinesFromFile(file.getAbsolutePath()); 208 } catch (IOException e) { 209 throw new RuntimeException(e); 210 } 211 212 // information about bugs are in another file 213 String path = file.getAbsolutePath(); 214 path = path.substring(0, path.length()-14) + "repro.csv"; 215 final String[] linesBug; 216 try { 217 linesBug = FileTools.getLinesFromFile(path); 218 } catch (IOException e) { 219 throw new RuntimeException(e); 220 } 221 222 // configure Instances 223 final ArrayList<Attribute> atts = new ArrayList<Attribute>(); 224 225 String[] lineSplit = lines[0].split(";"); 226 // ignore first three/four and last two columns 227 int offset; 228 if( lineSplit[3].equals("project_rev") ) { 229 offset = 4; 230 } else { 231 offset = 3; 232 } 233 for (int j = 0; j < lineSplit.length - (offset+2); j++) { 234 atts.add(new Attribute(lineSplit[j + offset])); 235 } 236 final ArrayList<String> classAttVals = new ArrayList<String>(); 237 classAttVals.add("0"); 238 classAttVals.add("1"); 239 final Attribute classAtt = new Attribute("bug", classAttVals); 240 atts.add(classAtt); 241 242 final Instances data = new Instances(file.getName(), atts, 0); 243 data.setClass(classAtt); 244 245 // fetch data 246 for (int i = 1; i < lines.length; i++) { 247 boolean validInstance = true; 248 lineSplit = lines[i].split(";"); 249 String[] lineSplitBug = linesBug[i].split(";"); 250 double[] values = new double[data.numAttributes()]; 251 for (int j = 0; validInstance && j < values.length-1; j++) { 252 if( lineSplit[j + offset].trim().isEmpty() ) { 253 validInstance = false; 254 } else { 255 values[j] = Double.parseDouble(lineSplit[j + offset].trim()); 256 } 257 } 258 if( offset==3 ) { 259 values[values.length - 1] = lineSplitBug[7].equals("0") ? 0 : 1; 260 } else { 261 values[values.length - 1] = lineSplitBug[8].equals("0") ? 0 : 1; 262 } 263 264 if( validInstance ) { 265 data.add(new DenseInstance(1.0, values)); 266 } else { 267 System.out.println("instance " + i + " is invalid"); 268 } 269 } 270 return data; 271 } 272 273 /* 274 * (non-Javadoc) 275 * 276 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader.SingleVersionLoader# 277 * filenameFilter(java.lang.String) 278 */ 279 @Override 280 public boolean filenameFilter(String filename) { 281 return filename.endsWith("src.csv"); 282 } 36 37 private class EntityRevisionPair implements Comparable<EntityRevisionPair> { 38 private final String entity; 39 private final int revision; 40 41 public EntityRevisionPair(String entity, int revision) { 42 this.entity = entity; 43 this.revision = revision; 44 } 45 46 @Override 47 public boolean equals(Object other) { 48 if (!(other instanceof EntityRevisionPair)) { 49 return false; 50 } 51 else { 52 return compareTo((EntityRevisionPair) other) == 0; 53 } 54 } 55 56 @Override 57 public int hashCode() { 58 return entity.hashCode() + revision; 59 } 60 61 @Override 62 public int compareTo(EntityRevisionPair other) { 63 int strCmp = this.entity.compareTo(other.entity); 64 if (strCmp != 0) { 65 return strCmp; 66 } 67 return Integer.compare(revision, other.revision); 68 } 69 70 @Override 71 public String toString() { 72 return entity + "@" + revision; 73 } 74 } 75 76 @Override 77 public Instances load(File file) { 78 final String[] lines; 79 String[] lineSplit; 80 String[] lineSplitBug; 81 82 try { 83 lines = FileTools.getLinesFromFile(file.getAbsolutePath()); 84 } 85 catch (IOException e) { 86 throw new RuntimeException(e); 87 } 88 89 // information about bugs are in another file 90 String path = file.getAbsolutePath(); 91 path = path.substring(0, path.length() - 14) + "repro.csv"; 92 final String[] linesBug; 93 try { 94 linesBug = FileTools.getLinesFromFile(path); 95 } 96 catch (IOException e) { 97 throw new RuntimeException(e); 98 } 99 100 int revisionIndex = -1; 101 int bugIndex = -1; 102 lineSplitBug = linesBug[0].split(";"); 103 for (int j = 0; j < lineSplitBug.length; j++) { 104 if (lineSplitBug[j].equals("svnrev")) { 105 revisionIndex = j; 106 } 107 if (lineSplitBug[j].equals("num_bugs_trace")) { 108 bugIndex = j; 109 } 110 } 111 if (revisionIndex < 0) { 112 throw new RuntimeException("could not find SVN revisions"); 113 } 114 if (bugIndex < 0) { 115 throw new RuntimeException("could not find bug information"); 116 } 117 118 int metricsStartIndex = -1; 119 int metricsEndIndex = -1; 120 lineSplit = lines[0].split(";"); 121 for (int j = 0; j < lineSplit.length; j++) { 122 if (lineSplit[j].equals("lm_LOC")) { 123 metricsStartIndex = j; 124 } 125 if (lineSplit[j].equals("h_E")) { 126 metricsEndIndex = j; 127 } 128 } 129 if (metricsStartIndex < 0) { 130 throw new RuntimeException("could not find first metric, i.e., lm_LOC"); 131 } 132 if (metricsEndIndex < 0) { 133 throw new RuntimeException("could not find last metric, i.e., h_E"); 134 } 135 int numMetrics = metricsEndIndex - metricsStartIndex + 1; 136 137 // create sets of all filenames and revisions 138 SortedMap<EntityRevisionPair, Integer> entityRevisionPairs = new TreeMap<>(); 139 for (int i = 1; i < linesBug.length; i++) { 140 lineSplitBug = linesBug[i].split(";"); 141 entityRevisionPairs.put(new EntityRevisionPair(lineSplitBug[0], Integer 142 .parseInt(lineSplitBug[revisionIndex])), i); 143 } 144 145 // prepare weka instances 146 final ArrayList<Attribute> atts = new ArrayList<Attribute>(); 147 lineSplit = lines[0].split(";"); 148 for (int j = metricsStartIndex; j <= metricsEndIndex; j++) { 149 atts.add(new Attribute(lineSplit[j] + "_delta")); 150 } 151 for (int j = metricsStartIndex; j <= metricsEndIndex; j++) { 152 atts.add(new Attribute(lineSplit[j] + "_abs")); 153 } 154 final ArrayList<String> classAttVals = new ArrayList<String>(); 155 classAttVals.add("0"); 156 classAttVals.add("1"); 157 final Attribute classAtt = new Attribute("bug", classAttVals); 158 atts.add(classAtt); 159 160 final Instances data = new Instances(file.getName(), atts, 0); 161 data.setClass(classAtt); 162 163 // create data 164 String lastFile = null; 165 double[] lastValues = null; 166 int lastNumBugs = 0; 167 for (Entry<EntityRevisionPair, Integer> entry : entityRevisionPairs.entrySet()) { 168 try { 169 // first get values 170 lineSplit = lines[entry.getValue()].split(";"); 171 lineSplitBug = linesBug[entry.getValue()].split(";"); 172 int i = 0; 173 double[] values = new double[numMetrics]; 174 for (int j = metricsStartIndex; j <= metricsEndIndex; j++) { 175 values[i] = Double.parseDouble(lineSplit[j]); 176 i++; 177 } 178 int numBugs = Integer.parseInt(lineSplitBug[bugIndex]); 179 180 // then check if an entity must be created 181 if (entry.getKey().entity.equals(lastFile)) { 182 // create new instance 183 double[] instanceValues = new double[2 * numMetrics + 1]; 184 for (int j = 0; j < numMetrics; j++) { 185 instanceValues[j] = values[j] - lastValues[j]; 186 instanceValues[j + numMetrics] = values[j]; 187 } 188 // check if any value>0 189 boolean changeOccured = false; 190 for (int j = 0; j < numMetrics; j++) { 191 if (instanceValues[j] > 0) { 192 changeOccured = true; 193 } 194 } 195 if (changeOccured) { 196 instanceValues[instanceValues.length - 1] = numBugs <= lastNumBugs ? 0 : 1; 197 data.add(new DenseInstance(1.0, instanceValues)); 198 } 199 } 200 lastFile = entry.getKey().entity; 201 lastValues = values; 202 lastNumBugs = numBugs; 203 } 204 catch (IllegalArgumentException e) { 205 System.err.println("error in line " + entry.getValue() + ": " + e.getMessage()); 206 System.err.println("metrics line: " + lines[entry.getValue()]); 207 System.err.println("bugs line: " + linesBug[entry.getValue()]); 208 System.err.println("line is ignored"); 209 } 210 } 211 212 return data; 213 } 214 215 /* 216 * (non-Javadoc) 217 * 218 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader.SingleVersionLoader#load( java.io.File) 219 */ 220 221 public Instances load(File file, String dummy) { 222 final String[] lines; 223 try { 224 lines = FileTools.getLinesFromFile(file.getAbsolutePath()); 225 } 226 catch (IOException e) { 227 throw new RuntimeException(e); 228 } 229 230 // information about bugs are in another file 231 String path = file.getAbsolutePath(); 232 path = path.substring(0, path.length() - 14) + "repro.csv"; 233 final String[] linesBug; 234 try { 235 linesBug = FileTools.getLinesFromFile(path); 236 } 237 catch (IOException e) { 238 throw new RuntimeException(e); 239 } 240 241 // configure Instances 242 final ArrayList<Attribute> atts = new ArrayList<Attribute>(); 243 244 String[] lineSplit = lines[0].split(";"); 245 // ignore first three/four and last two columns 246 int offset; 247 if (lineSplit[3].equals("project_rev")) { 248 offset = 4; 249 } 250 else { 251 offset = 3; 252 } 253 for (int j = 0; j < lineSplit.length - (offset + 2); j++) { 254 atts.add(new Attribute(lineSplit[j + offset])); 255 } 256 final ArrayList<String> classAttVals = new ArrayList<String>(); 257 classAttVals.add("0"); 258 classAttVals.add("1"); 259 final Attribute classAtt = new Attribute("bug", classAttVals); 260 atts.add(classAtt); 261 262 final Instances data = new Instances(file.getName(), atts, 0); 263 data.setClass(classAtt); 264 265 // fetch data 266 for (int i = 1; i < lines.length; i++) { 267 boolean validInstance = true; 268 lineSplit = lines[i].split(";"); 269 String[] lineSplitBug = linesBug[i].split(";"); 270 double[] values = new double[data.numAttributes()]; 271 for (int j = 0; validInstance && j < values.length - 1; j++) { 272 if (lineSplit[j + offset].trim().isEmpty()) { 273 validInstance = false; 274 } 275 else { 276 values[j] = Double.parseDouble(lineSplit[j + offset].trim()); 277 } 278 } 279 if (offset == 3) { 280 values[values.length - 1] = lineSplitBug[7].equals("0") ? 0 : 1; 281 } 282 else { 283 values[values.length - 1] = lineSplitBug[8].equals("0") ? 0 : 1; 284 } 285 286 if (validInstance) { 287 data.add(new DenseInstance(1.0, values)); 288 } 289 else { 290 System.out.println("instance " + i + " is invalid"); 291 } 292 } 293 return data; 294 } 295 296 /* 297 * (non-Javadoc) 298 * 299 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader.SingleVersionLoader# 300 * filenameFilter(java.lang.String) 301 */ 302 @Override 303 public boolean filenameFilter(String filename) { 304 return filename.endsWith("src.csv"); 305 } 283 306 284 307 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/AUDIDataLoader.java
r35 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.loader; 2 16 … … 12 26 /** 13 27 * TODO 28 * 14 29 * @author sherbold 15 * 30 * 16 31 */ 17 32 class AUDIDataLoader implements SingleVersionLoader { 18 33 19 /* 20 * (non-Javadoc) 21 * 22 * @see 23 * de.ugoe.cs.cpdp.loader.AbstractFolderLoader.SingleVersionLoader#load( 24 * java.io.File) 25 */ 26 @Override 27 public Instances load(File file) { 28 final String[] lines; 29 try { 30 lines = FileTools.getLinesFromFile(file.getAbsolutePath()); 31 } catch (IOException e) { 32 throw new RuntimeException(e); 33 } 34 35 // information about bugs are in another file 36 String path = file.getAbsolutePath(); 37 path = path.substring(0, path.length()-14) + "repro.csv"; 38 final String[] linesBug; 39 try { 40 linesBug = FileTools.getLinesFromFile(path); 41 } catch (IOException e) { 42 throw new RuntimeException(e); 43 } 44 45 // configure Instances 46 final ArrayList<Attribute> atts = new ArrayList<Attribute>(); 34 /* 35 * (non-Javadoc) 36 * 37 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader.SingleVersionLoader#load( java.io.File) 38 */ 39 @Override 40 public Instances load(File file) { 41 final String[] lines; 42 try { 43 lines = FileTools.getLinesFromFile(file.getAbsolutePath()); 44 } 45 catch (IOException e) { 46 throw new RuntimeException(e); 47 } 47 48 48 String[] lineSplit = lines[0].split(";"); 49 // ignore first three/four and last two columns 50 int offset; 51 if( lineSplit[3].equals("project_rev") ) { 52 offset = 4; 53 } else { 54 offset = 3; 55 } 56 for (int j = 0; j < lineSplit.length - (offset+2); j++) { 57 atts.add(new Attribute(lineSplit[j + offset])); 58 } 59 final ArrayList<String> classAttVals = new ArrayList<String>(); 60 classAttVals.add("0"); 61 classAttVals.add("1"); 62 final Attribute classAtt = new Attribute("bug", classAttVals); 63 atts.add(classAtt); 49 // information about bugs are in another file 50 String path = file.getAbsolutePath(); 51 path = path.substring(0, path.length() - 14) + "repro.csv"; 52 final String[] linesBug; 53 try { 54 linesBug = FileTools.getLinesFromFile(path); 55 } 56 catch (IOException e) { 57 throw new RuntimeException(e); 58 } 64 59 65 final Instances data = new Instances(file.getName(), atts, 0); 66 data.setClass(classAtt);60 // configure Instances 61 final ArrayList<Attribute> atts = new ArrayList<Attribute>(); 67 62 68 // fetch data 69 for (int i = 1; i < lines.length; i++) { 70 boolean validInstance = true; 71 lineSplit = lines[i].split(";"); 72 String[] lineSplitBug = linesBug[i].split(";"); 73 double[] values = new double[data.numAttributes()]; 74 for (int j = 0; validInstance && j < values.length-1; j++) { 75 if( lineSplit[j + offset].trim().isEmpty() ) { 76 validInstance = false; 77 } else { 78 values[j] = Double.parseDouble(lineSplit[j + offset].trim()); 79 } 80 } 81 if( offset==3 ) { 82 values[values.length - 1] = lineSplitBug[7].equals("0") ? 0 : 1; 83 } else { 84 values[values.length - 1] = lineSplitBug[8].equals("0") ? 0 : 1; 85 } 86 87 if( validInstance ) { 88 data.add(new DenseInstance(1.0, values)); 89 } else { 90 System.out.println("instance " + i + " is invalid"); 91 } 92 } 93 return data; 94 } 63 String[] lineSplit = lines[0].split(";"); 64 // ignore first three/four and last two columns 65 int offset; 66 if (lineSplit[3].equals("project_rev")) { 67 offset = 4; 68 } 69 else { 70 offset = 3; 71 } 72 for (int j = 0; j < lineSplit.length - (offset + 2); j++) { 73 atts.add(new Attribute(lineSplit[j + offset])); 74 } 75 final ArrayList<String> classAttVals = new ArrayList<String>(); 76 classAttVals.add("0"); 77 classAttVals.add("1"); 78 final Attribute classAtt = new Attribute("bug", classAttVals); 79 atts.add(classAtt); 95 80 96 /* 97 * (non-Javadoc) 98 * 99 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader.SingleVersionLoader# 100 * filenameFilter(java.lang.String) 101 */ 102 @Override 103 public boolean filenameFilter(String filename) { 104 return filename.endsWith("src.csv"); 105 } 81 final Instances data = new Instances(file.getName(), atts, 0); 82 data.setClass(classAtt); 83 84 // fetch data 85 for (int i = 1; i < lines.length; i++) { 86 boolean validInstance = true; 87 lineSplit = lines[i].split(";"); 88 String[] lineSplitBug = linesBug[i].split(";"); 89 double[] values = new double[data.numAttributes()]; 90 for (int j = 0; validInstance && j < values.length - 1; j++) { 91 if (lineSplit[j + offset].trim().isEmpty()) { 92 validInstance = false; 93 } 94 else { 95 values[j] = Double.parseDouble(lineSplit[j + offset].trim()); 96 } 97 } 98 if (offset == 3) { 99 values[values.length - 1] = lineSplitBug[7].equals("0") ? 0 : 1; 100 } 101 else { 102 values[values.length - 1] = lineSplitBug[8].equals("0") ? 0 : 1; 103 } 104 105 if (validInstance) { 106 data.add(new DenseInstance(1.0, values)); 107 } 108 else { 109 System.out.println("instance " + i + " is invalid"); 110 } 111 } 112 return data; 113 } 114 115 /* 116 * (non-Javadoc) 117 * 118 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader.SingleVersionLoader# 119 * filenameFilter(java.lang.String) 120 */ 121 @Override 122 public boolean filenameFilter(String filename) { 123 return filename.endsWith("src.csv"); 124 } 106 125 107 126 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/AUDIFolderLoader.java
r35 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.loader; 2 16 3 17 public class AUDIFolderLoader extends AbstractFolderLoader { 4 18 5 6 7 8 9 10 11 12 13 19 /* 20 * (non-Javadoc) 21 * 22 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader#getSingleLoader() 23 */ 24 @Override 25 protected SingleVersionLoader getSingleLoader() { 26 return new AUDIDataLoader(); 27 } 14 28 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/AbstractFolderLoader.java
r32 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.loader; 2 16 … … 10 24 11 25 /** 12 * Abstract class for loading data from a folder. The subfolders of a defined 13 * folder define the projects, the file contained in the subfolder are the 14 * versions of a project. 26 * Abstract class for loading data from a folder. The subfolders of a defined folder define the 27 * projects, the file contained in the subfolder are the versions of a project. 15 28 * 16 29 * @author Steffen Herbold … … 18 31 public abstract class AbstractFolderLoader implements IVersionLoader { 19 32 20 21 22 23 33 /** 34 * Path of the data. 35 */ 36 protected String path = ""; 24 37 25 26 27 28 29 30 31 38 /** 39 * @see de.ugoe.cs.cpdp.loader.IVersionLoader#setLocation(java.lang.String) 40 */ 41 @Override 42 public void setLocation(String location) { 43 path = location; 44 } 32 45 33 34 35 36 37 38 46 /** 47 * @see de.ugoe.cs.cpdp.loader.IVersionLoader#load() 48 */ 49 @Override 50 public List<SoftwareVersion> load() { 51 final List<SoftwareVersion> versions = new LinkedList<SoftwareVersion>(); 39 52 40 41 53 final File dataDir = new File(path); 54 final SingleVersionLoader instancesLoader = getSingleLoader(); 42 55 43 for (File projectDir : dataDir.listFiles()) { 44 if (projectDir.isDirectory()) { 45 String projectName = projectDir.getName(); 46 for (File versionFile : projectDir.listFiles()) { 47 if (versionFile.isFile() 48 && instancesLoader.filenameFilter(versionFile 49 .getName())) { 50 String versionName = versionFile.getName(); 51 Instances data = instancesLoader.load(versionFile); 52 versions.add(new SoftwareVersion(projectName, 53 versionName, data)); 54 } 55 } 56 } 57 } 58 return versions; 59 } 56 for (File projectDir : dataDir.listFiles()) { 57 if (projectDir.isDirectory()) { 58 String projectName = projectDir.getName(); 59 for (File versionFile : projectDir.listFiles()) { 60 if (versionFile.isFile() && 61 instancesLoader.filenameFilter(versionFile.getName())) 62 { 63 String versionName = versionFile.getName(); 64 Instances data = instancesLoader.load(versionFile); 65 versions.add(new SoftwareVersion(projectName, versionName, data)); 66 } 67 } 68 } 69 } 70 return versions; 71 } 60 72 61 /** 62 * Returns the concrete {@link SingleVersionLoader} to be used with this 63 * folder loader. 64 * 65 * @return 66 */ 67 abstract protected SingleVersionLoader getSingleLoader(); 73 /** 74 * Returns the concrete {@link SingleVersionLoader} to be used with this folder loader. 75 * 76 * @return 77 */ 78 abstract protected SingleVersionLoader getSingleLoader(); 68 79 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/CSVDataLoader.java
r4 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.loader; 2 16 … … 11 25 12 26 /** 13 * Loads the instances for a software version from a CSV file of the PROMISE 14 * data set mined byJurezko and Madeyski.27 * Loads the instances for a software version from a CSV file of the PROMISE data set mined by 28 * Jurezko and Madeyski. 15 29 * 16 30 * @author Steffen Herbold … … 18 32 class CSVDataLoader implements SingleVersionLoader { 19 33 20 /* 21 * (non-Javadoc) 22 * 23 * @see 24 * de.ugoe.cs.cpdp.loader.AbstractFolderLoader.SingleVersionLoader#load( 25 * java.io.File) 26 */ 27 @Override 28 public Instances load(File file) { 29 final String[] lines; 30 try { 31 lines = FileTools.getLinesFromFile(file.getAbsolutePath()); 32 } catch (IOException e) { 33 throw new RuntimeException(e); 34 } 34 /* 35 * (non-Javadoc) 36 * 37 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader.SingleVersionLoader#load( java.io.File) 38 */ 39 @Override 40 public Instances load(File file) { 41 final String[] lines; 42 try { 43 lines = FileTools.getLinesFromFile(file.getAbsolutePath()); 44 } 45 catch (IOException e) { 46 throw new RuntimeException(e); 47 } 35 48 36 37 49 // configure Instances 50 final ArrayList<Attribute> atts = new ArrayList<Attribute>(); 38 51 39 40 41 42 43 44 45 46 47 52 String[] lineSplit = lines[0].split(","); 53 for (int j = 0; j < lineSplit.length - 4; j++) { 54 atts.add(new Attribute(lineSplit[j + 3])); 55 } 56 final ArrayList<String> classAttVals = new ArrayList<String>(); 57 classAttVals.add("0"); 58 classAttVals.add("1"); 59 final Attribute classAtt = new Attribute("bug", classAttVals); 60 atts.add(classAtt); 48 61 49 50 62 final Instances data = new Instances(file.getName(), atts, 0); 63 data.setClass(classAtt); 51 64 52 // fetch data 53 for (int i = 1; i < lines.length; i++) { 54 lineSplit = lines[i].split(","); 55 double[] values = new double[lineSplit.length - 3]; 56 for (int j = 0; j < values.length - 1; j++) { 57 values[j] = Double.parseDouble(lineSplit[j + 3].trim()); 58 } 59 values[values.length - 1] = lineSplit[lineSplit.length - 1].trim() 60 .equals("0") ? 0 : 1; 61 data.add(new DenseInstance(1.0, values)); 62 } 65 // fetch data 66 for (int i = 1; i < lines.length; i++) { 67 lineSplit = lines[i].split(","); 68 double[] values = new double[lineSplit.length - 3]; 69 for (int j = 0; j < values.length - 1; j++) { 70 values[j] = Double.parseDouble(lineSplit[j + 3].trim()); 71 } 72 values[values.length - 1] = lineSplit[lineSplit.length - 1].trim().equals("0") ? 0 : 1; 73 data.add(new DenseInstance(1.0, values)); 74 } 63 75 64 65 76 return data; 77 } 66 78 67 68 69 70 71 72 73 74 75 76 79 /* 80 * (non-Javadoc) 81 * 82 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader.SingleVersionLoader# 83 * filenameFilter(java.lang.String) 84 */ 85 @Override 86 public boolean filenameFilter(String filename) { 87 return filename.endsWith(".csv"); 88 } 77 89 78 90 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/CSVFolderLoader.java
r4 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.loader; 2 16 3 17 /** 4 * Implements the {@link AbstractFolderLoader} for data from the PROMISE 5 * repository mined by Jurezkoand Madeyski.18 * Implements the {@link AbstractFolderLoader} for data from the PROMISE repository mined by Jurezko 19 * and Madeyski. 6 20 * 7 21 * @author Steffen Herbold … … 9 23 public class CSVFolderLoader extends AbstractFolderLoader { 10 24 11 12 13 14 15 16 17 18 19 25 /* 26 * (non-Javadoc) 27 * 28 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader#getSingleLoader() 29 */ 30 @Override 31 protected SingleVersionLoader getSingleLoader() { 32 return new CSVDataLoader(); 33 } 20 34 21 35 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/CSVMockusDataLoader.java
r29 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.loader; 2 16 … … 10 24 import de.ugoe.cs.util.FileTools; 11 25 12 13 26 class CSVMockusDataLoader implements SingleVersionLoader { 14 27 15 @Override 16 public Instances load(File file) { 17 final String[] lines; 18 try { 19 20 lines = FileTools.getLinesFromFile(file.getAbsolutePath()); 21 } catch (IOException e) { 22 throw new RuntimeException(e); 23 } 24 25 26 // configure Instances 27 final ArrayList<Attribute> atts = new ArrayList<Attribute>(); 28 @Override 29 public Instances load(File file) { 30 final String[] lines; 31 try { 28 32 29 String[] lineSplit = lines[0].split(","); 30 for (int j = 0; j < lineSplit.length - 3; j++) { 31 atts.add(new Attribute(lineSplit[j + 2])); 32 } 33 34 final ArrayList<String> classAttVals = new ArrayList<String>(); 35 classAttVals.add("0"); 36 classAttVals.add("1"); 37 final Attribute classAtt = new Attribute("bug", classAttVals); 38 atts.add(classAtt); 33 lines = FileTools.getLinesFromFile(file.getAbsolutePath()); 34 } 35 catch (IOException e) { 36 throw new RuntimeException(e); 37 } 39 38 40 final Instances data = new Instances(file.getName(), atts, 0); 41 data.setClass(classAtt);39 // configure Instances 40 final ArrayList<Attribute> atts = new ArrayList<Attribute>(); 42 41 43 // fetch data 44 for (int i = 1; i < lines.length; i++) { 45 lineSplit = lines[i].split(","); 46 double[] values = new double[lineSplit.length - 2]; 47 for (int j = 0; j < values.length - 1; j++) { 48 values[j] = Double.parseDouble(lineSplit[j + 2].trim()); 49 } 50 values[values.length - 1] = lineSplit[lineSplit.length - 1].trim() 51 .equals("0") ? 0 : 1; 52 data.add(new DenseInstance(1.0, values)); 53 } 42 String[] lineSplit = lines[0].split(","); 43 for (int j = 0; j < lineSplit.length - 3; j++) { 44 atts.add(new Attribute(lineSplit[j + 2])); 45 } 54 46 55 return data; 56 } 47 final ArrayList<String> classAttVals = new ArrayList<String>(); 48 classAttVals.add("0"); 49 classAttVals.add("1"); 50 final Attribute classAtt = new Attribute("bug", classAttVals); 51 atts.add(classAtt); 57 52 58 @Override 59 public boolean filenameFilter(String filename) { 60 return filename.endsWith(".csv"); 61 } 53 final Instances data = new Instances(file.getName(), atts, 0); 54 data.setClass(classAtt); 55 56 // fetch data 57 for (int i = 1; i < lines.length; i++) { 58 lineSplit = lines[i].split(","); 59 double[] values = new double[lineSplit.length - 2]; 60 for (int j = 0; j < values.length - 1; j++) { 61 values[j] = Double.parseDouble(lineSplit[j + 2].trim()); 62 } 63 values[values.length - 1] = lineSplit[lineSplit.length - 1].trim().equals("0") ? 0 : 1; 64 data.add(new DenseInstance(1.0, values)); 65 } 66 67 return data; 68 } 69 70 @Override 71 public boolean filenameFilter(String filename) { 72 return filename.endsWith(".csv"); 73 } 62 74 63 75 } 64 -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/CSVMockusFolderLoader.java
r28 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.loader; 2 16 3 17 public class CSVMockusFolderLoader extends AbstractFolderLoader { 4 18 5 6 7 8 19 @Override 20 protected SingleVersionLoader getSingleLoader() { 21 return new CSVMockusDataLoader(); 22 } 9 23 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/DecentDataLoader.java
r36 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.loader; 2 16 … … 38 52 39 53 /** 40 * Class for loading a decent model file. 41 * Loads a decent model file and (if no arff file is present) and does the 42 * following conversions: 43 * DECENT -> ARFFX -> ARFF 54 * Class for loading a decent model file. Loads a decent model file and (if no arff file is present) 55 * and does the following conversions: DECENT -> ARFFX -> ARFF 44 56 * 45 57 * @author Fabian Trautsch 46 * 58 * 47 59 */ 48 public class DecentDataLoader implements SingleVersionLoader{ 49 50 // Model Handler for Decent Models 51 private DECENTEpsilonModelHandler modelHandler = new DECENTEpsilonModelHandler(); 52 53 // Set log level 54 String logLevel = "1"; 55 String logToFile = "false"; 56 57 // This list contains attributes, that should be removed before building the arff file 58 private static List<String> attributeFilter = new LinkedList<String>(); 59 60 // This list contains all names of the different artifacts 61 private static Set<String> artifactNames = new LinkedHashSet<String>(); 62 63 // Name of the class attribute. 64 private static final String classAttributeName = "LABEL.Artifact.Target.BugFix.AverageWeight"; 65 66 67 private int getIndexOfArtifactName(String artifactName) { 68 int index = -1; 69 if(artifactNames.contains(artifactName)) { 70 int i=0; 71 for(String nameInSet: artifactNames) { 72 if(nameInSet.equals(artifactName)) { 73 index = i; 74 } else { 75 i++; 76 } 77 } 78 } 79 80 return index; 81 } 82 83 /** 84 * Defines attributes, that should be removed before building the 85 * ARFF File from. 86 */ 87 private void setAttributeFilter() { 88 attributeFilter.add("Agent.Name"); 89 90 } 91 92 /** 93 * Saves the dataset as arff after transformation (decent->arffx) and 94 * filtering 95 * 96 * @param dataSet the WEKA dataset to save 97 * @param arffLocation location where it should be saved to 98 */ 99 public void save(Instances dataSet, String arffLocation) { 100 101 102 ArffSaver saver = new ArffSaver(); 103 saver.setInstances(dataSet); 104 try { 105 saver.setFile(new File(arffLocation)); 106 saver.writeBatch(); 107 } catch (IOException e) { 108 Console.printerrln("Cannot save the file to path: "+arffLocation); 109 e.printStackTrace(); 110 } 111 } 112 113 114 /** 115 * Loads the given decent file and tranform it from decent->arffx->arff 116 * @return Instances in WEKA format 117 */ 118 @Override 119 public Instances load(File file) { 120 121 // Set attributeFilter 122 setAttributeFilter(); 123 124 // Register MetaModels 125 try { 126 registerMetaModels(); 127 } catch (Exception e1) { 128 Console.printerrln("Metamodels cannot be registered!"); 129 e1.printStackTrace(); 130 } 131 132 // Set location of decent and arffx Model 133 String decentModelLocation = file.getAbsolutePath(); 134 String pathToDecentModelFolder = decentModelLocation.substring(0,decentModelLocation.lastIndexOf(File.separator)); 135 String arffxModelLocation = pathToDecentModelFolder+"/model.arffx"; 136 String logModelLocation = pathToDecentModelFolder+"/model.log"; 137 String arffLocation = pathToDecentModelFolder+"/model.arff"; 138 139 // If arff File exists, load from it! 140 if(new File(arffLocation).exists()) { 141 System.out.println("Loading arff File..."); 142 BufferedReader reader; 143 Instances data = null; 144 try { 145 reader = new BufferedReader(new FileReader(arffLocation)); 146 data = new Instances(reader); 147 reader.close(); 148 } catch (FileNotFoundException e) { 149 Console.printerrln("File with path: "+arffLocation+" was not found."); 150 e.printStackTrace(); 151 } catch (IOException e) { 152 Console.printerrln("File with path: "+arffLocation+" cannot be read."); 153 e.printStackTrace(); 154 } 155 156 // Set class attribute if not set 157 if(data.classIndex() == -1) { 158 Attribute classAttribute = data.attribute(classAttributeName); 159 data.setClass(classAttribute); 160 } 161 162 163 return data; 164 } 165 166 // Location of EOL Scripts 167 String preprocess = "./decent/epsilon/query/preprocess.eol"; 168 String arffxToArffSource = "./decent/epsilon/query/addLabels.eol"; 169 170 // Set Log Properties 171 System.setProperty("epsilon.logLevel", logLevel); 172 System.setProperty("epsilon.logToFile", logToFile); 173 System.setProperty("epsilon.logFileAvailable", "false"); 174 175 // Set decent2arffx Properties 176 System.setProperty("epsilon.transformation.decent2arffx.skipSource", "false"); 177 System.setProperty("epsilon.transformation.decent2arffx.type", "code"); 178 179 180 181 // Preprocess Data, transform from decent2arffx 182 try { 183 IEolExecutableModule preProcessModule = loadModule(preprocess); 184 IModel preProcessDecentModel = modelHandler.getDECENTModel(decentModelLocation, true, true); 185 IModel preProcessArffxarffxModel = modelHandler.getARFFxModel(arffxModelLocation, false, true); 186 preProcessModule.getContext().getModelRepository().addModel(preProcessDecentModel); 187 preProcessModule.getContext().getModelRepository().addModel(preProcessArffxarffxModel); 188 execute(preProcessModule, logModelLocation); 189 preProcessDecentModel.dispose(); 190 preProcessArffxarffxModel.dispose(); 191 preProcessModule.reset(); 192 } catch (URISyntaxException e) { 193 Console.printerrln("URI Syntax for decent or arffx model is wrong."); 194 e.printStackTrace(); 195 } catch (Exception e) { 196 e.printStackTrace(); 197 } 198 199 200 201 202 // Transform to arff, for label and confidence attributes 203 try { 204 IEolExecutableModule arffxToArffModule = loadModule(arffxToArffSource); 205 IModel arffxToArffArffxModel = modelHandler.getARFFxModel(arffxModelLocation, true, true); 206 arffxToArffModule.getContext().getModelRepository().addModel(arffxToArffArffxModel); 207 execute(arffxToArffModule, logModelLocation); 208 arffxToArffArffxModel.dispose(); 209 // can be stored and retained alternatively 210 arffxToArffModule.reset(); 211 } catch (URISyntaxException e) { 212 Console.printerrln("URI Syntax for arffx model is wrong."); 213 e.printStackTrace(); 214 } catch (Exception e) { 215 e.printStackTrace(); 216 } 217 218 // Unregister MetaModels, otherwise cast will fail 219 HashMap<String, Object> metaModelCache = new HashMap<>(); 220 for (String key : EPackage.Registry.INSTANCE.keySet()) { 221 metaModelCache.put(key, EPackage.Registry.INSTANCE.get(key)); 222 }; 223 224 for (String key : metaModelCache .keySet()) { 225 EPackage.Registry.INSTANCE.remove(key); 226 }; 227 228 229 // Workaround to gernerate a usable URI. Absolute path is not 230 // possible, therefore we need to construct a relative path 231 232 URL location = DecentDataLoader.class.getProtectionDomain().getCodeSource().getLocation(); 233 String basePath = location.getFile(); 234 235 // Location is the bin folder, so we need to delete the last 4 characters 236 basePath = basePath.substring(0, basePath.length() - 4); 237 String relativePath = new File(basePath).toURI().relativize(new File(arffxModelLocation).toURI()).getPath(); 238 239 // Loard arffx file and create WEKA Instances 240 ARFFxResourceTool tool = new ARFFxResourceTool(); 241 Resource resource = tool.loadResourceFromXMI(relativePath, "arffx"); 242 243 Instances dataSet = null; 244 for(EObject o: resource.getContents()) { 245 Model m = (Model) o; 246 dataSet = createWekaDataFormat(m); 247 248 for(Instance i : m.getData()) { 249 createWekaInstance(dataSet, i); 250 } 251 } 252 253 // Set class attribute 254 Attribute classAttribute = dataSet.attribute(classAttributeName); 255 dataSet.setClass(classAttribute); 256 257 // Save as ARFF 258 save(dataSet, arffLocation); 259 260 return dataSet; 261 262 } 263 264 265 /** 266 * Creates a WekaInstance from an ARFFX Model Instance 267 * 268 * @param dataSet WekaInstance dataset, where the arffx model instances should be 269 * added to 270 * @param i arffx model instance 271 */ 272 private void createWekaInstance(Instances dataSet, Instance i) { 273 double[] values = new double[dataSet.numAttributes()]; 274 int j=0; 275 276 for(Value value : i.getValues()) { 277 String dataValue = value.getContent(); 278 String attributeName = value.getOfAttribute().getName(); 279 280 if(attributeFilter.contains(attributeName)) { 281 continue; 282 } 283 284 // Is value a LABEL.* attribute? 285 if(isLabel(attributeName)) { 286 values[j] = dataSet.attribute(j).indexOfValue(dataValue); 287 } else if (isConfidenceLabel(attributeName)){ 288 // Is value a CONFIDENCE.* attribute? 289 values[j] = dataSet.attribute(j).indexOfValue(dataValue); 290 } else if(attributeName.equals("Artifact.Name")){ 291 // Is it the name of the artifact? 292 artifactNames.add(dataValue); 293 values[j] = getIndexOfArtifactName(dataValue); 294 } else { 295 // Is it a numeric value? 296 values[j] = Double.parseDouble(dataValue); 297 } 298 299 j++; 300 } 301 302 DenseInstance inst = new DenseInstance(1.0, values); 303 dataSet.add(inst); 304 } 305 306 /** 307 * Creates a Weka Instances set out of a arffx model 308 * @param m arffx model 309 * @return 310 */ 311 private Instances createWekaDataFormat(Model m) { 312 313 // Bad solution, can be enhanced (continue in for loop) 314 ArrayList<Attribute> datasetAttributes = new ArrayList<Attribute>(); 315 for(de.ugoe.cs.cpdp.decentApp.models.arffx.Attribute attribute :m.getAttributes()) { 316 String attributeName = attribute.getName(); 317 318 if(attributeFilter.contains(attributeName)) { 319 continue; 320 } 321 322 Attribute wekaAttr; 323 324 // Is attribute a LABEL.* attribute? 325 if(isLabel(attributeName)) { 326 // Classattribute 327 final ArrayList<String> classAttVals = new ArrayList<String>(); 328 classAttVals.add("false"); 329 classAttVals.add("true"); 330 wekaAttr = new Attribute(attributeName, classAttVals); 331 } else if(isConfidenceLabel(attributeName)){ 332 // Is attribute a CONFIDENCE.* attribute? 333 ArrayList<String> labels = new ArrayList<String>(); 334 labels.add("high"); 335 labels.add("low"); 336 wekaAttr = new Attribute(attributeName, labels); 337 } else { 338 // Is it a numeric attribute? 339 wekaAttr = new Attribute(attributeName); 340 } 341 342 datasetAttributes.add(wekaAttr); 343 } 344 345 346 return new Instances("test-dataset", datasetAttributes, 0); 347 } 348 349 /** 350 * Helper methods which indicates if the given value starts with "LABEL" 351 * 352 * @param value to test 353 * @return 354 */ 355 private boolean isLabel(String value) { 356 if(value.length()>= 5 && value.substring(0, 5).equals("LABEL")) { 357 return true; 358 } 359 360 return false; 361 } 362 363 /** 364 * Helper method which indicates if the given value starts with "CONFIDENCE" 365 * @param value to test 366 * @return 367 */ 368 private boolean isConfidenceLabel(String value) { 369 if(value.length()>= 10 && value.substring(0, 10).equals("CONFIDENCE")) { 370 return true; 371 } 372 373 return false; 374 } 375 376 377 /** 378 * Returns if a filename ends with ".decent" 379 * @return 380 */ 381 @Override 382 public boolean filenameFilter(String filename) { 383 return filename.endsWith(".decent"); 384 } 385 386 /** 387 * Helper method for executing a eol scripts and adding the log model beforehand 388 * @param module module to execute 389 * @param logModelLocation location of the log model 390 * @throws Exception 391 */ 392 private void execute(IEolExecutableModule module, String logModelLocation) 393 throws Exception { 394 IModel logModel = modelHandler.getLOGModel(logModelLocation, true, true); 395 module.getContext().getModelRepository().addModel(logModel); 396 module.execute(); 397 logModel.dispose(); 398 } 399 400 /** 401 * Loads the module from a given source 402 * 403 * @param source where the module is (e.g. eol script) 404 * @return 405 * @throws Exception 406 * @throws URISyntaxException 407 */ 408 private IEolExecutableModule loadModule(String source) throws Exception, 409 URISyntaxException { 410 411 IEolExecutableModule module = null; 412 if (source.endsWith("etl")) { 413 module = new EtlModule(); 414 } else if (source.endsWith("eol")) { 415 module = new EolModule(); 416 } else { 417 418 } 419 420 module.parse(modelHandler.getFile(source)); 421 422 if (module.getParseProblems().size() > 0) { 423 Console.printerrln("Parse error occured..."); 424 for (ParseProblem problem : module.getParseProblems()) { 425 System.err.println(problem.toString()); 426 } 427 // System.exit(-1); 428 } 429 430 return module; 431 } 432 433 /** 434 * Helper method for registering the metamodels 435 * @throws Exception 436 */ 437 private void registerMetaModels() throws Exception { 438 String metaModelsPath = DECENTEpsilonModelHandler.metaPath; 439 File metaModelsLocation = new File(metaModelsPath); 440 for (File file : metaModelsLocation.listFiles()) { 441 if (file.getName().endsWith(".ecore")) { 442 EmfUtil.register(URI.createFileURI(file.getAbsolutePath()), EPackage.Registry.INSTANCE); 443 } 444 } 445 } 446 60 public class DecentDataLoader implements SingleVersionLoader { 61 62 // Model Handler for Decent Models 63 private DECENTEpsilonModelHandler modelHandler = new DECENTEpsilonModelHandler(); 64 65 // Set log level 66 String logLevel = "1"; 67 String logToFile = "false"; 68 69 // This list contains attributes, that should be removed before building the arff file 70 private static List<String> attributeFilter = new LinkedList<String>(); 71 72 // This list contains all names of the different artifacts 73 private static Set<String> artifactNames = new LinkedHashSet<String>(); 74 75 // Name of the class attribute. 76 private static final String classAttributeName = "LABEL.Artifact.Target.BugFix.AverageWeight"; 77 78 private int getIndexOfArtifactName(String artifactName) { 79 int index = -1; 80 if (artifactNames.contains(artifactName)) { 81 int i = 0; 82 for (String nameInSet : artifactNames) { 83 if (nameInSet.equals(artifactName)) { 84 index = i; 85 } 86 else { 87 i++; 88 } 89 } 90 } 91 92 return index; 93 } 94 95 /** 96 * Defines attributes, that should be removed before building the ARFF File from. 97 */ 98 private void setAttributeFilter() { 99 attributeFilter.add("Agent.Name"); 100 101 } 102 103 /** 104 * Saves the dataset as arff after transformation (decent->arffx) and filtering 105 * 106 * @param dataSet 107 * the WEKA dataset to save 108 * @param arffLocation 109 * location where it should be saved to 110 */ 111 public void save(Instances dataSet, String arffLocation) { 112 113 ArffSaver saver = new ArffSaver(); 114 saver.setInstances(dataSet); 115 try { 116 saver.setFile(new File(arffLocation)); 117 saver.writeBatch(); 118 } 119 catch (IOException e) { 120 Console.printerrln("Cannot save the file to path: " + arffLocation); 121 e.printStackTrace(); 122 } 123 } 124 125 /** 126 * Loads the given decent file and tranform it from decent->arffx->arff 127 * 128 * @return Instances in WEKA format 129 */ 130 @Override 131 public Instances load(File file) { 132 133 // Set attributeFilter 134 setAttributeFilter(); 135 136 // Register MetaModels 137 try { 138 registerMetaModels(); 139 } 140 catch (Exception e1) { 141 Console.printerrln("Metamodels cannot be registered!"); 142 e1.printStackTrace(); 143 } 144 145 // Set location of decent and arffx Model 146 String decentModelLocation = file.getAbsolutePath(); 147 String pathToDecentModelFolder = 148 decentModelLocation.substring(0, decentModelLocation.lastIndexOf(File.separator)); 149 String arffxModelLocation = pathToDecentModelFolder + "/model.arffx"; 150 String logModelLocation = pathToDecentModelFolder + "/model.log"; 151 String arffLocation = pathToDecentModelFolder + "/model.arff"; 152 153 // If arff File exists, load from it! 154 if (new File(arffLocation).exists()) { 155 System.out.println("Loading arff File..."); 156 BufferedReader reader; 157 Instances data = null; 158 try { 159 reader = new BufferedReader(new FileReader(arffLocation)); 160 data = new Instances(reader); 161 reader.close(); 162 } 163 catch (FileNotFoundException e) { 164 Console.printerrln("File with path: " + arffLocation + " was not found."); 165 e.printStackTrace(); 166 } 167 catch (IOException e) { 168 Console.printerrln("File with path: " + arffLocation + " cannot be read."); 169 e.printStackTrace(); 170 } 171 172 // Set class attribute if not set 173 if (data.classIndex() == -1) { 174 Attribute classAttribute = data.attribute(classAttributeName); 175 data.setClass(classAttribute); 176 } 177 178 return data; 179 } 180 181 // Location of EOL Scripts 182 String preprocess = "./decent/epsilon/query/preprocess.eol"; 183 String arffxToArffSource = "./decent/epsilon/query/addLabels.eol"; 184 185 // Set Log Properties 186 System.setProperty("epsilon.logLevel", logLevel); 187 System.setProperty("epsilon.logToFile", logToFile); 188 System.setProperty("epsilon.logFileAvailable", "false"); 189 190 // Set decent2arffx Properties 191 System.setProperty("epsilon.transformation.decent2arffx.skipSource", "false"); 192 System.setProperty("epsilon.transformation.decent2arffx.type", "code"); 193 194 // Preprocess Data, transform from decent2arffx 195 try { 196 IEolExecutableModule preProcessModule = loadModule(preprocess); 197 IModel preProcessDecentModel = 198 modelHandler.getDECENTModel(decentModelLocation, true, true); 199 IModel preProcessArffxarffxModel = 200 modelHandler.getARFFxModel(arffxModelLocation, false, true); 201 preProcessModule.getContext().getModelRepository().addModel(preProcessDecentModel); 202 preProcessModule.getContext().getModelRepository().addModel(preProcessArffxarffxModel); 203 execute(preProcessModule, logModelLocation); 204 preProcessDecentModel.dispose(); 205 preProcessArffxarffxModel.dispose(); 206 preProcessModule.reset(); 207 } 208 catch (URISyntaxException e) { 209 Console.printerrln("URI Syntax for decent or arffx model is wrong."); 210 e.printStackTrace(); 211 } 212 catch (Exception e) { 213 e.printStackTrace(); 214 } 215 216 // Transform to arff, for label and confidence attributes 217 try { 218 IEolExecutableModule arffxToArffModule = loadModule(arffxToArffSource); 219 IModel arffxToArffArffxModel = 220 modelHandler.getARFFxModel(arffxModelLocation, true, true); 221 arffxToArffModule.getContext().getModelRepository().addModel(arffxToArffArffxModel); 222 execute(arffxToArffModule, logModelLocation); 223 arffxToArffArffxModel.dispose(); 224 // can be stored and retained alternatively 225 arffxToArffModule.reset(); 226 } 227 catch (URISyntaxException e) { 228 Console.printerrln("URI Syntax for arffx model is wrong."); 229 e.printStackTrace(); 230 } 231 catch (Exception e) { 232 e.printStackTrace(); 233 } 234 235 // Unregister MetaModels, otherwise cast will fail 236 HashMap<String, Object> metaModelCache = new HashMap<>(); 237 for (String key : EPackage.Registry.INSTANCE.keySet()) { 238 metaModelCache.put(key, EPackage.Registry.INSTANCE.get(key)); 239 }; 240 241 for (String key : metaModelCache.keySet()) { 242 EPackage.Registry.INSTANCE.remove(key); 243 }; 244 245 // Workaround to gernerate a usable URI. Absolute path is not 246 // possible, therefore we need to construct a relative path 247 248 URL location = DecentDataLoader.class.getProtectionDomain().getCodeSource().getLocation(); 249 String basePath = location.getFile(); 250 251 // Location is the bin folder, so we need to delete the last 4 characters 252 basePath = basePath.substring(0, basePath.length() - 4); 253 String relativePath = 254 new File(basePath).toURI().relativize(new File(arffxModelLocation).toURI()).getPath(); 255 256 // Loard arffx file and create WEKA Instances 257 ARFFxResourceTool tool = new ARFFxResourceTool(); 258 Resource resource = tool.loadResourceFromXMI(relativePath, "arffx"); 259 260 Instances dataSet = null; 261 for (EObject o : resource.getContents()) { 262 Model m = (Model) o; 263 dataSet = createWekaDataFormat(m); 264 265 for (Instance i : m.getData()) { 266 createWekaInstance(dataSet, i); 267 } 268 } 269 270 // Set class attribute 271 Attribute classAttribute = dataSet.attribute(classAttributeName); 272 dataSet.setClass(classAttribute); 273 274 // Save as ARFF 275 save(dataSet, arffLocation); 276 277 return dataSet; 278 279 } 280 281 /** 282 * Creates a WekaInstance from an ARFFX Model Instance 283 * 284 * @param dataSet 285 * WekaInstance dataset, where the arffx model instances should be added to 286 * @param i 287 * arffx model instance 288 */ 289 private void createWekaInstance(Instances dataSet, Instance i) { 290 double[] values = new double[dataSet.numAttributes()]; 291 int j = 0; 292 293 for (Value value : i.getValues()) { 294 String dataValue = value.getContent(); 295 String attributeName = value.getOfAttribute().getName(); 296 297 if (attributeFilter.contains(attributeName)) { 298 continue; 299 } 300 301 // Is value a LABEL.* attribute? 302 if (isLabel(attributeName)) { 303 values[j] = dataSet.attribute(j).indexOfValue(dataValue); 304 } 305 else if (isConfidenceLabel(attributeName)) { 306 // Is value a CONFIDENCE.* attribute? 307 values[j] = dataSet.attribute(j).indexOfValue(dataValue); 308 } 309 else if (attributeName.equals("Artifact.Name")) { 310 // Is it the name of the artifact? 311 artifactNames.add(dataValue); 312 values[j] = getIndexOfArtifactName(dataValue); 313 } 314 else { 315 // Is it a numeric value? 316 values[j] = Double.parseDouble(dataValue); 317 } 318 319 j++; 320 } 321 322 DenseInstance inst = new DenseInstance(1.0, values); 323 dataSet.add(inst); 324 } 325 326 /** 327 * Creates a Weka Instances set out of a arffx model 328 * 329 * @param m 330 * arffx model 331 * @return 332 */ 333 private Instances createWekaDataFormat(Model m) { 334 335 // Bad solution, can be enhanced (continue in for loop) 336 ArrayList<Attribute> datasetAttributes = new ArrayList<Attribute>(); 337 for (de.ugoe.cs.cpdp.decentApp.models.arffx.Attribute attribute : m.getAttributes()) { 338 String attributeName = attribute.getName(); 339 340 if (attributeFilter.contains(attributeName)) { 341 continue; 342 } 343 344 Attribute wekaAttr; 345 346 // Is attribute a LABEL.* attribute? 347 if (isLabel(attributeName)) { 348 // Classattribute 349 final ArrayList<String> classAttVals = new ArrayList<String>(); 350 classAttVals.add("false"); 351 classAttVals.add("true"); 352 wekaAttr = new Attribute(attributeName, classAttVals); 353 } 354 else if (isConfidenceLabel(attributeName)) { 355 // Is attribute a CONFIDENCE.* attribute? 356 ArrayList<String> labels = new ArrayList<String>(); 357 labels.add("high"); 358 labels.add("low"); 359 wekaAttr = new Attribute(attributeName, labels); 360 } 361 else { 362 // Is it a numeric attribute? 363 wekaAttr = new Attribute(attributeName); 364 } 365 366 datasetAttributes.add(wekaAttr); 367 } 368 369 return new Instances("test-dataset", datasetAttributes, 0); 370 } 371 372 /** 373 * Helper methods which indicates if the given value starts with "LABEL" 374 * 375 * @param value 376 * to test 377 * @return 378 */ 379 private boolean isLabel(String value) { 380 if (value.length() >= 5 && value.substring(0, 5).equals("LABEL")) { 381 return true; 382 } 383 384 return false; 385 } 386 387 /** 388 * Helper method which indicates if the given value starts with "CONFIDENCE" 389 * 390 * @param value 391 * to test 392 * @return 393 */ 394 private boolean isConfidenceLabel(String value) { 395 if (value.length() >= 10 && value.substring(0, 10).equals("CONFIDENCE")) { 396 return true; 397 } 398 399 return false; 400 } 401 402 /** 403 * Returns if a filename ends with ".decent" 404 * 405 * @return 406 */ 407 @Override 408 public boolean filenameFilter(String filename) { 409 return filename.endsWith(".decent"); 410 } 411 412 /** 413 * Helper method for executing a eol scripts and adding the log model beforehand 414 * 415 * @param module 416 * module to execute 417 * @param logModelLocation 418 * location of the log model 419 * @throws Exception 420 */ 421 private void execute(IEolExecutableModule module, String logModelLocation) throws Exception { 422 IModel logModel = modelHandler.getLOGModel(logModelLocation, true, true); 423 module.getContext().getModelRepository().addModel(logModel); 424 module.execute(); 425 logModel.dispose(); 426 } 427 428 /** 429 * Loads the module from a given source 430 * 431 * @param source 432 * where the module is (e.g. eol script) 433 * @return 434 * @throws Exception 435 * @throws URISyntaxException 436 */ 437 private IEolExecutableModule loadModule(String source) throws Exception, URISyntaxException { 438 439 IEolExecutableModule module = null; 440 if (source.endsWith("etl")) { 441 module = new EtlModule(); 442 } 443 else if (source.endsWith("eol")) { 444 module = new EolModule(); 445 } 446 else { 447 448 } 449 450 module.parse(modelHandler.getFile(source)); 451 452 if (module.getParseProblems().size() > 0) { 453 Console.printerrln("Parse error occured..."); 454 for (ParseProblem problem : module.getParseProblems()) { 455 System.err.println(problem.toString()); 456 } 457 // System.exit(-1); 458 } 459 460 return module; 461 } 462 463 /** 464 * Helper method for registering the metamodels 465 * 466 * @throws Exception 467 */ 468 private void registerMetaModels() throws Exception { 469 String metaModelsPath = DECENTEpsilonModelHandler.metaPath; 470 File metaModelsLocation = new File(metaModelsPath); 471 for (File file : metaModelsLocation.listFiles()) { 472 if (file.getName().endsWith(".ecore")) { 473 EmfUtil.register(URI.createFileURI(file.getAbsolutePath()), 474 EPackage.Registry.INSTANCE); 475 } 476 } 477 } 478 447 479 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/DecentFolderLoader.java
r32 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.loader; 2 16 … … 16 30 public class DecentFolderLoader extends AbstractFolderLoader { 17 31 18 /* 19 * (non-Javadoc) 20 * 21 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader#getSingleLoader() 22 */ 23 @Override 24 protected SingleVersionLoader getSingleLoader() { 25 return new DecentDataLoader(); 26 } 27 28 /** 29 * @see de.ugoe.cs.cpdp.loader.IVersionLoader#load() 30 */ 31 @Override 32 public List<SoftwareVersion> load() { 33 final List<SoftwareVersion> versions = new LinkedList<SoftwareVersion>(); 32 /* 33 * (non-Javadoc) 34 * 35 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader#getSingleLoader() 36 */ 37 @Override 38 protected SingleVersionLoader getSingleLoader() { 39 return new DecentDataLoader(); 40 } 34 41 35 final File dataDir = new File(path); 36 final SingleVersionLoader instancesLoader = getSingleLoader(); 42 /** 43 * @see de.ugoe.cs.cpdp.loader.IVersionLoader#load() 44 */ 45 @Override 46 public List<SoftwareVersion> load() { 47 final List<SoftwareVersion> versions = new LinkedList<SoftwareVersion>(); 37 48 38 String projectName = dataDir.getName(); 39 40 41 /* 42 * The following lines make it possible, that we can have two different possibilities 43 * to load data: 44 * 1) From one project (e.g. /decent/input/project1) 45 * 2) From more than one project (e.g. /decent/input/) 46 * 47 * Requirement is, that we have a folder structure like this: 48 * "/decent/input/project1/model.decent, /decent/input/project2/model.decent, ..." 49 * 50 * In the first one the "else" is executed, therefore it will just search the folder "project1" 51 * for a "model.decent" file. In the second one, it will look into each folder and searches for 52 * "model.decent" files. 53 */ 54 for (File projectDir : dataDir.listFiles()) { 55 if (projectDir.isDirectory()) { 56 projectName = projectDir.getName(); 57 for (File versionFile : projectDir.listFiles()) { 58 loadDataFromFile(versionFile,instancesLoader, projectName, versions); 59 } 60 } else { 61 loadDataFromFile(projectDir, instancesLoader, projectName, versions); 62 } 63 } 64 return versions; 65 } 66 67 /** 68 * Loads data from a file and adds the instances from the load method to the 69 * versions List. 70 * 71 * @param versionFile file to load from 72 * @param instancesLoader loader that should be used 73 * @param projectName name of the project which was loaded 74 * @param versions list, where the weka instances are added to 75 */ 76 77 private void loadDataFromFile(File versionFile, 78 SingleVersionLoader instancesLoader, String projectName, List<SoftwareVersion> versions) { 79 if (versionFile.isFile() 80 && instancesLoader.filenameFilter(versionFile 81 .getName())) { 82 String versionName = versionFile.getName(); 83 Instances data = instancesLoader.load(versionFile); 84 versions.add(new SoftwareVersion(projectName, 85 versionName, data)); 86 } 87 } 49 final File dataDir = new File(path); 50 final SingleVersionLoader instancesLoader = getSingleLoader(); 51 52 String projectName = dataDir.getName(); 53 54 /* 55 * The following lines make it possible, that we can have two different possibilities to 56 * load data: 1) From one project (e.g. /decent/input/project1) 2) From more than one 57 * project (e.g. /decent/input/) 58 * 59 * Requirement is, that we have a folder structure like this: 60 * "/decent/input/project1/model.decent, /decent/input/project2/model.decent, ..." 61 * 62 * In the first one the "else" is executed, therefore it will just search the folder 63 * "project1" for a "model.decent" file. In the second one, it will look into each folder 64 * and searches for "model.decent" files. 65 */ 66 for (File projectDir : dataDir.listFiles()) { 67 if (projectDir.isDirectory()) { 68 projectName = projectDir.getName(); 69 for (File versionFile : projectDir.listFiles()) { 70 loadDataFromFile(versionFile, instancesLoader, projectName, versions); 71 } 72 } 73 else { 74 loadDataFromFile(projectDir, instancesLoader, projectName, versions); 75 } 76 } 77 return versions; 78 } 79 80 /** 81 * Loads data from a file and adds the instances from the load method to the versions List. 82 * 83 * @param versionFile 84 * file to load from 85 * @param instancesLoader 86 * loader that should be used 87 * @param projectName 88 * name of the project which was loaded 89 * @param versions 90 * list, where the weka instances are added to 91 */ 92 93 private void loadDataFromFile(File versionFile, 94 SingleVersionLoader instancesLoader, 95 String projectName, 96 List<SoftwareVersion> versions) 97 { 98 if (versionFile.isFile() && instancesLoader.filenameFilter(versionFile.getName())) { 99 String versionName = versionFile.getName(); 100 Instances data = instancesLoader.load(versionFile); 101 versions.add(new SoftwareVersion(projectName, versionName, data)); 102 } 103 } 88 104 89 105 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/IDecentVersionLoader.java
r32 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.loader; 2 16 … … 5 19 import de.ugoe.cs.cpdp.versions.SoftwareVersion; 6 20 7 public interface IDecentVersionLoader extends IVersionLoader {8 9 21 public interface IDecentVersionLoader extends IVersionLoader { 22 23 public List<SoftwareVersion> load(List<String> decentAttributes); 10 24 11 25 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/IVersionLoader.java
r32 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.loader; 2 16 … … 12 26 public interface IVersionLoader { 13 27 14 15 16 17 18 19 20 28 /** 29 * Sets the location of the data. 30 * 31 * @param location 32 * location of the data 33 */ 34 public void setLocation(String location); 21 35 22 23 24 25 26 27 36 /** 37 * Loads the data. 38 * 39 * @return the data 40 */ 41 public List<SoftwareVersion> load(); 28 42 29 43 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/NasaARFFFolderLoader.java
r4 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.loader; 2 16 3 17 /** 4 * Implements the {@link AbstractFolderLoader} for the NASA/SOFTLAB/MDP data 5 * set. 18 * Implements the {@link AbstractFolderLoader} for the NASA/SOFTLAB/MDP data set. 6 19 * 7 20 * @author Steffen Herbold … … 9 22 public class NasaARFFFolderLoader extends AbstractFolderLoader { 10 23 11 12 13 14 15 16 17 18 19 24 /* 25 * (non-Javadoc) 26 * 27 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader#getSingleLoader() 28 */ 29 @Override 30 protected SingleVersionLoader getSingleLoader() { 31 return new NasaARFFLoader(); 32 } 20 33 21 34 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/NasaARFFLoader.java
r4 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.loader; 2 16 … … 16 30 17 31 /** 18 * Loads the instances for a software version from an ARFF file of the 19 * NASA/SOFTLAB/MDP data. 32 * Loads the instances for a software version from an ARFF file of the NASA/SOFTLAB/MDP data. 20 33 * 21 34 * @author Steffen Herbold … … 23 36 public class NasaARFFLoader implements SingleVersionLoader { 24 37 25 26 * used to map attributes the same attribute with different names to each 27 * other 28 */ 29 Map<String, String> attributeNameMap; 30 31 /** 32 * used to ensure that the attribute order is the same after loading 33 */ 34 List<String> attributeOrder; 35 36 /** 37 * Constructor. Creates a new NasaARFFLoader. 38 */ 39 public NasaARFFLoader() { 40 attributeNameMap = new HashMap<>(); 41 42 // Map entries for ar project 43 attributeNameMap.put("total_loc", "LOC_TOTAL");44 attributeNameMap.put("comment_loc", "LOC_COMMENTS");45 attributeNameMap.put("code_and_comment_loc", "LOC_CODE_AND_COMMENT");46 attributeNameMap.put("executable_loc", "LOC_EXECUTABLE");47 attributeNameMap.put("unique_operands", "NUM_UNIQUE_OPERANDS");48 attributeNameMap.put("unique_operators", "NUM_UNIQUE_OPERATORS");49 attributeNameMap.put("total_operands", "NUM_OPERANDS");50 attributeNameMap.put("total_operators", "NUM_OPERATORS");51 attributeNameMap.put("halstead_length", "HALSTEAD_LENGTH");52 attributeNameMap.put("halstead_volume", "HALSTEAD_VOLUME");53 attributeNameMap.put("halstead_difficulty", "HALSTEAD_DIFFICULTY");54 attributeNameMap.put("halstead_effort", "HALSTEAD_EFFORT");55 attributeNameMap.put("halstead_error", "HALSTEAD_ERROR_EST");56 attributeNameMap.put("halstead_time", "HALSTEAD_PROG_TIME");57 attributeNameMap.put("branch_count", "BRANCH_COUNT");58 attributeNameMap.put("cyclomatic_complexity", "CYCLOMATIC_COMPLEXITY");59 attributeNameMap.put("design_complexity", "DESIGN_COMPLEXITY"); 60 61 // Map entries for KC2 62 attributeNameMap.put("loc", "LOC_TOTAL");63 attributeNameMap.put("lOCode", "LOC_EXECUTABLE");64 attributeNameMap.put("lOComment", "LOC_COMMENTS");65 attributeNameMap.put("lOCodeAndComment", "LOC_CODE_AND_COMMENT");66 attributeNameMap.put("uniq_Op", "NUM_UNIQUE_OPERATORS");67 attributeNameMap.put("uniq_Opnd", "NUM_UNIQUE_OPERANDS");68 attributeNameMap.put("total_Op", "NUM_OPERATORS");69 attributeNameMap.put("total_Opnd", "NUM_OPERANDS");70 attributeNameMap.put("v", "HALSTEAD_VOLUME");71 attributeNameMap.put("l", "HALSTEAD_LENGTH");72 attributeNameMap.put("d", "HALSTEAD_DIFFICULTY");73 attributeNameMap.put("e", "HALSTEAD_EFFORT");74 attributeNameMap.put("b", "HALSTEAD_ERROR_EST");75 attributeNameMap.put("t", "HALSTEAD_PROG_TIME");76 attributeNameMap.put("branchCount", "BRANCH_COUNT");77 attributeNameMap.put("v(g)", "CYCLOMATIC_COMPLEXITY");78 attributeNameMap.put("iv(g)", "DESIGN_COMPLEXITY"); 79 80 attributeNameMap.put("defects", "bug");81 attributeNameMap.put("Defective", "bug");82 attributeNameMap.put("problems", "bug");83 attributeNameMap.put("label", "bug"); 84 85 // build list with normalized attribute order 86 attributeOrder = new LinkedList<>(); 87 88 attributeOrder.add("LOC_TOTAL");89 attributeOrder.add("LOC_EXECUTABLE");90 attributeOrder.add("LOC_COMMENTS");91 attributeOrder.add("LOC_CODE_AND_COMMENT");92 attributeOrder.add("NUM_UNIQUE_OPERATORS");93 attributeOrder.add("NUM_UNIQUE_OPERANDS");94 attributeOrder.add("NUM_OPERATORS");95 attributeOrder.add("NUM_OPERANDS");96 attributeOrder.add("HALSTEAD_VOLUME");97 attributeOrder.add("HALSTEAD_LENGTH");98 attributeOrder.add("HALSTEAD_DIFFICULTY");99 attributeOrder.add("HALSTEAD_EFFORT");100 attributeOrder.add("HALSTEAD_ERROR_EST");101 attributeOrder.add("HALSTEAD_PROG_TIME");102 attributeOrder.add("BRANCH_COUNT");103 attributeOrder.add("CYCLOMATIC_COMPLEXITY");104 attributeOrder.add("DESIGN_COMPLEXITY");105 attributeOrder.add("bug"); 106 } 107 108 /* 109 * (non-Javadoc) 110 * 111 * @see de.ugoe.cs.cpdp.loader.SingleVersionLoader#load(java.io.File) 112 */ 113 @Override 114 public Instances load(File file) { 115 BufferedReader reader;116 Instances data; 117 try { 118 reader = new BufferedReader(new FileReader(file));119 data = new Instances(reader);120 reader.close(); 121 }catch (IOException e) {122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 } catch (Exception e) { 158 throw new RuntimeException("Error while reordering the data", e); 159 } 160 if (data.numAttributes() != attributeOrder.size()) { 161 throw new RuntimeException( 162 163 164 165 166 167 168 169 170 171 172 173 } catch (Exception e) { 174 throw new RuntimeException( 175 176 177 178 179 180 181 String firstValue = data.classAttribute().enumerateValues() 182 .nextElement().toString(); 183 if (firstValue.equals("Y") || firstValue.equals("yes") 184 || firstValue.equals("true")) { 185 classValue = 0.0; 186 } else { 187 classValue = 1.0; 188 } 189 190 for (int i = 0; i < data.numInstances(); i++) {191 if (data.instance(i).classValue() == classValue) { 192 data.instance(i).setValue(data.classIndex() + 1, 1.0); 193 }else {194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 38 /** 39 * used to map attributes the same attribute with different names to each other 40 */ 41 Map<String, String> attributeNameMap; 42 43 /** 44 * used to ensure that the attribute order is the same after loading 45 */ 46 List<String> attributeOrder; 47 48 /** 49 * Constructor. Creates a new NasaARFFLoader. 50 */ 51 public NasaARFFLoader() { 52 attributeNameMap = new HashMap<>(); 53 54 // Map entries for ar project 55 attributeNameMap.put("total_loc", "LOC_TOTAL"); 56 attributeNameMap.put("comment_loc", "LOC_COMMENTS"); 57 attributeNameMap.put("code_and_comment_loc", "LOC_CODE_AND_COMMENT"); 58 attributeNameMap.put("executable_loc", "LOC_EXECUTABLE"); 59 attributeNameMap.put("unique_operands", "NUM_UNIQUE_OPERANDS"); 60 attributeNameMap.put("unique_operators", "NUM_UNIQUE_OPERATORS"); 61 attributeNameMap.put("total_operands", "NUM_OPERANDS"); 62 attributeNameMap.put("total_operators", "NUM_OPERATORS"); 63 attributeNameMap.put("halstead_length", "HALSTEAD_LENGTH"); 64 attributeNameMap.put("halstead_volume", "HALSTEAD_VOLUME"); 65 attributeNameMap.put("halstead_difficulty", "HALSTEAD_DIFFICULTY"); 66 attributeNameMap.put("halstead_effort", "HALSTEAD_EFFORT"); 67 attributeNameMap.put("halstead_error", "HALSTEAD_ERROR_EST"); 68 attributeNameMap.put("halstead_time", "HALSTEAD_PROG_TIME"); 69 attributeNameMap.put("branch_count", "BRANCH_COUNT"); 70 attributeNameMap.put("cyclomatic_complexity", "CYCLOMATIC_COMPLEXITY"); 71 attributeNameMap.put("design_complexity", "DESIGN_COMPLEXITY"); 72 73 // Map entries for KC2 74 attributeNameMap.put("loc", "LOC_TOTAL"); 75 attributeNameMap.put("lOCode", "LOC_EXECUTABLE"); 76 attributeNameMap.put("lOComment", "LOC_COMMENTS"); 77 attributeNameMap.put("lOCodeAndComment", "LOC_CODE_AND_COMMENT"); 78 attributeNameMap.put("uniq_Op", "NUM_UNIQUE_OPERATORS"); 79 attributeNameMap.put("uniq_Opnd", "NUM_UNIQUE_OPERANDS"); 80 attributeNameMap.put("total_Op", "NUM_OPERATORS"); 81 attributeNameMap.put("total_Opnd", "NUM_OPERANDS"); 82 attributeNameMap.put("v", "HALSTEAD_VOLUME"); 83 attributeNameMap.put("l", "HALSTEAD_LENGTH"); 84 attributeNameMap.put("d", "HALSTEAD_DIFFICULTY"); 85 attributeNameMap.put("e", "HALSTEAD_EFFORT"); 86 attributeNameMap.put("b", "HALSTEAD_ERROR_EST"); 87 attributeNameMap.put("t", "HALSTEAD_PROG_TIME"); 88 attributeNameMap.put("branchCount", "BRANCH_COUNT"); 89 attributeNameMap.put("v(g)", "CYCLOMATIC_COMPLEXITY"); 90 attributeNameMap.put("iv(g)", "DESIGN_COMPLEXITY"); 91 92 attributeNameMap.put("defects", "bug"); 93 attributeNameMap.put("Defective", "bug"); 94 attributeNameMap.put("problems", "bug"); 95 attributeNameMap.put("label", "bug"); 96 97 // build list with normalized attribute order 98 attributeOrder = new LinkedList<>(); 99 100 attributeOrder.add("LOC_TOTAL"); 101 attributeOrder.add("LOC_EXECUTABLE"); 102 attributeOrder.add("LOC_COMMENTS"); 103 attributeOrder.add("LOC_CODE_AND_COMMENT"); 104 attributeOrder.add("NUM_UNIQUE_OPERATORS"); 105 attributeOrder.add("NUM_UNIQUE_OPERANDS"); 106 attributeOrder.add("NUM_OPERATORS"); 107 attributeOrder.add("NUM_OPERANDS"); 108 attributeOrder.add("HALSTEAD_VOLUME"); 109 attributeOrder.add("HALSTEAD_LENGTH"); 110 attributeOrder.add("HALSTEAD_DIFFICULTY"); 111 attributeOrder.add("HALSTEAD_EFFORT"); 112 attributeOrder.add("HALSTEAD_ERROR_EST"); 113 attributeOrder.add("HALSTEAD_PROG_TIME"); 114 attributeOrder.add("BRANCH_COUNT"); 115 attributeOrder.add("CYCLOMATIC_COMPLEXITY"); 116 attributeOrder.add("DESIGN_COMPLEXITY"); 117 attributeOrder.add("bug"); 118 } 119 120 /* 121 * (non-Javadoc) 122 * 123 * @see de.ugoe.cs.cpdp.loader.SingleVersionLoader#load(java.io.File) 124 */ 125 @Override 126 public Instances load(File file) { 127 BufferedReader reader; 128 Instances data; 129 try { 130 reader = new BufferedReader(new FileReader(file)); 131 data = new Instances(reader); 132 reader.close(); 133 } 134 catch (IOException e) { 135 throw new RuntimeException("Error reading data", e); 136 } 137 138 // setting class attribute 139 data.setClassIndex(data.numAttributes() - 1); 140 141 // normalize attribute names 142 for (int i = 0; i < data.numAttributes(); i++) { 143 String mapValue = attributeNameMap.get(data.attribute(i).name()); 144 if (mapValue != null) { 145 data.renameAttribute(i, mapValue); 146 } 147 } 148 149 // determine new attribute order (unwanted attributes are implicitly 150 // removed 151 String orderString = ""; 152 for (String attName : attributeOrder) { 153 for (int i = 0; i < data.numAttributes(); i++) { 154 if (attName.equals(data.attribute(i).name())) { 155 orderString += (i + 1) + ","; 156 } 157 } 158 } 159 orderString = orderString.substring(0, orderString.length() - 1); 160 161 String relationName = data.relationName(); 162 String[] options = new String[2]; 163 options[0] = "-R"; 164 options[1] = orderString; 165 Reorder reorder = new Reorder(); 166 try { 167 reorder.setOptions(options); 168 reorder.setInputFormat(data); 169 data = Filter.useFilter(data, reorder); 170 } 171 catch (Exception e) { 172 throw new RuntimeException("Error while reordering the data", e); 173 } 174 if (data.numAttributes() != attributeOrder.size()) { 175 throw new RuntimeException("Invalid number of attributes; filename: " + file.getName()); 176 } 177 178 // normalize bug nominal values 179 Add add = new Add(); 180 add.setAttributeIndex("last"); 181 add.setNominalLabels("0,1"); 182 add.setAttributeName("bug-new"); 183 try { 184 add.setInputFormat(data); 185 data = Filter.useFilter(data, add); 186 } 187 catch (Exception e) { 188 throw new RuntimeException("Error while normalizing the bug nonminal values", e); 189 } 190 data.setRelationName(relationName); 191 192 double classValue; 193 194 String firstValue = data.classAttribute().enumerateValues().nextElement().toString(); 195 if (firstValue.equals("Y") || firstValue.equals("yes") || firstValue.equals("true")) { 196 classValue = 0.0; 197 } 198 else { 199 classValue = 1.0; 200 } 201 202 for (int i = 0; i < data.numInstances(); i++) { 203 if (data.instance(i).classValue() == classValue) { 204 data.instance(i).setValue(data.classIndex() + 1, 1.0); 205 } 206 else { 207 data.instance(i).setValue(data.classIndex() + 1, 0.0); 208 } 209 } 210 211 int oldClassIndex = data.classIndex(); 212 data.setClassIndex(oldClassIndex + 1); 213 data.deleteAttributeAt(oldClassIndex); 214 215 return data; 216 } 217 218 /* 219 * (non-Javadoc) 220 * 221 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader.SingleVersionLoader# 222 * filenameFilter(java.lang.String) 223 */ 224 @Override 225 public boolean filenameFilter(String filename) { 226 return filename.endsWith(".arff"); 227 } 215 228 216 229 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/SingleVersionLoader.java
r4 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.loader; 2 16 … … 6 20 7 21 /** 8 * Interface for version loaders, i.e., loading of a set of instances from a 9 * file 22 * Interface for version loaders, i.e., loading of a set of instances from a file 10 23 * 11 24 * @author Steffen Herbold … … 13 26 public interface SingleVersionLoader { 14 27 15 16 17 18 19 20 21 22 28 /** 29 * Loads the instances. 30 * 31 * @param file 32 * handle to the file of the instances 33 * @return the instances 34 */ 35 Instances load(File file); 23 36 24 25 * Defines a filter for the files to be loaded; only strings that end with 26 * the filter areconsidered.27 28 29 30 31 32 37 /** 38 * Defines a filter for the files to be loaded; only strings that end with the filter are 39 * considered. 40 * 41 * @param filename 42 * string defining the filename filter 43 * @return true if a filename shall be considered 44 */ 45 boolean filenameFilter(String endsWith); 33 46 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/FixClass.java
r31 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.training; 2 16 … … 14 28 * @author Steffen Herbold 15 29 */ 16 public class FixClass extends AbstractClassifier implements ITrainingStrategy, IWekaCompatibleTrainer { 30 public class FixClass extends AbstractClassifier implements ITrainingStrategy, 31 IWekaCompatibleTrainer 32 { 17 33 18 34 private static final long serialVersionUID = 1L; 19 35 20 36 private double fixedClassValue = 0.0d; 21 37 22 23 24 25 26 27 28 29 30 38 /** 39 * Returns default capabilities of the classifier. 40 * 41 * @return the capabilities of this classifier 42 */ 43 @Override 44 public Capabilities getCapabilities() { 45 Capabilities result = super.getCapabilities(); 46 result.disableAll(); 31 47 32 33 34 35 36 37 38 48 // attributes 49 result.enable(Capability.NOMINAL_ATTRIBUTES); 50 result.enable(Capability.NUMERIC_ATTRIBUTES); 51 result.enable(Capability.DATE_ATTRIBUTES); 52 result.enable(Capability.STRING_ATTRIBUTES); 53 result.enable(Capability.RELATIONAL_ATTRIBUTES); 54 result.enable(Capability.MISSING_VALUES); 39 55 40 41 42 43 56 // class 57 result.enable(Capability.NOMINAL_CLASS); 58 result.enable(Capability.NUMERIC_CLASS); 59 result.enable(Capability.MISSING_CLASS_VALUES); 44 60 45 46 61 // instances 62 result.setMinimumNumberInstances(0); 47 63 48 49 64 return result; 65 } 50 66 51 52 53 54 67 @Override 68 public void setOptions(String[] options) throws Exception { 69 fixedClassValue = Double.parseDouble(Utils.getOption('C', options)); 70 } 55 71 56 57 58 59 72 @Override 73 public double classifyInstance(Instance instance) { 74 return fixedClassValue; 75 } 60 76 61 62 63 64 77 @Override 78 public void buildClassifier(Instances traindata) throws Exception { 79 // do nothing 80 } 65 81 66 @Override 67 public void setParameter(String parameters) { 68 try { 69 this.setOptions(parameters.split(" ")); 70 } catch (Exception e) { 71 e.printStackTrace(); 72 } 73 } 82 @Override 83 public void setParameter(String parameters) { 84 try { 85 this.setOptions(parameters.split(" ")); 86 } 87 catch (Exception e) { 88 e.printStackTrace(); 89 } 90 } 74 91 75 76 77 78 92 @Override 93 public void apply(Instances traindata) { 94 // do nothing! 95 } 79 96 80 81 82 83 97 @Override 98 public String getName() { 99 return "FixClass"; 100 } 84 101 85 86 87 88 102 @Override 103 public Classifier getClassifier() { 104 return this; 105 } 89 106 90 107 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/ISetWiseTrainingStrategy.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.training; 2 16 … … 7 21 // Bagging Strategy: separate models for each training data set 8 22 public interface ISetWiseTrainingStrategy extends ITrainer { 9 10 11 12 23 24 void apply(SetUniqueList<Instances> traindataSet); 25 26 String getName(); 13 27 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/ITrainer.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.training; 2 16 -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/ITrainingStrategy.java
r6 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.training; 2 16 … … 4 18 5 19 public interface ITrainingStrategy extends ITrainer { 6 7 8 9 20 21 void apply(Instances traindata); 22 23 String getName(); 10 24 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/IWekaCompatibleTrainer.java
r24 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.training; 2 16 … … 4 18 5 19 public interface IWekaCompatibleTrainer extends ITrainer { 6 7 8 9 20 21 Classifier getClassifier(); 22 23 String getName(); 10 24 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/QuadTree.java
r23 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.training; 2 16 … … 12 26 * QuadTree implementation 13 27 * 14 * QuadTree gets a list of instances and then recursively split them into 4 childs 15 * For this it usesthe median of the 2 values x,y28 * QuadTree gets a list of instances and then recursively split them into 4 childs For this it uses 29 * the median of the 2 values x,y 16 30 */ 17 31 public class QuadTree { 18 19 /* 1 parent or null */ 20 private QuadTree parent = null; 21 22 /* 4 childs, 1 per quadrant */ 23 private QuadTree child_nw; 24 private QuadTree child_ne; 25 private QuadTree child_se; 26 private QuadTree child_sw; 27 28 /* list (only helps with generation of list of childs!) */ 29 private ArrayList<QuadTree> l = new ArrayList<QuadTree>(); 30 31 /* level only used for debugging */ 32 public int level = 0; 33 34 /* size of the quadrant */ 35 private double[] x; 36 private double[] y; 37 38 public static boolean verbose = false; 39 public static int size = 0; 40 public static double alpha = 0; 41 42 /* cluster payloads */ 43 public static ArrayList<ArrayList<QuadTreePayload<Instance>>> ccluster = new ArrayList<ArrayList<QuadTreePayload<Instance>>>(); 44 45 /* cluster sizes (index is cluster number, arraylist is list of boxes (x0,y0,x1,y1) */ 46 public static HashMap<Integer, ArrayList<Double[][]>> csize = new HashMap<Integer, ArrayList<Double[][]>>(); 47 48 /* payload of this instance */ 49 private ArrayList<QuadTreePayload<Instance>> payload; 50 51 52 public QuadTree(QuadTree parent, ArrayList<QuadTreePayload<Instance>> payload) { 53 this.parent = parent; 54 this.payload = payload; 55 } 56 57 58 public String toString() { 59 String n = ""; 60 if(this.parent == null) { 61 n += "rootnode "; 62 } 63 String level = new String(new char[this.level]).replace("\0", "-"); 64 n += level + " instances: " + this.getNumbers(); 65 return n; 66 } 67 68 /** 69 * Returns the payload, used for clustering 70 * in the clustering list we only have children with paylod 71 * 72 * @return payload 73 */ 74 public ArrayList<QuadTreePayload<Instance>> getPayload() { 75 return this.payload; 76 } 77 78 /** 79 * Calculate the density of this quadrant 80 * 81 * density = number of instances / global size (all instances) 82 * 83 * @return density 84 */ 85 public double getDensity() { 86 double dens = 0; 87 dens = (double)this.getNumbers() / QuadTree.size; 88 return dens; 89 } 90 91 public void setSize(double[] x, double[] y){ 92 this.x = x; 93 this.y = y; 94 } 95 96 public double[][] getSize() { 97 return new double[][] {this.x, this.y}; 98 } 99 100 public Double[][] getSizeDouble() { 101 Double[] tmpX = new Double[2]; 102 Double[] tmpY = new Double[2]; 103 104 tmpX[0] = this.x[0]; 105 tmpX[1] = this.x[1]; 106 107 tmpY[0] = this.y[0]; 108 tmpY[1] = this.y[1]; 109 110 return new Double[][] {tmpX, tmpY}; 111 } 112 113 /** 114 * TODO: DRY, median ist immer dasselbe 115 * 116 * @return median for x 117 */ 118 private double getMedianForX() { 119 double med_x =0 ; 120 121 Collections.sort(this.payload, new Comparator<QuadTreePayload<Instance>>() { 122 @Override 123 public int compare(QuadTreePayload<Instance> x1, QuadTreePayload<Instance> x2) { 124 return Double.compare(x1.x, x2.x); 125 } 126 }); 127 128 if(this.payload.size() % 2 == 0) { 129 int mid = this.payload.size() / 2; 130 med_x = (this.payload.get(mid).x + this.payload.get(mid+1).x) / 2; 131 }else { 132 int mid = this.payload.size() / 2; 133 med_x = this.payload.get(mid).x; 134 } 135 136 if(QuadTree.verbose) { 137 System.out.println("sorted:"); 138 for(int i = 0; i < this.payload.size(); i++) { 139 System.out.print(""+this.payload.get(i).x+","); 140 } 141 System.out.println("median x: " + med_x); 142 } 143 return med_x; 144 } 145 146 private double getMedianForY() { 147 double med_y =0 ; 148 149 Collections.sort(this.payload, new Comparator<QuadTreePayload<Instance>>() { 150 @Override 151 public int compare(QuadTreePayload<Instance> y1, QuadTreePayload<Instance> y2) { 152 return Double.compare(y1.y, y2.y); 153 } 154 }); 155 156 if(this.payload.size() % 2 == 0) { 157 int mid = this.payload.size() / 2; 158 med_y = (this.payload.get(mid).y + this.payload.get(mid+1).y) / 2; 159 }else { 160 int mid = this.payload.size() / 2; 161 med_y = this.payload.get(mid).y; 162 } 163 164 if(QuadTree.verbose) { 165 System.out.println("sorted:"); 166 for(int i = 0; i < this.payload.size(); i++) { 167 System.out.print(""+this.payload.get(i).y+","); 168 } 169 System.out.println("median y: " + med_y); 170 } 171 return med_y; 172 } 173 174 /** 175 * Reurns the number of instances in the payload 176 * 177 * @return int number of instances 178 */ 179 public int getNumbers() { 180 int number = 0; 181 if(this.payload != null) { 182 number = this.payload.size(); 183 } 184 return number; 185 } 186 187 /** 188 * Calculate median values of payload for x, y and split into 4 sectors 189 * 190 * @return Array of QuadTree nodes (4 childs) 191 * @throws Exception if we would run into an recursive loop 192 */ 193 public QuadTree[] split() throws Exception { 194 195 double medx = this.getMedianForX(); 196 double medy = this.getMedianForY(); 197 198 // Payload lists for each child 199 ArrayList<QuadTreePayload<Instance>> nw = new ArrayList<QuadTreePayload<Instance>>(); 200 ArrayList<QuadTreePayload<Instance>> sw = new ArrayList<QuadTreePayload<Instance>>(); 201 ArrayList<QuadTreePayload<Instance>> ne = new ArrayList<QuadTreePayload<Instance>>(); 202 ArrayList<QuadTreePayload<Instance>> se = new ArrayList<QuadTreePayload<Instance>>(); 203 204 // sort the payloads to new payloads 205 // here we have the problem that payloads with the same values are sorted 206 // into the same slots and it could happen that medx and medy = size_x[1] and size_y[1] 207 // in that case we would have an endless loop 208 for(int i=0; i < this.payload.size(); i++) { 209 210 QuadTreePayload<Instance> item = this.payload.get(i); 211 212 // north west 213 if(item.x <= medx && item.y >= medy) { 214 nw.add(item); 215 } 216 217 // south west 218 else if(item.x <= medx && item.y <= medy) { 219 sw.add(item); 220 } 221 222 // north east 223 else if(item.x >= medx && item.y >= medy) { 224 ne.add(item); 225 } 226 227 // south east 228 else if(item.x >= medx && item.y <= medy) { 229 se.add(item); 230 } 231 } 232 233 // if we assign one child a payload equal to our own (see problem above) 234 // we throw an exceptions which stops the recursion on this node 235 if(nw.equals(this.payload)) { 236 throw new Exception("payload equal"); 237 } 238 if(sw.equals(this.payload)) { 239 throw new Exception("payload equal"); 240 } 241 if(ne.equals(this.payload)) { 242 throw new Exception("payload equal"); 243 } 244 if(se.equals(this.payload)) { 245 throw new Exception("payload equal"); 246 } 247 248 this.child_nw = new QuadTree(this, nw); 249 this.child_nw.setSize(new double[] {this.x[0], medx}, new double[] {medy, this.y[1]}); 250 this.child_nw.level = this.level + 1; 251 252 this.child_sw = new QuadTree(this, sw); 253 this.child_sw.setSize(new double[] {this.x[0], medx}, new double[] {this.y[0], medy}); 254 this.child_sw.level = this.level + 1; 255 256 this.child_ne = new QuadTree(this, ne); 257 this.child_ne.setSize(new double[] {medx, this.x[1]}, new double[] {medy, this.y[1]}); 258 this.child_ne.level = this.level + 1; 259 260 this.child_se = new QuadTree(this, se); 261 this.child_se.setSize(new double[] {medx, this.x[1]}, new double[] {this.y[0], medy}); 262 this.child_se.level = this.level + 1; 263 264 this.payload = null; 265 return new QuadTree[] {this.child_nw, this.child_ne, this.child_se, this.child_sw}; 266 } 267 268 /** 269 * TODO: static method 270 * 271 * @param q 272 */ 273 public void recursiveSplit(QuadTree q) { 274 if(QuadTree.verbose) { 275 System.out.println("splitting: "+ q); 276 } 277 if(q.getNumbers() < QuadTree.alpha) { 278 return; 279 }else{ 280 // exception is thrown if we would run into an endless loop (see comments in split()) 281 try { 282 QuadTree[] childs = q.split(); 283 this.recursiveSplit(childs[0]); 284 this.recursiveSplit(childs[1]); 285 this.recursiveSplit(childs[2]); 286 this.recursiveSplit(childs[3]); 287 }catch(Exception e) { 288 return; 289 } 290 } 291 } 292 293 /** 294 * returns an list of childs sorted by density 295 * 296 * @param q QuadTree 297 * @return list of QuadTrees 298 */ 299 private void generateList(QuadTree q) { 300 301 // we only have all childs or none at all 302 if(q.child_ne == null) { 303 this.l.add(q); 304 } 305 306 if(q.child_ne != null) { 307 this.generateList(q.child_ne); 308 } 309 if(q.child_nw != null) { 310 this.generateList(q.child_nw); 311 } 312 if(q.child_se != null) { 313 this.generateList(q.child_se); 314 } 315 if(q.child_sw != null) { 316 this.generateList(q.child_sw); 317 } 318 } 319 320 /** 321 * Checks if passed QuadTree is neighboring to us 322 * 323 * @param q QuadTree 324 * @return true if passed QuadTree is a neighbor 325 */ 326 public boolean isNeighbour(QuadTree q) { 327 boolean is_neighbour = false; 328 329 double[][] our_size = this.getSize(); 330 double[][] new_size = q.getSize(); 331 332 // X is i=0, Y is i=1 333 for(int i =0; i < 2; i++) { 334 // we are smaller than q 335 // -------------- q 336 // ------- we 337 if(our_size[i][0] >= new_size[i][0] && our_size[i][1] <= new_size[i][1]) { 338 is_neighbour = true; 339 } 340 // we overlap with q at some point 341 //a) ---------------q 342 // ----------- we 343 //b) --------- q 344 // --------- we 345 if((our_size[i][0] >= new_size[i][0] && our_size[i][0] <= new_size[i][1]) || 346 (our_size[i][1] >= new_size[i][0] && our_size[i][1] <= new_size[i][1])) { 347 is_neighbour = true; 348 } 349 // we are larger than q 350 // ---- q 351 // ---------- we 352 if(our_size[i][1] >= new_size[i][1] && our_size[i][0] <= new_size[i][0]) { 353 is_neighbour = true; 354 } 355 } 356 357 if(is_neighbour && QuadTree.verbose) { 358 System.out.println(this + " neighbour of: " + q); 359 } 360 361 return is_neighbour; 362 } 363 364 /** 365 * Perform pruning and clustering of the quadtree 366 * 367 * Pruning according to: 368 * Tim Menzies, Andrew Butcher, David Cok, Andrian Marcus, Lucas Layman, 369 * Forrest Shull, Burak Turhan, Thomas Zimmermann, 370 * "Local versus Global Lessons for Defect Prediction and Effort Estimation," 371 * IEEE Transactions on Software Engineering, vol. 39, no. 6, pp. 822-834, June, 2013 372 * 373 * 1) get list of leaf quadrants 374 * 2) sort by their density 375 * 3) set stop_rule to 0.5 * highest Density in the list 376 * 4) merge all nodes with a density > stop_rule to the new cluster and remove all from list 377 * 5) repeat 378 * 379 * @param q List of QuadTree (children only) 380 */ 381 public void gridClustering(ArrayList<QuadTree> list) { 382 383 if(list.size() == 0) { 384 return; 385 } 386 387 double stop_rule; 388 QuadTree biggest; 389 QuadTree current; 390 391 // current clusterlist 392 ArrayList<QuadTreePayload<Instance>> current_cluster; 393 394 // remove list (for removal of items after scanning of the list) 395 ArrayList<Integer> remove = new ArrayList<Integer>(); 396 397 // 1. find biggest, and add it 398 biggest = list.get(list.size()-1); 399 stop_rule = biggest.getDensity() * 0.5; 400 401 current_cluster = new ArrayList<QuadTreePayload<Instance>>(); 402 current_cluster.addAll(biggest.getPayload()); 403 404 // remove the biggest because we are starting with it 405 remove.add(list.size()-1); 406 407 ArrayList<Double[][]> tmpSize = new ArrayList<Double[][]>(); 408 tmpSize.add(biggest.getSizeDouble()); 409 410 // check the items for their density 411 for(int i=list.size()-1; i >= 0; i--) { 412 current = list.get(i); 413 414 // 2. find neighbors with correct density 415 // if density > stop_rule and is_neighbour add to cluster and remove from list 416 if(current.getDensity() > stop_rule && !current.equals(biggest) && current.isNeighbour(biggest)) { 417 current_cluster.addAll(current.getPayload()); 418 419 // add it to remove list (we cannot remove it inside the loop because it would move the index) 420 remove.add(i); 421 422 // get the size 423 tmpSize.add(current.getSizeDouble()); 424 } 425 } 426 427 // 3. remove our removal candidates from the list 428 for(Integer item: remove) { 429 list.remove((int)item); 430 } 431 432 // 4. add to cluster 433 QuadTree.ccluster.add(current_cluster); 434 435 // 5. add sizes of our current (biggest) this adds a number of sizes (all QuadTree Instances belonging to this cluster) 436 // we need that to classify test instances to a cluster later 437 Integer cnumber = new Integer(QuadTree.ccluster.size()-1); 438 if(QuadTree.csize.containsKey(cnumber) == false) { 439 QuadTree.csize.put(cnumber, tmpSize); 440 } 441 442 // repeat 443 this.gridClustering(list); 444 } 445 446 public void printInfo() { 447 System.out.println("we have " + ccluster.size() + " clusters"); 448 449 for(int i=0; i < ccluster.size(); i++) { 450 System.out.println("cluster: "+i+ " size: "+ ccluster.get(i).size()); 451 } 452 } 453 454 /** 455 * Helper Method to get a sorted list (by density) for all 456 * children 457 * 458 * @param q QuadTree 459 * @return Sorted ArrayList of quadtrees 460 */ 461 public ArrayList<QuadTree> getList(QuadTree q) { 462 this.generateList(q); 463 464 Collections.sort(this.l, new Comparator<QuadTree>() { 465 @Override 466 public int compare(QuadTree x1, QuadTree x2) { 467 return Double.compare(x1.getDensity(), x2.getDensity()); 468 } 469 }); 470 471 return this.l; 472 } 32 33 /* 1 parent or null */ 34 private QuadTree parent = null; 35 36 /* 4 childs, 1 per quadrant */ 37 private QuadTree child_nw; 38 private QuadTree child_ne; 39 private QuadTree child_se; 40 private QuadTree child_sw; 41 42 /* list (only helps with generation of list of childs!) */ 43 private ArrayList<QuadTree> l = new ArrayList<QuadTree>(); 44 45 /* level only used for debugging */ 46 public int level = 0; 47 48 /* size of the quadrant */ 49 private double[] x; 50 private double[] y; 51 52 public static boolean verbose = false; 53 public static int size = 0; 54 public static double alpha = 0; 55 56 /* cluster payloads */ 57 public static ArrayList<ArrayList<QuadTreePayload<Instance>>> ccluster = 58 new ArrayList<ArrayList<QuadTreePayload<Instance>>>(); 59 60 /* cluster sizes (index is cluster number, arraylist is list of boxes (x0,y0,x1,y1) */ 61 public static HashMap<Integer, ArrayList<Double[][]>> csize = 62 new HashMap<Integer, ArrayList<Double[][]>>(); 63 64 /* payload of this instance */ 65 private ArrayList<QuadTreePayload<Instance>> payload; 66 67 public QuadTree(QuadTree parent, ArrayList<QuadTreePayload<Instance>> payload) { 68 this.parent = parent; 69 this.payload = payload; 70 } 71 72 public String toString() { 73 String n = ""; 74 if (this.parent == null) { 75 n += "rootnode "; 76 } 77 String level = new String(new char[this.level]).replace("\0", "-"); 78 n += level + " instances: " + this.getNumbers(); 79 return n; 80 } 81 82 /** 83 * Returns the payload, used for clustering in the clustering list we only have children with 84 * paylod 85 * 86 * @return payload 87 */ 88 public ArrayList<QuadTreePayload<Instance>> getPayload() { 89 return this.payload; 90 } 91 92 /** 93 * Calculate the density of this quadrant 94 * 95 * density = number of instances / global size (all instances) 96 * 97 * @return density 98 */ 99 public double getDensity() { 100 double dens = 0; 101 dens = (double) this.getNumbers() / QuadTree.size; 102 return dens; 103 } 104 105 public void setSize(double[] x, double[] y) { 106 this.x = x; 107 this.y = y; 108 } 109 110 public double[][] getSize() { 111 return new double[][] 112 { this.x, this.y }; 113 } 114 115 public Double[][] getSizeDouble() { 116 Double[] tmpX = new Double[2]; 117 Double[] tmpY = new Double[2]; 118 119 tmpX[0] = this.x[0]; 120 tmpX[1] = this.x[1]; 121 122 tmpY[0] = this.y[0]; 123 tmpY[1] = this.y[1]; 124 125 return new Double[][] 126 { tmpX, tmpY }; 127 } 128 129 /** 130 * TODO: DRY, median ist immer dasselbe 131 * 132 * @return median for x 133 */ 134 private double getMedianForX() { 135 double med_x = 0; 136 137 Collections.sort(this.payload, new Comparator<QuadTreePayload<Instance>>() { 138 @Override 139 public int compare(QuadTreePayload<Instance> x1, QuadTreePayload<Instance> x2) { 140 return Double.compare(x1.x, x2.x); 141 } 142 }); 143 144 if (this.payload.size() % 2 == 0) { 145 int mid = this.payload.size() / 2; 146 med_x = (this.payload.get(mid).x + this.payload.get(mid + 1).x) / 2; 147 } 148 else { 149 int mid = this.payload.size() / 2; 150 med_x = this.payload.get(mid).x; 151 } 152 153 if (QuadTree.verbose) { 154 System.out.println("sorted:"); 155 for (int i = 0; i < this.payload.size(); i++) { 156 System.out.print("" + this.payload.get(i).x + ","); 157 } 158 System.out.println("median x: " + med_x); 159 } 160 return med_x; 161 } 162 163 private double getMedianForY() { 164 double med_y = 0; 165 166 Collections.sort(this.payload, new Comparator<QuadTreePayload<Instance>>() { 167 @Override 168 public int compare(QuadTreePayload<Instance> y1, QuadTreePayload<Instance> y2) { 169 return Double.compare(y1.y, y2.y); 170 } 171 }); 172 173 if (this.payload.size() % 2 == 0) { 174 int mid = this.payload.size() / 2; 175 med_y = (this.payload.get(mid).y + this.payload.get(mid + 1).y) / 2; 176 } 177 else { 178 int mid = this.payload.size() / 2; 179 med_y = this.payload.get(mid).y; 180 } 181 182 if (QuadTree.verbose) { 183 System.out.println("sorted:"); 184 for (int i = 0; i < this.payload.size(); i++) { 185 System.out.print("" + this.payload.get(i).y + ","); 186 } 187 System.out.println("median y: " + med_y); 188 } 189 return med_y; 190 } 191 192 /** 193 * Reurns the number of instances in the payload 194 * 195 * @return int number of instances 196 */ 197 public int getNumbers() { 198 int number = 0; 199 if (this.payload != null) { 200 number = this.payload.size(); 201 } 202 return number; 203 } 204 205 /** 206 * Calculate median values of payload for x, y and split into 4 sectors 207 * 208 * @return Array of QuadTree nodes (4 childs) 209 * @throws Exception 210 * if we would run into an recursive loop 211 */ 212 public QuadTree[] split() throws Exception { 213 214 double medx = this.getMedianForX(); 215 double medy = this.getMedianForY(); 216 217 // Payload lists for each child 218 ArrayList<QuadTreePayload<Instance>> nw = new ArrayList<QuadTreePayload<Instance>>(); 219 ArrayList<QuadTreePayload<Instance>> sw = new ArrayList<QuadTreePayload<Instance>>(); 220 ArrayList<QuadTreePayload<Instance>> ne = new ArrayList<QuadTreePayload<Instance>>(); 221 ArrayList<QuadTreePayload<Instance>> se = new ArrayList<QuadTreePayload<Instance>>(); 222 223 // sort the payloads to new payloads 224 // here we have the problem that payloads with the same values are sorted 225 // into the same slots and it could happen that medx and medy = size_x[1] and size_y[1] 226 // in that case we would have an endless loop 227 for (int i = 0; i < this.payload.size(); i++) { 228 229 QuadTreePayload<Instance> item = this.payload.get(i); 230 231 // north west 232 if (item.x <= medx && item.y >= medy) { 233 nw.add(item); 234 } 235 236 // south west 237 else if (item.x <= medx && item.y <= medy) { 238 sw.add(item); 239 } 240 241 // north east 242 else if (item.x >= medx && item.y >= medy) { 243 ne.add(item); 244 } 245 246 // south east 247 else if (item.x >= medx && item.y <= medy) { 248 se.add(item); 249 } 250 } 251 252 // if we assign one child a payload equal to our own (see problem above) 253 // we throw an exceptions which stops the recursion on this node 254 if (nw.equals(this.payload)) { 255 throw new Exception("payload equal"); 256 } 257 if (sw.equals(this.payload)) { 258 throw new Exception("payload equal"); 259 } 260 if (ne.equals(this.payload)) { 261 throw new Exception("payload equal"); 262 } 263 if (se.equals(this.payload)) { 264 throw new Exception("payload equal"); 265 } 266 267 this.child_nw = new QuadTree(this, nw); 268 this.child_nw.setSize(new double[] 269 { this.x[0], medx }, new double[] 270 { medy, this.y[1] }); 271 this.child_nw.level = this.level + 1; 272 273 this.child_sw = new QuadTree(this, sw); 274 this.child_sw.setSize(new double[] 275 { this.x[0], medx }, new double[] 276 { this.y[0], medy }); 277 this.child_sw.level = this.level + 1; 278 279 this.child_ne = new QuadTree(this, ne); 280 this.child_ne.setSize(new double[] 281 { medx, this.x[1] }, new double[] 282 { medy, this.y[1] }); 283 this.child_ne.level = this.level + 1; 284 285 this.child_se = new QuadTree(this, se); 286 this.child_se.setSize(new double[] 287 { medx, this.x[1] }, new double[] 288 { this.y[0], medy }); 289 this.child_se.level = this.level + 1; 290 291 this.payload = null; 292 return new QuadTree[] 293 { this.child_nw, this.child_ne, this.child_se, this.child_sw }; 294 } 295 296 /** 297 * TODO: static method 298 * 299 * @param q 300 */ 301 public void recursiveSplit(QuadTree q) { 302 if (QuadTree.verbose) { 303 System.out.println("splitting: " + q); 304 } 305 if (q.getNumbers() < QuadTree.alpha) { 306 return; 307 } 308 else { 309 // exception is thrown if we would run into an endless loop (see comments in split()) 310 try { 311 QuadTree[] childs = q.split(); 312 this.recursiveSplit(childs[0]); 313 this.recursiveSplit(childs[1]); 314 this.recursiveSplit(childs[2]); 315 this.recursiveSplit(childs[3]); 316 } 317 catch (Exception e) { 318 return; 319 } 320 } 321 } 322 323 /** 324 * returns an list of childs sorted by density 325 * 326 * @param q 327 * QuadTree 328 * @return list of QuadTrees 329 */ 330 private void generateList(QuadTree q) { 331 332 // we only have all childs or none at all 333 if (q.child_ne == null) { 334 this.l.add(q); 335 } 336 337 if (q.child_ne != null) { 338 this.generateList(q.child_ne); 339 } 340 if (q.child_nw != null) { 341 this.generateList(q.child_nw); 342 } 343 if (q.child_se != null) { 344 this.generateList(q.child_se); 345 } 346 if (q.child_sw != null) { 347 this.generateList(q.child_sw); 348 } 349 } 350 351 /** 352 * Checks if passed QuadTree is neighboring to us 353 * 354 * @param q 355 * QuadTree 356 * @return true if passed QuadTree is a neighbor 357 */ 358 public boolean isNeighbour(QuadTree q) { 359 boolean is_neighbour = false; 360 361 double[][] our_size = this.getSize(); 362 double[][] new_size = q.getSize(); 363 364 // X is i=0, Y is i=1 365 for (int i = 0; i < 2; i++) { 366 // we are smaller than q 367 // -------------- q 368 // ------- we 369 if (our_size[i][0] >= new_size[i][0] && our_size[i][1] <= new_size[i][1]) { 370 is_neighbour = true; 371 } 372 // we overlap with q at some point 373 // a) ---------------q 374 // ----------- we 375 // b) --------- q 376 // --------- we 377 if ((our_size[i][0] >= new_size[i][0] && our_size[i][0] <= new_size[i][1]) || 378 (our_size[i][1] >= new_size[i][0] && our_size[i][1] <= new_size[i][1])) 379 { 380 is_neighbour = true; 381 } 382 // we are larger than q 383 // ---- q 384 // ---------- we 385 if (our_size[i][1] >= new_size[i][1] && our_size[i][0] <= new_size[i][0]) { 386 is_neighbour = true; 387 } 388 } 389 390 if (is_neighbour && QuadTree.verbose) { 391 System.out.println(this + " neighbour of: " + q); 392 } 393 394 return is_neighbour; 395 } 396 397 /** 398 * Perform pruning and clustering of the quadtree 399 * 400 * Pruning according to: Tim Menzies, Andrew Butcher, David Cok, Andrian Marcus, Lucas Layman, 401 * Forrest Shull, Burak Turhan, Thomas Zimmermann, 402 * "Local versus Global Lessons for Defect Prediction and Effort Estimation," IEEE Transactions 403 * on Software Engineering, vol. 39, no. 6, pp. 822-834, June, 2013 404 * 405 * 1) get list of leaf quadrants 2) sort by their density 3) set stop_rule to 0.5 * highest 406 * Density in the list 4) merge all nodes with a density > stop_rule to the new cluster and 407 * remove all from list 5) repeat 408 * 409 * @param q 410 * List of QuadTree (children only) 411 */ 412 public void gridClustering(ArrayList<QuadTree> list) { 413 414 if (list.size() == 0) { 415 return; 416 } 417 418 double stop_rule; 419 QuadTree biggest; 420 QuadTree current; 421 422 // current clusterlist 423 ArrayList<QuadTreePayload<Instance>> current_cluster; 424 425 // remove list (for removal of items after scanning of the list) 426 ArrayList<Integer> remove = new ArrayList<Integer>(); 427 428 // 1. find biggest, and add it 429 biggest = list.get(list.size() - 1); 430 stop_rule = biggest.getDensity() * 0.5; 431 432 current_cluster = new ArrayList<QuadTreePayload<Instance>>(); 433 current_cluster.addAll(biggest.getPayload()); 434 435 // remove the biggest because we are starting with it 436 remove.add(list.size() - 1); 437 438 ArrayList<Double[][]> tmpSize = new ArrayList<Double[][]>(); 439 tmpSize.add(biggest.getSizeDouble()); 440 441 // check the items for their density 442 for (int i = list.size() - 1; i >= 0; i--) { 443 current = list.get(i); 444 445 // 2. find neighbors with correct density 446 // if density > stop_rule and is_neighbour add to cluster and remove from list 447 if (current.getDensity() > stop_rule && !current.equals(biggest) && 448 current.isNeighbour(biggest)) 449 { 450 current_cluster.addAll(current.getPayload()); 451 452 // add it to remove list (we cannot remove it inside the loop because it would move 453 // the index) 454 remove.add(i); 455 456 // get the size 457 tmpSize.add(current.getSizeDouble()); 458 } 459 } 460 461 // 3. remove our removal candidates from the list 462 for (Integer item : remove) { 463 list.remove((int) item); 464 } 465 466 // 4. add to cluster 467 QuadTree.ccluster.add(current_cluster); 468 469 // 5. add sizes of our current (biggest) this adds a number of sizes (all QuadTree Instances 470 // belonging to this cluster) 471 // we need that to classify test instances to a cluster later 472 Integer cnumber = new Integer(QuadTree.ccluster.size() - 1); 473 if (QuadTree.csize.containsKey(cnumber) == false) { 474 QuadTree.csize.put(cnumber, tmpSize); 475 } 476 477 // repeat 478 this.gridClustering(list); 479 } 480 481 public void printInfo() { 482 System.out.println("we have " + ccluster.size() + " clusters"); 483 484 for (int i = 0; i < ccluster.size(); i++) { 485 System.out.println("cluster: " + i + " size: " + ccluster.get(i).size()); 486 } 487 } 488 489 /** 490 * Helper Method to get a sorted list (by density) for all children 491 * 492 * @param q 493 * QuadTree 494 * @return Sorted ArrayList of quadtrees 495 */ 496 public ArrayList<QuadTree> getList(QuadTree q) { 497 this.generateList(q); 498 499 Collections.sort(this.l, new Comparator<QuadTree>() { 500 @Override 501 public int compare(QuadTree x1, QuadTree x2) { 502 return Double.compare(x1.getDensity(), x2.getDensity()); 503 } 504 }); 505 506 return this.l; 507 } 473 508 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/RandomClass.java
r38 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.training; 2 16 … … 11 25 * Assigns a random class label to the instance it is evaluated on. 12 26 * 13 * The range of class labels are hardcoded in fixedClassValues. 14 * This can later be extended to take values from the XML configuration.27 * The range of class labels are hardcoded in fixedClassValues. This can later be extended to take 28 * values from the XML configuration. 15 29 */ 16 public class RandomClass extends AbstractClassifier implements ITrainingStrategy, IWekaCompatibleTrainer { 30 public class RandomClass extends AbstractClassifier implements ITrainingStrategy, 31 IWekaCompatibleTrainer 32 { 17 33 18 34 private static final long serialVersionUID = 1L; 19 35 20 private double[] fixedClassValues = {0.0d, 1.0d}; 21 22 @Override 23 public void setParameter(String parameters) { 24 // do nothing, maybe take percentages for distribution later 25 } 36 private double[] fixedClassValues = 37 { 0.0d, 1.0d }; 26 38 27 28 public void buildClassifier(Instances arg0) throws Exception{29 // do nothing 30 39 @Override 40 public void setParameter(String parameters) { 41 // do nothing, maybe take percentages for distribution later 42 } 31 43 32 33 public Classifier getClassifier(){34 return this; 35 44 @Override 45 public void buildClassifier(Instances arg0) throws Exception { 46 // do nothing 47 } 36 48 37 38 public void apply(Instances traindata) {39 // nothing to do 40 49 @Override 50 public Classifier getClassifier() { 51 return this; 52 } 41 53 42 @Override 43 public String getName() { 44 return "RandomClass"; 45 } 46 47 @Override 48 public double classifyInstance(Instance instance) { 49 Random rand = new Random(); 50 int randomNum = rand.nextInt(this.fixedClassValues.length); 51 return this.fixedClassValues[randomNum]; 52 } 54 @Override 55 public void apply(Instances traindata) { 56 // nothing to do 57 } 58 59 @Override 60 public String getName() { 61 return "RandomClass"; 62 } 63 64 @Override 65 public double classifyInstance(Instance instance) { 66 Random rand = new Random(); 67 int randomNum = rand.nextInt(this.fixedClassValues.length); 68 return this.fixedClassValues[randomNum]; 69 } 53 70 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/WekaBaggingTraining.java
r25 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.training; 2 16 … … 18 32 /** 19 33 * Programmatic WekaBaggingTraining 20 *21 * first parameter is Trainer Name.22 * second parameter is class name23 34 * 24 * all subsequent parameters are configuration params (for example for trees) 25 * Cross Validation params always come last and are prepended with -CVPARAM 35 * first parameter is Trainer Name. second parameter is class name 36 * 37 * all subsequent parameters are configuration params (for example for trees) Cross Validation 38 * params always come last and are prepended with -CVPARAM 26 39 * 27 40 * XML Configurations for Weka Classifiers: 41 * 28 42 * <pre> 29 43 * {@code … … 37 51 public class WekaBaggingTraining extends WekaBaseTraining implements ISetWiseTrainingStrategy { 38 52 39 private final TraindatasetBagging classifier = new TraindatasetBagging(); 40 41 @Override 42 public Classifier getClassifier() { 43 return classifier; 44 } 45 46 @Override 47 public void apply(SetUniqueList<Instances> traindataSet) { 48 PrintStream errStr = System.err; 49 System.setErr(new PrintStream(new NullOutputStream())); 50 try { 51 classifier.buildClassifier(traindataSet); 52 } catch (Exception e) { 53 throw new RuntimeException(e); 54 } finally { 55 System.setErr(errStr); 56 } 57 } 58 59 public class TraindatasetBagging extends AbstractClassifier { 60 61 private static final long serialVersionUID = 1L; 53 private final TraindatasetBagging classifier = new TraindatasetBagging(); 62 54 63 private List<Instances> trainingData = null; 64 65 private List<Classifier> classifiers = null; 66 67 @Override 68 public double classifyInstance(Instance instance) { 69 if( classifiers==null ) { 70 return 0.0; 71 } 72 73 double classification = 0.0; 74 for( int i=0 ; i<classifiers.size(); i++ ) { 75 Classifier classifier = classifiers.get(i); 76 Instances traindata = trainingData.get(i); 77 78 Set<String> attributeNames = new HashSet<>(); 79 for( int j=0; j<traindata.numAttributes(); j++ ) { 80 attributeNames.add(traindata.attribute(j).name()); 81 } 82 83 double[] values = new double[traindata.numAttributes()]; 84 int index = 0; 85 for( int j=0; j<instance.numAttributes(); j++ ) { 86 if( attributeNames.contains(instance.attribute(j).name())) { 87 values[index] = instance.value(j); 88 index++; 89 } 90 } 91 92 Instances tmp = new Instances(traindata); 93 tmp.clear(); 94 Instance instCopy = new DenseInstance(instance.weight(), values); 95 instCopy.setDataset(tmp); 96 try { 97 classification += classifier.classifyInstance(instCopy); 98 } catch (Exception e) { 99 throw new RuntimeException("bagging classifier could not classify an instance", e); 100 } 101 } 102 classification /= classifiers.size(); 103 return (classification>=0.5) ? 1.0 : 0.0; 104 } 105 106 public void buildClassifier(SetUniqueList<Instances> traindataSet) throws Exception { 107 classifiers = new LinkedList<>(); 108 trainingData = new LinkedList<>(); 109 for( Instances traindata : traindataSet ) { 110 Classifier classifier = setupClassifier(); 111 classifier.buildClassifier(traindata); 112 classifiers.add(classifier); 113 trainingData.add(new Instances(traindata)); 114 } 115 } 116 117 @Override 118 public void buildClassifier(Instances traindata) throws Exception { 119 classifiers = new LinkedList<>(); 120 trainingData = new LinkedList<>(); 121 final Classifier classifier = setupClassifier(); 122 classifier.buildClassifier(traindata); 123 classifiers.add(classifier); 124 trainingData.add(new Instances(traindata)); 125 } 126 } 55 @Override 56 public Classifier getClassifier() { 57 return classifier; 58 } 59 60 @Override 61 public void apply(SetUniqueList<Instances> traindataSet) { 62 PrintStream errStr = System.err; 63 System.setErr(new PrintStream(new NullOutputStream())); 64 try { 65 classifier.buildClassifier(traindataSet); 66 } 67 catch (Exception e) { 68 throw new RuntimeException(e); 69 } 70 finally { 71 System.setErr(errStr); 72 } 73 } 74 75 public class TraindatasetBagging extends AbstractClassifier { 76 77 private static final long serialVersionUID = 1L; 78 79 private List<Instances> trainingData = null; 80 81 private List<Classifier> classifiers = null; 82 83 @Override 84 public double classifyInstance(Instance instance) { 85 if (classifiers == null) { 86 return 0.0; 87 } 88 89 double classification = 0.0; 90 for (int i = 0; i < classifiers.size(); i++) { 91 Classifier classifier = classifiers.get(i); 92 Instances traindata = trainingData.get(i); 93 94 Set<String> attributeNames = new HashSet<>(); 95 for (int j = 0; j < traindata.numAttributes(); j++) { 96 attributeNames.add(traindata.attribute(j).name()); 97 } 98 99 double[] values = new double[traindata.numAttributes()]; 100 int index = 0; 101 for (int j = 0; j < instance.numAttributes(); j++) { 102 if (attributeNames.contains(instance.attribute(j).name())) { 103 values[index] = instance.value(j); 104 index++; 105 } 106 } 107 108 Instances tmp = new Instances(traindata); 109 tmp.clear(); 110 Instance instCopy = new DenseInstance(instance.weight(), values); 111 instCopy.setDataset(tmp); 112 try { 113 classification += classifier.classifyInstance(instCopy); 114 } 115 catch (Exception e) { 116 throw new RuntimeException("bagging classifier could not classify an instance", 117 e); 118 } 119 } 120 classification /= classifiers.size(); 121 return (classification >= 0.5) ? 1.0 : 0.0; 122 } 123 124 public void buildClassifier(SetUniqueList<Instances> traindataSet) throws Exception { 125 classifiers = new LinkedList<>(); 126 trainingData = new LinkedList<>(); 127 for (Instances traindata : traindataSet) { 128 Classifier classifier = setupClassifier(); 129 classifier.buildClassifier(traindata); 130 classifiers.add(classifier); 131 trainingData.add(new Instances(traindata)); 132 } 133 } 134 135 @Override 136 public void buildClassifier(Instances traindata) throws Exception { 137 classifiers = new LinkedList<>(); 138 trainingData = new LinkedList<>(); 139 final Classifier classifier = setupClassifier(); 140 classifier.buildClassifier(traindata); 141 classifiers.add(classifier); 142 trainingData.add(new Instances(traindata)); 143 } 144 } 127 145 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/WekaBaseTraining.java
r25 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.training; 2 16 … … 15 29 * Allows specification of the Weka classifier and its params in the XML experiment configuration. 16 30 * 17 * Important conventions of the XML format: 18 * Cross Validation params always come last and are prepended with -CVPARAM19 * Example: <trainer name="WekaTraining"param="RandomForestLocal weka.classifiers.trees.RandomForest -CVPARAM I 5 25 5"/>31 * Important conventions of the XML format: Cross Validation params always come last and are 32 * prepended with -CVPARAM Example: <trainer name="WekaTraining" 33 * param="RandomForestLocal weka.classifiers.trees.RandomForest -CVPARAM I 5 25 5"/> 20 34 */ 21 35 public abstract class WekaBaseTraining implements IWekaCompatibleTrainer { 22 23 protected Classifier classifier = null;24 protected String classifierClassName;25 protected String classifierName;26 protected String[] classifierParams;27 28 @Override29 public void setParameter(String parameters) {30 String[] params = parameters.split(" ");31 36 32 // first part of the params is the classifierName (e.g. SMORBF) 33 classifierName = params[0]; 34 35 // the following parameters can be copied from weka! 36 37 // second param is classifierClassName (e.g. weka.classifiers.functions.SMO) 38 classifierClassName = params[1]; 39 40 // rest are params to the specified classifier (e.g. -K weka.classifiers.functions.supportVector.RBFKernel) 41 classifierParams = Arrays.copyOfRange(params, 2, params.length); 42 43 classifier = setupClassifier(); 44 } 37 protected Classifier classifier = null; 38 protected String classifierClassName; 39 protected String classifierName; 40 protected String[] classifierParams; 45 41 46 @Override 47 public Classifier getClassifier() { 48 return classifier; 49 } 42 @Override 43 public void setParameter(String parameters) { 44 String[] params = parameters.split(" "); 50 45 51 public Classifier setupClassifier() { 52 Classifier cl = null; 53 try{ 54 @SuppressWarnings("rawtypes") 55 Class c = Class.forName(classifierClassName); 56 Classifier obj = (Classifier) c.newInstance(); 57 58 // Filter out -CVPARAM, these are special because they do not belong to the Weka classifier class as parameters 59 String[] param = Arrays.copyOf(classifierParams, classifierParams.length); 60 String[] cvparam = {}; 61 boolean cv = false; 62 for ( int i=0; i < classifierParams.length; i++ ) { 63 if(classifierParams[i].equals("-CVPARAM")) { 64 // rest of array are cvparam 65 cvparam = Arrays.copyOfRange(classifierParams, i+1, classifierParams.length); 66 67 // before this we have normal params 68 param = Arrays.copyOfRange(classifierParams, 0, i); 69 70 cv = true; 71 break; 72 } 73 } 74 75 // set classifier params 76 ((OptionHandler)obj).setOptions(param); 77 cl = obj; 78 79 // we have cross val params 80 // cant check on cvparam.length here, it may not be initialized 81 if(cv) { 82 final CVParameterSelection ps = new CVParameterSelection(); 83 ps.setClassifier(obj); 84 ps.setNumFolds(5); 85 //ps.addCVParameter("I 5 25 5"); 86 for( int i=1 ; i<cvparam.length/4 ; i++ ) { 87 ps.addCVParameter(Arrays.asList(Arrays.copyOfRange(cvparam, 0, 4*i)).toString().replaceAll(", ", " ").replaceAll("^\\[|\\]$", "")); 88 } 89 90 cl = ps; 91 } 46 // first part of the params is the classifierName (e.g. SMORBF) 47 classifierName = params[0]; 92 48 93 }catch(ClassNotFoundException e) { 94 Console.traceln(Level.WARNING, String.format("class not found: %s", e.toString())); 95 e.printStackTrace(); 96 } catch (InstantiationException e) { 97 Console.traceln(Level.WARNING, String.format("Instantiation Exception: %s", e.toString())); 98 e.printStackTrace(); 99 } catch (IllegalAccessException e) { 100 Console.traceln(Level.WARNING, String.format("Illegal Access Exception: %s", e.toString())); 101 e.printStackTrace(); 102 } catch (Exception e) { 103 Console.traceln(Level.WARNING, String.format("Exception: %s", e.toString())); 104 e.printStackTrace(); 105 } 106 107 return cl; 108 } 49 // the following parameters can be copied from weka! 109 50 110 @Override 111 public String getName() { 112 return classifierName; 113 } 114 51 // second param is classifierClassName (e.g. weka.classifiers.functions.SMO) 52 classifierClassName = params[1]; 53 54 // rest are params to the specified classifier (e.g. -K 55 // weka.classifiers.functions.supportVector.RBFKernel) 56 classifierParams = Arrays.copyOfRange(params, 2, params.length); 57 58 classifier = setupClassifier(); 59 } 60 61 @Override 62 public Classifier getClassifier() { 63 return classifier; 64 } 65 66 public Classifier setupClassifier() { 67 Classifier cl = null; 68 try { 69 @SuppressWarnings("rawtypes") 70 Class c = Class.forName(classifierClassName); 71 Classifier obj = (Classifier) c.newInstance(); 72 73 // Filter out -CVPARAM, these are special because they do not belong to the Weka 74 // classifier class as parameters 75 String[] param = Arrays.copyOf(classifierParams, classifierParams.length); 76 String[] cvparam = { }; 77 boolean cv = false; 78 for (int i = 0; i < classifierParams.length; i++) { 79 if (classifierParams[i].equals("-CVPARAM")) { 80 // rest of array are cvparam 81 cvparam = Arrays.copyOfRange(classifierParams, i + 1, classifierParams.length); 82 83 // before this we have normal params 84 param = Arrays.copyOfRange(classifierParams, 0, i); 85 86 cv = true; 87 break; 88 } 89 } 90 91 // set classifier params 92 ((OptionHandler) obj).setOptions(param); 93 cl = obj; 94 95 // we have cross val params 96 // cant check on cvparam.length here, it may not be initialized 97 if (cv) { 98 final CVParameterSelection ps = new CVParameterSelection(); 99 ps.setClassifier(obj); 100 ps.setNumFolds(5); 101 // ps.addCVParameter("I 5 25 5"); 102 for (int i = 1; i < cvparam.length / 4; i++) { 103 ps.addCVParameter(Arrays.asList(Arrays.copyOfRange(cvparam, 0, 4 * i)) 104 .toString().replaceAll(", ", " ").replaceAll("^\\[|\\]$", "")); 105 } 106 107 cl = ps; 108 } 109 110 } 111 catch (ClassNotFoundException e) { 112 Console.traceln(Level.WARNING, String.format("class not found: %s", e.toString())); 113 e.printStackTrace(); 114 } 115 catch (InstantiationException e) { 116 Console.traceln(Level.WARNING, 117 String.format("Instantiation Exception: %s", e.toString())); 118 e.printStackTrace(); 119 } 120 catch (IllegalAccessException e) { 121 Console.traceln(Level.WARNING, 122 String.format("Illegal Access Exception: %s", e.toString())); 123 e.printStackTrace(); 124 } 125 catch (Exception e) { 126 Console.traceln(Level.WARNING, String.format("Exception: %s", e.toString())); 127 e.printStackTrace(); 128 } 129 130 return cl; 131 } 132 133 @Override 134 public String getName() { 135 return classifierName; 136 } 137 115 138 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/WekaLocalEMTraining.java
r25 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.training; 2 16 … … 24 38 * WekaLocalEMTraining 25 39 * 26 * Local Trainer with EM Clustering for data partitioning. 27 * Currently supports only EM Clustering. 28 * 29 * 1. Cluster training data 30 * 2. for each cluster train a classifier with training data from cluster 40 * Local Trainer with EM Clustering for data partitioning. Currently supports only EM Clustering. 41 * 42 * 1. Cluster training data 2. for each cluster train a classifier with training data from cluster 31 43 * 3. match test data instance to a cluster, then classify with classifier from the cluster 32 44 * 33 * XML configuration: 34 * <!-- because of clustering --> 35 * <preprocessor name="Normalization" param=""/> 36 * 37 * <!-- cluster trainer --> 38 * <trainer name="WekaLocalEMTraining" param="NaiveBayes weka.classifiers.bayes.NaiveBayes" /> 45 * XML configuration: <!-- because of clustering --> <preprocessor name="Normalization" param=""/> 46 * 47 * <!-- cluster trainer --> <trainer name="WekaLocalEMTraining" 48 * param="NaiveBayes weka.classifiers.bayes.NaiveBayes" /> 39 49 */ 40 50 public class WekaLocalEMTraining extends WekaBaseTraining implements ITrainingStrategy { 41 51 42 private final TraindatasetCluster classifier = new TraindatasetCluster(); 43 44 @Override 45 public Classifier getClassifier() { 46 return classifier; 47 } 48 49 @Override 50 public void apply(Instances traindata) { 51 PrintStream errStr = System.err; 52 System.setErr(new PrintStream(new NullOutputStream())); 53 try { 54 classifier.buildClassifier(traindata); 55 } catch (Exception e) { 56 throw new RuntimeException(e); 57 } finally { 58 System.setErr(errStr); 59 } 60 } 61 62 63 public class TraindatasetCluster extends AbstractClassifier { 64 65 private static final long serialVersionUID = 1L; 66 67 private EM clusterer = null; 68 69 private HashMap<Integer, Classifier> cclassifier; 70 private HashMap<Integer, Instances> ctraindata; 71 72 73 /** 74 * Helper method that gives us a clean instance copy with 75 * the values of the instancelist of the first parameter. 76 * 77 * @param instancelist with attributes 78 * @param instance with only values 79 * @return copy of the instance 80 */ 81 private Instance createInstance(Instances instances, Instance instance) { 82 // attributes for feeding instance to classifier 83 Set<String> attributeNames = new HashSet<>(); 84 for( int j=0; j<instances.numAttributes(); j++ ) { 85 attributeNames.add(instances.attribute(j).name()); 86 } 87 88 double[] values = new double[instances.numAttributes()]; 89 int index = 0; 90 for( int j=0; j<instance.numAttributes(); j++ ) { 91 if( attributeNames.contains(instance.attribute(j).name())) { 92 values[index] = instance.value(j); 93 index++; 94 } 95 } 96 97 Instances tmp = new Instances(instances); 98 tmp.clear(); 99 Instance instCopy = new DenseInstance(instance.weight(), values); 100 instCopy.setDataset(tmp); 101 102 return instCopy; 103 } 104 105 @Override 106 public double classifyInstance(Instance instance) { 107 double ret = 0; 108 try { 109 // 1. copy the instance (keep the class attribute) 110 Instances traindata = ctraindata.get(0); 111 Instance classInstance = createInstance(traindata, instance); 112 113 // 2. remove class attribute before clustering 114 Remove filter = new Remove(); 115 filter.setAttributeIndices("" + (traindata.classIndex() + 1)); 116 filter.setInputFormat(traindata); 117 traindata = Filter.useFilter(traindata, filter); 118 119 // 3. copy the instance (without the class attribute) for clustering 120 Instance clusterInstance = createInstance(traindata, instance); 121 122 // 4. match instance without class attribute to a cluster number 123 int cnum = clusterer.clusterInstance(clusterInstance); 124 125 // 5. classify instance with class attribute to the classifier of that cluster number 126 ret = cclassifier.get(cnum).classifyInstance(classInstance); 127 128 }catch( Exception e ) { 129 Console.traceln(Level.INFO, String.format("ERROR matching instance to cluster!")); 130 throw new RuntimeException(e); 131 } 132 return ret; 133 } 134 135 @Override 136 public void buildClassifier(Instances traindata) throws Exception { 137 138 // 1. copy training data 139 Instances train = new Instances(traindata); 140 141 // 2. remove class attribute for clustering 142 Remove filter = new Remove(); 143 filter.setAttributeIndices("" + (train.classIndex() + 1)); 144 filter.setInputFormat(train); 145 train = Filter.useFilter(train, filter); 146 147 // new objects 148 cclassifier = new HashMap<Integer, Classifier>(); 149 ctraindata = new HashMap<Integer, Instances>(); 150 151 Instances ctrain; 152 int maxNumClusters = train.size(); 153 boolean sufficientInstancesInEachCluster; 154 do { // while(onlyTarget) 155 sufficientInstancesInEachCluster = true; 156 clusterer = new EM(); 157 clusterer.setMaximumNumberOfClusters(maxNumClusters); 158 clusterer.buildClusterer(train); 159 160 // 4. get cluster membership of our traindata 161 //AddCluster cfilter = new AddCluster(); 162 //cfilter.setClusterer(clusterer); 163 //cfilter.setInputFormat(train); 164 //Instances ctrain = Filter.useFilter(train, cfilter); 165 166 ctrain = new Instances(train); 167 ctraindata = new HashMap<>(); 168 169 // get traindata per cluster 170 for ( int j=0; j < ctrain.numInstances(); j++ ) { 171 // get the cluster number from the attributes, subract 1 because if we clusterInstance we get 0-n, and this is 1-n 172 //cnumber = Integer.parseInt(ctrain.get(j).stringValue(ctrain.get(j).numAttributes()-1).replace("cluster", "")) - 1; 173 174 int cnumber = clusterer.clusterInstance(ctrain.get(j)); 175 // add training data to list of instances for this cluster number 176 if ( !ctraindata.containsKey(cnumber) ) { 177 ctraindata.put(cnumber, new Instances(traindata)); 178 ctraindata.get(cnumber).delete(); 179 } 180 ctraindata.get(cnumber).add(traindata.get(j)); 181 } 182 183 for( Entry<Integer,Instances> entry : ctraindata.entrySet() ) { 184 Instances instances = entry.getValue(); 185 int[] counts = instances.attributeStats(instances.classIndex()).nominalCounts; 186 for( int count : counts ) { 187 sufficientInstancesInEachCluster &= count>0; 188 } 189 sufficientInstancesInEachCluster &= instances.numInstances()>=5; 190 } 191 maxNumClusters = clusterer.numberOfClusters()-1; 192 } while(!sufficientInstancesInEachCluster); 193 194 // train one classifier per cluster, we get the cluster number from the training data 195 Iterator<Integer> clusternumber = ctraindata.keySet().iterator(); 196 while ( clusternumber.hasNext() ) { 197 int cnumber = clusternumber.next(); 198 cclassifier.put(cnumber,setupClassifier()); 199 cclassifier.get(cnumber).buildClassifier(ctraindata.get(cnumber)); 200 201 //Console.traceln(Level.INFO, String.format("classifier in cluster "+cnumber)); 202 } 203 } 204 } 52 private final TraindatasetCluster classifier = new TraindatasetCluster(); 53 54 @Override 55 public Classifier getClassifier() { 56 return classifier; 57 } 58 59 @Override 60 public void apply(Instances traindata) { 61 PrintStream errStr = System.err; 62 System.setErr(new PrintStream(new NullOutputStream())); 63 try { 64 classifier.buildClassifier(traindata); 65 } 66 catch (Exception e) { 67 throw new RuntimeException(e); 68 } 69 finally { 70 System.setErr(errStr); 71 } 72 } 73 74 public class TraindatasetCluster extends AbstractClassifier { 75 76 private static final long serialVersionUID = 1L; 77 78 private EM clusterer = null; 79 80 private HashMap<Integer, Classifier> cclassifier; 81 private HashMap<Integer, Instances> ctraindata; 82 83 /** 84 * Helper method that gives us a clean instance copy with the values of the instancelist of 85 * the first parameter. 86 * 87 * @param instancelist 88 * with attributes 89 * @param instance 90 * with only values 91 * @return copy of the instance 92 */ 93 private Instance createInstance(Instances instances, Instance instance) { 94 // attributes for feeding instance to classifier 95 Set<String> attributeNames = new HashSet<>(); 96 for (int j = 0; j < instances.numAttributes(); j++) { 97 attributeNames.add(instances.attribute(j).name()); 98 } 99 100 double[] values = new double[instances.numAttributes()]; 101 int index = 0; 102 for (int j = 0; j < instance.numAttributes(); j++) { 103 if (attributeNames.contains(instance.attribute(j).name())) { 104 values[index] = instance.value(j); 105 index++; 106 } 107 } 108 109 Instances tmp = new Instances(instances); 110 tmp.clear(); 111 Instance instCopy = new DenseInstance(instance.weight(), values); 112 instCopy.setDataset(tmp); 113 114 return instCopy; 115 } 116 117 @Override 118 public double classifyInstance(Instance instance) { 119 double ret = 0; 120 try { 121 // 1. copy the instance (keep the class attribute) 122 Instances traindata = ctraindata.get(0); 123 Instance classInstance = createInstance(traindata, instance); 124 125 // 2. remove class attribute before clustering 126 Remove filter = new Remove(); 127 filter.setAttributeIndices("" + (traindata.classIndex() + 1)); 128 filter.setInputFormat(traindata); 129 traindata = Filter.useFilter(traindata, filter); 130 131 // 3. copy the instance (without the class attribute) for clustering 132 Instance clusterInstance = createInstance(traindata, instance); 133 134 // 4. match instance without class attribute to a cluster number 135 int cnum = clusterer.clusterInstance(clusterInstance); 136 137 // 5. classify instance with class attribute to the classifier of that cluster 138 // number 139 ret = cclassifier.get(cnum).classifyInstance(classInstance); 140 141 } 142 catch (Exception e) { 143 Console.traceln(Level.INFO, String.format("ERROR matching instance to cluster!")); 144 throw new RuntimeException(e); 145 } 146 return ret; 147 } 148 149 @Override 150 public void buildClassifier(Instances traindata) throws Exception { 151 152 // 1. copy training data 153 Instances train = new Instances(traindata); 154 155 // 2. remove class attribute for clustering 156 Remove filter = new Remove(); 157 filter.setAttributeIndices("" + (train.classIndex() + 1)); 158 filter.setInputFormat(train); 159 train = Filter.useFilter(train, filter); 160 161 // new objects 162 cclassifier = new HashMap<Integer, Classifier>(); 163 ctraindata = new HashMap<Integer, Instances>(); 164 165 Instances ctrain; 166 int maxNumClusters = train.size(); 167 boolean sufficientInstancesInEachCluster; 168 do { // while(onlyTarget) 169 sufficientInstancesInEachCluster = true; 170 clusterer = new EM(); 171 clusterer.setMaximumNumberOfClusters(maxNumClusters); 172 clusterer.buildClusterer(train); 173 174 // 4. get cluster membership of our traindata 175 // AddCluster cfilter = new AddCluster(); 176 // cfilter.setClusterer(clusterer); 177 // cfilter.setInputFormat(train); 178 // Instances ctrain = Filter.useFilter(train, cfilter); 179 180 ctrain = new Instances(train); 181 ctraindata = new HashMap<>(); 182 183 // get traindata per cluster 184 for (int j = 0; j < ctrain.numInstances(); j++) { 185 // get the cluster number from the attributes, subract 1 because if we 186 // clusterInstance we get 0-n, and this is 1-n 187 // cnumber = 188 // Integer.parseInt(ctrain.get(j).stringValue(ctrain.get(j).numAttributes()-1).replace("cluster", 189 // "")) - 1; 190 191 int cnumber = clusterer.clusterInstance(ctrain.get(j)); 192 // add training data to list of instances for this cluster number 193 if (!ctraindata.containsKey(cnumber)) { 194 ctraindata.put(cnumber, new Instances(traindata)); 195 ctraindata.get(cnumber).delete(); 196 } 197 ctraindata.get(cnumber).add(traindata.get(j)); 198 } 199 200 for (Entry<Integer, Instances> entry : ctraindata.entrySet()) { 201 Instances instances = entry.getValue(); 202 int[] counts = instances.attributeStats(instances.classIndex()).nominalCounts; 203 for (int count : counts) { 204 sufficientInstancesInEachCluster &= count > 0; 205 } 206 sufficientInstancesInEachCluster &= instances.numInstances() >= 5; 207 } 208 maxNumClusters = clusterer.numberOfClusters() - 1; 209 } 210 while (!sufficientInstancesInEachCluster); 211 212 // train one classifier per cluster, we get the cluster number from the training data 213 Iterator<Integer> clusternumber = ctraindata.keySet().iterator(); 214 while (clusternumber.hasNext()) { 215 int cnumber = clusternumber.next(); 216 cclassifier.put(cnumber, setupClassifier()); 217 cclassifier.get(cnumber).buildClassifier(ctraindata.get(cnumber)); 218 219 // Console.traceln(Level.INFO, String.format("classifier in cluster "+cnumber)); 220 } 221 } 222 } 205 223 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/WekaLocalFQTraining.java
r25 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.training; 2 16 … … 24 38 25 39 /** 26 * Trainer with reimplementation of WHERE clustering algorithm from: 27 * Tim Menzies, Andrew Butcher, David Cok, Andrian Marcus, Lucas Layman, 28 * Forrest Shull, Burak Turhan, Thomas Zimmermann, 29 * "Local versus Global Lessons for Defect Prediction and Effort Estimation," 30 * IEEE Transactions on Software Engineering, vol. 39, no. 6, pp. 822-834, June, 2013 40 * Trainer with reimplementation of WHERE clustering algorithm from: Tim Menzies, Andrew Butcher, 41 * David Cok, Andrian Marcus, Lucas Layman, Forrest Shull, Burak Turhan, Thomas Zimmermann, 42 * "Local versus Global Lessons for Defect Prediction and Effort Estimation," IEEE Transactions on 43 * Software Engineering, vol. 39, no. 6, pp. 822-834, June, 2013 31 44 * 32 * With WekaLocalFQTraining we do the following: 33 * 1) Run the Fastmap algorithm on all training data, let it calculate the 2 most significant34 * dimensions and projections of each instance to these dimensions35 * 2) With these 2 dimensions we span a QuadTree which gets recursively split on median(x) and median(y) values.36 * 3) We cluster the QuadTree nodes together if they have similar density (50%)37 * 4) We save the clusters and their training data38 * 5) We only use clusters with > ALPHA instances (currently Math.sqrt(SIZE)), rest is discarded with the training data of this cluster39 * 6) We train a Weka classifier for each cluster with the clusters training data40 * 7) We recalculate Fastmap distances for a single instance with the old pivots and then try to find a cluster containing the coords of the instance.41 * 7.1.) If we can not find a cluster (due to coords outside of all clusters) we find the nearest cluster.42 * 8) We classify the Instance with theclassifier and traindata from the Cluster we found in 7.45 * With WekaLocalFQTraining we do the following: 1) Run the Fastmap algorithm on all training data, 46 * let it calculate the 2 most significant dimensions and projections of each instance to these 47 * dimensions 2) With these 2 dimensions we span a QuadTree which gets recursively split on 48 * median(x) and median(y) values. 3) We cluster the QuadTree nodes together if they have similar 49 * density (50%) 4) We save the clusters and their training data 5) We only use clusters with > 50 * ALPHA instances (currently Math.sqrt(SIZE)), rest is discarded with the training data of this 51 * cluster 6) We train a Weka classifier for each cluster with the clusters training data 7) We 52 * recalculate Fastmap distances for a single instance with the old pivots and then try to find a 53 * cluster containing the coords of the instance. 7.1.) If we can not find a cluster (due to coords 54 * outside of all clusters) we find the nearest cluster. 8) We classify the Instance with the 55 * classifier and traindata from the Cluster we found in 7. 43 56 */ 44 57 public class WekaLocalFQTraining extends WekaBaseTraining implements ITrainingStrategy { 45 46 private final TraindatasetCluster classifier = new TraindatasetCluster(); 47 48 @Override 49 public Classifier getClassifier() { 50 return classifier; 51 } 52 53 @Override 54 public void apply(Instances traindata) { 55 PrintStream errStr = System.err; 56 System.setErr(new PrintStream(new NullOutputStream())); 57 try { 58 classifier.buildClassifier(traindata); 59 } catch (Exception e) { 60 throw new RuntimeException(e); 61 } finally { 62 System.setErr(errStr); 63 } 64 } 65 66 67 public class TraindatasetCluster extends AbstractClassifier { 68 69 private static final long serialVersionUID = 1L; 70 71 /* classifier per cluster */ 72 private HashMap<Integer, Classifier> cclassifier; 73 74 /* instances per cluster */ 75 private HashMap<Integer, Instances> ctraindata; 76 77 /* holds the instances and indices of the pivot objects of the Fastmap calculation in buildClassifier*/ 78 private HashMap<Integer, Instance> cpivots; 79 80 /* holds the indices of the pivot objects for x,y and the dimension [x,y][dimension]*/ 81 private int[][] cpivotindices; 82 83 /* holds the sizes of the cluster multiple "boxes" per cluster */ 84 private HashMap<Integer, ArrayList<Double[][]>> csize; 85 86 /* debug vars */ 87 @SuppressWarnings("unused") 88 private boolean show_biggest = true; 89 90 @SuppressWarnings("unused") 91 private int CFOUND = 0; 92 @SuppressWarnings("unused") 93 private int CNOTFOUND = 0; 94 95 96 private Instance createInstance(Instances instances, Instance instance) { 97 // attributes for feeding instance to classifier 98 Set<String> attributeNames = new HashSet<>(); 99 for( int j=0; j<instances.numAttributes(); j++ ) { 100 attributeNames.add(instances.attribute(j).name()); 101 } 102 103 double[] values = new double[instances.numAttributes()]; 104 int index = 0; 105 for( int j=0; j<instance.numAttributes(); j++ ) { 106 if( attributeNames.contains(instance.attribute(j).name())) { 107 values[index] = instance.value(j); 108 index++; 109 } 110 } 111 112 Instances tmp = new Instances(instances); 113 tmp.clear(); 114 Instance instCopy = new DenseInstance(instance.weight(), values); 115 instCopy.setDataset(tmp); 116 117 return instCopy; 118 } 119 120 /** 121 * Because Fastmap saves only the image not the values of the attributes it used 122 * we can not use the old data directly to classify single instances to clusters. 123 * 124 * To classify a single instance we do a new fastmap computation with only the instance and 125 * the old pivot elements. 126 * 127 * After that we find the cluster with our fastmap result for x and y. 128 */ 129 @Override 130 public double classifyInstance(Instance instance) { 131 132 double ret = 0; 133 try { 134 // classinstance gets passed to classifier 135 Instances traindata = ctraindata.get(0); 136 Instance classInstance = createInstance(traindata, instance); 137 138 // this one keeps the class attribute 139 Instances traindata2 = ctraindata.get(1); 140 141 // remove class attribute before clustering 142 Remove filter = new Remove(); 143 filter.setAttributeIndices("" + (traindata.classIndex() + 1)); 144 filter.setInputFormat(traindata); 145 traindata = Filter.useFilter(traindata, filter); 146 Instance clusterInstance = createInstance(traindata, instance); 147 148 Fastmap FMAP = new Fastmap(2); 149 EuclideanDistance dist = new EuclideanDistance(traindata); 150 151 // we set our pivot indices [x=0,y=1][dimension] 152 int[][] npivotindices = new int[2][2]; 153 npivotindices[0][0] = 1; 154 npivotindices[1][0] = 2; 155 npivotindices[0][1] = 3; 156 npivotindices[1][1] = 4; 157 158 // build temp dist matrix (2 pivots per dimension + 1 instance we want to classify) 159 // the instance we want to classify comes first after that the pivot elements in the order defined above 160 double[][] distmat = new double[2*FMAP.target_dims+1][2*FMAP.target_dims+1]; 161 distmat[0][0] = 0; 162 distmat[0][1] = dist.distance(clusterInstance, this.cpivots.get((Integer)this.cpivotindices[0][0])); 163 distmat[0][2] = dist.distance(clusterInstance, this.cpivots.get((Integer)this.cpivotindices[1][0])); 164 distmat[0][3] = dist.distance(clusterInstance, this.cpivots.get((Integer)this.cpivotindices[0][1])); 165 distmat[0][4] = dist.distance(clusterInstance, this.cpivots.get((Integer)this.cpivotindices[1][1])); 166 167 distmat[1][0] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[0][0]), clusterInstance); 168 distmat[1][1] = 0; 169 distmat[1][2] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[0][0]), this.cpivots.get((Integer)this.cpivotindices[1][0])); 170 distmat[1][3] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[0][0]), this.cpivots.get((Integer)this.cpivotindices[0][1])); 171 distmat[1][4] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[0][0]), this.cpivots.get((Integer)this.cpivotindices[1][1])); 172 173 distmat[2][0] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[1][0]), clusterInstance); 174 distmat[2][1] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[1][0]), this.cpivots.get((Integer)this.cpivotindices[0][0])); 175 distmat[2][2] = 0; 176 distmat[2][3] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[1][0]), this.cpivots.get((Integer)this.cpivotindices[0][1])); 177 distmat[2][4] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[1][0]), this.cpivots.get((Integer)this.cpivotindices[1][1])); 178 179 distmat[3][0] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[0][1]), clusterInstance); 180 distmat[3][1] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[0][1]), this.cpivots.get((Integer)this.cpivotindices[0][0])); 181 distmat[3][2] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[0][1]), this.cpivots.get((Integer)this.cpivotindices[1][0])); 182 distmat[3][3] = 0; 183 distmat[3][4] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[0][1]), this.cpivots.get((Integer)this.cpivotindices[1][1])); 184 185 distmat[4][0] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[1][1]), clusterInstance); 186 distmat[4][1] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[1][1]), this.cpivots.get((Integer)this.cpivotindices[0][0])); 187 distmat[4][2] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[1][1]), this.cpivots.get((Integer)this.cpivotindices[1][0])); 188 distmat[4][3] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[1][1]), this.cpivots.get((Integer)this.cpivotindices[0][1])); 189 distmat[4][4] = 0; 190 191 192 /* debug output: show biggest distance found within the new distance matrix 193 double biggest = 0; 194 for(int i=0; i < distmat.length; i++) { 195 for(int j=0; j < distmat[0].length; j++) { 196 if(biggest < distmat[i][j]) { 197 biggest = distmat[i][j]; 198 } 199 } 200 } 201 if(this.show_biggest) { 202 Console.traceln(Level.INFO, String.format(""+clusterInstance)); 203 Console.traceln(Level.INFO, String.format("biggest distances: "+ biggest)); 204 this.show_biggest = false; 205 } 206 */ 207 208 FMAP.setDistmat(distmat); 209 FMAP.setPivots(npivotindices); 210 FMAP.calculate(); 211 double[][] x = FMAP.getX(); 212 double[] proj = x[0]; 213 214 // debug output: show the calculated distance matrix, our result vektor for the instance and the complete result matrix 215 /* 216 Console.traceln(Level.INFO, "distmat:"); 217 for(int i=0; i<distmat.length; i++){ 218 for(int j=0; j<distmat[0].length; j++){ 219 Console.trace(Level.INFO, String.format("%20s", distmat[i][j])); 220 } 221 Console.traceln(Level.INFO, ""); 222 } 223 224 Console.traceln(Level.INFO, "vector:"); 225 for(int i=0; i < proj.length; i++) { 226 Console.trace(Level.INFO, String.format("%20s", proj[i])); 227 } 228 Console.traceln(Level.INFO, ""); 229 230 Console.traceln(Level.INFO, "resultmat:"); 231 for(int i=0; i<x.length; i++){ 232 for(int j=0; j<x[0].length; j++){ 233 Console.trace(Level.INFO, String.format("%20s", x[i][j])); 234 } 235 Console.traceln(Level.INFO, ""); 236 } 237 */ 238 239 // now we iterate over all clusters (well, boxes of sizes per cluster really) and save the number of the 240 // cluster in which we are 241 int cnumber; 242 int found_cnumber = -1; 243 Iterator<Integer> clusternumber = this.csize.keySet().iterator(); 244 while ( clusternumber.hasNext() && found_cnumber == -1) { 245 cnumber = clusternumber.next(); 246 247 // now iterate over the boxes of the cluster and hope we find one (cluster could have been removed) 248 // or we are too far away from any cluster because of the fastmap calculation with the initial pivot objects 249 for ( int box=0; box < this.csize.get(cnumber).size(); box++ ) { 250 Double[][] current = this.csize.get(cnumber).get(box); 251 252 if(proj[0] >= current[0][0] && proj[0] <= current[0][1] && // x 253 proj[1] >= current[1][0] && proj[1] <= current[1][1]) { // y 254 found_cnumber = cnumber; 255 } 256 } 257 } 258 259 // we want to count how often we are really inside a cluster 260 //if ( found_cnumber == -1 ) { 261 // CNOTFOUND += 1; 262 //}else { 263 // CFOUND += 1; 264 //} 265 266 // now it can happen that we do not find a cluster because we deleted it previously (too few instances) 267 // or we get bigger distance measures from weka so that we are completely outside of our clusters. 268 // in these cases we just find the nearest cluster to our instance and use it for classification. 269 // to do that we use the EuclideanDistance again to compare our distance to all other Instances 270 // then we take the cluster of the closest weka instance 271 dist = new EuclideanDistance(traindata2); 272 if( !this.ctraindata.containsKey(found_cnumber) ) { 273 double min_distance = Double.MAX_VALUE; 274 clusternumber = ctraindata.keySet().iterator(); 275 while ( clusternumber.hasNext() ) { 276 cnumber = clusternumber.next(); 277 for(int i=0; i < ctraindata.get(cnumber).size(); i++) { 278 if(dist.distance(instance, ctraindata.get(cnumber).get(i)) <= min_distance) { 279 found_cnumber = cnumber; 280 min_distance = dist.distance(instance, ctraindata.get(cnumber).get(i)); 281 } 282 } 283 } 284 } 285 286 // here we have the cluster where an instance has the minimum distance between itself and the 287 // instance we want to classify 288 // if we still have not found a cluster we exit because something is really wrong 289 if( found_cnumber == -1 ) { 290 Console.traceln(Level.INFO, String.format("ERROR matching instance to cluster with full search!")); 291 throw new RuntimeException("cluster not found with full search"); 292 } 293 294 // classify the passed instance with the cluster we found and its training data 295 ret = cclassifier.get(found_cnumber).classifyInstance(classInstance); 296 297 }catch( Exception e ) { 298 Console.traceln(Level.INFO, String.format("ERROR matching instance to cluster!")); 299 throw new RuntimeException(e); 300 } 301 return ret; 302 } 303 304 @Override 305 public void buildClassifier(Instances traindata) throws Exception { 306 307 //Console.traceln(Level.INFO, String.format("found: "+ CFOUND + ", notfound: " + CNOTFOUND)); 308 this.show_biggest = true; 309 310 cclassifier = new HashMap<Integer, Classifier>(); 311 ctraindata = new HashMap<Integer, Instances>(); 312 cpivots = new HashMap<Integer, Instance>(); 313 cpivotindices = new int[2][2]; 314 315 // 1. copy traindata 316 Instances train = new Instances(traindata); 317 Instances train2 = new Instances(traindata); // this one keeps the class attribute 318 319 // 2. remove class attribute for clustering 320 Remove filter = new Remove(); 321 filter.setAttributeIndices("" + (train.classIndex() + 1)); 322 filter.setInputFormat(train); 323 train = Filter.useFilter(train, filter); 324 325 // 3. calculate distance matrix (needed for Fastmap because it starts at dimension 1) 326 double biggest = 0; 327 EuclideanDistance dist = new EuclideanDistance(train); 328 double[][] distmat = new double[train.size()][train.size()]; 329 for( int i=0; i < train.size(); i++ ) { 330 for( int j=0; j < train.size(); j++ ) { 331 distmat[i][j] = dist.distance(train.get(i), train.get(j)); 332 if( distmat[i][j] > biggest ) { 333 biggest = distmat[i][j]; 334 } 335 } 336 } 337 //Console.traceln(Level.INFO, String.format("biggest distances: "+ biggest)); 338 339 // 4. run fastmap for 2 dimensions on the distance matrix 340 Fastmap FMAP = new Fastmap(2); 341 FMAP.setDistmat(distmat); 342 FMAP.calculate(); 343 344 cpivotindices = FMAP.getPivots(); 345 346 double[][] X = FMAP.getX(); 347 distmat = new double[0][0]; 348 System.gc(); 349 350 // quadtree payload generation 351 ArrayList<QuadTreePayload<Instance>> qtp = new ArrayList<QuadTreePayload<Instance>>(); 352 353 // we need these for the sizes of the quadrants 354 double[] big = {0,0}; 355 double[] small = {Double.MAX_VALUE,Double.MAX_VALUE}; 356 357 // set quadtree payload values and get max and min x and y values for size 358 for( int i=0; i<X.length; i++ ){ 359 if(X[i][0] >= big[0]) { 360 big[0] = X[i][0]; 361 } 362 if(X[i][1] >= big[1]) { 363 big[1] = X[i][1]; 364 } 365 if(X[i][0] <= small[0]) { 366 small[0] = X[i][0]; 367 } 368 if(X[i][1] <= small[1]) { 369 small[1] = X[i][1]; 370 } 371 QuadTreePayload<Instance> tmp = new QuadTreePayload<Instance>(X[i][0], X[i][1], train2.get(i)); 372 qtp.add(tmp); 373 } 374 375 //Console.traceln(Level.INFO, String.format("size for cluster ("+small[0]+","+small[1]+") - ("+big[0]+","+big[1]+")")); 376 377 // 5. generate quadtree 378 QuadTree TREE = new QuadTree(null, qtp); 379 QuadTree.size = train.size(); 380 QuadTree.alpha = Math.sqrt(train.size()); 381 QuadTree.ccluster = new ArrayList<ArrayList<QuadTreePayload<Instance>>>(); 382 QuadTree.csize = new HashMap<Integer, ArrayList<Double[][]>>(); 383 384 //Console.traceln(Level.INFO, String.format("Generate QuadTree with "+ QuadTree.size + " size, Alpha: "+ QuadTree.alpha+ "")); 385 386 // set the size and then split the tree recursively at the median value for x, y 387 TREE.setSize(new double[] {small[0], big[0]}, new double[] {small[1], big[1]}); 388 389 // recursive split und grid clustering eher static 390 TREE.recursiveSplit(TREE); 391 392 // generate list of nodes sorted by density (childs only) 393 ArrayList<QuadTree> l = new ArrayList<QuadTree>(TREE.getList(TREE)); 394 395 // recursive grid clustering (tree pruning), the values are stored in ccluster 396 TREE.gridClustering(l); 397 398 // wir iterieren durch die cluster und sammeln uns die instanzen daraus 399 //ctraindata.clear(); 400 for( int i=0; i < QuadTree.ccluster.size(); i++ ) { 401 ArrayList<QuadTreePayload<Instance>> current = QuadTree.ccluster.get(i); 402 403 // i is the clusternumber 404 // we only allow clusters with Instances > ALPHA, other clusters are not considered! 405 //if(current.size() > QuadTree.alpha) { 406 if( current.size() > 4 ) { 407 for( int j=0; j < current.size(); j++ ) { 408 if( !ctraindata.containsKey(i) ) { 409 ctraindata.put(i, new Instances(train2)); 410 ctraindata.get(i).delete(); 411 } 412 ctraindata.get(i).add(current.get(j).getInst()); 413 } 414 }else{ 415 Console.traceln(Level.INFO, String.format("drop cluster, only: " + current.size() + " instances")); 416 } 417 } 418 419 // here we keep things we need later on 420 // QuadTree sizes for later use (matching new instances) 421 this.csize = new HashMap<Integer, ArrayList<Double[][]>>(QuadTree.csize); 422 423 // pivot elements 424 //this.cpivots.clear(); 425 for( int i=0; i < FMAP.PA[0].length; i++ ) { 426 this.cpivots.put(FMAP.PA[0][i], (Instance)train.get(FMAP.PA[0][i]).copy()); 427 } 428 for( int j=0; j < FMAP.PA[0].length; j++ ) { 429 this.cpivots.put(FMAP.PA[1][j], (Instance)train.get(FMAP.PA[1][j]).copy()); 430 } 431 432 433 /* debug output 434 int pnumber; 435 Iterator<Integer> pivotnumber = cpivots.keySet().iterator(); 436 while ( pivotnumber.hasNext() ) { 437 pnumber = pivotnumber.next(); 438 Console.traceln(Level.INFO, String.format("pivot: "+pnumber+ " inst: "+cpivots.get(pnumber))); 439 } 440 */ 441 442 // train one classifier per cluster, we get the cluster number from the traindata 443 int cnumber; 444 Iterator<Integer> clusternumber = ctraindata.keySet().iterator(); 445 //cclassifier.clear(); 446 447 //int traindata_count = 0; 448 while ( clusternumber.hasNext() ) { 449 cnumber = clusternumber.next(); 450 cclassifier.put(cnumber,setupClassifier()); // this is the classifier used for the cluster 451 cclassifier.get(cnumber).buildClassifier(ctraindata.get(cnumber)); 452 //Console.traceln(Level.INFO, String.format("classifier in cluster "+cnumber)); 453 //traindata_count += ctraindata.get(cnumber).size(); 454 //Console.traceln(Level.INFO, String.format("building classifier in cluster "+cnumber +" with "+ ctraindata.get(cnumber).size() +" traindata instances")); 455 } 456 457 // add all traindata 458 //Console.traceln(Level.INFO, String.format("traindata in all clusters: " + traindata_count)); 459 } 460 } 461 462 463 /** 464 * Payload for the QuadTree. 465 * x and y are the calculated Fastmap values. 466 * T is a weka instance. 467 */ 468 public class QuadTreePayload<T> { 469 470 public double x; 471 public double y; 472 private T inst; 473 474 public QuadTreePayload(double x, double y, T value) { 475 this.x = x; 476 this.y = y; 477 this.inst = value; 478 } 479 480 public T getInst() { 481 return this.inst; 482 } 483 } 484 485 486 /** 487 * Fastmap implementation 488 * 489 * Faloutsos, C., & Lin, K. I. (1995). 490 * FastMap: A fast algorithm for indexing, data-mining and visualization of traditional and multimedia datasets 491 * (Vol. 24, No. 2, pp. 163-174). ACM. 492 */ 493 public class Fastmap { 494 495 /*N x k Array, at the end, the i-th row will be the image of the i-th object*/ 496 private double[][] X; 497 498 /*2 x k pivot Array one pair per recursive call*/ 499 private int[][] PA; 500 501 /*Objects we got (distance matrix)*/ 502 private double[][] O; 503 504 /*column of X currently updated (also the dimension)*/ 505 private int col = 0; 506 507 /*number of dimensions we want*/ 508 private int target_dims = 0; 509 510 // if we already have the pivot elements 511 private boolean pivot_set = false; 512 513 514 public Fastmap(int k) { 515 this.target_dims = k; 516 } 517 518 /** 519 * Sets the distance matrix 520 * and params that depend on this 521 * @param O 522 */ 523 public void setDistmat(double[][] O) { 524 this.O = O; 525 int N = O.length; 526 this.X = new double[N][this.target_dims]; 527 this.PA = new int[2][this.target_dims]; 528 } 529 530 /** 531 * Set pivot elements, we need that to classify instances 532 * after the calculation is complete (because we then want to reuse 533 * only the pivot elements). 534 * 535 * @param pi 536 */ 537 public void setPivots(int[][] pi) { 538 this.pivot_set = true; 539 this.PA = pi; 540 } 541 542 /** 543 * Return the pivot elements that were chosen during the calculation 544 * 545 * @return 546 */ 547 public int[][] getPivots() { 548 return this.PA; 549 } 550 551 /** 552 * The distance function for euclidean distance 553 * 554 * Acts according to equation 4 of the fastmap paper 555 * 556 * @param x x index of x image (if k==0 x object) 557 * @param y y index of y image (if k==0 y object) 558 * @param kdimensionality 559 * @return distance 560 */ 561 private double dist(int x, int y, int k) { 562 563 // basis is object distance, we get this from our distance matrix 564 double tmp = this.O[x][y] * this.O[x][y]; 565 566 // decrease by projections 567 for( int i=0; i < k; i++ ) { 568 double tmp2 = (this.X[x][i] - this.X[y][i]); 569 tmp -= tmp2 * tmp2; 570 } 571 572 return Math.abs(tmp); 573 } 574 575 /** 576 * Find the object farthest from the given index 577 * This method is a helper Method for findDistandObjects 578 * 579 * @param index of the object 580 * @return index of the farthest object from the given index 581 */ 582 private int findFarthest(int index) { 583 double furthest = Double.MIN_VALUE; 584 int ret = 0; 585 586 for( int i=0; i < O.length; i++ ) { 587 double dist = this.dist(i, index, this.col); 588 if( i != index && dist > furthest ) { 589 furthest = dist; 590 ret = i; 591 } 592 } 593 return ret; 594 } 595 596 /** 597 * Finds the pivot objects 598 * 599 * This method is basically algorithm 1 of the fastmap paper. 600 * 601 * @return 2 indexes of the choosen pivot objects 602 */ 603 private int[] findDistantObjects() { 604 // 1. choose object randomly 605 Random r = new Random(); 606 int obj = r.nextInt(this.O.length); 607 608 // 2. find farthest object from randomly chosen object 609 int idx1 = this.findFarthest(obj); 610 611 // 3. find farthest object from previously farthest object 612 int idx2 = this.findFarthest(idx1); 613 614 return new int[] {idx1, idx2}; 615 } 616 617 /** 618 * Calculates the new k-vector values (projections) 619 * 620 * This is basically algorithm 2 of the fastmap paper. 621 * We just added the possibility to pre-set the pivot elements because 622 * we need to classify single instances after the computation is already done. 623 * 624 * @param dims dimensionality 625 */ 626 public void calculate() { 627 628 for( int k=0; k < this.target_dims; k++ ) { 629 // 2) choose pivot objects 630 if ( !this.pivot_set ) { 631 int[] pivots = this.findDistantObjects(); 632 633 // 3) record ids of pivot objects 634 this.PA[0][this.col] = pivots[0]; 635 this.PA[1][this.col] = pivots[1]; 636 } 637 638 // 4) inter object distances are zero (this.X is initialized with 0 so we just continue) 639 if( this.dist(this.PA[0][this.col], this.PA[1][this.col], this.col) == 0 ) { 640 continue; 641 } 642 643 // 5) project the objects on the line between the pivots 644 double dxy = this.dist(this.PA[0][this.col], this.PA[1][this.col], this.col); 645 for( int i=0; i < this.O.length; i++ ) { 646 647 double dix = this.dist(i, this.PA[0][this.col], this.col); 648 double diy = this.dist(i, this.PA[1][this.col], this.col); 649 650 double tmp = (dix + dxy - diy) / (2 * Math.sqrt(dxy)); 651 652 // save the projection 653 this.X[i][this.col] = tmp; 654 } 655 656 this.col += 1; 657 } 658 } 659 660 /** 661 * returns the result matrix of the projections 662 * 663 * @return calculated result 664 */ 665 public double[][] getX() { 666 return this.X; 667 } 668 } 58 59 private final TraindatasetCluster classifier = new TraindatasetCluster(); 60 61 @Override 62 public Classifier getClassifier() { 63 return classifier; 64 } 65 66 @Override 67 public void apply(Instances traindata) { 68 PrintStream errStr = System.err; 69 System.setErr(new PrintStream(new NullOutputStream())); 70 try { 71 classifier.buildClassifier(traindata); 72 } 73 catch (Exception e) { 74 throw new RuntimeException(e); 75 } 76 finally { 77 System.setErr(errStr); 78 } 79 } 80 81 public class TraindatasetCluster extends AbstractClassifier { 82 83 private static final long serialVersionUID = 1L; 84 85 /* classifier per cluster */ 86 private HashMap<Integer, Classifier> cclassifier; 87 88 /* instances per cluster */ 89 private HashMap<Integer, Instances> ctraindata; 90 91 /* 92 * holds the instances and indices of the pivot objects of the Fastmap calculation in 93 * buildClassifier 94 */ 95 private HashMap<Integer, Instance> cpivots; 96 97 /* holds the indices of the pivot objects for x,y and the dimension [x,y][dimension] */ 98 private int[][] cpivotindices; 99 100 /* holds the sizes of the cluster multiple "boxes" per cluster */ 101 private HashMap<Integer, ArrayList<Double[][]>> csize; 102 103 /* debug vars */ 104 @SuppressWarnings("unused") 105 private boolean show_biggest = true; 106 107 @SuppressWarnings("unused") 108 private int CFOUND = 0; 109 @SuppressWarnings("unused") 110 private int CNOTFOUND = 0; 111 112 private Instance createInstance(Instances instances, Instance instance) { 113 // attributes for feeding instance to classifier 114 Set<String> attributeNames = new HashSet<>(); 115 for (int j = 0; j < instances.numAttributes(); j++) { 116 attributeNames.add(instances.attribute(j).name()); 117 } 118 119 double[] values = new double[instances.numAttributes()]; 120 int index = 0; 121 for (int j = 0; j < instance.numAttributes(); j++) { 122 if (attributeNames.contains(instance.attribute(j).name())) { 123 values[index] = instance.value(j); 124 index++; 125 } 126 } 127 128 Instances tmp = new Instances(instances); 129 tmp.clear(); 130 Instance instCopy = new DenseInstance(instance.weight(), values); 131 instCopy.setDataset(tmp); 132 133 return instCopy; 134 } 135 136 /** 137 * Because Fastmap saves only the image not the values of the attributes it used we can not 138 * use the old data directly to classify single instances to clusters. 139 * 140 * To classify a single instance we do a new fastmap computation with only the instance and 141 * the old pivot elements. 142 * 143 * After that we find the cluster with our fastmap result for x and y. 144 */ 145 @Override 146 public double classifyInstance(Instance instance) { 147 148 double ret = 0; 149 try { 150 // classinstance gets passed to classifier 151 Instances traindata = ctraindata.get(0); 152 Instance classInstance = createInstance(traindata, instance); 153 154 // this one keeps the class attribute 155 Instances traindata2 = ctraindata.get(1); 156 157 // remove class attribute before clustering 158 Remove filter = new Remove(); 159 filter.setAttributeIndices("" + (traindata.classIndex() + 1)); 160 filter.setInputFormat(traindata); 161 traindata = Filter.useFilter(traindata, filter); 162 Instance clusterInstance = createInstance(traindata, instance); 163 164 Fastmap FMAP = new Fastmap(2); 165 EuclideanDistance dist = new EuclideanDistance(traindata); 166 167 // we set our pivot indices [x=0,y=1][dimension] 168 int[][] npivotindices = new int[2][2]; 169 npivotindices[0][0] = 1; 170 npivotindices[1][0] = 2; 171 npivotindices[0][1] = 3; 172 npivotindices[1][1] = 4; 173 174 // build temp dist matrix (2 pivots per dimension + 1 instance we want to classify) 175 // the instance we want to classify comes first after that the pivot elements in the 176 // order defined above 177 double[][] distmat = new double[2 * FMAP.target_dims + 1][2 * FMAP.target_dims + 1]; 178 distmat[0][0] = 0; 179 distmat[0][1] = 180 dist.distance(clusterInstance, 181 this.cpivots.get((Integer) this.cpivotindices[0][0])); 182 distmat[0][2] = 183 dist.distance(clusterInstance, 184 this.cpivots.get((Integer) this.cpivotindices[1][0])); 185 distmat[0][3] = 186 dist.distance(clusterInstance, 187 this.cpivots.get((Integer) this.cpivotindices[0][1])); 188 distmat[0][4] = 189 dist.distance(clusterInstance, 190 this.cpivots.get((Integer) this.cpivotindices[1][1])); 191 192 distmat[1][0] = 193 dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][0]), 194 clusterInstance); 195 distmat[1][1] = 0; 196 distmat[1][2] = 197 dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][0]), 198 this.cpivots.get((Integer) this.cpivotindices[1][0])); 199 distmat[1][3] = 200 dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][0]), 201 this.cpivots.get((Integer) this.cpivotindices[0][1])); 202 distmat[1][4] = 203 dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][0]), 204 this.cpivots.get((Integer) this.cpivotindices[1][1])); 205 206 distmat[2][0] = 207 dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][0]), 208 clusterInstance); 209 distmat[2][1] = 210 dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][0]), 211 this.cpivots.get((Integer) this.cpivotindices[0][0])); 212 distmat[2][2] = 0; 213 distmat[2][3] = 214 dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][0]), 215 this.cpivots.get((Integer) this.cpivotindices[0][1])); 216 distmat[2][4] = 217 dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][0]), 218 this.cpivots.get((Integer) this.cpivotindices[1][1])); 219 220 distmat[3][0] = 221 dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][1]), 222 clusterInstance); 223 distmat[3][1] = 224 dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][1]), 225 this.cpivots.get((Integer) this.cpivotindices[0][0])); 226 distmat[3][2] = 227 dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][1]), 228 this.cpivots.get((Integer) this.cpivotindices[1][0])); 229 distmat[3][3] = 0; 230 distmat[3][4] = 231 dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][1]), 232 this.cpivots.get((Integer) this.cpivotindices[1][1])); 233 234 distmat[4][0] = 235 dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][1]), 236 clusterInstance); 237 distmat[4][1] = 238 dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][1]), 239 this.cpivots.get((Integer) this.cpivotindices[0][0])); 240 distmat[4][2] = 241 dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][1]), 242 this.cpivots.get((Integer) this.cpivotindices[1][0])); 243 distmat[4][3] = 244 dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][1]), 245 this.cpivots.get((Integer) this.cpivotindices[0][1])); 246 distmat[4][4] = 0; 247 248 /* 249 * debug output: show biggest distance found within the new distance matrix double 250 * biggest = 0; for(int i=0; i < distmat.length; i++) { for(int j=0; j < 251 * distmat[0].length; j++) { if(biggest < distmat[i][j]) { biggest = distmat[i][j]; 252 * } } } if(this.show_biggest) { Console.traceln(Level.INFO, 253 * String.format(""+clusterInstance)); Console.traceln(Level.INFO, 254 * String.format("biggest distances: "+ biggest)); this.show_biggest = false; } 255 */ 256 257 FMAP.setDistmat(distmat); 258 FMAP.setPivots(npivotindices); 259 FMAP.calculate(); 260 double[][] x = FMAP.getX(); 261 double[] proj = x[0]; 262 263 // debug output: show the calculated distance matrix, our result vektor for the 264 // instance and the complete result matrix 265 /* 266 * Console.traceln(Level.INFO, "distmat:"); for(int i=0; i<distmat.length; i++){ 267 * for(int j=0; j<distmat[0].length; j++){ Console.trace(Level.INFO, 268 * String.format("%20s", distmat[i][j])); } Console.traceln(Level.INFO, ""); } 269 * 270 * Console.traceln(Level.INFO, "vector:"); for(int i=0; i < proj.length; i++) { 271 * Console.trace(Level.INFO, String.format("%20s", proj[i])); } 272 * Console.traceln(Level.INFO, ""); 273 * 274 * Console.traceln(Level.INFO, "resultmat:"); for(int i=0; i<x.length; i++){ for(int 275 * j=0; j<x[0].length; j++){ Console.trace(Level.INFO, String.format("%20s", 276 * x[i][j])); } Console.traceln(Level.INFO, ""); } 277 */ 278 279 // now we iterate over all clusters (well, boxes of sizes per cluster really) and 280 // save the number of the 281 // cluster in which we are 282 int cnumber; 283 int found_cnumber = -1; 284 Iterator<Integer> clusternumber = this.csize.keySet().iterator(); 285 while (clusternumber.hasNext() && found_cnumber == -1) { 286 cnumber = clusternumber.next(); 287 288 // now iterate over the boxes of the cluster and hope we find one (cluster could 289 // have been removed) 290 // or we are too far away from any cluster because of the fastmap calculation 291 // with the initial pivot objects 292 for (int box = 0; box < this.csize.get(cnumber).size(); box++) { 293 Double[][] current = this.csize.get(cnumber).get(box); 294 295 if (proj[0] >= current[0][0] && proj[0] <= current[0][1] && // x 296 proj[1] >= current[1][0] && proj[1] <= current[1][1]) 297 { // y 298 found_cnumber = cnumber; 299 } 300 } 301 } 302 303 // we want to count how often we are really inside a cluster 304 // if ( found_cnumber == -1 ) { 305 // CNOTFOUND += 1; 306 // }else { 307 // CFOUND += 1; 308 // } 309 310 // now it can happen that we do not find a cluster because we deleted it previously 311 // (too few instances) 312 // or we get bigger distance measures from weka so that we are completely outside of 313 // our clusters. 314 // in these cases we just find the nearest cluster to our instance and use it for 315 // classification. 316 // to do that we use the EuclideanDistance again to compare our distance to all 317 // other Instances 318 // then we take the cluster of the closest weka instance 319 dist = new EuclideanDistance(traindata2); 320 if (!this.ctraindata.containsKey(found_cnumber)) { 321 double min_distance = Double.MAX_VALUE; 322 clusternumber = ctraindata.keySet().iterator(); 323 while (clusternumber.hasNext()) { 324 cnumber = clusternumber.next(); 325 for (int i = 0; i < ctraindata.get(cnumber).size(); i++) { 326 if (dist.distance(instance, ctraindata.get(cnumber).get(i)) <= min_distance) 327 { 328 found_cnumber = cnumber; 329 min_distance = 330 dist.distance(instance, ctraindata.get(cnumber).get(i)); 331 } 332 } 333 } 334 } 335 336 // here we have the cluster where an instance has the minimum distance between 337 // itself and the 338 // instance we want to classify 339 // if we still have not found a cluster we exit because something is really wrong 340 if (found_cnumber == -1) { 341 Console.traceln(Level.INFO, String 342 .format("ERROR matching instance to cluster with full search!")); 343 throw new RuntimeException("cluster not found with full search"); 344 } 345 346 // classify the passed instance with the cluster we found and its training data 347 ret = cclassifier.get(found_cnumber).classifyInstance(classInstance); 348 349 } 350 catch (Exception e) { 351 Console.traceln(Level.INFO, String.format("ERROR matching instance to cluster!")); 352 throw new RuntimeException(e); 353 } 354 return ret; 355 } 356 357 @Override 358 public void buildClassifier(Instances traindata) throws Exception { 359 360 // Console.traceln(Level.INFO, String.format("found: "+ CFOUND + ", notfound: " + 361 // CNOTFOUND)); 362 this.show_biggest = true; 363 364 cclassifier = new HashMap<Integer, Classifier>(); 365 ctraindata = new HashMap<Integer, Instances>(); 366 cpivots = new HashMap<Integer, Instance>(); 367 cpivotindices = new int[2][2]; 368 369 // 1. copy traindata 370 Instances train = new Instances(traindata); 371 Instances train2 = new Instances(traindata); // this one keeps the class attribute 372 373 // 2. remove class attribute for clustering 374 Remove filter = new Remove(); 375 filter.setAttributeIndices("" + (train.classIndex() + 1)); 376 filter.setInputFormat(train); 377 train = Filter.useFilter(train, filter); 378 379 // 3. calculate distance matrix (needed for Fastmap because it starts at dimension 1) 380 double biggest = 0; 381 EuclideanDistance dist = new EuclideanDistance(train); 382 double[][] distmat = new double[train.size()][train.size()]; 383 for (int i = 0; i < train.size(); i++) { 384 for (int j = 0; j < train.size(); j++) { 385 distmat[i][j] = dist.distance(train.get(i), train.get(j)); 386 if (distmat[i][j] > biggest) { 387 biggest = distmat[i][j]; 388 } 389 } 390 } 391 // Console.traceln(Level.INFO, String.format("biggest distances: "+ biggest)); 392 393 // 4. run fastmap for 2 dimensions on the distance matrix 394 Fastmap FMAP = new Fastmap(2); 395 FMAP.setDistmat(distmat); 396 FMAP.calculate(); 397 398 cpivotindices = FMAP.getPivots(); 399 400 double[][] X = FMAP.getX(); 401 distmat = new double[0][0]; 402 System.gc(); 403 404 // quadtree payload generation 405 ArrayList<QuadTreePayload<Instance>> qtp = new ArrayList<QuadTreePayload<Instance>>(); 406 407 // we need these for the sizes of the quadrants 408 double[] big = 409 { 0, 0 }; 410 double[] small = 411 { Double.MAX_VALUE, Double.MAX_VALUE }; 412 413 // set quadtree payload values and get max and min x and y values for size 414 for (int i = 0; i < X.length; i++) { 415 if (X[i][0] >= big[0]) { 416 big[0] = X[i][0]; 417 } 418 if (X[i][1] >= big[1]) { 419 big[1] = X[i][1]; 420 } 421 if (X[i][0] <= small[0]) { 422 small[0] = X[i][0]; 423 } 424 if (X[i][1] <= small[1]) { 425 small[1] = X[i][1]; 426 } 427 QuadTreePayload<Instance> tmp = 428 new QuadTreePayload<Instance>(X[i][0], X[i][1], train2.get(i)); 429 qtp.add(tmp); 430 } 431 432 // Console.traceln(Level.INFO, 433 // String.format("size for cluster ("+small[0]+","+small[1]+") - ("+big[0]+","+big[1]+")")); 434 435 // 5. generate quadtree 436 QuadTree TREE = new QuadTree(null, qtp); 437 QuadTree.size = train.size(); 438 QuadTree.alpha = Math.sqrt(train.size()); 439 QuadTree.ccluster = new ArrayList<ArrayList<QuadTreePayload<Instance>>>(); 440 QuadTree.csize = new HashMap<Integer, ArrayList<Double[][]>>(); 441 442 // Console.traceln(Level.INFO, String.format("Generate QuadTree with "+ QuadTree.size + 443 // " size, Alpha: "+ QuadTree.alpha+ "")); 444 445 // set the size and then split the tree recursively at the median value for x, y 446 TREE.setSize(new double[] 447 { small[0], big[0] }, new double[] 448 { small[1], big[1] }); 449 450 // recursive split und grid clustering eher static 451 TREE.recursiveSplit(TREE); 452 453 // generate list of nodes sorted by density (childs only) 454 ArrayList<QuadTree> l = new ArrayList<QuadTree>(TREE.getList(TREE)); 455 456 // recursive grid clustering (tree pruning), the values are stored in ccluster 457 TREE.gridClustering(l); 458 459 // wir iterieren durch die cluster und sammeln uns die instanzen daraus 460 // ctraindata.clear(); 461 for (int i = 0; i < QuadTree.ccluster.size(); i++) { 462 ArrayList<QuadTreePayload<Instance>> current = QuadTree.ccluster.get(i); 463 464 // i is the clusternumber 465 // we only allow clusters with Instances > ALPHA, other clusters are not considered! 466 // if(current.size() > QuadTree.alpha) { 467 if (current.size() > 4) { 468 for (int j = 0; j < current.size(); j++) { 469 if (!ctraindata.containsKey(i)) { 470 ctraindata.put(i, new Instances(train2)); 471 ctraindata.get(i).delete(); 472 } 473 ctraindata.get(i).add(current.get(j).getInst()); 474 } 475 } 476 else { 477 Console.traceln(Level.INFO, 478 String.format("drop cluster, only: " + current.size() + 479 " instances")); 480 } 481 } 482 483 // here we keep things we need later on 484 // QuadTree sizes for later use (matching new instances) 485 this.csize = new HashMap<Integer, ArrayList<Double[][]>>(QuadTree.csize); 486 487 // pivot elements 488 // this.cpivots.clear(); 489 for (int i = 0; i < FMAP.PA[0].length; i++) { 490 this.cpivots.put(FMAP.PA[0][i], (Instance) train.get(FMAP.PA[0][i]).copy()); 491 } 492 for (int j = 0; j < FMAP.PA[0].length; j++) { 493 this.cpivots.put(FMAP.PA[1][j], (Instance) train.get(FMAP.PA[1][j]).copy()); 494 } 495 496 /* 497 * debug output int pnumber; Iterator<Integer> pivotnumber = 498 * cpivots.keySet().iterator(); while ( pivotnumber.hasNext() ) { pnumber = 499 * pivotnumber.next(); Console.traceln(Level.INFO, String.format("pivot: "+pnumber+ 500 * " inst: "+cpivots.get(pnumber))); } 501 */ 502 503 // train one classifier per cluster, we get the cluster number from the traindata 504 int cnumber; 505 Iterator<Integer> clusternumber = ctraindata.keySet().iterator(); 506 // cclassifier.clear(); 507 508 // int traindata_count = 0; 509 while (clusternumber.hasNext()) { 510 cnumber = clusternumber.next(); 511 cclassifier.put(cnumber, setupClassifier()); // this is the classifier used for the 512 // cluster 513 cclassifier.get(cnumber).buildClassifier(ctraindata.get(cnumber)); 514 // Console.traceln(Level.INFO, String.format("classifier in cluster "+cnumber)); 515 // traindata_count += ctraindata.get(cnumber).size(); 516 // Console.traceln(Level.INFO, 517 // String.format("building classifier in cluster "+cnumber +" with "+ 518 // ctraindata.get(cnumber).size() +" traindata instances")); 519 } 520 521 // add all traindata 522 // Console.traceln(Level.INFO, String.format("traindata in all clusters: " + 523 // traindata_count)); 524 } 525 } 526 527 /** 528 * Payload for the QuadTree. x and y are the calculated Fastmap values. T is a weka instance. 529 */ 530 public class QuadTreePayload<T> { 531 532 public double x; 533 public double y; 534 private T inst; 535 536 public QuadTreePayload(double x, double y, T value) { 537 this.x = x; 538 this.y = y; 539 this.inst = value; 540 } 541 542 public T getInst() { 543 return this.inst; 544 } 545 } 546 547 /** 548 * Fastmap implementation 549 * 550 * Faloutsos, C., & Lin, K. I. (1995). FastMap: A fast algorithm for indexing, data-mining and 551 * visualization of traditional and multimedia datasets (Vol. 24, No. 2, pp. 163-174). ACM. 552 */ 553 public class Fastmap { 554 555 /* N x k Array, at the end, the i-th row will be the image of the i-th object */ 556 private double[][] X; 557 558 /* 2 x k pivot Array one pair per recursive call */ 559 private int[][] PA; 560 561 /* Objects we got (distance matrix) */ 562 private double[][] O; 563 564 /* column of X currently updated (also the dimension) */ 565 private int col = 0; 566 567 /* number of dimensions we want */ 568 private int target_dims = 0; 569 570 // if we already have the pivot elements 571 private boolean pivot_set = false; 572 573 public Fastmap(int k) { 574 this.target_dims = k; 575 } 576 577 /** 578 * Sets the distance matrix and params that depend on this 579 * 580 * @param O 581 */ 582 public void setDistmat(double[][] O) { 583 this.O = O; 584 int N = O.length; 585 this.X = new double[N][this.target_dims]; 586 this.PA = new int[2][this.target_dims]; 587 } 588 589 /** 590 * Set pivot elements, we need that to classify instances after the calculation is complete 591 * (because we then want to reuse only the pivot elements). 592 * 593 * @param pi 594 */ 595 public void setPivots(int[][] pi) { 596 this.pivot_set = true; 597 this.PA = pi; 598 } 599 600 /** 601 * Return the pivot elements that were chosen during the calculation 602 * 603 * @return 604 */ 605 public int[][] getPivots() { 606 return this.PA; 607 } 608 609 /** 610 * The distance function for euclidean distance 611 * 612 * Acts according to equation 4 of the fastmap paper 613 * 614 * @param x 615 * x index of x image (if k==0 x object) 616 * @param y 617 * y index of y image (if k==0 y object) 618 * @param kdimensionality 619 * @return distance 620 */ 621 private double dist(int x, int y, int k) { 622 623 // basis is object distance, we get this from our distance matrix 624 double tmp = this.O[x][y] * this.O[x][y]; 625 626 // decrease by projections 627 for (int i = 0; i < k; i++) { 628 double tmp2 = (this.X[x][i] - this.X[y][i]); 629 tmp -= tmp2 * tmp2; 630 } 631 632 return Math.abs(tmp); 633 } 634 635 /** 636 * Find the object farthest from the given index This method is a helper Method for 637 * findDistandObjects 638 * 639 * @param index 640 * of the object 641 * @return index of the farthest object from the given index 642 */ 643 private int findFarthest(int index) { 644 double furthest = Double.MIN_VALUE; 645 int ret = 0; 646 647 for (int i = 0; i < O.length; i++) { 648 double dist = this.dist(i, index, this.col); 649 if (i != index && dist > furthest) { 650 furthest = dist; 651 ret = i; 652 } 653 } 654 return ret; 655 } 656 657 /** 658 * Finds the pivot objects 659 * 660 * This method is basically algorithm 1 of the fastmap paper. 661 * 662 * @return 2 indexes of the choosen pivot objects 663 */ 664 private int[] findDistantObjects() { 665 // 1. choose object randomly 666 Random r = new Random(); 667 int obj = r.nextInt(this.O.length); 668 669 // 2. find farthest object from randomly chosen object 670 int idx1 = this.findFarthest(obj); 671 672 // 3. find farthest object from previously farthest object 673 int idx2 = this.findFarthest(idx1); 674 675 return new int[] 676 { idx1, idx2 }; 677 } 678 679 /** 680 * Calculates the new k-vector values (projections) 681 * 682 * This is basically algorithm 2 of the fastmap paper. We just added the possibility to 683 * pre-set the pivot elements because we need to classify single instances after the 684 * computation is already done. 685 * 686 * @param dims 687 * dimensionality 688 */ 689 public void calculate() { 690 691 for (int k = 0; k < this.target_dims; k++) { 692 // 2) choose pivot objects 693 if (!this.pivot_set) { 694 int[] pivots = this.findDistantObjects(); 695 696 // 3) record ids of pivot objects 697 this.PA[0][this.col] = pivots[0]; 698 this.PA[1][this.col] = pivots[1]; 699 } 700 701 // 4) inter object distances are zero (this.X is initialized with 0 so we just 702 // continue) 703 if (this.dist(this.PA[0][this.col], this.PA[1][this.col], this.col) == 0) { 704 continue; 705 } 706 707 // 5) project the objects on the line between the pivots 708 double dxy = this.dist(this.PA[0][this.col], this.PA[1][this.col], this.col); 709 for (int i = 0; i < this.O.length; i++) { 710 711 double dix = this.dist(i, this.PA[0][this.col], this.col); 712 double diy = this.dist(i, this.PA[1][this.col], this.col); 713 714 double tmp = (dix + dxy - diy) / (2 * Math.sqrt(dxy)); 715 716 // save the projection 717 this.X[i][this.col] = tmp; 718 } 719 720 this.col += 1; 721 } 722 } 723 724 /** 725 * returns the result matrix of the projections 726 * 727 * @return calculated result 728 */ 729 public double[][] getX() { 730 return this.X; 731 } 732 } 669 733 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/WekaTraining.java
r25 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.training; 2 16 … … 11 25 /** 12 26 * Programmatic WekaTraining 13 *14 * first parameter is Trainer Name.15 * second parameter is class name16 27 * 17 * all subsequent parameters are configuration params (for example for trees) 18 * Cross Validation params always come last and are prepended with -CVPARAM 28 * first parameter is Trainer Name. second parameter is class name 29 * 30 * all subsequent parameters are configuration params (for example for trees) Cross Validation 31 * params always come last and are prepended with -CVPARAM 19 32 * 20 33 * XML Configurations for Weka Classifiers: 34 * 21 35 * <pre> 22 36 * {@code … … 30 44 public class WekaTraining extends WekaBaseTraining implements ITrainingStrategy { 31 45 32 @Override 33 public void apply(Instances traindata) { 34 PrintStream errStr = System.err; 35 System.setErr(new PrintStream(new NullOutputStream())); 36 try { 37 if(classifier == null) { 38 Console.traceln(Level.WARNING, String.format("classifier null!")); 39 } 40 classifier.buildClassifier(traindata); 41 } catch (Exception e) { 42 throw new RuntimeException(e); 43 } finally { 44 System.setErr(errStr); 45 } 46 } 46 @Override 47 public void apply(Instances traindata) { 48 PrintStream errStr = System.err; 49 System.setErr(new PrintStream(new NullOutputStream())); 50 try { 51 if (classifier == null) { 52 Console.traceln(Level.WARNING, String.format("classifier null!")); 53 } 54 classifier.buildClassifier(traindata); 55 } 56 catch (Exception e) { 57 throw new RuntimeException(e); 58 } 59 finally { 60 System.setErr(errStr); 61 } 62 } 47 63 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/versions/AbstractVersionFilter.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.versions; 2 16 … … 6 20 /** 7 21 * Implements a skeletal {@link IVersionFilter}. 22 * 8 23 * @author Steffen Herbold 9 24 */ 10 25 public abstract class AbstractVersionFilter implements IVersionFilter { 11 26 12 13 14 15 16 17 18 for( final Iterator<SoftwareVersion> iter=versions.iterator() ; iter.hasNext() ;) {19 20 21 if( apply(version)) {22 23 24 25 26 27 27 /** 28 * @see de.ugoe.cs.cpdp.versions.IVersionFilter#apply(java.util.List) 29 */ 30 @Override 31 public int apply(List<SoftwareVersion> versions) { 32 int removed = 0; 33 for (final Iterator<SoftwareVersion> iter = versions.iterator(); iter.hasNext();) { 34 SoftwareVersion version = iter.next(); 35 36 if (apply(version)) { 37 iter.remove(); 38 removed++; 39 } 40 } 41 return removed; 42 } 28 43 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/versions/IVersionFilter.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.versions; 2 16 … … 6 20 7 21 /** 8 * Implements the interface for a {@link SoftwareVersion} filter. 22 * Implements the interface for a {@link SoftwareVersion} filter. 23 * 9 24 * @author Steffen Herbold 10 25 */ 11 26 public interface IVersionFilter extends IParameterizable { 12 27 13 /** 14 * Applies the filter to a single version. 15 * @param version the version 16 * @return true if filter applies to version, false otherwise 17 */ 18 boolean apply(SoftwareVersion version); 19 20 /** 21 * Applies the filter a a list of versions. Versions were the filter applies are automatically removed from the list. 22 * @param versions list of versions 23 * @return number of removed versions 24 */ 25 int apply(List<SoftwareVersion> versions); 28 /** 29 * Applies the filter to a single version. 30 * 31 * @param version 32 * the version 33 * @return true if filter applies to version, false otherwise 34 */ 35 boolean apply(SoftwareVersion version); 36 37 /** 38 * Applies the filter a a list of versions. Versions were the filter applies are automatically 39 * removed from the list. 40 * 41 * @param versions 42 * list of versions 43 * @return number of removed versions 44 */ 45 int apply(List<SoftwareVersion> versions); 26 46 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/versions/MaxInstanceNumberFilter.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.versions; 2 16 3 17 /** 4 * Applies to large data sets. All data sets that have more than the required maximum number of instances are removed. 18 * Applies to large data sets. All data sets that have more than the required maximum number of 19 * instances are removed. 20 * 5 21 * @author Steffen Herbold 6 22 */ 7 23 public class MaxInstanceNumberFilter extends AbstractVersionFilter { 8 24 9 /** 10 * maximum number of instances required 11 */ 12 private int maxInstances = 0; 13 14 /** 15 * @see de.ugoe.cs.cpdp.versions.IVersionFilter#apply(de.ugoe.cs.cpdp.versions.SoftwareVersion) 16 */ 17 @Override 18 public boolean apply(SoftwareVersion version) { 19 return version.getInstances().numInstances()>maxInstances; 20 } 25 /** 26 * maximum number of instances required 27 */ 28 private int maxInstances = 0; 21 29 22 /** 23 * Sets the minimal number of instances. 24 * @param parameters number of instances 25 */ 26 @Override 27 public void setParameter(String parameters) { 28 maxInstances = Integer.parseInt(parameters); 29 } 30 /** 31 * @see de.ugoe.cs.cpdp.versions.IVersionFilter#apply(de.ugoe.cs.cpdp.versions.SoftwareVersion) 32 */ 33 @Override 34 public boolean apply(SoftwareVersion version) { 35 return version.getInstances().numInstances() > maxInstances; 36 } 37 38 /** 39 * Sets the minimal number of instances. 40 * 41 * @param parameters 42 * number of instances 43 */ 44 @Override 45 public void setParameter(String parameters) { 46 maxInstances = Integer.parseInt(parameters); 47 } 30 48 31 49 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/versions/MinClassNumberFilter.java
r26 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.versions; 2 16 … … 4 18 5 19 /** 6 * Applies to small data sets. All data sets that do not have the required minimal number of instances in each class (i.e., positive, negative) are removed. 20 * Applies to small data sets. All data sets that do not have the required minimal number of 21 * instances in each class (i.e., positive, negative) are removed. 22 * 7 23 * @author Steffen Herbold 8 24 */ 9 25 public class MinClassNumberFilter extends AbstractVersionFilter { 10 26 11 /** 12 * minimal number of instances required 13 */ 14 private int minInstances = 0; 15 16 /** 17 * @see de.ugoe.cs.cpdp.versions.IVersionFilter#apply(de.ugoe.cs.cpdp.versions.SoftwareVersion) 18 */ 19 @Override 20 public boolean apply(SoftwareVersion version) { 21 Instances instances = version.getInstances(); 22 int[] counts = instances.attributeStats(instances.classIndex()).nominalCounts; 23 boolean toSmall = false; 24 for( int count : counts ) { 25 toSmall |= count<minInstances; 26 } 27 return toSmall; 28 } 27 /** 28 * minimal number of instances required 29 */ 30 private int minInstances = 0; 29 31 30 /** 31 * Sets the minimal number of instances for each class. 32 * @param parameters number of instances 33 */ 34 @Override 35 public void setParameter(String parameters) { 36 minInstances = Integer.parseInt(parameters); 37 } 32 /** 33 * @see de.ugoe.cs.cpdp.versions.IVersionFilter#apply(de.ugoe.cs.cpdp.versions.SoftwareVersion) 34 */ 35 @Override 36 public boolean apply(SoftwareVersion version) { 37 Instances instances = version.getInstances(); 38 int[] counts = instances.attributeStats(instances.classIndex()).nominalCounts; 39 boolean toSmall = false; 40 for (int count : counts) { 41 toSmall |= count < minInstances; 42 } 43 return toSmall; 44 } 45 46 /** 47 * Sets the minimal number of instances for each class. 48 * 49 * @param parameters 50 * number of instances 51 */ 52 @Override 53 public void setParameter(String parameters) { 54 minInstances = Integer.parseInt(parameters); 55 } 38 56 39 57 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/versions/MinInstanceNumberFilter.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.versions; 2 16 3 17 /** 4 * Applies to small data sets. All data sets that do not have the required minimal number of instances are removed. 18 * Applies to small data sets. All data sets that do not have the required minimal number of 19 * instances are removed. 20 * 5 21 * @author Steffen Herbold 6 22 */ 7 23 public class MinInstanceNumberFilter extends AbstractVersionFilter { 8 24 9 /** 10 * minimal number of instances required 11 */ 12 private int minInstances = 0; 13 14 /** 15 * @see de.ugoe.cs.cpdp.versions.IVersionFilter#apply(de.ugoe.cs.cpdp.versions.SoftwareVersion) 16 */ 17 @Override 18 public boolean apply(SoftwareVersion version) { 19 return version.getInstances().numInstances()<minInstances; 20 } 25 /** 26 * minimal number of instances required 27 */ 28 private int minInstances = 0; 21 29 22 /** 23 * Sets the minimal number of instances. 24 * @param parameters number of instances 25 */ 26 @Override 27 public void setParameter(String parameters) { 28 minInstances = Integer.parseInt(parameters); 29 } 30 /** 31 * @see de.ugoe.cs.cpdp.versions.IVersionFilter#apply(de.ugoe.cs.cpdp.versions.SoftwareVersion) 32 */ 33 @Override 34 public boolean apply(SoftwareVersion version) { 35 return version.getInstances().numInstances() < minInstances; 36 } 37 38 /** 39 * Sets the minimal number of instances. 40 * 41 * @param parameters 42 * number of instances 43 */ 44 @Override 45 public void setParameter(String parameters) { 46 minInstances = Integer.parseInt(parameters); 47 } 30 48 31 49 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/versions/SoftwareVersion.java
r27 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.versions; 2 16 … … 4 18 5 19 /** 6 * Data class for software versions. 20 * Data class for software versions. 21 * 7 22 * @author Steffen Herbold 8 23 */ 9 24 public class SoftwareVersion implements Comparable<SoftwareVersion> { 10 25 11 /** 12 * name of the project 13 */ 14 private final String project; 15 16 /** 17 * version of the project 18 */ 19 private final String version; 26 /** 27 * name of the project 28 */ 29 private final String project; 20 30 21 /** 22 * data of the version 23 */ 24 private final Instances instances; 25 26 /** 27 * Constructor. Creates a new version. 28 * @param project name of the project 29 * @param version name of the version 30 * @param instances data of the version 31 */ 32 public SoftwareVersion(String project, String version, Instances instances) { 33 this.project = project; 34 this.version = version; 35 this.instances = instances; 36 } 37 38 /** 39 * returns the project name 40 * @return project name 41 */ 42 public String getProject() { 43 return project; 44 } 45 46 /** 47 * returns the name of the version 48 * @return name of the version 49 */ 50 public String getVersion() { 51 return version; 52 } 53 54 /** 55 * returns the data of the version 56 * @return data 57 */ 58 public Instances getInstances() { 59 return new Instances(instances); 60 } 31 /** 32 * version of the project 33 */ 34 private final String version; 61 35 62 /** 63 * Compares first based on project name and then based on version. Only string comparisons are performed. 64 * @see java.lang.Comparable#compareTo(java.lang.Object) 65 */ 66 @Override 67 public int compareTo(SoftwareVersion o) { 68 int projectStrCmp = 0; 69 if( project!=null ) { 70 projectStrCmp = project.compareTo(o.project); 71 } 72 if( projectStrCmp==0 && version!=null ) { 73 return version.compareTo(o.version); 74 } else { 75 return projectStrCmp; 76 } 77 } 36 /** 37 * data of the version 38 */ 39 private final Instances instances; 40 41 /** 42 * Constructor. Creates a new version. 43 * 44 * @param project 45 * name of the project 46 * @param version 47 * name of the version 48 * @param instances 49 * data of the version 50 */ 51 public SoftwareVersion(String project, String version, Instances instances) { 52 this.project = project; 53 this.version = version; 54 this.instances = instances; 55 } 56 57 /** 58 * returns the project name 59 * 60 * @return project name 61 */ 62 public String getProject() { 63 return project; 64 } 65 66 /** 67 * returns the name of the version 68 * 69 * @return name of the version 70 */ 71 public String getVersion() { 72 return version; 73 } 74 75 /** 76 * returns the data of the version 77 * 78 * @return data 79 */ 80 public Instances getInstances() { 81 return new Instances(instances); 82 } 83 84 /** 85 * Compares first based on project name and then based on version. Only string comparisons are 86 * performed. 87 * 88 * @see java.lang.Comparable#compareTo(java.lang.Object) 89 */ 90 @Override 91 public int compareTo(SoftwareVersion o) { 92 int projectStrCmp = 0; 93 if (project != null) { 94 projectStrCmp = project.compareTo(o.project); 95 } 96 if (projectStrCmp == 0 && version != null) { 97 return version.compareTo(o.version); 98 } 99 else { 100 return projectStrCmp; 101 } 102 } 78 103 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/versions/UnbalancedFilter.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.versions; 2 16 … … 4 18 5 19 /** 6 * Removes unbalanced data sets in terms of classification. All data sets that are outside of the quantil defined 7 * by setParameter (default=0.1) are removed. 20 * Removes unbalanced data sets in terms of classification. All data sets that are outside of the 21 * quantil defined by setParameter (default=0.1) are removed. 22 * 8 23 * @author Steffen Herbold 9 24 */ 10 25 public class UnbalancedFilter extends AbstractVersionFilter { 11 26 12 /** 13 * quantil where outside lying versions are removed 14 */ 15 private double quantil = 0.1; 16 17 /** 18 * Sets the quantil. 19 * @param parameters the quantil as string 20 */ 21 @Override 22 public void setParameter(String parameters) { 23 quantil = Double.parseDouble(parameters); 24 } 27 /** 28 * quantil where outside lying versions are removed 29 */ 30 private double quantil = 0.1; 25 31 26 /** 27 * @see de.ugoe.cs.cpdp.versions.IVersionFilter#apply(de.ugoe.cs.cpdp.versions.SoftwareVersion) 28 */ 29 @Override 30 public boolean apply(SoftwareVersion version) { 31 final Instances instances = version.getInstances(); 32 33 final int[] counts = instances.attributeStats(instances.classIndex()).nominalCounts; 34 return ((double) counts[0])/instances.numInstances() >= (1-quantil) || 35 ((double) counts[0])/instances.numInstances() <= (quantil); 36 } 32 /** 33 * Sets the quantil. 34 * 35 * @param parameters 36 * the quantil as string 37 */ 38 @Override 39 public void setParameter(String parameters) { 40 quantil = Double.parseDouble(parameters); 41 } 42 43 /** 44 * @see de.ugoe.cs.cpdp.versions.IVersionFilter#apply(de.ugoe.cs.cpdp.versions.SoftwareVersion) 45 */ 46 @Override 47 public boolean apply(SoftwareVersion version) { 48 final Instances instances = version.getInstances(); 49 50 final int[] counts = instances.attributeStats(instances.classIndex()).nominalCounts; 51 return ((double) counts[0]) / instances.numInstances() >= (1 - quantil) || 52 ((double) counts[0]) / instances.numInstances() <= (quantil); 53 } 37 54 38 55 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/wekaclassifier/FixClass.java
r30 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.wekaclassifier; 2 3 16 4 17 import weka.classifiers.AbstractClassifier; … … 16 29 public class FixClass extends AbstractClassifier { 17 30 18 31 private static final long serialVersionUID = 1L; 19 32 20 33 private double fixedClassValue = 0.0d; 21 34 22 23 24 35 public FixClass() { 36 // TODO Auto-generated constructor stub 37 } 25 38 26 27 28 29 30 31 32 33 34 39 /** 40 * Returns default capabilities of the classifier. 41 * 42 * @return the capabilities of this classifier 43 */ 44 @Override 45 public Capabilities getCapabilities() { 46 Capabilities result = super.getCapabilities(); 47 result.disableAll(); 35 48 36 37 38 39 40 41 42 49 // attributes 50 result.enable(Capability.NOMINAL_ATTRIBUTES); 51 result.enable(Capability.NUMERIC_ATTRIBUTES); 52 result.enable(Capability.DATE_ATTRIBUTES); 53 result.enable(Capability.STRING_ATTRIBUTES); 54 result.enable(Capability.RELATIONAL_ATTRIBUTES); 55 result.enable(Capability.MISSING_VALUES); 43 56 44 45 46 47 57 // class 58 result.enable(Capability.NOMINAL_CLASS); 59 result.enable(Capability.NUMERIC_CLASS); 60 result.enable(Capability.MISSING_CLASS_VALUES); 48 61 49 50 62 // instances 63 result.setMinimumNumberInstances(0); 51 64 52 53 65 return result; 66 } 54 67 55 56 57 58 68 @Override 69 public void setOptions(String[] options) throws Exception { 70 fixedClassValue = Double.parseDouble(Utils.getOption('C', options)); 71 } 59 72 60 61 62 63 73 @Override 74 public double classifyInstance(Instance instance) { 75 return fixedClassValue; 76 } 64 77 65 66 67 68 78 @Override 79 public void buildClassifier(Instances traindata) throws Exception { 80 // do nothing 81 } 69 82 }
Note: See TracChangeset
for help on using the changeset viewer.