Changeset 41 for trunk/CrossPare/src/de/ugoe
- Timestamp:
- 09/24/15 10:59:05 (10 years ago)
- Location:
- trunk/CrossPare/src/de/ugoe/cs/cpdp
- Files:
-
- 84 edited
-
ExperimentConfiguration.java (modified) (3 diffs)
-
ExperimentConfigurationException.java (modified) (1 diff)
-
IParameterizable.java (modified) (1 diff)
-
Runner.java (modified) (2 diffs)
-
dataprocessing/AttributeNonRemoval.java (modified) (2 diffs)
-
dataprocessing/AttributeRemoval.java (modified) (2 diffs)
-
dataprocessing/AverageStandardization.java (modified) (2 diffs)
-
dataprocessing/BiasedWeights.java (modified) (2 diffs)
-
dataprocessing/DataGravitation.java (modified) (2 diffs)
-
dataprocessing/IProcessesingStrategy.java (modified) (2 diffs)
-
dataprocessing/ISetWiseProcessingStrategy.java (modified) (2 diffs)
-
dataprocessing/InformationGainFilter.java (modified) (2 diffs)
-
dataprocessing/LogarithmTransform.java (modified) (2 diffs)
-
dataprocessing/MedianAsReference.java (modified) (2 diffs)
-
dataprocessing/NominalAttributeFilter.java (modified) (2 diffs)
-
dataprocessing/Normalization.java (modified) (2 diffs)
-
dataprocessing/Oversampling.java (modified) (2 diffs)
-
dataprocessing/Resampling.java (modified) (2 diffs)
-
dataprocessing/SimulationFilter.java (modified) (2 diffs)
-
dataprocessing/Undersampling.java (modified) (2 diffs)
-
dataprocessing/ZScoreNormalization.java (modified) (2 diffs)
-
dataprocessing/ZScoreTargetNormalization.java (modified) (2 diffs)
-
dataselection/AbstractCharacteristicSelection.java (modified) (2 diffs)
-
dataselection/IPointWiseDataselectionStrategy.java (modified) (2 diffs)
-
dataselection/ISetWiseDataselectionStrategy.java (modified) (2 diffs)
-
dataselection/PetersFilter.java (modified) (3 diffs)
-
dataselection/PointWiseEMClusterSelection.java (modified) (2 diffs)
-
dataselection/SeparatabilitySelection.java (modified) (2 diffs)
-
dataselection/SetWiseEMClusterSelection.java (modified) (2 diffs)
-
dataselection/SetWiseEMContextSelection.java (modified) (2 diffs)
-
dataselection/SetWiseKNNSelection.java (modified) (2 diffs)
-
dataselection/TestAsTraining.java (modified) (2 diffs)
-
dataselection/TurhanFilter.java (modified) (2 diffs)
-
decentApp/ARFFxResourceTool.java (modified) (2 diffs)
-
decentApp/DECENTEpsilonModelHandler.java (modified) (2 diffs)
-
decentApp/DECENTResourceTool.java (modified) (2 diffs)
-
decentApp/FileWatcher.java (modified) (2 diffs)
-
decentApp/ResourceTool.java (modified) (2 diffs)
-
eval/AbstractWekaEvaluation.java (modified) (2 diffs)
-
eval/CVWekaEvaluation.java (modified) (2 diffs)
-
eval/IEvaluationStrategy.java (modified) (2 diffs)
-
eval/NormalWekaEvaluation.java (modified) (2 diffs)
-
execution/ClassifierCreationExperiment.java (modified) (3 diffs)
-
execution/CrossProjectExperiment.java (modified) (2 diffs)
-
execution/IExecutionStrategy.java (modified) (2 diffs)
-
execution/RelaxedCrossProjectExperiment.java (modified) (2 diffs)
-
loader/ARFFFolderLoader.java (modified) (2 diffs)
-
loader/ARFFLoader.java (modified) (2 diffs)
-
loader/AUDIChangeFolderLoader.java (modified) (1 diff)
-
loader/AUDIChangeLoader.java (modified) (2 diffs)
-
loader/AUDIDataLoader.java (modified) (2 diffs)
-
loader/AUDIFolderLoader.java (modified) (1 diff)
-
loader/AbstractFolderLoader.java (modified) (3 diffs)
-
loader/CSVDataLoader.java (modified) (3 diffs)
-
loader/CSVFolderLoader.java (modified) (2 diffs)
-
loader/CSVMockusDataLoader.java (modified) (2 diffs)
-
loader/CSVMockusFolderLoader.java (modified) (1 diff)
-
loader/DecentDataLoader.java (modified) (2 diffs)
-
loader/DecentFolderLoader.java (modified) (2 diffs)
-
loader/IDecentVersionLoader.java (modified) (2 diffs)
-
loader/IVersionLoader.java (modified) (2 diffs)
-
loader/NasaARFFFolderLoader.java (modified) (2 diffs)
-
loader/NasaARFFLoader.java (modified) (3 diffs)
-
loader/SingleVersionLoader.java (modified) (3 diffs)
-
training/FixClass.java (modified) (2 diffs)
-
training/ISetWiseTrainingStrategy.java (modified) (2 diffs)
-
training/ITrainer.java (modified) (1 diff)
-
training/ITrainingStrategy.java (modified) (2 diffs)
-
training/IWekaCompatibleTrainer.java (modified) (2 diffs)
-
training/QuadTree.java (modified) (2 diffs)
-
training/RandomClass.java (modified) (2 diffs)
-
training/WekaBaggingTraining.java (modified) (3 diffs)
-
training/WekaBaseTraining.java (modified) (2 diffs)
-
training/WekaLocalEMTraining.java (modified) (2 diffs)
-
training/WekaLocalFQTraining.java (modified) (2 diffs)
-
training/WekaTraining.java (modified) (3 diffs)
-
versions/AbstractVersionFilter.java (modified) (2 diffs)
-
versions/IVersionFilter.java (modified) (2 diffs)
-
versions/MaxInstanceNumberFilter.java (modified) (1 diff)
-
versions/MinClassNumberFilter.java (modified) (2 diffs)
-
versions/MinInstanceNumberFilter.java (modified) (1 diff)
-
versions/SoftwareVersion.java (modified) (2 diffs)
-
versions/UnbalancedFilter.java (modified) (2 diffs)
-
wekaclassifier/FixClass.java (modified) (2 diffs)
Legend:
- Unmodified
- Added
- Removed
-
trunk/CrossPare/src/de/ugoe/cs/cpdp/ExperimentConfiguration.java
r32 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp; 2 16 … … 33 47 34 48 /** 35 * Class that contains all meta information about an experiment, i.e., its configuration. The configuration is loaded from an XML file. 36 * <br><br> 37 * In the current implementation, the experiment configuration can only be created using an XML file. Programmatic creation of experiment configurations is currently not possibly. 49 * Class that contains all meta information about an experiment, i.e., its configuration. The 50 * configuration is loaded from an XML file. <br> 51 * <br> 52 * In the current implementation, the experiment configuration can only be created using an XML 53 * file. Programmatic creation of experiment configurations is currently not possibly. 54 * 38 55 * @author Steffen Herbold 39 56 */ 40 public class ExperimentConfiguration extends DefaultHandler { 41 42 /** 43 * handle of the file that contains the configuration 44 */ 45 private final File configFile; 46 47 /** 48 * name of the experiment (automatically set to the file name without the .xml ending) 49 */ 50 private String experimentName = "exp"; 51 52 /** 53 * loads instances 54 */ 55 private List<IVersionLoader> loaders; 56 57 /** 58 * path were the results of the experiments are stored 59 */ 60 private String resultsPath = "results"; 61 62 /** 63 * data set filters applied to all data 64 */ 65 private List<IVersionFilter> versionFilters; 66 67 /** 68 * data set filters that decide if a data set is used as test data 69 */ 70 private List<IVersionFilter> testVersionFilters; 71 72 /** 73 * data set filters that decide if a data is used as candidate training data 74 */ 75 private List<IVersionFilter> trainingVersionFilters; 76 77 /** 78 * setwise data processors that are applied before the setwise data selection 79 */ 80 private List<ISetWiseProcessingStrategy> setwisepreprocessors; 81 82 /** 83 * setwise data selection strategies 84 */ 85 private List<ISetWiseDataselectionStrategy> setwiseselectors; 86 87 /** 88 * setwise data processors that are applied after the setwise data selection 89 */ 90 private List<ISetWiseProcessingStrategy> setwisepostprocessors; 91 92 /** 93 * setwise trainers, i.e., trainers that require the selected training data to be separate from each other 94 */ 95 private List<ISetWiseTrainingStrategy> setwiseTrainers; 96 97 /** 98 * data processors that are applied before the pointwise data selection 99 */ 100 private List<IProcessesingStrategy> preprocessors; 101 102 /** 103 * pointwise data selection strategies 104 */ 105 private List<IPointWiseDataselectionStrategy> pointwiseselectors; 106 107 /** 108 * data processors that are applied before the pointwise data selection 109 */ 110 private List<IProcessesingStrategy> postprocessors; 111 112 /** 113 * normal trainers, i.e., trainers that require the selected training data in a single data set 114 */ 115 private List<ITrainingStrategy> trainers; 116 117 /** 118 * evaluators used for the the experiment results 119 */ 120 private List<IEvaluationStrategy> evaluators; 121 122 /** 123 * indicates, if the classifier should be saved 124 */ 125 private Boolean saveClassifier = null; 126 127 /** 128 * indicates, which execution strategy to choose 129 * (e.g. CrossProjectExperiment, ClassifierCreationExecution). 130 * Default is CrossProjectExperiment. 131 */ 132 private String executionStrategy = "CrossProjectExperiment"; 133 134 /** 135 * Constructor. Creates a new configuration from a given file. 136 * @param filename name of the file from the configuration is loaded. 137 * @throws ExperimentConfigurationException thrown if there is an error creating the configuration 138 */ 139 public ExperimentConfiguration(String filename) throws ExperimentConfigurationException { 140 this(new File(filename)); 141 } 142 143 /** 144 * Constructor. Creates a new configuration from a given file. 145 * @param filename handle of the file from the configuration is loaded. 146 * @throws ExperimentConfigurationException thrown if there is an error creating the configuration 147 */ 148 public ExperimentConfiguration(File file) throws ExperimentConfigurationException { 149 loaders = new LinkedList<>(); 150 versionFilters = new LinkedList<>(); 151 testVersionFilters = new LinkedList<>(); 152 trainingVersionFilters = new LinkedList<>(); 153 setwisepreprocessors = new LinkedList<>(); 154 setwiseselectors = new LinkedList<>(); 155 setwisepostprocessors = new LinkedList<>(); 156 setwiseTrainers = new LinkedList<>(); 157 preprocessors = new LinkedList<>(); 158 pointwiseselectors = new LinkedList<>(); 159 postprocessors = new LinkedList<>(); 160 trainers = new LinkedList<>(); 161 evaluators = new LinkedList<>(); 162 163 if (file == null) { 57 public class ExperimentConfiguration extends DefaultHandler { 58 59 /** 60 * handle of the file that contains the configuration 61 */ 62 private final File configFile; 63 64 /** 65 * name of the experiment (automatically set to the file name without the .xml ending) 66 */ 67 private String experimentName = "exp"; 68 69 /** 70 * loads instances 71 */ 72 private List<IVersionLoader> loaders; 73 74 /** 75 * path were the results of the experiments are stored 76 */ 77 private String resultsPath = "results"; 78 79 /** 80 * data set filters applied to all data 81 */ 82 private List<IVersionFilter> versionFilters; 83 84 /** 85 * data set filters that decide if a data set is used as test data 86 */ 87 private List<IVersionFilter> testVersionFilters; 88 89 /** 90 * data set filters that decide if a data is used as candidate training data 91 */ 92 private List<IVersionFilter> trainingVersionFilters; 93 94 /** 95 * setwise data processors that are applied before the setwise data selection 96 */ 97 private List<ISetWiseProcessingStrategy> setwisepreprocessors; 98 99 /** 100 * setwise data selection strategies 101 */ 102 private List<ISetWiseDataselectionStrategy> setwiseselectors; 103 104 /** 105 * setwise data processors that are applied after the setwise data selection 106 */ 107 private List<ISetWiseProcessingStrategy> setwisepostprocessors; 108 109 /** 110 * setwise trainers, i.e., trainers that require the selected training data to be separate from 111 * each other 112 */ 113 private List<ISetWiseTrainingStrategy> setwiseTrainers; 114 115 /** 116 * data processors that are applied before the pointwise data selection 117 */ 118 private List<IProcessesingStrategy> preprocessors; 119 120 /** 121 * pointwise data selection strategies 122 */ 123 private List<IPointWiseDataselectionStrategy> pointwiseselectors; 124 125 /** 126 * data processors that are applied before the pointwise data selection 127 */ 128 private List<IProcessesingStrategy> postprocessors; 129 130 /** 131 * normal trainers, i.e., trainers that require the selected training data in a single data set 132 */ 133 private List<ITrainingStrategy> trainers; 134 135 /** 136 * evaluators used for the the experiment results 137 */ 138 private List<IEvaluationStrategy> evaluators; 139 140 /** 141 * indicates, if the classifier should be saved 142 */ 143 private Boolean saveClassifier = null; 144 145 /** 146 * indicates, which execution strategy to choose (e.g. CrossProjectExperiment, 147 * ClassifierCreationExecution). Default is CrossProjectExperiment. 148 */ 149 private String executionStrategy = "CrossProjectExperiment"; 150 151 /** 152 * Constructor. Creates a new configuration from a given file. 153 * 154 * @param filename 155 * name of the file from the configuration is loaded. 156 * @throws ExperimentConfigurationException 157 * thrown if there is an error creating the configuration 158 */ 159 public ExperimentConfiguration(String filename) throws ExperimentConfigurationException { 160 this(new File(filename)); 161 } 162 163 /** 164 * Constructor. Creates a new configuration from a given file. 165 * 166 * @param filename 167 * handle of the file from the configuration is loaded. 168 * @throws ExperimentConfigurationException 169 * thrown if there is an error creating the configuration 170 */ 171 public ExperimentConfiguration(File file) throws ExperimentConfigurationException { 172 loaders = new LinkedList<>(); 173 versionFilters = new LinkedList<>(); 174 testVersionFilters = new LinkedList<>(); 175 trainingVersionFilters = new LinkedList<>(); 176 setwisepreprocessors = new LinkedList<>(); 177 setwiseselectors = new LinkedList<>(); 178 setwisepostprocessors = new LinkedList<>(); 179 setwiseTrainers = new LinkedList<>(); 180 preprocessors = new LinkedList<>(); 181 pointwiseselectors = new LinkedList<>(); 182 postprocessors = new LinkedList<>(); 183 trainers = new LinkedList<>(); 184 evaluators = new LinkedList<>(); 185 186 if (file == null) { 164 187 throw new IllegalArgumentException("file must not be null"); 165 188 } 166 if (file.isDirectory()) {167 throw new IllegalArgumentException("file must not be a directory");168 }169 configFile = file;170 171 experimentName = file.getName().split("\\.")[0];189 if (file.isDirectory()) { 190 throw new IllegalArgumentException("file must not be a directory"); 191 } 192 configFile = file; 193 194 experimentName = file.getName().split("\\.")[0]; 172 195 173 196 final SAXParserFactory spf = SAXParserFactory.newInstance(); … … 177 200 InputSource inputSource = null; 178 201 try { 179 saxParser = spf.newSAXParser(); 180 } catch (ParserConfigurationException | SAXException e) { 181 throw new ExperimentConfigurationException(e); 182 } 183 202 saxParser = spf.newSAXParser(); 203 } 204 catch (ParserConfigurationException | SAXException e) { 205 throw new ExperimentConfigurationException(e); 206 } 207 184 208 InputStreamReader reader = null; 185 try { 186 reader = new InputStreamReader(new FileInputStream(file), "UTF-8"); 187 inputSource = new InputSource(reader); 188 } catch (UnsupportedEncodingException | FileNotFoundException e) { 189 throw new ExperimentConfigurationException("Could not open configuration file.", e); 190 } 191 209 try { 210 reader = new InputStreamReader(new FileInputStream(file), "UTF-8"); 211 inputSource = new InputSource(reader); 212 } 213 catch (UnsupportedEncodingException | FileNotFoundException e) { 214 throw new ExperimentConfigurationException("Could not open configuration file.", e); 215 } 216 192 217 if (inputSource != null) { 193 218 inputSource.setSystemId("file://" + file.getAbsolutePath()); 194 try { 195 saxParser.parse(inputSource, this); 196 } catch (SAXException | IOException e) { 197 throw new ExperimentConfigurationException("Error parsing configuration.", e); 198 } 199 } 200 if( reader!=null ) { 201 try { 202 reader.close(); 203 } catch (IOException e) { 204 throw new ExperimentConfigurationException("Error closing reader.", e); 205 } 206 } 207 } 208 209 /** 210 * returns the name of the experiment 211 * @return name of the experiment 212 */ 213 public String getExperimentName() { 214 return experimentName; 215 } 216 217 /** 218 * returns the loaders for instances 219 * @return data loaders 220 */ 221 public List<IVersionLoader> getLoaders() { 222 return loaders; 223 } 224 225 /** 226 * returns the results path 227 * @return results path 228 */ 229 public String getResultsPath() { 230 return resultsPath; 231 } 232 233 /** 234 * returns the data set filters of the experiment 235 * @return data set filters of the experiment 236 */ 237 public List<IVersionFilter> getVersionFilters() { 238 return versionFilters; 239 } 240 241 /** 242 * returns the test set filters of the experiment 243 * @return test set filters of the experiment 244 */ 245 public List<IVersionFilter> getTestVersionFilters() { 246 return testVersionFilters; 247 } 248 249 /** 250 * returns the candidate training version filters of the experiment 251 * @return candidate training version filters of the experiment 252 */ 253 public List<IVersionFilter> getTrainingVersionFilters() { 254 return trainingVersionFilters; 255 } 256 257 /** 258 * returns the setwise processors applied before the setwise data selection 259 * @return setwise processors applied before the setwise data selection 260 */ 261 public List<ISetWiseProcessingStrategy> getSetWisePreprocessors() { 262 return setwisepreprocessors; 263 } 264 265 /** 266 * returns the setwise data selection strategies 267 * @return setwise data selection strategies 268 */ 269 public List<ISetWiseDataselectionStrategy> getSetWiseSelectors() { 270 return setwiseselectors; 271 } 272 273 /** 274 * returns the setwise processors applied after the setwise data selection 275 * @return setwise processors applied after the setwise data selection 276 */ 277 public List<ISetWiseProcessingStrategy> getSetWisePostprocessors() { 278 return setwisepostprocessors; 279 } 280 281 /** 282 * returns the setwise training algorithms 283 * @return setwise training algorithms 284 */ 285 public List<ISetWiseTrainingStrategy> getSetWiseTrainers() { 286 return setwiseTrainers; 287 } 288 289 /** 290 * returns the processors applied before the pointwise data selection 291 * @return processors applied before the pointwise data selection 292 */ 293 public List<IProcessesingStrategy> getPreProcessors() { 294 return preprocessors; 295 } 296 297 /** 298 * returns the pointwise data selection strategies 299 * @return pointwise data selection strategies 300 */ 301 public List<IPointWiseDataselectionStrategy> getPointWiseSelectors() { 302 return pointwiseselectors; 303 } 304 305 /** 306 * returns the processors applied after the pointwise data selection 307 * @return processors applied after the pointwise data selection 308 */ 309 public List<IProcessesingStrategy> getPostProcessors() { 310 return postprocessors; 311 } 312 313 /** 314 * returns the normal training algorithm 315 * @return normal training algorithms 316 */ 317 public List<ITrainingStrategy> getTrainers() { 318 return trainers; 319 } 320 321 /** 322 * returns the evaluation strategies 323 * @return evaluation strategies 324 */ 325 public List<IEvaluationStrategy> getEvaluators() { 326 return evaluators; 327 } 328 329 /** 330 * returns boolean, if classifier should be saved 331 * @return boolean 332 */ 333 public boolean getSaveClassifier() { 334 return saveClassifier; 335 } 336 337 /** 338 * returns the execution strategy 339 * @return String execution strategy 340 */ 341 public String getExecutionStrategy() { 342 return executionStrategy; 343 } 344 345 /* (non-Javadoc) 346 * @see org.xml.sax.helpers.DefaultHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes) 347 */ 348 @Override 349 public void startElement(String uri, String localName, String qName, 350 Attributes attributes) throws SAXException { 351 try { 352 if( qName.equals("config") ) { 353 // ingore 354 } 355 else if( qName.equals("loader") ) { 356 final IVersionLoader loader = (IVersionLoader) Class.forName("de.ugoe.cs.cpdp.loader." + attributes.getValue("name")).newInstance(); 357 loader.setLocation(attributes.getValue("datalocation")); 358 loaders.add(loader); 359 360 // TODO location as relative 361 } 362 else if( qName.equals("resultspath") ) { 363 resultsPath = attributes.getValue("path"); 364 } 365 else if( qName.equals("versionfilter") ) { 366 final IVersionFilter filter = (IVersionFilter) Class.forName("de.ugoe.cs.cpdp.versions." + attributes.getValue("name")).newInstance(); 367 filter.setParameter(attributes.getValue("param")); 368 versionFilters.add(filter); 369 } 370 else if( qName.equals("testVersionfilter") ) { 371 final IVersionFilter filter = (IVersionFilter) Class.forName("de.ugoe.cs.cpdp.versions." + attributes.getValue("name")).newInstance(); 372 filter.setParameter(attributes.getValue("param")); 373 testVersionFilters.add(filter); 374 } 375 else if( qName.equals("trainVersionfilter") ) { 376 final IVersionFilter filter = (IVersionFilter) Class.forName("de.ugoe.cs.cpdp.versions." + attributes.getValue("name")).newInstance(); 377 filter.setParameter(attributes.getValue("param")); 378 trainingVersionFilters.add(filter); 379 } 380 else if( qName.equals("setwisepreprocessor") ) { 381 final ISetWiseProcessingStrategy processor = (ISetWiseProcessingStrategy) Class.forName("de.ugoe.cs.cpdp.dataprocessing." + attributes.getValue("name")).newInstance(); 382 processor.setParameter(attributes.getValue("param")); 383 setwisepreprocessors.add(processor); 384 } 385 else if( qName.equals("setwiseselector") ) { 386 final ISetWiseDataselectionStrategy selection = (ISetWiseDataselectionStrategy) Class.forName("de.ugoe.cs.cpdp.dataselection." + attributes.getValue("name")).newInstance(); 387 selection.setParameter(attributes.getValue("param")); 388 setwiseselectors.add(selection); 389 } 390 else if( qName.equals("setwisepostprocessor") ) { 391 final ISetWiseProcessingStrategy processor = (ISetWiseProcessingStrategy) Class.forName("de.ugoe.cs.cpdp.dataprocessing." + attributes.getValue("name")).newInstance(); 392 processor.setParameter(attributes.getValue("param")); 393 setwisepostprocessors.add(processor); 394 } 395 else if( qName.equals("setwisetrainer") ) { 396 final ISetWiseTrainingStrategy trainer = (ISetWiseTrainingStrategy) Class.forName("de.ugoe.cs.cpdp.training." + attributes.getValue("name")).newInstance(); 397 trainer.setParameter(attributes.getValue("param")); 398 setwiseTrainers.add(trainer); 399 } 400 else if( qName.equals("preprocessor") ) { 401 final IProcessesingStrategy processor = (IProcessesingStrategy) Class.forName("de.ugoe.cs.cpdp.dataprocessing." + attributes.getValue("name")).newInstance(); 402 processor.setParameter( attributes.getValue("param")); 403 preprocessors.add(processor); 404 } 405 else if( qName.equals("pointwiseselector") ) { 406 final IPointWiseDataselectionStrategy selection = (IPointWiseDataselectionStrategy) Class.forName("de.ugoe.cs.cpdp.dataselection." + attributes.getValue("name")).newInstance(); 407 selection.setParameter( attributes.getValue("param")); 408 pointwiseselectors.add(selection); 409 } 410 else if( qName.equals("postprocessor") ) { 411 final IProcessesingStrategy processor = (IProcessesingStrategy) Class.forName("de.ugoe.cs.cpdp.dataprocessing." + attributes.getValue("name")).newInstance(); 412 processor.setParameter( attributes.getValue("param")); 413 postprocessors.add(processor); 414 } 415 else if( qName.equals("trainer") ) { 416 final ITrainingStrategy trainer = (ITrainingStrategy) Class.forName("de.ugoe.cs.cpdp.training." + attributes.getValue("name")).newInstance(); 417 trainer.setParameter(attributes.getValue("param")); 418 trainers.add(trainer); 419 } 420 else if( qName.equals("eval") ) { 421 final IEvaluationStrategy evaluator = (IEvaluationStrategy) Class.forName("de.ugoe.cs.cpdp.eval." + attributes.getValue("name")).newInstance(); 422 evaluators.add(evaluator); 423 } 424 else if( qName.equals("saveClassifier")) { 425 saveClassifier = true; 426 } 427 else if( qName.equals("executionStrategy")) { 428 executionStrategy = attributes.getValue("name"); 429 } 430 else if( qName.equals("partialconfig") ) { 431 String path = attributes.getValue("path"); 432 try { 433 boolean relative = true; 434 if( attributes.getValue("relative")!=null ) { 435 relative = Boolean.parseBoolean(attributes.getValue("relative")); 436 } 437 438 if( relative ) { 439 path = configFile.getParentFile().getPath() + "/" + path; 440 } 441 addConfigurations(new ExperimentConfiguration(path)); 442 } catch (ExperimentConfigurationException e) { 443 throw new SAXException("Could not load partial configuration: " + path, e); 444 } 445 } else { 446 Console.traceln(Level.WARNING, "element in config-file " + configFile.getName() + " ignored: " + qName); 447 } 448 } 449 catch (NoClassDefFoundError | ClassNotFoundException | IllegalAccessException | InstantiationException | ClassCastException e) { 450 throw new SAXException("Could not initialize class correctly", (Exception) e); 451 } 452 } 453 454 /** 455 * Adds the information of another experiment configuration to this configuration. This mechanism allows the usage of partial configuration files. The name of the other configuration is lost. 456 * <br><br> 457 * If the current data path is the empty string (""), it is override by the datapath of the other configuration. Otherwise, the current data path is kept. 458 * @param other experiment whose information is added 459 * @throws ExperimentConfigurationException 460 */ 461 private void addConfigurations(ExperimentConfiguration other) throws ExperimentConfigurationException { 462 if( "results".equals(resultsPath) ) { 463 resultsPath = other.resultsPath; 464 } 465 loaders.addAll(other.loaders); 466 versionFilters.addAll(other.versionFilters); 467 testVersionFilters.addAll(other.testVersionFilters); 468 trainingVersionFilters.addAll(other.trainingVersionFilters); 469 setwisepreprocessors.addAll(other.setwisepreprocessors); 470 setwiseselectors.addAll(other.setwiseselectors); 471 setwisepostprocessors.addAll(other.setwisepostprocessors); 472 setwiseTrainers.addAll(other.setwiseTrainers); 473 preprocessors.addAll(other.preprocessors); 474 pointwiseselectors.addAll(other.pointwiseselectors); 475 postprocessors.addAll(other.postprocessors); 476 trainers.addAll(other.trainers); 477 evaluators.addAll(other.evaluators); 478 479 if(!executionStrategy.equals(other.executionStrategy)) { 480 throw new ExperimentConfigurationException("Executionstrategies must be the same, if config files should be added."); 481 } 482 483 /* Only if saveClassifier is not set in the main config and 484 * the other configs saveClassifier is true, it must be set. 485 */ 486 if(saveClassifier == null && other.saveClassifier == true) { 487 saveClassifier = other.saveClassifier; 488 } 489 490 } 491 492 /* (non-Javadoc) 493 * @see java.lang.Object#toString() 494 */ 495 @Override 496 public String toString() { 497 final StringBuilder builder = new StringBuilder(); 498 builder.append("Experiment name: " + experimentName + StringTools.ENDLINE); 499 builder.append("Loaders: " + loaders + StringTools.ENDLINE); 500 builder.append("Results path: " + resultsPath + StringTools.ENDLINE); 501 builder.append("Version filters: " + versionFilters.toString() + StringTools.ENDLINE); 502 builder.append("Test version filters: " + testVersionFilters.toString() + StringTools.ENDLINE); 503 builder.append("Training version filters: " + trainingVersionFilters.toString() + StringTools.ENDLINE); 504 builder.append("Setwise preprocessors: " + setwisepreprocessors.toString() + StringTools.ENDLINE); 505 builder.append("Setwise selectors: " + setwiseselectors.toString() + StringTools.ENDLINE); 506 builder.append("Setwise postprocessors: " + setwisepostprocessors.toString() + StringTools.ENDLINE); 507 builder.append("Setwise trainers: " + setwiseTrainers.toString() + StringTools.ENDLINE); 508 builder.append("Pointwise preprocessors: " + preprocessors.toString() + StringTools.ENDLINE); 509 builder.append("Pointwise selectors: " + pointwiseselectors.toString() + StringTools.ENDLINE); 510 builder.append("Pointwise postprocessors: " + postprocessors.toString() + StringTools.ENDLINE); 511 builder.append("Pointwise trainers: " + trainers.toString() + StringTools.ENDLINE); 512 builder.append("Evaluators: " + evaluators.toString() + StringTools.ENDLINE); 513 builder.append("Save Classifier?: " + saveClassifier + StringTools.ENDLINE); 514 builder.append("Execution Strategy: " + executionStrategy + StringTools.ENDLINE); 515 516 return builder.toString(); 517 } 219 try { 220 saxParser.parse(inputSource, this); 221 } 222 catch (SAXException | IOException e) { 223 throw new ExperimentConfigurationException("Error parsing configuration.", e); 224 } 225 } 226 if (reader != null) { 227 try { 228 reader.close(); 229 } 230 catch (IOException e) { 231 throw new ExperimentConfigurationException("Error closing reader.", e); 232 } 233 } 234 } 235 236 /** 237 * returns the name of the experiment 238 * 239 * @return name of the experiment 240 */ 241 public String getExperimentName() { 242 return experimentName; 243 } 244 245 /** 246 * returns the loaders for instances 247 * 248 * @return data loaders 249 */ 250 public List<IVersionLoader> getLoaders() { 251 return loaders; 252 } 253 254 /** 255 * returns the results path 256 * 257 * @return results path 258 */ 259 public String getResultsPath() { 260 return resultsPath; 261 } 262 263 /** 264 * returns the data set filters of the experiment 265 * 266 * @return data set filters of the experiment 267 */ 268 public List<IVersionFilter> getVersionFilters() { 269 return versionFilters; 270 } 271 272 /** 273 * returns the test set filters of the experiment 274 * 275 * @return test set filters of the experiment 276 */ 277 public List<IVersionFilter> getTestVersionFilters() { 278 return testVersionFilters; 279 } 280 281 /** 282 * returns the candidate training version filters of the experiment 283 * 284 * @return candidate training version filters of the experiment 285 */ 286 public List<IVersionFilter> getTrainingVersionFilters() { 287 return trainingVersionFilters; 288 } 289 290 /** 291 * returns the setwise processors applied before the setwise data selection 292 * 293 * @return setwise processors applied before the setwise data selection 294 */ 295 public List<ISetWiseProcessingStrategy> getSetWisePreprocessors() { 296 return setwisepreprocessors; 297 } 298 299 /** 300 * returns the setwise data selection strategies 301 * 302 * @return setwise data selection strategies 303 */ 304 public List<ISetWiseDataselectionStrategy> getSetWiseSelectors() { 305 return setwiseselectors; 306 } 307 308 /** 309 * returns the setwise processors applied after the setwise data selection 310 * 311 * @return setwise processors applied after the setwise data selection 312 */ 313 public List<ISetWiseProcessingStrategy> getSetWisePostprocessors() { 314 return setwisepostprocessors; 315 } 316 317 /** 318 * returns the setwise training algorithms 319 * 320 * @return setwise training algorithms 321 */ 322 public List<ISetWiseTrainingStrategy> getSetWiseTrainers() { 323 return setwiseTrainers; 324 } 325 326 /** 327 * returns the processors applied before the pointwise data selection 328 * 329 * @return processors applied before the pointwise data selection 330 */ 331 public List<IProcessesingStrategy> getPreProcessors() { 332 return preprocessors; 333 } 334 335 /** 336 * returns the pointwise data selection strategies 337 * 338 * @return pointwise data selection strategies 339 */ 340 public List<IPointWiseDataselectionStrategy> getPointWiseSelectors() { 341 return pointwiseselectors; 342 } 343 344 /** 345 * returns the processors applied after the pointwise data selection 346 * 347 * @return processors applied after the pointwise data selection 348 */ 349 public List<IProcessesingStrategy> getPostProcessors() { 350 return postprocessors; 351 } 352 353 /** 354 * returns the normal training algorithm 355 * 356 * @return normal training algorithms 357 */ 358 public List<ITrainingStrategy> getTrainers() { 359 return trainers; 360 } 361 362 /** 363 * returns the evaluation strategies 364 * 365 * @return evaluation strategies 366 */ 367 public List<IEvaluationStrategy> getEvaluators() { 368 return evaluators; 369 } 370 371 /** 372 * returns boolean, if classifier should be saved 373 * 374 * @return boolean 375 */ 376 public boolean getSaveClassifier() { 377 return saveClassifier; 378 } 379 380 /** 381 * returns the execution strategy 382 * 383 * @return String execution strategy 384 */ 385 public String getExecutionStrategy() { 386 return executionStrategy; 387 } 388 389 /* 390 * (non-Javadoc) 391 * 392 * @see org.xml.sax.helpers.DefaultHandler#startElement(java.lang.String, java.lang.String, 393 * java.lang.String, org.xml.sax.Attributes) 394 */ 395 @Override 396 public void startElement(String uri, String localName, String qName, Attributes attributes) 397 throws SAXException 398 { 399 try { 400 if (qName.equals("config")) { 401 // ingore 402 } 403 else if (qName.equals("loader")) { 404 final IVersionLoader loader = 405 (IVersionLoader) Class.forName("de.ugoe.cs.cpdp.loader." + 406 attributes.getValue("name")).newInstance(); 407 loader.setLocation(attributes.getValue("datalocation")); 408 loaders.add(loader); 409 410 // TODO location as relative 411 } 412 else if (qName.equals("resultspath")) { 413 resultsPath = attributes.getValue("path"); 414 } 415 else if (qName.equals("versionfilter")) { 416 final IVersionFilter filter = 417 (IVersionFilter) Class.forName("de.ugoe.cs.cpdp.versions." + 418 attributes.getValue("name")).newInstance(); 419 filter.setParameter(attributes.getValue("param")); 420 versionFilters.add(filter); 421 } 422 else if (qName.equals("testVersionfilter")) { 423 final IVersionFilter filter = 424 (IVersionFilter) Class.forName("de.ugoe.cs.cpdp.versions." + 425 attributes.getValue("name")).newInstance(); 426 filter.setParameter(attributes.getValue("param")); 427 testVersionFilters.add(filter); 428 } 429 else if (qName.equals("trainVersionfilter")) { 430 final IVersionFilter filter = 431 (IVersionFilter) Class.forName("de.ugoe.cs.cpdp.versions." + 432 attributes.getValue("name")).newInstance(); 433 filter.setParameter(attributes.getValue("param")); 434 trainingVersionFilters.add(filter); 435 } 436 else if (qName.equals("setwisepreprocessor")) { 437 final ISetWiseProcessingStrategy processor = 438 (ISetWiseProcessingStrategy) Class.forName("de.ugoe.cs.cpdp.dataprocessing." + 439 attributes.getValue("name")) 440 .newInstance(); 441 processor.setParameter(attributes.getValue("param")); 442 setwisepreprocessors.add(processor); 443 } 444 else if (qName.equals("setwiseselector")) { 445 final ISetWiseDataselectionStrategy selection = 446 (ISetWiseDataselectionStrategy) Class.forName("de.ugoe.cs.cpdp.dataselection." + 447 attributes.getValue("name")) 448 .newInstance(); 449 selection.setParameter(attributes.getValue("param")); 450 setwiseselectors.add(selection); 451 } 452 else if (qName.equals("setwisepostprocessor")) { 453 final ISetWiseProcessingStrategy processor = 454 (ISetWiseProcessingStrategy) Class.forName("de.ugoe.cs.cpdp.dataprocessing." + 455 attributes.getValue("name")) 456 .newInstance(); 457 processor.setParameter(attributes.getValue("param")); 458 setwisepostprocessors.add(processor); 459 } 460 else if (qName.equals("setwisetrainer")) { 461 final ISetWiseTrainingStrategy trainer = 462 (ISetWiseTrainingStrategy) Class.forName("de.ugoe.cs.cpdp.training." + 463 attributes.getValue("name")) 464 .newInstance(); 465 trainer.setParameter(attributes.getValue("param")); 466 setwiseTrainers.add(trainer); 467 } 468 else if (qName.equals("preprocessor")) { 469 final IProcessesingStrategy processor = 470 (IProcessesingStrategy) Class.forName("de.ugoe.cs.cpdp.dataprocessing." + 471 attributes.getValue("name")) 472 .newInstance(); 473 processor.setParameter(attributes.getValue("param")); 474 preprocessors.add(processor); 475 } 476 else if (qName.equals("pointwiseselector")) { 477 final IPointWiseDataselectionStrategy selection = 478 (IPointWiseDataselectionStrategy) Class 479 .forName("de.ugoe.cs.cpdp.dataselection." + attributes.getValue("name")) 480 .newInstance(); 481 selection.setParameter(attributes.getValue("param")); 482 pointwiseselectors.add(selection); 483 } 484 else if (qName.equals("postprocessor")) { 485 final IProcessesingStrategy processor = 486 (IProcessesingStrategy) Class.forName("de.ugoe.cs.cpdp.dataprocessing." + 487 attributes.getValue("name")) 488 .newInstance(); 489 processor.setParameter(attributes.getValue("param")); 490 postprocessors.add(processor); 491 } 492 else if (qName.equals("trainer")) { 493 final ITrainingStrategy trainer = 494 (ITrainingStrategy) Class.forName("de.ugoe.cs.cpdp.training." + 495 attributes.getValue("name")) 496 .newInstance(); 497 trainer.setParameter(attributes.getValue("param")); 498 trainers.add(trainer); 499 } 500 else if (qName.equals("eval")) { 501 final IEvaluationStrategy evaluator = 502 (IEvaluationStrategy) Class.forName("de.ugoe.cs.cpdp.eval." + 503 attributes.getValue("name")) 504 .newInstance(); 505 evaluators.add(evaluator); 506 } 507 else if (qName.equals("saveClassifier")) { 508 saveClassifier = true; 509 } 510 else if (qName.equals("executionStrategy")) { 511 executionStrategy = attributes.getValue("name"); 512 } 513 else if (qName.equals("partialconfig")) { 514 String path = attributes.getValue("path"); 515 try { 516 boolean relative = true; 517 if (attributes.getValue("relative") != null) { 518 relative = Boolean.parseBoolean(attributes.getValue("relative")); 519 } 520 521 if (relative) { 522 path = configFile.getParentFile().getPath() + "/" + path; 523 } 524 addConfigurations(new ExperimentConfiguration(path)); 525 } 526 catch (ExperimentConfigurationException e) { 527 throw new SAXException("Could not load partial configuration: " + path, e); 528 } 529 } 530 else { 531 Console.traceln(Level.WARNING, "element in config-file " + configFile.getName() + 532 " ignored: " + qName); 533 } 534 } 535 catch (NoClassDefFoundError | ClassNotFoundException | IllegalAccessException 536 | InstantiationException | ClassCastException e) 537 { 538 throw new SAXException("Could not initialize class correctly", (Exception) e); 539 } 540 } 541 542 /** 543 * Adds the information of another experiment configuration to this configuration. This 544 * mechanism allows the usage of partial configuration files. The name of the other 545 * configuration is lost. <br> 546 * <br> 547 * If the current data path is the empty string (""), it is override by the datapath 548 * of the other configuration. Otherwise, the current data path is kept. 549 * 550 * @param other 551 * experiment whose information is added 552 * @throws ExperimentConfigurationException 553 */ 554 private void addConfigurations(ExperimentConfiguration other) 555 throws ExperimentConfigurationException 556 { 557 if ("results".equals(resultsPath)) { 558 resultsPath = other.resultsPath; 559 } 560 loaders.addAll(other.loaders); 561 versionFilters.addAll(other.versionFilters); 562 testVersionFilters.addAll(other.testVersionFilters); 563 trainingVersionFilters.addAll(other.trainingVersionFilters); 564 setwisepreprocessors.addAll(other.setwisepreprocessors); 565 setwiseselectors.addAll(other.setwiseselectors); 566 setwisepostprocessors.addAll(other.setwisepostprocessors); 567 setwiseTrainers.addAll(other.setwiseTrainers); 568 preprocessors.addAll(other.preprocessors); 569 pointwiseselectors.addAll(other.pointwiseselectors); 570 postprocessors.addAll(other.postprocessors); 571 trainers.addAll(other.trainers); 572 evaluators.addAll(other.evaluators); 573 574 if (!executionStrategy.equals(other.executionStrategy)) { 575 throw new ExperimentConfigurationException( 576 "Executionstrategies must be the same, if config files should be added."); 577 } 578 579 /* 580 * Only if saveClassifier is not set in the main config and the other configs saveClassifier 581 * is true, it must be set. 582 */ 583 if (saveClassifier == null && other.saveClassifier == true) { 584 saveClassifier = other.saveClassifier; 585 } 586 587 } 588 589 /* 590 * (non-Javadoc) 591 * 592 * @see java.lang.Object#toString() 593 */ 594 @Override 595 public String toString() { 596 final StringBuilder builder = new StringBuilder(); 597 builder.append("Experiment name: " + experimentName + StringTools.ENDLINE); 598 builder.append("Loaders: " + loaders + StringTools.ENDLINE); 599 builder.append("Results path: " + resultsPath + StringTools.ENDLINE); 600 builder.append("Version filters: " + versionFilters.toString() + StringTools.ENDLINE); 601 builder.append("Test version filters: " + testVersionFilters.toString() + 602 StringTools.ENDLINE); 603 builder.append("Training version filters: " + trainingVersionFilters.toString() + 604 StringTools.ENDLINE); 605 builder.append("Setwise preprocessors: " + setwisepreprocessors.toString() + 606 StringTools.ENDLINE); 607 builder.append("Setwise selectors: " + setwiseselectors.toString() + StringTools.ENDLINE); 608 builder.append("Setwise postprocessors: " + setwisepostprocessors.toString() + 609 StringTools.ENDLINE); 610 builder.append("Setwise trainers: " + setwiseTrainers.toString() + StringTools.ENDLINE); 611 builder 612 .append("Pointwise preprocessors: " + preprocessors.toString() + StringTools.ENDLINE); 613 builder.append("Pointwise selectors: " + pointwiseselectors.toString() + 614 StringTools.ENDLINE); 615 builder.append("Pointwise postprocessors: " + postprocessors.toString() + 616 StringTools.ENDLINE); 617 builder.append("Pointwise trainers: " + trainers.toString() + StringTools.ENDLINE); 618 builder.append("Evaluators: " + evaluators.toString() + StringTools.ENDLINE); 619 builder.append("Save Classifier?: " + saveClassifier + StringTools.ENDLINE); 620 builder.append("Execution Strategy: " + executionStrategy + StringTools.ENDLINE); 621 622 return builder.toString(); 623 } 518 624 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/ExperimentConfigurationException.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp; 2 16 3 17 /** 4 18 * Thrown if there is an error creating an experiment configuration. 19 * 5 20 * @author Steffen Herbold 6 21 */ 7 22 public class ExperimentConfigurationException extends Exception { 8 23 9 /**10 * Standard serialization ID. 11 */12 private static final long serialVersionUID = 1L;13 14 /**15 * @see Exception#Exception() 16 */17 public ExperimentConfigurationException() {18 super();19 }20 21 /**22 * @see Exception#Exception(String)23 */24 public ExperimentConfigurationException(String message) {25 super(message);26 }27 28 /**29 * @see Exception#Exception(String, Throwable)30 */31 public ExperimentConfigurationException(String message, Throwable e) {32 super(message, e);33 }34 35 /**36 * @see Exception#Exception(Throwable)37 */38 public ExperimentConfigurationException(Throwable e) {39 super(e);40 }24 /** 25 * Standard serialization ID. 26 */ 27 private static final long serialVersionUID = 1L; 28 29 /** 30 * @see Exception#Exception() 31 */ 32 public ExperimentConfigurationException() { 33 super(); 34 } 35 36 /** 37 * @see Exception#Exception(String) 38 */ 39 public ExperimentConfigurationException(String message) { 40 super(message); 41 } 42 43 /** 44 * @see Exception#Exception(String, Throwable) 45 */ 46 public ExperimentConfigurationException(String message, Throwable e) { 47 super(message, e); 48 } 49 50 /** 51 * @see Exception#Exception(Throwable) 52 */ 53 public ExperimentConfigurationException(Throwable e) { 54 super(e); 55 } 41 56 42 57 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/IParameterizable.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp; 2 16 3 17 /** 4 * Interface that defines that an entity excepts a parameter string. Can be used to configure parts of an experiment. How (and if) this parameter is interpreted depends entirely on the entity. 18 * Interface that defines that an entity excepts a parameter string. Can be used to configure parts 19 * of an experiment. How (and if) this parameter is interpreted depends entirely on the entity. 20 * 5 21 * @author Steffen Herbold 6 * 22 * 7 23 */ 8 24 public interface IParameterizable { 9 25 10 /** 11 * Sets the parameters of an entity. 12 * @param parameters parameters as string 13 */ 14 void setParameter(String parameters); 26 /** 27 * Sets the parameters of an entity. 28 * 29 * @param parameters 30 * parameters as string 31 */ 32 void setParameter(String parameters); 15 33 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/Runner.java
r38 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp; 2 16 … … 15 29 /** 16 30 * Executable that can be used to run experiments. 31 * 17 32 * @author Steffen Herbold 18 * 33 * 19 34 */ 20 35 public class Runner { 21 22 /**23 * Main class. The arguments are {@link ExperimentConfiguration} files. Each experiment is started in a separate thread. The number of concurrently running threads is the number of logical processors of the host system.24 * @param args experiment configuration files25 */26 public static void main(String[] args) {27 new TextConsole(Level.FINE);28 final int concurrentThreads = Runtime.getRuntime().availableProcessors()-2;29 final ExecutorService threadPool = Executors.newFixedThreadPool(concurrentThreads);30 for( String arg : args ) {31 File file = new File(arg);32 if( file.isFile() ) {33 createConfig(threadPool, file.getAbsolutePath());34 }35 else if( file.isDirectory() ) {36 for( File subfile : file.listFiles() ) {37 if( subfile.isFile() ) {38 createConfig(threadPool, subfile.getAbsolutePath());39 }40 }41 }42 }43 threadPool.shutdown();44 try {45 threadPool.awaitTermination(Long.MAX_VALUE, TimeUnit.SECONDS);46 } catch (InterruptedException e) {47 e.printStackTrace();48 }49 }50 51 /**52 * Creates the config and starts the corresponding experiment53 * @param threadPool54 * @param configFile location of the config file55 */56 public static void createConfig(ExecutorService threadPool, String configFile) {57 ExperimentConfiguration config = null;58 try {59 config = new ExperimentConfiguration(configFile);60 } catch (Exception e) {61 Console.printerrln("Failure initializing the experiment configuration for configuration file " + configFile);62 e.printStackTrace();63 }64 36 65 if( config!=null ) { 66 Console.trace(Level.FINE, config.toString()); 67 // Instantiate the class like it was given as parameter in the config file and cast it to the interface 68 try { 69 // Because we need to pass a parameter, a normal new Instance call is not possible 70 Class<?> executionStrategyClass = Class.forName("de.ugoe.cs.cpdp.execution."+config.getExecutionStrategy()); 71 Constructor<?> executionStrategyConstructor = 72 executionStrategyClass.getConstructor(ExperimentConfiguration.class); 73 74 IExecutionStrategy experiment = (IExecutionStrategy) executionStrategyConstructor.newInstance(config); 75 threadPool.execute(experiment); 76 } catch (NoSuchMethodException e) { 77 Console.printerrln("Class \"" + config.getExecutionStrategy()+ "\" does not have the right Constructor"); 78 e.printStackTrace(); 79 } catch (SecurityException e) { 80 Console.printerrln("Security manager prevents reflection"); 81 e.printStackTrace(); 82 } catch (IllegalArgumentException e) { 83 Console.printerrln("Class \"" + config.getExecutionStrategy()+ "\" does not have a Constructor, which" 84 + "matches the given arguments"); 85 e.printStackTrace(); 86 } catch (InvocationTargetException e) { 87 Console.printerrln("Constructor in Class \"" + config.getExecutionStrategy()+ "\" is not public"); 88 e.printStackTrace(); 89 } catch (InstantiationException e) { 90 Console.printerrln("Cannot instantiate Class \"" + config.getExecutionStrategy()+"\""); 91 e.printStackTrace(); 92 } catch (IllegalAccessException e) { 93 Console.printerrln("Cannot access Class \"" + config.getExecutionStrategy()+"\""); 94 e.printStackTrace(); 95 } catch (ClassNotFoundException e) { 96 Console.printerrln("Class \"" + config.getExecutionStrategy()+ "\" was not found"); 97 e.printStackTrace(); 98 } 99 100 } 101 102 } 37 /** 38 * Main class. The arguments are {@link ExperimentConfiguration} files. Each experiment is 39 * started in a separate thread. The number of concurrently running threads is the number of 40 * logical processors of the host system. 41 * 42 * @param args 43 * experiment configuration files 44 */ 45 public static void main(String[] args) { 46 new TextConsole(Level.FINE); 47 final int concurrentThreads = Runtime.getRuntime().availableProcessors() - 2; 48 final ExecutorService threadPool = Executors.newFixedThreadPool(concurrentThreads); 49 for (String arg : args) { 50 File file = new File(arg); 51 if (file.isFile()) { 52 createConfig(threadPool, file.getAbsolutePath()); 53 } 54 else if (file.isDirectory()) { 55 for (File subfile : file.listFiles()) { 56 if (subfile.isFile()) { 57 createConfig(threadPool, subfile.getAbsolutePath()); 58 } 59 } 60 } 61 } 62 threadPool.shutdown(); 63 try { 64 threadPool.awaitTermination(Long.MAX_VALUE, TimeUnit.SECONDS); 65 } 66 catch (InterruptedException e) { 67 e.printStackTrace(); 68 } 69 } 70 71 /** 72 * Creates the config and starts the corresponding experiment 73 * 74 * @param threadPool 75 * @param configFile 76 * location of the config file 77 */ 78 public static void createConfig(ExecutorService threadPool, String configFile) { 79 ExperimentConfiguration config = null; 80 try { 81 config = new ExperimentConfiguration(configFile); 82 } 83 catch (Exception e) { 84 Console 85 .printerrln("Failure initializing the experiment configuration for configuration file " + 86 configFile); 87 e.printStackTrace(); 88 } 89 90 if (config != null) { 91 Console.trace(Level.FINE, config.toString()); 92 // Instantiate the class like it was given as parameter in the config file and cast it 93 // to the interface 94 try { 95 // Because we need to pass a parameter, a normal new Instance call is not possible 96 Class<?> executionStrategyClass = 97 Class.forName("de.ugoe.cs.cpdp.execution." + config.getExecutionStrategy()); 98 Constructor<?> executionStrategyConstructor = 99 executionStrategyClass.getConstructor(ExperimentConfiguration.class); 100 101 IExecutionStrategy experiment = 102 (IExecutionStrategy) executionStrategyConstructor.newInstance(config); 103 threadPool.execute(experiment); 104 } 105 catch (NoSuchMethodException e) { 106 Console.printerrln("Class \"" + config.getExecutionStrategy() + 107 "\" does not have the right Constructor"); 108 e.printStackTrace(); 109 } 110 catch (SecurityException e) { 111 Console.printerrln("Security manager prevents reflection"); 112 e.printStackTrace(); 113 } 114 catch (IllegalArgumentException e) { 115 Console.printerrln("Class \"" + config.getExecutionStrategy() + 116 "\" does not have a Constructor, which" + "matches the given arguments"); 117 e.printStackTrace(); 118 } 119 catch (InvocationTargetException e) { 120 Console.printerrln("Constructor in Class \"" + config.getExecutionStrategy() + 121 "\" is not public"); 122 e.printStackTrace(); 123 } 124 catch (InstantiationException e) { 125 Console.printerrln("Cannot instantiate Class \"" + config.getExecutionStrategy() + 126 "\""); 127 e.printStackTrace(); 128 } 129 catch (IllegalAccessException e) { 130 Console.printerrln("Cannot access Class \"" + config.getExecutionStrategy() + "\""); 131 e.printStackTrace(); 132 } 133 catch (ClassNotFoundException e) { 134 Console.printerrln("Class \"" + config.getExecutionStrategy() + "\" was not found"); 135 e.printStackTrace(); 136 } 137 138 } 139 140 } 103 141 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/AttributeNonRemoval.java
r32 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 8 22 9 23 /** 10 * Removes attributes from all data sets, except the one defined, using their name. 24 * Removes attributes from all data sets, except the one defined, using their name. 25 * 11 26 * @author Fabian Trautsch 12 27 */ 13 28 public class AttributeNonRemoval implements ISetWiseProcessingStrategy, IProcessesingStrategy { 14 29 15 /** 16 * names of the attributes to be kept (determined by {@link #setParameter(String)}) 17 */ 18 private ArrayList<String> attributeNames = new ArrayList<String>(); 19 20 /** 21 * Sets that attributes that will be kept. The string contains the blank-separated names of the attributes to be kept. 22 * <br><br> 23 * Note, that keeping of attributes with blanks is currently not supported! 24 * @param parameters string with the blank-separated attribute names 25 */ 26 @Override 27 public void setParameter(String parameters) { 28 if( parameters!=null ) { 29 String[] attributeNamesArray = parameters.split(" "); 30 for(String attributeName : attributeNamesArray) { 31 attributeNames.add(attributeName); 32 } 33 } 34 } 30 /** 31 * names of the attributes to be kept (determined by {@link #setParameter(String)}) 32 */ 33 private ArrayList<String> attributeNames = new ArrayList<String>(); 35 34 36 /** 37 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 38 */ 39 @Override 40 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 41 for( String attributeName : attributeNames ) { 42 for( int i=0 ; i<testdata.numAttributes() ; i++ ) { 43 if(!attributeName.equals(testdata.attribute(i).name()) ) { 44 testdata.deleteAttributeAt(i); 45 for( Instances traindata : traindataSet ) { 46 traindata.deleteAttributeAt(i); 47 } 48 } 49 } 50 } 51 } 35 /** 36 * Sets that attributes that will be kept. The string contains the blank-separated names of the 37 * attributes to be kept. <br> 38 * <br> 39 * Note, that keeping of attributes with blanks is currently not supported! 40 * 41 * @param parameters 42 * string with the blank-separated attribute names 43 */ 44 @Override 45 public void setParameter(String parameters) { 46 if (parameters != null) { 47 String[] attributeNamesArray = parameters.split(" "); 48 for (String attributeName : attributeNamesArray) { 49 attributeNames.add(attributeName); 50 } 51 } 52 } 52 53 53 /** 54 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 55 */ 56 @Override 57 public void apply(Instances testdata, Instances traindata) { 58 for(int i=testdata.numAttributes()-1; i>=0; i--) { 59 if(!attributeNames.contains(testdata.attribute(i).name())) { 60 testdata.deleteAttributeAt(i); 61 traindata.deleteAttributeAt(i); 62 } 63 } 64 } 54 /** 55 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, 56 * org.apache.commons.collections4.list.SetUniqueList) 57 */ 58 @Override 59 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 60 for (String attributeName : attributeNames) { 61 for (int i = 0; i < testdata.numAttributes(); i++) { 62 if (!attributeName.equals(testdata.attribute(i).name())) { 63 testdata.deleteAttributeAt(i); 64 for (Instances traindata : traindataSet) { 65 traindata.deleteAttributeAt(i); 66 } 67 } 68 } 69 } 70 } 71 72 /** 73 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, 74 * weka.core.Instances) 75 */ 76 @Override 77 public void apply(Instances testdata, Instances traindata) { 78 for (int i = testdata.numAttributes() - 1; i >= 0; i--) { 79 if (!attributeNames.contains(testdata.attribute(i).name())) { 80 testdata.deleteAttributeAt(i); 81 traindata.deleteAttributeAt(i); 82 } 83 } 84 } 65 85 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/AttributeRemoval.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 6 20 7 21 /** 8 * Removes an attributes from all data sets using their name. 22 * Removes an attributes from all data sets using their name. 23 * 9 24 * @author Steffen Herbold 10 25 */ 11 26 public class AttributeRemoval implements ISetWiseProcessingStrategy, IProcessesingStrategy { 12 27 13 /** 14 * names of the attributes to be removed (determined by {@link #setParameter(String)}) 15 */ 16 private String[] attributeNames = new String[]{}; 17 18 /** 19 * Sets that attributes that will be removed. The string contains the blank-separated names of the attributes to be removed. 20 * <br><br> 21 * Note, that removal of attributes with blanks is currently not supported! 22 * @param parameters string with the blank-separated attribute names 23 */ 24 @Override 25 public void setParameter(String parameters) { 26 if( parameters!=null ) { 27 attributeNames = parameters.split(" "); 28 } 29 } 28 /** 29 * names of the attributes to be removed (determined by {@link #setParameter(String)}) 30 */ 31 private String[] attributeNames = new String[] { }; 30 32 31 /** 32 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 33 */ 34 @Override 35 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 36 for( String attributeName : attributeNames ) { 37 for( int i=0 ; i<testdata.numAttributes() ; i++ ) { 38 if( attributeName.equals(testdata.attribute(i).name()) ) { 39 testdata.deleteAttributeAt(i); 40 for( Instances traindata : traindataSet ) { 41 traindata.deleteAttributeAt(i); 42 } 43 } 44 } 45 } 46 } 33 /** 34 * Sets that attributes that will be removed. The string contains the blank-separated names of 35 * the attributes to be removed. <br> 36 * <br> 37 * Note, that removal of attributes with blanks is currently not supported! 38 * 39 * @param parameters 40 * string with the blank-separated attribute names 41 */ 42 @Override 43 public void setParameter(String parameters) { 44 if (parameters != null) { 45 attributeNames = parameters.split(" "); 46 } 47 } 47 48 48 /** 49 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 50 */ 51 @Override 52 public void apply(Instances testdata, Instances traindata) { 53 for( String attributeName : attributeNames ) { 54 for( int i=0 ; i<testdata.numAttributes() ; i++ ) { 55 if( attributeName.equals(testdata.attribute(i).name()) ) { 56 testdata.deleteAttributeAt(i); 57 traindata.deleteAttributeAt(i); 58 } 59 } 60 } 61 } 49 /** 50 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, 51 * org.apache.commons.collections4.list.SetUniqueList) 52 */ 53 @Override 54 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 55 for (String attributeName : attributeNames) { 56 for (int i = 0; i < testdata.numAttributes(); i++) { 57 if (attributeName.equals(testdata.attribute(i).name())) { 58 testdata.deleteAttributeAt(i); 59 for (Instances traindata : traindataSet) { 60 traindata.deleteAttributeAt(i); 61 } 62 } 63 } 64 } 65 } 66 67 /** 68 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, 69 * weka.core.Instances) 70 */ 71 @Override 72 public void apply(Instances testdata, Instances traindata) { 73 for (String attributeName : attributeNames) { 74 for (int i = 0; i < testdata.numAttributes(); i++) { 75 if (attributeName.equals(testdata.attribute(i).name())) { 76 testdata.deleteAttributeAt(i); 77 traindata.deleteAttributeAt(i); 78 } 79 } 80 } 81 } 62 82 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/AverageStandardization.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 8 22 9 23 /** 10 * Standardization procedure after Watanabe et al.: Adapting a Fault Prediction Model to Allow Inter Language Reuse. 11 * <br><br> 12 * In comparison to Watanabe et al., we transform training data instead of the test data. Otherwise, this approach would not be feasible with multiple projects. 24 * Standardization procedure after Watanabe et al.: Adapting a Fault Prediction Model to Allow Inter 25 * Language Reuse. <br> 26 * <br> 27 * In comparison to Watanabe et al., we transform training data instead of the test data. Otherwise, 28 * this approach would not be feasible with multiple projects. 29 * 13 30 * @author Steffen Herbold 14 31 */ 15 32 public class AverageStandardization implements ISetWiseProcessingStrategy, IProcessesingStrategy { 16 33 17 /** 18 * Does not have parameters. String is ignored. 19 * @param parameters ignored 20 */ 21 @Override 22 public void setParameter(String parameters) { 23 // dummy 24 } 34 /** 35 * Does not have parameters. String is ignored. 36 * 37 * @param parameters 38 * ignored 39 */ 40 @Override 41 public void setParameter(String parameters) { 42 // dummy 43 } 25 44 26 /** 27 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 28 */ 29 @Override 30 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 31 final Attribute classAttribute = testdata.classAttribute(); 32 33 final double[] meanTest = new double[testdata.numAttributes()]; 34 35 // get means of testdata 36 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 37 if( testdata.attribute(j)!=classAttribute ) { 38 meanTest[j] = testdata.meanOrMode(j); 39 } 40 } 41 42 // preprocess training data 43 for( Instances traindata : traindataSet ) { 44 double[] meanTrain = new double[testdata.numAttributes()]; 45 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 46 if( testdata.attribute(j)!=classAttribute ) { 47 meanTrain[j] = traindata.meanOrMode(j); 48 } 49 } 50 51 for( int i=0 ; i<traindata.numInstances() ; i++ ) { 52 Instance instance = traindata.instance(i); 53 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 54 if( testdata.attribute(j)!=classAttribute ) { 55 instance.setValue(j, instance.value(j)*meanTest[j]/meanTrain[j]); 56 } 57 } 58 } 59 } 60 } 45 /** 46 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, 47 * org.apache.commons.collections4.list.SetUniqueList) 48 */ 49 @Override 50 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 51 final Attribute classAttribute = testdata.classAttribute(); 61 52 62 /** 63 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 64 */ 65 @Override 66 public void apply(Instances testdata, Instances traindata) { 67 final Attribute classAttribute = testdata.classAttribute(); 68 69 final double[] meanTest = new double[testdata.numAttributes()]; 70 71 // get means of testdata 72 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 73 if( testdata.attribute(j)!=classAttribute ) { 74 meanTest[j] = testdata.meanOrMode(j); 75 } 76 } 77 78 // preprocess training data 79 final double[] meanTrain = new double[testdata.numAttributes()]; 80 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 81 if( testdata.attribute(j)!=classAttribute ) { 82 meanTrain[j] = traindata.meanOrMode(j); 83 } 84 } 85 86 for( int i=0 ; i<traindata.numInstances() ; i++ ) { 87 Instance instance = traindata.instance(i); 88 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 89 if( testdata.attribute(j)!=classAttribute ) { 90 instance.setValue(j, instance.value(j)*meanTest[j]/meanTrain[j]); 91 } 92 } 93 } 94 } 53 final double[] meanTest = new double[testdata.numAttributes()]; 54 55 // get means of testdata 56 for (int j = 0; j < testdata.numAttributes(); j++) { 57 if (testdata.attribute(j) != classAttribute) { 58 meanTest[j] = testdata.meanOrMode(j); 59 } 60 } 61 62 // preprocess training data 63 for (Instances traindata : traindataSet) { 64 double[] meanTrain = new double[testdata.numAttributes()]; 65 for (int j = 0; j < testdata.numAttributes(); j++) { 66 if (testdata.attribute(j) != classAttribute) { 67 meanTrain[j] = traindata.meanOrMode(j); 68 } 69 } 70 71 for (int i = 0; i < traindata.numInstances(); i++) { 72 Instance instance = traindata.instance(i); 73 for (int j = 0; j < testdata.numAttributes(); j++) { 74 if (testdata.attribute(j) != classAttribute) { 75 instance.setValue(j, instance.value(j) * meanTest[j] / meanTrain[j]); 76 } 77 } 78 } 79 } 80 } 81 82 /** 83 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, 84 * weka.core.Instances) 85 */ 86 @Override 87 public void apply(Instances testdata, Instances traindata) { 88 final Attribute classAttribute = testdata.classAttribute(); 89 90 final double[] meanTest = new double[testdata.numAttributes()]; 91 92 // get means of testdata 93 for (int j = 0; j < testdata.numAttributes(); j++) { 94 if (testdata.attribute(j) != classAttribute) { 95 meanTest[j] = testdata.meanOrMode(j); 96 } 97 } 98 99 // preprocess training data 100 final double[] meanTrain = new double[testdata.numAttributes()]; 101 for (int j = 0; j < testdata.numAttributes(); j++) { 102 if (testdata.attribute(j) != classAttribute) { 103 meanTrain[j] = traindata.meanOrMode(j); 104 } 105 } 106 107 for (int i = 0; i < traindata.numInstances(); i++) { 108 Instance instance = traindata.instance(i); 109 for (int j = 0; j < testdata.numAttributes(); j++) { 110 if (testdata.attribute(j) != classAttribute) { 111 instance.setValue(j, instance.value(j) * meanTest[j] / meanTrain[j]); 112 } 113 } 114 } 115 } 95 116 96 117 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/BiasedWeights.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 7 21 8 22 /** 9 * Sets the bias of the weights of the training data. By using a bias of 0.5 (default value) the total weight of the positive instances (i.e. 10 * fault-prone) is equal to the total weight of the negative instances (i.e. non-fault-prone). Otherwise the weights between the two will be 11 * distributed according to the bias, where <0.5 means in favor of the negative instances and >0.5 in favor of the positive instances. 12 * equal to the total weight of the test 23 * Sets the bias of the weights of the training data. By using a bias of 0.5 (default value) the 24 * total weight of the positive instances (i.e. fault-prone) is equal to the total weight of the 25 * negative instances (i.e. non-fault-prone). Otherwise the weights between the two will be 26 * distributed according to the bias, where <0.5 means in favor of the negative instances and 27 * >0.5 in favor of the positive instances. equal to the total weight of the test 28 * 13 29 * @author Steffen Herbold 14 30 */ 15 31 public class BiasedWeights implements IProcessesingStrategy, ISetWiseProcessingStrategy { 16 32 17 /** 18 * bias used for the weighting 19 */ 20 private double bias = 0.5; 21 22 23 /** 24 * Sets the bias to be used for weighting. 25 * @param parameters string with the bias 26 */ 27 @Override 28 public void setParameter(String parameters) { 29 bias = Double.parseDouble(parameters); 30 } 33 /** 34 * bias used for the weighting 35 */ 36 private double bias = 0.5; 31 37 32 /** 33 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 34 */ 35 @Override 36 public void apply(Instances testdata, Instances traindata) { 37 //setBiasedWeights(testdata); 38 setBiasedWeights(traindata); 39 } 40 41 /** 42 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 43 */ 44 @Override 45 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 46 for( Instances traindata : traindataSet ) { 47 setBiasedWeights(traindata); 48 } 49 } 50 51 /** 52 * Helper method that sets the weights for a given data set. 53 * @param data data set whose weights are set 54 */ 55 private void setBiasedWeights(Instances data) { 56 final int classIndex = data.classIndex(); 57 58 final int[] counts = data.attributeStats(classIndex).nominalCounts; 59 60 final double weightNegatives = ((1-bias)*data.numInstances()) / counts[0]; 61 final double weightPositives = (bias*data.numInstances()) / counts[1]; 62 63 64 for( int i=0 ; i<data.numInstances() ; i++ ) { 65 Instance instance = data.instance(i); 66 if( instance.value(classIndex)==0 ) { 67 instance.setWeight(weightNegatives); 68 } 69 if( instance.value(classIndex)==1 ) { 70 instance.setWeight(weightPositives); 71 } 72 } 73 } 38 /** 39 * Sets the bias to be used for weighting. 40 * 41 * @param parameters 42 * string with the bias 43 */ 44 @Override 45 public void setParameter(String parameters) { 46 bias = Double.parseDouble(parameters); 47 } 74 48 75 49 /** 50 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, 51 * weka.core.Instances) 52 */ 53 @Override 54 public void apply(Instances testdata, Instances traindata) { 55 // setBiasedWeights(testdata); 56 setBiasedWeights(traindata); 57 } 58 59 /** 60 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, 61 * org.apache.commons.collections4.list.SetUniqueList) 62 */ 63 @Override 64 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 65 for (Instances traindata : traindataSet) { 66 setBiasedWeights(traindata); 67 } 68 } 69 70 /** 71 * Helper method that sets the weights for a given data set. 72 * 73 * @param data 74 * data set whose weights are set 75 */ 76 private void setBiasedWeights(Instances data) { 77 final int classIndex = data.classIndex(); 78 79 final int[] counts = data.attributeStats(classIndex).nominalCounts; 80 81 final double weightNegatives = ((1 - bias) * data.numInstances()) / counts[0]; 82 final double weightPositives = (bias * data.numInstances()) / counts[1]; 83 84 for (int i = 0; i < data.numInstances(); i++) { 85 Instance instance = data.instance(i); 86 if (instance.value(classIndex) == 0) { 87 instance.setWeight(weightNegatives); 88 } 89 if (instance.value(classIndex) == 1) { 90 instance.setWeight(weightPositives); 91 } 92 } 93 } 76 94 77 95 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/DataGravitation.java
r10 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 8 22 9 23 /** 10 * Implements an approach for data weighting suggested after Y. Ma, G. Luo, X. Zeng, and A. Chen: Transfer learning for 11 * cross-company software defect prediction. The instances are weighted higher, the more attributes are within the range they are in the training data. 24 * Implements an approach for data weighting suggested after Y. Ma, G. Luo, X. Zeng, and A. Chen: 25 * Transfer learning for cross-company software defect prediction. The instances are weighted 26 * higher, the more attributes are within the range they are in the training data. 27 * 12 28 * @author Steffen Herbold 13 29 */ 14 30 public class DataGravitation implements IProcessesingStrategy, ISetWiseProcessingStrategy { 15 31 16 /** 17 * Does not have parameters. String is ignored. 18 * @param parameters ignored 19 */ 20 @Override 21 public void setParameter(String parameters) { 22 // dummy 23 } 24 25 /* (non-Javadoc) 26 * @see de.ugoe.cs.cpdp.dataprocessing.ISetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 27 */ 28 @Override 29 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 30 for( Instances traindata : traindataSet ) { 31 apply(testdata, traindata); 32 } 33 } 32 /** 33 * Does not have parameters. String is ignored. 34 * 35 * @param parameters 36 * ignored 37 */ 38 @Override 39 public void setParameter(String parameters) { 40 // dummy 41 } 34 42 35 /* (non-Javadoc) 36 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 37 */ 38 @Override 39 public void apply(Instances testdata, Instances traindata) { 40 Attribute classAtt = testdata.classAttribute(); 41 42 double[] minAttValues = new double[testdata.numAttributes()]; 43 double[] maxAttValues = new double[testdata.numAttributes()]; 44 double[] weights = new double[traindata.numInstances()]; 45 double weightsum = 0.0; 46 47 for( int j=0; j<testdata.numAttributes(); j++) { 48 if( testdata.attribute(j)!=classAtt ) { 49 minAttValues[j] = testdata.attributeStats(j).numericStats.min; 50 maxAttValues[j] = testdata.attributeStats(j).numericStats.max; 51 } 52 } 53 54 for( int i=0; i<traindata.numInstances(); i++ ) { 55 Instance inst = traindata.instance(i); 56 int similar = 0; 57 for( int j=0; j<testdata.numAttributes(); j++ ) { 58 if( testdata.attribute(j)!=classAtt ) { 59 if( inst.value(j)>=minAttValues[j] && inst.value(j)<=maxAttValues[j] ) { 60 similar++; 61 } 62 } 63 } 64 weights[i] = similar/Math.sqrt(testdata.numAttributes()-similar); 65 weightsum += weights[i]; 66 } 67 for( int i=0; i<traindata.numInstances(); i++ ) { 68 traindata.instance(i).setWeight(weights[i]*traindata.numInstances()/weightsum); 69 } 70 } 43 /* 44 * (non-Javadoc) 45 * 46 * @see de.ugoe.cs.cpdp.dataprocessing.ISetWiseProcessingStrategy#apply(weka.core.Instances, 47 * org.apache.commons.collections4.list.SetUniqueList) 48 */ 49 @Override 50 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 51 for (Instances traindata : traindataSet) { 52 apply(testdata, traindata); 53 } 54 } 55 56 /* 57 * (non-Javadoc) 58 * 59 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances, 60 * weka.core.Instances) 61 */ 62 @Override 63 public void apply(Instances testdata, Instances traindata) { 64 Attribute classAtt = testdata.classAttribute(); 65 66 double[] minAttValues = new double[testdata.numAttributes()]; 67 double[] maxAttValues = new double[testdata.numAttributes()]; 68 double[] weights = new double[traindata.numInstances()]; 69 double weightsum = 0.0; 70 71 for (int j = 0; j < testdata.numAttributes(); j++) { 72 if (testdata.attribute(j) != classAtt) { 73 minAttValues[j] = testdata.attributeStats(j).numericStats.min; 74 maxAttValues[j] = testdata.attributeStats(j).numericStats.max; 75 } 76 } 77 78 for (int i = 0; i < traindata.numInstances(); i++) { 79 Instance inst = traindata.instance(i); 80 int similar = 0; 81 for (int j = 0; j < testdata.numAttributes(); j++) { 82 if (testdata.attribute(j) != classAtt) { 83 if (inst.value(j) >= minAttValues[j] && inst.value(j) <= maxAttValues[j]) { 84 similar++; 85 } 86 } 87 } 88 weights[i] = similar / Math.sqrt(testdata.numAttributes() - similar); 89 weightsum += weights[i]; 90 } 91 for (int i = 0; i < traindata.numInstances(); i++) { 92 traindata.instance(i).setWeight(weights[i] * traindata.numInstances() / weightsum); 93 } 94 } 71 95 72 96 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/IProcessesingStrategy.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 5 19 6 20 /** 7 * A data processing strategy that is applied to the test data and a single set of training data. 21 * A data processing strategy that is applied to the test data and a single set of training data. 22 * 8 23 * @author Steffen Herbold 9 24 */ 10 25 public interface IProcessesingStrategy extends IParameterizable { 11 12 /** 13 * Applies the processing strategy. 14 * @param testdata test data 15 * @param traindata training data 16 */ 17 void apply(Instances testdata, Instances traindata); 26 27 /** 28 * Applies the processing strategy. 29 * 30 * @param testdata 31 * test data 32 * @param traindata 33 * training data 34 */ 35 void apply(Instances testdata, Instances traindata); 18 36 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/ISetWiseProcessingStrategy.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 8 22 9 23 /** 10 * A data processing strategy that is applied to the test data and a multiple sets of training data. 24 * A data processing strategy that is applied to the test data and a multiple sets of training data. 25 * 11 26 * @author Steffen Herbold 12 27 */ 13 28 public interface ISetWiseProcessingStrategy extends IParameterizable { 14 29 15 /** 16 * Applies the processing strategy. 17 * @param testdata test data 18 * @param traindataSet training data sets 19 */ 20 void apply(Instances testdata, SetUniqueList<Instances> traindataSet); 21 30 /** 31 * Applies the processing strategy. 32 * 33 * @param testdata 34 * test data 35 * @param traindataSet 36 * training data sets 37 */ 38 void apply(Instances testdata, SetUniqueList<Instances> traindataSet); 39 22 40 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/InformationGainFilter.java
r10 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 12 26 13 27 /** 14 * Implements an attribute filter that is based on the information gain of each attribute after Z. He, F. Peters, T. Menzies, Y. Yang: Learning from Open-Source Projects: An Empirical Study on Defect Prediction. 15 * A logistic classifier is trained to separate a random sample of the training data from a random sample of the test data. As standard, the best 50% of attributes are retained. 16 * This ratio can be adjusted using the parameter of the filter (0.5 = 50%). 17 * <br><br> 18 * Best means the least information gain, because this means that the attribute is similar in both test and training data. 28 * Implements an attribute filter that is based on the information gain of each attribute after Z. 29 * He, F. Peters, T. Menzies, Y. Yang: Learning from Open-Source Projects: An Empirical Study on 30 * Defect Prediction. A logistic classifier is trained to separate a random sample of the training 31 * data from a random sample of the test data. As standard, the best 50% of attributes are retained. 32 * This ratio can be adjusted using the parameter of the filter (0.5 = 50%). <br> 33 * <br> 34 * Best means the least information gain, because this means that the attribute is similar in both 35 * test and training data. 36 * 19 37 * @author Steffen Herbold 20 38 */ 21 39 public class InformationGainFilter implements ISetWiseProcessingStrategy, IProcessesingStrategy { 22 40 23 /** 24 * size of the random sample that is drawn from both test data and training data 25 */ 26 private final int sampleSize = 500; 27 28 /** 29 * ratio of features that is kept 30 */ 31 private double featureRatio = 0.5; 32 33 /** 34 * Sets the feature ratio. 35 * @param parameters feature ratio 36 */ 37 @Override 38 public void setParameter(String parameters) { 39 if( !"".equals(parameters) ) { 40 featureRatio = Double.parseDouble(parameters); 41 } 42 } 41 /** 42 * size of the random sample that is drawn from both test data and training data 43 */ 44 private final int sampleSize = 500; 43 45 44 /** 45 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 46 */ 47 @Override 48 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 49 for( Instances traindata : traindataSet ) { 50 apply(testdata, traindata, false); 51 } 52 53 } 54 55 /** 56 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 57 */ 58 @Override 59 public void apply(Instances testdata, Instances traindata) { 60 apply(testdata, traindata, true); 61 } 62 63 /** 64 * Internal helper function for the application of the filter to both all data set as well as a single data set. 65 * @param testdata data of the target product 66 * @param traindata data of the training product 67 * @param removeFromTest defines whether the attributes shall be removed from the test data as well or not 68 */ 69 private void apply(Instances testdata, Instances traindata, boolean removeFromTest) { 70 final Random rand = new Random(1); 71 final int removalNumber = (int) (featureRatio*(testdata.numAttributes()-1)); 72 73 final int classIndex = testdata.classIndex(); 74 75 // sample instances 76 final Instances sample = new Instances(testdata); 77 for( int j=0; j<sampleSize; j++ ) { 78 Instance inst = new DenseInstance(testdata.instance(rand.nextInt(testdata.numInstances()))); 79 inst.setDataset(sample); 80 inst.setClassValue(1.0); 81 sample.add(inst); 82 inst = new DenseInstance(traindata.instance(rand.nextInt(traindata.numInstances()))); 83 inst.setDataset(sample); 84 inst.setClassValue(0.0); 85 sample.add(inst); 86 } 87 88 final double[] gain = new double[sample.numAttributes()]; 89 90 final InfoGainAttributeEval gainEval = new InfoGainAttributeEval(); 91 try { 92 gainEval.buildEvaluator(sample); 93 for( int i=0 ; i<testdata.numAttributes() ; i++ ) { 94 //if( sample.classAttribute().equals(sample.attribute(i)) ) { 95 // gain[i] = 0.0; 96 //} else { 97 if( !sample.classAttribute().equals(sample.attribute(i)) ) { 98 gain[i] = gainEval.evaluateAttribute(i); 99 } 100 } 101 } catch (Exception e) { 102 //throw new RuntimeException("could not determine information gain for all attributes", e); 103 // ignore exception; it is caused by attributes that are extremely 104 } 105 106 // select best attributes 107 final double[] gainCopy = Arrays.copyOf(gain, gain.length); 108 Arrays.sort(gainCopy); 109 final double cutoffGain = gainCopy[testdata.numAttributes()-removalNumber]; 110 111 for( int i=testdata.numAttributes()-1; i>=0 ; i-- ) { 112 if( gain[i]>=cutoffGain && i!=classIndex) { 113 traindata.deleteAttributeAt(i); 114 if( removeFromTest ) { 115 testdata.deleteAttributeAt(i); 116 } 117 } 118 } 119 } 46 /** 47 * ratio of features that is kept 48 */ 49 private double featureRatio = 0.5; 50 51 /** 52 * Sets the feature ratio. 53 * 54 * @param parameters 55 * feature ratio 56 */ 57 @Override 58 public void setParameter(String parameters) { 59 if (!"".equals(parameters)) { 60 featureRatio = Double.parseDouble(parameters); 61 } 62 } 63 64 /** 65 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, 66 * org.apache.commons.collections4.list.SetUniqueList) 67 */ 68 @Override 69 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 70 for (Instances traindata : traindataSet) { 71 apply(testdata, traindata, false); 72 } 73 74 } 75 76 /** 77 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, 78 * weka.core.Instances) 79 */ 80 @Override 81 public void apply(Instances testdata, Instances traindata) { 82 apply(testdata, traindata, true); 83 } 84 85 /** 86 * Internal helper function for the application of the filter to both all data set as well as a 87 * single data set. 88 * 89 * @param testdata 90 * data of the target product 91 * @param traindata 92 * data of the training product 93 * @param removeFromTest 94 * defines whether the attributes shall be removed from the test data as well or not 95 */ 96 private void apply(Instances testdata, Instances traindata, boolean removeFromTest) { 97 final Random rand = new Random(1); 98 final int removalNumber = (int) (featureRatio * (testdata.numAttributes() - 1)); 99 100 final int classIndex = testdata.classIndex(); 101 102 // sample instances 103 final Instances sample = new Instances(testdata); 104 for (int j = 0; j < sampleSize; j++) { 105 Instance inst = 106 new DenseInstance(testdata.instance(rand.nextInt(testdata.numInstances()))); 107 inst.setDataset(sample); 108 inst.setClassValue(1.0); 109 sample.add(inst); 110 inst = new DenseInstance(traindata.instance(rand.nextInt(traindata.numInstances()))); 111 inst.setDataset(sample); 112 inst.setClassValue(0.0); 113 sample.add(inst); 114 } 115 116 final double[] gain = new double[sample.numAttributes()]; 117 118 final InfoGainAttributeEval gainEval = new InfoGainAttributeEval(); 119 try { 120 gainEval.buildEvaluator(sample); 121 for (int i = 0; i < testdata.numAttributes(); i++) { 122 // if( sample.classAttribute().equals(sample.attribute(i)) ) { 123 // gain[i] = 0.0; 124 // } else { 125 if (!sample.classAttribute().equals(sample.attribute(i))) { 126 gain[i] = gainEval.evaluateAttribute(i); 127 } 128 } 129 } 130 catch (Exception e) { 131 // throw new RuntimeException("could not determine information gain for all attributes", 132 // e); 133 // ignore exception; it is caused by attributes that are extremely 134 } 135 136 // select best attributes 137 final double[] gainCopy = Arrays.copyOf(gain, gain.length); 138 Arrays.sort(gainCopy); 139 final double cutoffGain = gainCopy[testdata.numAttributes() - removalNumber]; 140 141 for (int i = testdata.numAttributes() - 1; i >= 0; i--) { 142 if (gain[i] >= cutoffGain && i != classIndex) { 143 traindata.deleteAttributeAt(i); 144 if (removeFromTest) { 145 testdata.deleteAttributeAt(i); 146 } 147 } 148 } 149 } 120 150 121 151 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/LogarithmTransform.java
r40 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 8 22 9 23 /** 10 * Logarithm transformation after Carmargo Cruz and Ochimizu: Towards Logistic Regression Models for Predicting Fault-prone Code across Software Projects. 11 * <br><br> 12 * Transform each attribute value x into log(x+1). 24 * Logarithm transformation after Carmargo Cruz and Ochimizu: Towards Logistic Regression Models for 25 * Predicting Fault-prone Code across Software Projects. <br> 26 * <br> 27 * Transform each attribute value x into log(x+1). 28 * 13 29 * @author Steffen Herbold 14 30 */ 15 31 public class LogarithmTransform implements ISetWiseProcessingStrategy, IProcessesingStrategy { 16 32 17 /** 18 * Does not have parameters. String is ignored. 19 * @param parameters ignored 20 */ 21 @Override 22 public void setParameter(String parameters) { 23 // dummy 24 } 33 /** 34 * Does not have parameters. String is ignored. 35 * 36 * @param parameters 37 * ignored 38 */ 39 @Override 40 public void setParameter(String parameters) { 41 // dummy 42 } 25 43 26 /** 27 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 28 */ 29 @Override 30 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 31 final Attribute classAttribute = testdata.classAttribute(); 32 33 // preprocess testdata 34 for( int i=0 ; i<testdata.numInstances() ; i++ ) { 35 Instance instance = testdata.instance(i); 36 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 37 if( testdata.attribute(j)!=classAttribute && testdata.attribute(j).isNumeric() ) { 38 if( instance.value(j) < 0 ) { 39 instance.setValue(j, (-1*(Math.log(-1*instance.value(j))))); 40 }else { 41 instance.setValue(j, Math.log(1+instance.value(j))); 42 } 43 } 44 } 45 } 46 47 // preprocess training data 48 for( Instances traindata : traindataSet ) { 49 for( int i=0 ; i<traindata.numInstances() ; i++ ) { 50 Instance instance = traindata.instance(i); 51 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 52 if( traindata.attribute(j)!=classAttribute && traindata.attribute(j).isNumeric() ) { 53 if( instance.value(j) < 0 ) { 54 instance.setValue(j, (-1*(Math.log(-1*instance.value(j))))); 55 }else { 56 instance.setValue(j, Math.log(1+instance.value(j))); 57 } 58 } 59 } 60 } 61 } 62 } 44 /** 45 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, 46 * org.apache.commons.collections4.list.SetUniqueList) 47 */ 48 @Override 49 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 50 final Attribute classAttribute = testdata.classAttribute(); 63 51 64 /** 65 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 66 */ 67 @Override 68 public void apply(Instances testdata, Instances traindata) { 69 final Attribute classAttribute = testdata.classAttribute(); 70 71 // preprocess testdata 72 for( int i=0 ; i<testdata.numInstances() ; i++ ) { 73 Instance instance = testdata.instance(i); 74 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 75 if( testdata.attribute(j)!=classAttribute && testdata.attribute(j).isNumeric() ) { 76 if( instance.value(j) < 0 ) { 77 instance.setValue(j, (-1*(Math.log(-1*instance.value(j))))); 78 }else { 79 instance.setValue(j, Math.log(1+instance.value(j))); 80 } 81 } 82 } 83 } 84 85 // preprocess training data 86 for( int i=0 ; i<traindata.numInstances() ; i++ ) { 87 Instance instance = traindata.instance(i); 88 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 89 if( traindata.attribute(j)!=classAttribute && traindata.attribute(j).isNumeric() ) { 90 if( instance.value(j) < 0 ) { 91 instance.setValue(j, (-1*(Math.log(-1*instance.value(j))))); 92 }else { 93 instance.setValue(j, Math.log(1+instance.value(j))); 94 } 95 } 96 } 97 } 98 } 52 // preprocess testdata 53 for (int i = 0; i < testdata.numInstances(); i++) { 54 Instance instance = testdata.instance(i); 55 for (int j = 0; j < testdata.numAttributes(); j++) { 56 if (testdata.attribute(j) != classAttribute && testdata.attribute(j).isNumeric()) { 57 if (instance.value(j) < 0) { 58 instance.setValue(j, (-1 * (Math.log(-1 * instance.value(j))))); 59 } 60 else { 61 instance.setValue(j, Math.log(1 + instance.value(j))); 62 } 63 } 64 } 65 } 66 67 // preprocess training data 68 for (Instances traindata : traindataSet) { 69 for (int i = 0; i < traindata.numInstances(); i++) { 70 Instance instance = traindata.instance(i); 71 for (int j = 0; j < testdata.numAttributes(); j++) { 72 if (traindata.attribute(j) != classAttribute && 73 traindata.attribute(j).isNumeric()) 74 { 75 if (instance.value(j) < 0) { 76 instance.setValue(j, (-1 * (Math.log(-1 * instance.value(j))))); 77 } 78 else { 79 instance.setValue(j, Math.log(1 + instance.value(j))); 80 } 81 } 82 } 83 } 84 } 85 } 86 87 /** 88 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, 89 * weka.core.Instances) 90 */ 91 @Override 92 public void apply(Instances testdata, Instances traindata) { 93 final Attribute classAttribute = testdata.classAttribute(); 94 95 // preprocess testdata 96 for (int i = 0; i < testdata.numInstances(); i++) { 97 Instance instance = testdata.instance(i); 98 for (int j = 0; j < testdata.numAttributes(); j++) { 99 if (testdata.attribute(j) != classAttribute && testdata.attribute(j).isNumeric()) { 100 if (instance.value(j) < 0) { 101 instance.setValue(j, (-1 * (Math.log(-1 * instance.value(j))))); 102 } 103 else { 104 instance.setValue(j, Math.log(1 + instance.value(j))); 105 } 106 } 107 } 108 } 109 110 // preprocess training data 111 for (int i = 0; i < traindata.numInstances(); i++) { 112 Instance instance = traindata.instance(i); 113 for (int j = 0; j < testdata.numAttributes(); j++) { 114 if (traindata.attribute(j) != classAttribute && traindata.attribute(j).isNumeric()) 115 { 116 if (instance.value(j) < 0) { 117 instance.setValue(j, (-1 * (Math.log(-1 * instance.value(j))))); 118 } 119 else { 120 instance.setValue(j, Math.log(1 + instance.value(j))); 121 } 122 } 123 } 124 } 125 } 99 126 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/MedianAsReference.java
r40 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 8 22 9 23 /** 10 * Median as reference transformation after Carmargo Cruz and Ochimizu: Towards Logistic Regression Models for Predicting Fault-prone Code across Software Projects 11 * <br><br> 12 * For each attribute value x, the new value is x + (median of the test data - median of the current project) 24 * Median as reference transformation after Carmargo Cruz and Ochimizu: Towards Logistic Regression 25 * Models for Predicting Fault-prone Code across Software Projects <br> 26 * <br> 27 * For each attribute value x, the new value is x + (median of the test data - median of the current 28 * project) 29 * 13 30 * @author Steffen Herbold 14 31 */ 15 32 public class MedianAsReference implements ISetWiseProcessingStrategy, IProcessesingStrategy { 16 33 17 /** 18 * Does not have parameters. String is ignored. 19 * @param parameters ignored 20 */ 21 @Override 22 public void setParameter(String parameters) { 23 // dummy 24 } 34 /** 35 * Does not have parameters. String is ignored. 36 * 37 * @param parameters 38 * ignored 39 */ 40 @Override 41 public void setParameter(String parameters) { 42 // dummy 43 } 25 44 26 /** 27 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 28 */ 29 @Override 30 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 31 final Attribute classAttribute = testdata.classAttribute(); 32 final double[] median = new double[testdata.numAttributes()]; 33 34 // test and train have the same number of attributes 35 Attribute traindataClassAttribute; 36 double[] currentmedian = new double[testdata.numAttributes()]; 37 38 // get medians 39 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 40 if( testdata.attribute(j)!=classAttribute ) { 41 median[j] = testdata.kthSmallestValue(j, (testdata.numInstances()+1)>>1); // (>>2 -> /2) 42 } 43 } 44 45 // preprocess training data 46 for( Instances traindata : traindataSet ) { 47 // get median of current training set 48 traindataClassAttribute = traindata.classAttribute(); 49 for( int j=0 ; j<traindata.numAttributes() ; j++ ) { 50 if( traindata.attribute(j)!=traindataClassAttribute && traindata.attribute(j).isNumeric()) { 51 currentmedian[j] = traindata.kthSmallestValue(j, (traindata.numInstances()+1)>>1); // (>>2 -> /2) 52 } 53 } 54 for( int i=0 ; i<traindata.numInstances() ; i++ ) { 55 Instance instance = traindata.instance(i); 56 for( int j=0 ; j<traindata.numAttributes() ; j++ ) { 57 if( traindata.attribute(j)!=classAttribute && traindata.attribute(j).isNumeric() ) { 58 instance.setValue(j, instance.value(j) + (median[j] - currentmedian[j])); 59 } 60 } 61 } 62 } 63 } 45 /** 46 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, 47 * org.apache.commons.collections4.list.SetUniqueList) 48 */ 49 @Override 50 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 51 final Attribute classAttribute = testdata.classAttribute(); 52 final double[] median = new double[testdata.numAttributes()]; 64 53 65 /** 66 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 67 */ 68 @Override 69 public void apply(Instances testdata, Instances traindata) { 70 final Attribute classAttribute = testdata.classAttribute(); 71 final Attribute traindataClassAttribute = traindata.classAttribute(); 72 final double[] median = new double[testdata.numAttributes()]; 54 // test and train have the same number of attributes 55 Attribute traindataClassAttribute; 56 double[] currentmedian = new double[testdata.numAttributes()]; 73 57 74 // test and train have the same number of attributes 75 double[] currentmedian = new double[testdata.numAttributes()]; 76 77 // get medians 78 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 79 if( testdata.attribute(j)!=classAttribute ) { 80 median[j] = testdata.kthSmallestValue(j, (testdata.numInstances()+1)>>1); // (>>2 -> /2) 81 } 82 } 58 // get medians 59 for (int j = 0; j < testdata.numAttributes(); j++) { 60 if (testdata.attribute(j) != classAttribute) { 61 median[j] = testdata.kthSmallestValue(j, (testdata.numInstances() + 1) >> 1); // (>>2 62 // -> 63 // /2) 64 } 65 } 83 66 84 // get median of current training set 85 for( int j=0 ; j<traindata.numAttributes() ; j++ ) { 86 if( traindata.attribute(j)!=traindataClassAttribute && traindata.attribute(j).isNumeric() ) { 87 currentmedian[j] = traindata.kthSmallestValue(j, (traindata.numInstances()+1)>>1); // (>>2 -> /2) 88 } 89 } 90 91 // preprocess training data 92 for( int i=0 ; i<traindata.numInstances() ; i++ ) { 93 Instance instance = traindata.instance(i); 94 for( int j=0 ; j<traindata.numAttributes() ; j++ ) { 95 if( traindata.attribute(j)!=classAttribute && traindata.attribute(j).isNumeric() ) { 96 instance.setValue(j, instance.value(j) + (median[j] - currentmedian[j])); 97 } 98 } 99 } 100 } 67 // preprocess training data 68 for (Instances traindata : traindataSet) { 69 // get median of current training set 70 traindataClassAttribute = traindata.classAttribute(); 71 for (int j = 0; j < traindata.numAttributes(); j++) { 72 if (traindata.attribute(j) != traindataClassAttribute && 73 traindata.attribute(j).isNumeric()) 74 { 75 currentmedian[j] = 76 traindata.kthSmallestValue(j, (traindata.numInstances() + 1) >> 1); // (>>2 77 // -> 78 // /2) 79 } 80 } 81 for (int i = 0; i < traindata.numInstances(); i++) { 82 Instance instance = traindata.instance(i); 83 for (int j = 0; j < traindata.numAttributes(); j++) { 84 if (traindata.attribute(j) != classAttribute && 85 traindata.attribute(j).isNumeric()) 86 { 87 instance.setValue(j, instance.value(j) + (median[j] - currentmedian[j])); 88 } 89 } 90 } 91 } 92 } 93 94 /** 95 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, 96 * weka.core.Instances) 97 */ 98 @Override 99 public void apply(Instances testdata, Instances traindata) { 100 final Attribute classAttribute = testdata.classAttribute(); 101 final Attribute traindataClassAttribute = traindata.classAttribute(); 102 final double[] median = new double[testdata.numAttributes()]; 103 104 // test and train have the same number of attributes 105 double[] currentmedian = new double[testdata.numAttributes()]; 106 107 // get medians 108 for (int j = 0; j < testdata.numAttributes(); j++) { 109 if (testdata.attribute(j) != classAttribute) { 110 median[j] = testdata.kthSmallestValue(j, (testdata.numInstances() + 1) >> 1); // (>>2 111 // -> 112 // /2) 113 } 114 } 115 116 // get median of current training set 117 for (int j = 0; j < traindata.numAttributes(); j++) { 118 if (traindata.attribute(j) != traindataClassAttribute && 119 traindata.attribute(j).isNumeric()) 120 { 121 currentmedian[j] = 122 traindata.kthSmallestValue(j, (traindata.numInstances() + 1) >> 1); // (>>2 -> 123 // /2) 124 } 125 } 126 127 // preprocess training data 128 for (int i = 0; i < traindata.numInstances(); i++) { 129 Instance instance = traindata.instance(i); 130 for (int j = 0; j < traindata.numAttributes(); j++) { 131 if (traindata.attribute(j) != classAttribute && traindata.attribute(j).isNumeric()) 132 { 133 instance.setValue(j, instance.value(j) + (median[j] - currentmedian[j])); 134 } 135 } 136 } 137 } 101 138 102 139 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/NominalAttributeFilter.java
r32 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 10 24 11 25 /** 12 * Filters the given dataset for an nominal attribute. 13 * Every instance that has a value of thedefined values of the given nominal attribute is removed.26 * Filters the given dataset for an nominal attribute. Every instance that has a value of the 27 * defined values of the given nominal attribute is removed. 14 28 * 15 * 16 * (e.g. param="CONFIDECNE low middle"; all instances where the "CONFIDENCE" attribute 17 * value is"low" or "middle" are removed from the dataset)29 * 30 * (e.g. param="CONFIDECNE low middle"; all instances where the "CONFIDENCE" attribute value is 31 * "low" or "middle" are removed from the dataset) 18 32 */ 19 33 20 public class NominalAttributeFilter implements IProcessesingStrategy {34 public class NominalAttributeFilter implements IProcessesingStrategy { 21 35 22 private String nominalAttributeName = ""; 23 private String[] nominalAttributeValues = new String[]{}; 24 25 /** 26 * Sets the nominal attribute name (first parameter) and the nominal attribute values (other 27 * parameters), which should be removed from the dataset. 28 * 29 * @param parameters string with the blank-separated parameters (first parameter 30 * is the name of the nominal attribute, everything else are the values) 31 */ 32 @Override 33 public void setParameter(String parameters) { 34 if( parameters!=null ) { 35 String[] parameter = parameters.split(" "); 36 nominalAttributeName = parameter[0]; 37 nominalAttributeValues = Arrays.copyOfRange(parameter, 1, parameter.length); 38 } 39 } 40 41 /* (non-Javadoc) 42 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 43 */ 44 @Override 45 public void apply(Instances testdata, Instances traindata) { 46 int indexOfConfidenceAttribute = -1; 47 48 // Find index of the named confidence attribute to filter for 49 for(int i=0; i<traindata.numAttributes(); i++) { 50 if(traindata.attribute(i).name().equals(nominalAttributeName)) { 51 indexOfConfidenceAttribute = i; 52 } 53 } 54 55 // if it was not found return 56 if(indexOfConfidenceAttribute == -1) { 57 return; 58 } 59 60 // Find index of nominal values 61 Attribute confidenceAttribute = traindata.attribute(indexOfConfidenceAttribute); 62 ArrayList<Object> nominalValuesOfConfidenceAttribute = Collections.list(confidenceAttribute.enumerateValues()); 63 ArrayList<Double> indexOfnominalAttributeValues = new ArrayList<Double>(); 64 65 66 for(int k=0; k<nominalValuesOfConfidenceAttribute.size(); k++) { 67 for(String attributeValue : nominalAttributeValues) { 68 if(((String)nominalValuesOfConfidenceAttribute.get(k)).equals(attributeValue)) { 69 indexOfnominalAttributeValues.add((double) k); 70 } 71 } 72 } 36 private String nominalAttributeName = ""; 37 private String[] nominalAttributeValues = new String[] { }; 73 38 74 75 76 77 // Go through all instances and check if nominal attribute equals 78 for(int j=traindata.numInstances()-1; j>=0; j--) { 79 Instance wekaInstance = traindata.get(j); 80 81 // delete all instances where nominal attribute has the value of one of the parameter 82 if(indexOfnominalAttributeValues.contains(wekaInstance.value(indexOfConfidenceAttribute))) { 83 traindata.delete(j); 84 } 85 } 86 } 39 /** 40 * Sets the nominal attribute name (first parameter) and the nominal attribute values (other 41 * parameters), which should be removed from the dataset. 42 * 43 * @param parameters 44 * string with the blank-separated parameters (first parameter is the name of the 45 * nominal attribute, everything else are the values) 46 */ 47 @Override 48 public void setParameter(String parameters) { 49 if (parameters != null) { 50 String[] parameter = parameters.split(" "); 51 nominalAttributeName = parameter[0]; 52 nominalAttributeValues = Arrays.copyOfRange(parameter, 1, parameter.length); 53 } 54 } 55 56 /* 57 * (non-Javadoc) 58 * 59 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances, 60 * weka.core.Instances) 61 */ 62 @Override 63 public void apply(Instances testdata, Instances traindata) { 64 int indexOfConfidenceAttribute = -1; 65 66 // Find index of the named confidence attribute to filter for 67 for (int i = 0; i < traindata.numAttributes(); i++) { 68 if (traindata.attribute(i).name().equals(nominalAttributeName)) { 69 indexOfConfidenceAttribute = i; 70 } 71 } 72 73 // if it was not found return 74 if (indexOfConfidenceAttribute == -1) { 75 return; 76 } 77 78 // Find index of nominal values 79 Attribute confidenceAttribute = traindata.attribute(indexOfConfidenceAttribute); 80 ArrayList<Object> nominalValuesOfConfidenceAttribute = 81 Collections.list(confidenceAttribute.enumerateValues()); 82 ArrayList<Double> indexOfnominalAttributeValues = new ArrayList<Double>(); 83 84 for (int k = 0; k < nominalValuesOfConfidenceAttribute.size(); k++) { 85 for (String attributeValue : nominalAttributeValues) { 86 if (((String) nominalValuesOfConfidenceAttribute.get(k)).equals(attributeValue)) { 87 indexOfnominalAttributeValues.add((double) k); 88 } 89 } 90 } 91 92 // Go through all instances and check if nominal attribute equals 93 for (int j = traindata.numInstances() - 1; j >= 0; j--) { 94 Instance wekaInstance = traindata.get(j); 95 96 // delete all instances where nominal attribute has the value of one of the parameter 97 if (indexOfnominalAttributeValues.contains(wekaInstance 98 .value(indexOfConfidenceAttribute))) 99 { 100 traindata.delete(j); 101 } 102 } 103 } 87 104 88 105 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/Normalization.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 9 23 10 24 /** 11 * Normalizes each attribute of each data set separately. 25 * Normalizes each attribute of each data set separately. 26 * 12 27 * @author Steffen Herbold 13 28 */ 14 29 public class Normalization implements ISetWiseProcessingStrategy, IProcessesingStrategy { 15 30 16 /** 17 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 18 */ 19 @Override 20 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 21 final Attribute classAtt = testdata.classAttribute(); 22 23 for( int i=0 ; i<testdata.numAttributes(); i++) { 24 if( !testdata.attribute(i).equals(classAtt) ) { 25 Stats teststats = testdata.attributeStats(i).numericStats; 26 27 double minVal = teststats.min; 28 double maxVal = teststats.max; 29 30 for( Instances traindata : traindataSet ) { 31 Stats trainstats = traindata.attributeStats(i).numericStats; 32 if( minVal>trainstats.min ) { 33 minVal = trainstats.min; 34 } 35 if( maxVal<trainstats.max ) { 36 maxVal = trainstats.max; 37 } 38 } 39 40 for( int j=0 ; j<testdata.numInstances() ; j++ ) { 41 Instance inst = testdata.instance(j); 42 double newValue = (inst.value(i)-minVal)/(maxVal-minVal); 43 inst.setValue(i, newValue); 44 } 45 46 for( Instances traindata : traindataSet ) { 47 for( int j=0 ; j<traindata.numInstances() ; j++ ) { 48 Instance inst = traindata.instance(j); 49 double newValue = (inst.value(i)-minVal)/(maxVal-minVal); 50 inst.setValue(i, newValue); 51 } 52 } 53 } 54 } 55 56 } 57 58 /** 59 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 60 */ 61 @Override 62 public void apply(Instances testdata, Instances traindata) { 63 final Attribute classAtt = testdata.classAttribute(); 64 65 for( int i=0 ; i<testdata.numAttributes(); i++) { 66 if( !testdata.attribute(i).equals(classAtt) ) { 67 Stats teststats = testdata.attributeStats(i).numericStats; 68 69 double minVal = teststats.min; 70 double maxVal = teststats.max; 71 72 Stats trainstats = traindata.attributeStats(i).numericStats; 73 if( minVal>trainstats.min ) { 74 minVal = trainstats.min; 75 } 76 if( maxVal<trainstats.max ) { 77 maxVal = trainstats.max; 78 } 79 80 for( int j=0 ; j<testdata.numInstances() ; j++ ) { 81 Instance inst = testdata.instance(j); 82 double newValue = (inst.value(i)-minVal)/(maxVal-minVal); 83 inst.setValue(i, newValue); 84 } 85 86 for( int j=0 ; j<traindata.numInstances() ; j++ ) { 87 Instance inst = traindata.instance(j); 88 double newValue = (inst.value(i)-minVal)/(maxVal-minVal); 89 inst.setValue(i, newValue); 90 } 91 } 92 } 93 } 31 /** 32 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, 33 * org.apache.commons.collections4.list.SetUniqueList) 34 */ 35 @Override 36 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 37 final Attribute classAtt = testdata.classAttribute(); 94 38 95 /** 96 * Does not have parameters. String is ignored. 97 * @param parameters ignored 98 */ 99 @Override 100 public void setParameter(String parameters) { 101 // no parameters 102 } 39 for (int i = 0; i < testdata.numAttributes(); i++) { 40 if (!testdata.attribute(i).equals(classAtt)) { 41 Stats teststats = testdata.attributeStats(i).numericStats; 42 43 double minVal = teststats.min; 44 double maxVal = teststats.max; 45 46 for (Instances traindata : traindataSet) { 47 Stats trainstats = traindata.attributeStats(i).numericStats; 48 if (minVal > trainstats.min) { 49 minVal = trainstats.min; 50 } 51 if (maxVal < trainstats.max) { 52 maxVal = trainstats.max; 53 } 54 } 55 56 for (int j = 0; j < testdata.numInstances(); j++) { 57 Instance inst = testdata.instance(j); 58 double newValue = (inst.value(i) - minVal) / (maxVal - minVal); 59 inst.setValue(i, newValue); 60 } 61 62 for (Instances traindata : traindataSet) { 63 for (int j = 0; j < traindata.numInstances(); j++) { 64 Instance inst = traindata.instance(j); 65 double newValue = (inst.value(i) - minVal) / (maxVal - minVal); 66 inst.setValue(i, newValue); 67 } 68 } 69 } 70 } 71 72 } 73 74 /** 75 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, 76 * weka.core.Instances) 77 */ 78 @Override 79 public void apply(Instances testdata, Instances traindata) { 80 final Attribute classAtt = testdata.classAttribute(); 81 82 for (int i = 0; i < testdata.numAttributes(); i++) { 83 if (!testdata.attribute(i).equals(classAtt)) { 84 Stats teststats = testdata.attributeStats(i).numericStats; 85 86 double minVal = teststats.min; 87 double maxVal = teststats.max; 88 89 Stats trainstats = traindata.attributeStats(i).numericStats; 90 if (minVal > trainstats.min) { 91 minVal = trainstats.min; 92 } 93 if (maxVal < trainstats.max) { 94 maxVal = trainstats.max; 95 } 96 97 for (int j = 0; j < testdata.numInstances(); j++) { 98 Instance inst = testdata.instance(j); 99 double newValue = (inst.value(i) - minVal) / (maxVal - minVal); 100 inst.setValue(i, newValue); 101 } 102 103 for (int j = 0; j < traindata.numInstances(); j++) { 104 Instance inst = traindata.instance(j); 105 double newValue = (inst.value(i) - minVal) / (maxVal - minVal); 106 inst.setValue(i, newValue); 107 } 108 } 109 } 110 } 111 112 /** 113 * Does not have parameters. String is ignored. 114 * 115 * @param parameters 116 * ignored 117 */ 118 @Override 119 public void setParameter(String parameters) { 120 // no parameters 121 } 103 122 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/Oversampling.java
r38 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 8 22 9 23 /** 10 * Implements oversampling, a strategy for 11 * handling bias in data. In case there are less positive samples (i.e. 12 * defect-prone) samples in the data than negative samples (i.e. 13 * non-defect-prone), the defect-prone entities are over-sampled such that the 14 * number of defect-prone and non-defect-prone instances is the same afterwards. 15 * This means, that some of the defect-prone entities will be more than once 16 * within the data. 24 * Implements oversampling, a strategy for handling bias in data. In case there are less positive 25 * samples (i.e. defect-prone) samples in the data than negative samples (i.e. non-defect-prone), 26 * the defect-prone entities are over-sampled such that the number of defect-prone and 27 * non-defect-prone instances is the same afterwards. This means, that some of the defect-prone 28 * entities will be more than once within the data. 17 29 * 18 30 * @author Steffen Herbold 19 31 */ 20 public class Oversampling implements IProcessesingStrategy, 21 ISetWiseProcessingStrategy { 32 public class Oversampling implements IProcessesingStrategy, ISetWiseProcessingStrategy { 22 33 23 /**24 * Does not have parameters. String is ignored.25 *26 * @param parameters27 * ignored28 */29 @Override30 public void setParameter(String parameters) {31 // dummy32 }34 /** 35 * Does not have parameters. String is ignored. 36 * 37 * @param parameters 38 * ignored 39 */ 40 @Override 41 public void setParameter(String parameters) { 42 // dummy 43 } 33 44 34 /* 35 * (non-Javadoc) 36 * 37 * @see 38 * de.ugoe.cs.cpdp.dataprocessing.ISetWiseProcessingStrategy#apply(weka. 39 * core.Instances, org.apache.commons.collections4.list.SetUniqueList) 40 */ 41 @Override 42 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 43 for (Instances traindata : traindataSet) { 44 apply(testdata, traindata); 45 } 46 } 45 /* 46 * (non-Javadoc) 47 * 48 * @see de.ugoe.cs.cpdp.dataprocessing.ISetWiseProcessingStrategy#apply(weka. core.Instances, 49 * org.apache.commons.collections4.list.SetUniqueList) 50 */ 51 @Override 52 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 53 for (Instances traindata : traindataSet) { 54 apply(testdata, traindata); 55 } 56 } 47 57 48 /* 49 * (non-Javadoc) 50 * 51 * @see 52 * de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core. 53 * Instances, weka.core.Instances) 54 */ 55 @Override 56 public void apply(Instances testdata, Instances traindata) { 58 /* 59 * (non-Javadoc) 60 * 61 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core. Instances, 62 * weka.core.Instances) 63 */ 64 @Override 65 public void apply(Instances testdata, Instances traindata) { 57 66 58 final int[] counts = traindata.attributeStats(traindata.classIndex()).nominalCounts;59 if (counts[1] < counts[0]) {60 Instances negatives = new Instances(traindata);61 Instances positives = new Instances(traindata);67 final int[] counts = traindata.attributeStats(traindata.classIndex()).nominalCounts; 68 if (counts[1] < counts[0]) { 69 Instances negatives = new Instances(traindata); 70 Instances positives = new Instances(traindata); 62 71 63 for (int i = traindata.size() - 1; i >= 0; i--) {64 if (Double.compare(1.0, negatives.get(i).classValue()) == 0) {65 negatives.remove(i);66 }67 if (Double.compare(0.0, positives.get(i).classValue()) == 0) {68 positives.remove(i);69 }70 }72 for (int i = traindata.size() - 1; i >= 0; i--) { 73 if (Double.compare(1.0, negatives.get(i).classValue()) == 0) { 74 negatives.remove(i); 75 } 76 if (Double.compare(0.0, positives.get(i).classValue()) == 0) { 77 positives.remove(i); 78 } 79 } 71 80 72 Resample resample = new Resample(); 73 // TODO: resample.setSampleSizePercent((100.0*counts[1])/100+0.01); 74 // Ohne +0.01 wird bei tomcat, xerces-1.2 und jedit-4.0 ein negative 75 // weniger zurückgegeben 76 resample.setSampleSizePercent((100.0 * counts[0]) / counts[1]); 77 try { 78 resample.setInputFormat(traindata); 79 positives = Filter.useFilter(positives, resample); 80 } catch (Exception e) { 81 throw new RuntimeException(e); 82 } 83 traindata.clear(); 84 for (int i = 0; i < negatives.size(); i++) { 85 traindata.add(negatives.get(i)); 86 } 87 for (int i = 0; i < positives.size(); i++) { 88 traindata.add(positives.get(i)); 89 } 90 } 91 } 81 Resample resample = new Resample(); 82 // TODO: resample.setSampleSizePercent((100.0*counts[1])/100+0.01); 83 // Ohne +0.01 wird bei tomcat, xerces-1.2 und jedit-4.0 ein negative 84 // weniger zurückgegeben 85 resample.setSampleSizePercent((100.0 * counts[0]) / counts[1]); 86 try { 87 resample.setInputFormat(traindata); 88 positives = Filter.useFilter(positives, resample); 89 } 90 catch (Exception e) { 91 throw new RuntimeException(e); 92 } 93 traindata.clear(); 94 for (int i = 0; i < negatives.size(); i++) { 95 traindata.add(negatives.get(i)); 96 } 97 for (int i = 0; i < positives.size(); i++) { 98 traindata.add(positives.get(i)); 99 } 100 } 101 } 92 102 93 103 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/Resampling.java
r38 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 8 22 9 23 /** 10 * Resamples the data with WEKA {@link Resample} to have a uniform distribution among all classes. 24 * Resamples the data with WEKA {@link Resample} to have a uniform distribution among all classes. 25 * 11 26 * @author Steffen Herbold 12 27 */ 13 public class Resampling implements IProcessesingStrategy, 14 ISetWiseProcessingStrategy { 28 public class Resampling implements IProcessesingStrategy, ISetWiseProcessingStrategy { 15 29 16 17 /** 18 * Does not have parameters. String is ignored. 19 * @param parameters ignored 20 */ 21 @Override 22 public void setParameter(String parameters) { 23 // dummy 24 } 30 /** 31 * Does not have parameters. String is ignored. 32 * 33 * @param parameters 34 * ignored 35 */ 36 @Override 37 public void setParameter(String parameters) { 38 // dummy 39 } 25 40 26 /* 27 * (non-Javadoc) 28 * @see de.ugoe.cs.cpdp.dataprocessing.ISetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 29 */ 30 @Override 31 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 32 for( Instances traindata : traindataSet ) { 33 apply(testdata, traindata); 34 } 35 } 41 /* 42 * (non-Javadoc) 43 * 44 * @see de.ugoe.cs.cpdp.dataprocessing.ISetWiseProcessingStrategy#apply(weka.core.Instances, 45 * org.apache.commons.collections4.list.SetUniqueList) 46 */ 47 @Override 48 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 49 for (Instances traindata : traindataSet) { 50 apply(testdata, traindata); 51 } 52 } 36 53 37 /* 38 * (non-Javadoc) 39 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 40 */ 41 @Override 42 public void apply(Instances testdata, Instances traindata) { 43 Resample resample = new Resample(); 44 resample.setSampleSizePercent(100); 45 resample.setBiasToUniformClass(1.0); 46 47 Instances traindataSample; 48 try { 49 resample.setInputFormat(traindata); 50 traindataSample = Filter.useFilter(traindata, resample); 51 } catch (Exception e) { 52 throw new RuntimeException(e); 53 } 54 traindata.clear(); 55 for( int i=0 ; i<traindataSample.size() ; i++ ) { 56 traindata.add(traindataSample.get(i)); 57 } 58 } 54 /* 55 * (non-Javadoc) 56 * 57 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances, 58 * weka.core.Instances) 59 */ 60 @Override 61 public void apply(Instances testdata, Instances traindata) { 62 Resample resample = new Resample(); 63 resample.setSampleSizePercent(100); 64 resample.setBiasToUniformClass(1.0); 65 66 Instances traindataSample; 67 try { 68 resample.setInputFormat(traindata); 69 traindataSample = Filter.useFilter(traindata, resample); 70 } 71 catch (Exception e) { 72 throw new RuntimeException(e); 73 } 74 traindata.clear(); 75 for (int i = 0; i < traindataSample.size(); i++) { 76 traindata.add(traindataSample.get(i)); 77 } 78 } 59 79 60 80 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/SimulationFilter.java
r32 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 12 26 * Filter for the Repast Simulation of Software Projects. 13 27 * 14 * Filters the training dataset in the following way: If 0 is no bug 15 * and 1 means there is a bug in this artifact, then this filter 16 * filters the dataset in this way: 28 * Filters the training dataset in the following way: If 0 is no bug and 1 means there is a bug in 29 * this artifact, then this filter filters the dataset in this way: 17 30 * 18 * 10010111000101110101111011101 19 * x--x-x-----x-x---x-x----x---x 31 * 10010111000101110101111011101 x--x-x-----x-x---x-x----x---x 20 32 * 21 * The instances, which are marked with x in this graphic are included 22 * in the newly created datasetand form the trainingsdataset.33 * The instances, which are marked with x in this graphic are included in the newly created dataset 34 * and form the trainingsdataset. 23 35 * 24 36 * @author Fabian Trautsch 25 * 37 * 26 38 */ 27 39 28 public class SimulationFilter implements IProcessesingStrategy {40 public class SimulationFilter implements IProcessesingStrategy { 29 41 30 /**31 * Does not have parameters. String is ignored.32 * @param parameters ignored 33 */ 34 @Override 35 public void setParameter(String parameters) { 36 // dummy 37 38 } 42 /** 43 * Does not have parameters. String is ignored. 44 * 45 * @param parameters 46 * ignored 47 */ 48 @Override 49 public void setParameter(String parameters) { 50 // dummy 39 51 40 41 /* 42 * (non-Javadoc) 43 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 44 */ 45 @Override 46 public void apply(Instances testdata, Instances traindata) { 47 Instances newDataSet = new Instances(traindata); 48 traindata.delete(); 49 50 HashMap<Double, Instance> artifactNames = new HashMap<Double, Instance>(); 51 52 // This is to add all data, where the first occurence of the file has a bug 53 ArrayList<Double> firstOccurenceArtifactNames = new ArrayList<Double>(); 54 55 // Sort dataset (StateID is connected to the date of commit: Lower StateID 56 // means earlier commit than a higher stateID) 57 Attribute wekaAttribute = newDataSet.attribute("Artifact.Target.StateID"); 58 newDataSet.sort(wekaAttribute); 59 60 61 /* 62 * Logical summary: 63 * If there is an instance that dont have a bug, put it into the hashmap (only unique values in there) 64 * 65 * If there is an instance, that hava a bug look up if it is in the hashmap already (this means: 66 * it does not had a bug before!): If this is true add it to a new dataset and remove it from 67 * the hashmap, so that new changes from "nonBug" -> "bug" for this file can be found. 68 * 69 * If the instance has a bug and is not in the hashmap (this means: The file has a bug with its 70 * first occurence or this file only has bugs and not an instance with no bug), then (if it is 71 * not in the arrayList above) add it to the new dataset. This way it is possible to get 72 * the first occurence of a file, which has a bug 73 * 74 */ 75 for(int i=0; i<newDataSet.numInstances(); i++) { 76 Instance wekaInstance = newDataSet.instance(i); 52 } 77 53 78 double newBugLabel = wekaInstance.classValue(); 79 Attribute wekaArtifactName = newDataSet.attribute("Artifact.Name"); 80 Double artifactName = wekaInstance.value(wekaArtifactName); 81 82 if(newBugLabel == 0.0 && artifactNames.keySet().contains(artifactName)) { 83 artifactNames.put(artifactName, wekaInstance); 84 } else if(newBugLabel == 0.0 && !artifactNames.keySet().contains(artifactName)) { 85 artifactNames.put(artifactName, wekaInstance); 86 } else if(newBugLabel == 1.0 && artifactNames.keySet().contains(artifactName)) { 87 traindata.add(wekaInstance); 88 artifactNames.remove(artifactName); 89 } else if(newBugLabel == 1.0 && !artifactNames.keySet().contains(artifactName)) { 90 if(!firstOccurenceArtifactNames.contains(artifactName)) { 91 traindata.add(wekaInstance); 92 firstOccurenceArtifactNames.add(artifactName); 93 } 94 } 95 } 96 97 98 // If we have a file, that never had a bug (this is, when it is NOT in the 99 // new created dataset, but it is in the HashMap from above) add it to 100 // the new dataset 101 102 double[] artifactNamesinNewDataSet = traindata.attributeToDoubleArray(0); 103 HashMap<Double, Instance> artifactNamesCopy = new HashMap<Double, Instance>(artifactNames); 104 105 106 for(Double artifactName : artifactNames.keySet()) { 107 108 for(int i=0; i<artifactNamesinNewDataSet.length; i++) { 109 if(artifactNamesinNewDataSet[i] == artifactName) { 110 artifactNamesCopy.remove(artifactName); 111 } 112 } 113 } 114 115 for(Double artifact: artifactNamesCopy.keySet()) { 116 traindata.add(artifactNamesCopy.get(artifact)); 117 } 118 119 } 54 /* 55 * (non-Javadoc) 56 * 57 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances, 58 * weka.core.Instances) 59 */ 60 @Override 61 public void apply(Instances testdata, Instances traindata) { 62 Instances newDataSet = new Instances(traindata); 63 traindata.delete(); 64 65 HashMap<Double, Instance> artifactNames = new HashMap<Double, Instance>(); 66 67 // This is to add all data, where the first occurence of the file has a bug 68 ArrayList<Double> firstOccurenceArtifactNames = new ArrayList<Double>(); 69 70 // Sort dataset (StateID is connected to the date of commit: Lower StateID 71 // means earlier commit than a higher stateID) 72 Attribute wekaAttribute = newDataSet.attribute("Artifact.Target.StateID"); 73 newDataSet.sort(wekaAttribute); 74 75 /* 76 * Logical summary: If there is an instance that dont have a bug, put it into the hashmap 77 * (only unique values in there) 78 * 79 * If there is an instance, that hava a bug look up if it is in the hashmap already (this 80 * means: it does not had a bug before!): If this is true add it to a new dataset and remove 81 * it from the hashmap, so that new changes from "nonBug" -> "bug" for this file can be 82 * found. 83 * 84 * If the instance has a bug and is not in the hashmap (this means: The file has a bug with 85 * its first occurence or this file only has bugs and not an instance with no bug), then (if 86 * it is not in the arrayList above) add it to the new dataset. This way it is possible to 87 * get the first occurence of a file, which has a bug 88 */ 89 for (int i = 0; i < newDataSet.numInstances(); i++) { 90 Instance wekaInstance = newDataSet.instance(i); 91 92 double newBugLabel = wekaInstance.classValue(); 93 Attribute wekaArtifactName = newDataSet.attribute("Artifact.Name"); 94 Double artifactName = wekaInstance.value(wekaArtifactName); 95 96 if (newBugLabel == 0.0 && artifactNames.keySet().contains(artifactName)) { 97 artifactNames.put(artifactName, wekaInstance); 98 } 99 else if (newBugLabel == 0.0 && !artifactNames.keySet().contains(artifactName)) { 100 artifactNames.put(artifactName, wekaInstance); 101 } 102 else if (newBugLabel == 1.0 && artifactNames.keySet().contains(artifactName)) { 103 traindata.add(wekaInstance); 104 artifactNames.remove(artifactName); 105 } 106 else if (newBugLabel == 1.0 && !artifactNames.keySet().contains(artifactName)) { 107 if (!firstOccurenceArtifactNames.contains(artifactName)) { 108 traindata.add(wekaInstance); 109 firstOccurenceArtifactNames.add(artifactName); 110 } 111 } 112 } 113 114 // If we have a file, that never had a bug (this is, when it is NOT in the 115 // new created dataset, but it is in the HashMap from above) add it to 116 // the new dataset 117 118 double[] artifactNamesinNewDataSet = traindata.attributeToDoubleArray(0); 119 HashMap<Double, Instance> artifactNamesCopy = new HashMap<Double, Instance>(artifactNames); 120 121 for (Double artifactName : artifactNames.keySet()) { 122 123 for (int i = 0; i < artifactNamesinNewDataSet.length; i++) { 124 if (artifactNamesinNewDataSet[i] == artifactName) { 125 artifactNamesCopy.remove(artifactName); 126 } 127 } 128 } 129 130 for (Double artifact : artifactNamesCopy.keySet()) { 131 traindata.add(artifactNamesCopy.get(artifact)); 132 } 133 134 } 120 135 121 136 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/Undersampling.java
r18 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 8 22 9 23 /** 10 * Implements undersampling, a strategy for handling bias in data. In case there are less positive samples (i.e. defect-prone) samples in the 11 * data than negative samples (i.e. non-defect-prone), the non-defect-prone entities are sampled such thatthe number of defect-prone and non-defect-prone instances is the same afterwards. 24 * Implements undersampling, a strategy for handling bias in data. In case there are less positive 25 * samples (i.e. defect-prone) samples in the data than negative samples (i.e. non-defect-prone), 26 * the non-defect-prone entities are sampled such thatthe number of defect-prone and 27 * non-defect-prone instances is the same afterwards. 28 * 12 29 * @author Steffen Herbold 13 30 */ 14 public class Undersampling implements IProcessesingStrategy, 15 ISetWiseProcessingStrategy { 31 public class Undersampling implements IProcessesingStrategy, ISetWiseProcessingStrategy { 16 32 17 18 /** 19 * Does not have parameters. String is ignored. 20 * @param parameters ignored 21 */ 22 @Override 23 public void setParameter(String parameters) { 24 // dummy 25 } 33 /** 34 * Does not have parameters. String is ignored. 35 * 36 * @param parameters 37 * ignored 38 */ 39 @Override 40 public void setParameter(String parameters) { 41 // dummy 42 } 26 43 27 /* 28 * (non-Javadoc) 29 * @see de.ugoe.cs.cpdp.dataprocessing.ISetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 30 */ 31 @Override 32 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 33 for( Instances traindata : traindataSet ) { 34 apply(testdata, traindata); 35 } 36 } 44 /* 45 * (non-Javadoc) 46 * 47 * @see de.ugoe.cs.cpdp.dataprocessing.ISetWiseProcessingStrategy#apply(weka.core.Instances, 48 * org.apache.commons.collections4.list.SetUniqueList) 49 */ 50 @Override 51 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 52 for (Instances traindata : traindataSet) { 53 apply(testdata, traindata); 54 } 55 } 37 56 38 /* 39 * (non-Javadoc) 40 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 41 */ 42 @Override 43 public void apply(Instances testdata, Instances traindata) { 44 45 final int[] counts = traindata.attributeStats(traindata.classIndex()).nominalCounts; 46 47 if( counts[1]<counts[0] ) { 48 Instances negatives = new Instances(traindata); 49 Instances positives = new Instances(traindata); 50 51 for( int i=traindata.size()-1 ; i>=0 ; i-- ) { 52 if( Double.compare(1.0, negatives.get(i).classValue())==0 ) { 53 negatives.remove(i); 54 } 55 if( Double.compare(0.0, positives.get(i).classValue())==0 ) { 56 positives.remove(i); 57 } 58 } 59 60 Resample resample = new Resample(); 61 // TODO: resample.setSampleSizePercent((100.0*counts[1])/100+0.01); 62 // Ohne +0.01 wird bei tomcat, xerces-1.2 und jedit-4.0 ein negative weniger zurückgegeben 63 resample.setSampleSizePercent((100.0* counts[1])/counts[0]); 64 try { 65 resample.setInputFormat(traindata); 66 negatives = Filter.useFilter(negatives, resample); 67 } catch (Exception e) { 68 throw new RuntimeException(e); 69 } 70 traindata.clear(); 71 for( int i=0 ; i<negatives.size() ; i++ ) { 72 traindata.add(negatives.get(i)); 73 } 74 for( int i=0 ; i<positives.size() ; i++ ) { 75 traindata.add(positives.get(i)); 76 } 77 } 78 } 57 /* 58 * (non-Javadoc) 59 * 60 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances, 61 * weka.core.Instances) 62 */ 63 @Override 64 public void apply(Instances testdata, Instances traindata) { 65 66 final int[] counts = traindata.attributeStats(traindata.classIndex()).nominalCounts; 67 68 if (counts[1] < counts[0]) { 69 Instances negatives = new Instances(traindata); 70 Instances positives = new Instances(traindata); 71 72 for (int i = traindata.size() - 1; i >= 0; i--) { 73 if (Double.compare(1.0, negatives.get(i).classValue()) == 0) { 74 negatives.remove(i); 75 } 76 if (Double.compare(0.0, positives.get(i).classValue()) == 0) { 77 positives.remove(i); 78 } 79 } 80 81 Resample resample = new Resample(); 82 // TODO: resample.setSampleSizePercent((100.0*counts[1])/100+0.01); 83 // Ohne +0.01 wird bei tomcat, xerces-1.2 und jedit-4.0 ein negative weniger 84 // zurückgegeben 85 resample.setSampleSizePercent((100.0 * counts[1]) / counts[0]); 86 try { 87 resample.setInputFormat(traindata); 88 negatives = Filter.useFilter(negatives, resample); 89 } 90 catch (Exception e) { 91 throw new RuntimeException(e); 92 } 93 traindata.clear(); 94 for (int i = 0; i < negatives.size(); i++) { 95 traindata.add(negatives.get(i)); 96 } 97 for (int i = 0; i < positives.size(); i++) { 98 traindata.add(positives.get(i)); 99 } 100 } 101 } 79 102 80 103 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/ZScoreNormalization.java
r38 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 14 28 public class ZScoreNormalization implements ISetWiseProcessingStrategy, IProcessesingStrategy { 15 29 16 /** 17 * Does not have parameters. String is ignored. 18 * @param parameters ignored 19 */ 20 @Override 21 public void setParameter(String parameters) { 22 // dummy 23 } 30 /** 31 * Does not have parameters. String is ignored. 32 * 33 * @param parameters 34 * ignored 35 */ 36 @Override 37 public void setParameter(String parameters) { 38 // dummy 39 } 24 40 25 /** 26 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 27 */ 28 @Override 29 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 30 normalize(testdata); 31 for( Instances instances : traindataSet ) { 32 normalize(instances); 33 } 34 } 41 /** 42 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, 43 * org.apache.commons.collections4.list.SetUniqueList) 44 */ 45 @Override 46 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 47 normalize(testdata); 48 for (Instances instances : traindataSet) { 49 normalize(instances); 50 } 51 } 35 52 36 /** 37 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 38 */ 39 @Override 40 public void apply(Instances testdata, Instances traindata) { 41 normalize(testdata); 42 normalize(traindata); 43 } 44 45 private void normalize(Instances instances) { 46 instances.toString(); 47 final Attribute classAttribute = instances.classAttribute(); 48 49 final double[] means = new double[instances.numAttributes()]; 50 final double[] stddevs = new double[instances.numAttributes()]; 51 52 // get means and stddevs of data 53 for( int j=0 ; j<instances.numAttributes() ; j++ ) { 54 if( instances.attribute(j)!=classAttribute ) { 55 means[j] = instances.meanOrMode(j); 56 stddevs[j] = Math.sqrt(instances.variance(j)); 57 } 58 } 59 for( int i=0 ; i<instances.numAttributes(); i++) { 60 if( !instances.attribute(i).equals(classAttribute) ) { 61 for( int j=0 ; j<instances.numInstances() ; j++ ) { 62 Instance inst = instances.get(i); 63 double newValue = (inst.value(i)-means[i])/stddevs[i]; 64 if( newValue==Double.NaN ) { 65 System.out.println("foooooo"); 66 } 67 inst.setValue(i, newValue); 68 } 69 } 70 } 71 } 53 /** 54 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, 55 * weka.core.Instances) 56 */ 57 @Override 58 public void apply(Instances testdata, Instances traindata) { 59 normalize(testdata); 60 normalize(traindata); 61 } 62 63 private void normalize(Instances instances) { 64 instances.toString(); 65 final Attribute classAttribute = instances.classAttribute(); 66 67 final double[] means = new double[instances.numAttributes()]; 68 final double[] stddevs = new double[instances.numAttributes()]; 69 70 // get means and stddevs of data 71 for (int j = 0; j < instances.numAttributes(); j++) { 72 if (instances.attribute(j) != classAttribute) { 73 means[j] = instances.meanOrMode(j); 74 stddevs[j] = Math.sqrt(instances.variance(j)); 75 } 76 } 77 for (int i = 0; i < instances.numAttributes(); i++) { 78 if (!instances.attribute(i).equals(classAttribute)) { 79 for (int j = 0; j < instances.numInstances(); j++) { 80 Instance inst = instances.get(i); 81 double newValue = (inst.value(i) - means[i]) / stddevs[i]; 82 if (newValue == Double.NaN) { 83 System.out.println("foooooo"); 84 } 85 inst.setValue(i, newValue); 86 } 87 } 88 } 89 } 72 90 73 91 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/ZScoreTargetNormalization.java
r38 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 12 26 * @author Steffen Herbold 13 27 */ 14 public class ZScoreTargetNormalization implements ISetWiseProcessingStrategy, IProcessesingStrategy { 28 public class ZScoreTargetNormalization implements ISetWiseProcessingStrategy, IProcessesingStrategy 29 { 15 30 16 /** 17 * Does not have parameters. String is ignored. 18 * @param parameters ignored 19 */ 20 @Override 21 public void setParameter(String parameters) { 22 // dummy 23 } 31 /** 32 * Does not have parameters. String is ignored. 33 * 34 * @param parameters 35 * ignored 36 */ 37 @Override 38 public void setParameter(String parameters) { 39 // dummy 40 } 24 41 25 /** 26 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 27 */ 28 @Override 29 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 30 final Attribute classAttribute = testdata.classAttribute(); 31 32 final double[] meanTest = new double[testdata.numAttributes()]; 33 final double[] stddevTest = new double[testdata.numAttributes()]; 34 35 // get means of testdata 36 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 37 if( testdata.attribute(j)!=classAttribute ) { 38 meanTest[j] = testdata.meanOrMode(j); 39 stddevTest[j] = Math.sqrt(testdata.variance(j)); 40 } 41 } 42 43 // preprocess test data 44 for( int i=0 ; i<testdata.numInstances() ; i++ ) { 45 Instance instance = testdata.instance(i); 46 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 47 if( testdata.attribute(j)!=classAttribute ) { 48 instance.setValue(j, instance.value(j)-meanTest[j]/stddevTest[j]); 49 } 50 } 51 } 52 53 // preprocess training data 54 for( Instances traindata : traindataSet ) { 55 for( int i=0 ; i<traindata.numInstances() ; i++ ) { 56 Instance instance = traindata.instance(i); 57 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 58 if( testdata.attribute(j)!=classAttribute ) { 59 instance.setValue(j, instance.value(j)-meanTest[j]/stddevTest[j]); 60 } 61 } 62 } 63 } 64 } 42 /** 43 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, 44 * org.apache.commons.collections4.list.SetUniqueList) 45 */ 46 @Override 47 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 48 final Attribute classAttribute = testdata.classAttribute(); 65 49 66 /** 67 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 68 */ 69 @Override 70 public void apply(Instances testdata, Instances traindata) { 71 final Attribute classAttribute = testdata.classAttribute(); 72 73 final double[] meanTest = new double[testdata.numAttributes()]; 74 final double[] stddevTest = new double[testdata.numAttributes()]; 75 76 // get means of testdata 77 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 78 if( testdata.attribute(j)!=classAttribute ) { 79 meanTest[j] = testdata.meanOrMode(j); 80 stddevTest[j] = Math.sqrt(testdata.variance(j)); 81 } 82 } 83 84 // preprocess test data 85 for( int i=0 ; i<testdata.numInstances() ; i++ ) { 86 Instance instance = testdata.instance(i); 87 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 88 if( testdata.attribute(j)!=classAttribute ) { 89 instance.setValue(j, instance.value(j)-meanTest[j]/stddevTest[j]); 90 } 91 } 92 } 93 94 // preprocess training data 95 for( int i=0 ; i<traindata.numInstances() ; i++ ) { 96 Instance instance = traindata.instance(i); 97 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 98 if( testdata.attribute(j)!=classAttribute ) { 99 instance.setValue(j, instance.value(j)-meanTest[j]/stddevTest[j]); 100 } 101 } 102 } 103 } 50 final double[] meanTest = new double[testdata.numAttributes()]; 51 final double[] stddevTest = new double[testdata.numAttributes()]; 52 53 // get means of testdata 54 for (int j = 0; j < testdata.numAttributes(); j++) { 55 if (testdata.attribute(j) != classAttribute) { 56 meanTest[j] = testdata.meanOrMode(j); 57 stddevTest[j] = Math.sqrt(testdata.variance(j)); 58 } 59 } 60 61 // preprocess test data 62 for (int i = 0; i < testdata.numInstances(); i++) { 63 Instance instance = testdata.instance(i); 64 for (int j = 0; j < testdata.numAttributes(); j++) { 65 if (testdata.attribute(j) != classAttribute) { 66 instance.setValue(j, instance.value(j) - meanTest[j] / stddevTest[j]); 67 } 68 } 69 } 70 71 // preprocess training data 72 for (Instances traindata : traindataSet) { 73 for (int i = 0; i < traindata.numInstances(); i++) { 74 Instance instance = traindata.instance(i); 75 for (int j = 0; j < testdata.numAttributes(); j++) { 76 if (testdata.attribute(j) != classAttribute) { 77 instance.setValue(j, instance.value(j) - meanTest[j] / stddevTest[j]); 78 } 79 } 80 } 81 } 82 } 83 84 /** 85 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, 86 * weka.core.Instances) 87 */ 88 @Override 89 public void apply(Instances testdata, Instances traindata) { 90 final Attribute classAttribute = testdata.classAttribute(); 91 92 final double[] meanTest = new double[testdata.numAttributes()]; 93 final double[] stddevTest = new double[testdata.numAttributes()]; 94 95 // get means of testdata 96 for (int j = 0; j < testdata.numAttributes(); j++) { 97 if (testdata.attribute(j) != classAttribute) { 98 meanTest[j] = testdata.meanOrMode(j); 99 stddevTest[j] = Math.sqrt(testdata.variance(j)); 100 } 101 } 102 103 // preprocess test data 104 for (int i = 0; i < testdata.numInstances(); i++) { 105 Instance instance = testdata.instance(i); 106 for (int j = 0; j < testdata.numAttributes(); j++) { 107 if (testdata.attribute(j) != classAttribute) { 108 instance.setValue(j, instance.value(j) - meanTest[j] / stddevTest[j]); 109 } 110 } 111 } 112 113 // preprocess training data 114 for (int i = 0; i < traindata.numInstances(); i++) { 115 Instance instance = traindata.instance(i); 116 for (int j = 0; j < testdata.numAttributes(); j++) { 117 if (testdata.attribute(j) != classAttribute) { 118 instance.setValue(j, instance.value(j) - meanTest[j] / stddevTest[j]); 119 } 120 } 121 } 122 } 104 123 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/AbstractCharacteristicSelection.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataselection; 2 16 … … 14 28 15 29 /** 16 * Abstract class that implements the foundation of setwise data selection strategies using distributional characteristics. 17 * This class provides the means to transform the data sets into their characteristic vectors. 30 * Abstract class that implements the foundation of setwise data selection strategies using 31 * distributional characteristics. This class provides the means to transform the data sets into 32 * their characteristic vectors. 33 * 18 34 * @author Steffen Herbold 19 35 */ 20 public abstract class AbstractCharacteristicSelection implements 21 ISetWiseDataselectionStrategy { 36 public abstract class AbstractCharacteristicSelection implements ISetWiseDataselectionStrategy { 22 37 23 /** 24 * vector with the distributional characteristics 25 */ 26 private String[] characteristics = new String[]{"mean","stddev"}; 27 28 /** 29 * Sets the distributional characteristics. The names of the characteristics are separated by blanks. 30 */ 31 @Override 32 public void setParameter(String parameters) { 33 if( !"".equals(parameters) ) { 34 characteristics = parameters.split(" "); 35 } 36 } 37 38 /** 39 * Transforms the data into the distributional characteristics. The first instance is the test data, followed by the training data. 40 * @param testdata test data 41 * @param traindataSet training data sets 42 * @return distributional characteristics of the data 43 */ 44 protected Instances characteristicInstances(Instances testdata, SetUniqueList<Instances> traindataSet) { 45 // setup weka Instances for clustering 46 final ArrayList<Attribute> atts = new ArrayList<Attribute>(); 47 48 final Attribute classAtt = testdata.classAttribute(); 49 for( int i=0 ; i<testdata.numAttributes() ; i++ ) { 50 Attribute dataAtt = testdata.attribute(i); 51 if( !dataAtt.equals(classAtt) ) { 52 for( String characteristic : characteristics ) { 53 atts.add(new Attribute(dataAtt.name() + "_" + characteristic)); 54 } 55 } 56 } 57 final Instances data = new Instances("distributional_characteristics", atts, 0); 58 59 // setup data for clustering 60 double[] instanceValues = new double[atts.size()]; 61 for( int i=0 ; i<testdata.numAttributes() ; i++ ) { 62 Attribute dataAtt = testdata.attribute(i); 63 if( !dataAtt.equals(classAtt) ) { 64 Stats stats = testdata.attributeStats(i).numericStats; 65 for( int j=0; j<characteristics.length; j++ ) { 66 if( "mean".equals(characteristics[j]) ) { 67 instanceValues[i*characteristics.length+j] = stats.mean; 68 } else if( "stddev".equals(characteristics[j])) { 69 instanceValues[i*characteristics.length+j] = stats.stdDev; 70 } else if( "var".equals(characteristics[j])) { 71 instanceValues[i*characteristics.length+j] = testdata.variance(j); 72 } else { 73 throw new RuntimeException("Unkown distributional characteristic: " + characteristics[j]); 74 } 75 } 76 } 77 } 78 data.add(new DenseInstance(1.0, instanceValues)); 79 80 for( Instances traindata : traindataSet ) { 81 instanceValues = new double[atts.size()]; 82 for( int i=0 ; i<traindata.numAttributes() ; i++ ) { 83 Attribute dataAtt = traindata.attribute(i); 84 if( !dataAtt.equals(classAtt) ) { 85 Stats stats = traindata.attributeStats(i).numericStats; 86 for( int j=0; j<characteristics.length; j++ ) { 87 if( "mean".equals(characteristics[j]) ) { 88 instanceValues[i*characteristics.length+j] = stats.mean; 89 } else if( "stddev".equals(characteristics[j])) { 90 instanceValues[i*characteristics.length+j] = stats.stdDev; 91 } else if( "var".equals(characteristics[j])) { 92 instanceValues[i*characteristics.length+j] = testdata.variance(j); 93 } else { 94 throw new RuntimeException("Unkown distributional characteristic: " + characteristics[j]); 95 } 96 } 97 } 98 } 99 Instance instance = new DenseInstance(1.0, instanceValues); 100 101 data.add(instance); 102 } 103 return data; 104 } 105 106 /** 107 * Returns the normalized distributional characteristics of the training data. 108 * @param testdata test data 109 * @param traindataSet training data sets 110 * @return normalized distributional characteristics of the data 111 */ 112 protected Instances normalizedCharacteristicInstances(Instances testdata, SetUniqueList<Instances> traindataSet) { 113 Instances data = characteristicInstances(testdata, traindataSet); 114 try { 115 final Normalize normalizer = new Normalize(); 116 normalizer.setInputFormat(data); 117 data = Filter.useFilter(data, normalizer); 118 } catch (Exception e) { 119 throw new RuntimeException("Unexpected exception during normalization of distributional characteristics.", e); 120 } 121 return data; 122 } 38 /** 39 * vector with the distributional characteristics 40 */ 41 private String[] characteristics = new String[] 42 { "mean", "stddev" }; 43 44 /** 45 * Sets the distributional characteristics. The names of the characteristics are separated by 46 * blanks. 47 */ 48 @Override 49 public void setParameter(String parameters) { 50 if (!"".equals(parameters)) { 51 characteristics = parameters.split(" "); 52 } 53 } 54 55 /** 56 * Transforms the data into the distributional characteristics. The first instance is the test 57 * data, followed by the training data. 58 * 59 * @param testdata 60 * test data 61 * @param traindataSet 62 * training data sets 63 * @return distributional characteristics of the data 64 */ 65 protected Instances characteristicInstances(Instances testdata, 66 SetUniqueList<Instances> traindataSet) 67 { 68 // setup weka Instances for clustering 69 final ArrayList<Attribute> atts = new ArrayList<Attribute>(); 70 71 final Attribute classAtt = testdata.classAttribute(); 72 for (int i = 0; i < testdata.numAttributes(); i++) { 73 Attribute dataAtt = testdata.attribute(i); 74 if (!dataAtt.equals(classAtt)) { 75 for (String characteristic : characteristics) { 76 atts.add(new Attribute(dataAtt.name() + "_" + characteristic)); 77 } 78 } 79 } 80 final Instances data = new Instances("distributional_characteristics", atts, 0); 81 82 // setup data for clustering 83 double[] instanceValues = new double[atts.size()]; 84 for (int i = 0; i < testdata.numAttributes(); i++) { 85 Attribute dataAtt = testdata.attribute(i); 86 if (!dataAtt.equals(classAtt)) { 87 Stats stats = testdata.attributeStats(i).numericStats; 88 for (int j = 0; j < characteristics.length; j++) { 89 if ("mean".equals(characteristics[j])) { 90 instanceValues[i * characteristics.length + j] = stats.mean; 91 } 92 else if ("stddev".equals(characteristics[j])) { 93 instanceValues[i * characteristics.length + j] = stats.stdDev; 94 } 95 else if ("var".equals(characteristics[j])) { 96 instanceValues[i * characteristics.length + j] = testdata.variance(j); 97 } 98 else { 99 throw new RuntimeException("Unkown distributional characteristic: " + 100 characteristics[j]); 101 } 102 } 103 } 104 } 105 data.add(new DenseInstance(1.0, instanceValues)); 106 107 for (Instances traindata : traindataSet) { 108 instanceValues = new double[atts.size()]; 109 for (int i = 0; i < traindata.numAttributes(); i++) { 110 Attribute dataAtt = traindata.attribute(i); 111 if (!dataAtt.equals(classAtt)) { 112 Stats stats = traindata.attributeStats(i).numericStats; 113 for (int j = 0; j < characteristics.length; j++) { 114 if ("mean".equals(characteristics[j])) { 115 instanceValues[i * characteristics.length + j] = stats.mean; 116 } 117 else if ("stddev".equals(characteristics[j])) { 118 instanceValues[i * characteristics.length + j] = stats.stdDev; 119 } 120 else if ("var".equals(characteristics[j])) { 121 instanceValues[i * characteristics.length + j] = testdata.variance(j); 122 } 123 else { 124 throw new RuntimeException("Unkown distributional characteristic: " + 125 characteristics[j]); 126 } 127 } 128 } 129 } 130 Instance instance = new DenseInstance(1.0, instanceValues); 131 132 data.add(instance); 133 } 134 return data; 135 } 136 137 /** 138 * Returns the normalized distributional characteristics of the training data. 139 * 140 * @param testdata 141 * test data 142 * @param traindataSet 143 * training data sets 144 * @return normalized distributional characteristics of the data 145 */ 146 protected Instances normalizedCharacteristicInstances(Instances testdata, 147 SetUniqueList<Instances> traindataSet) 148 { 149 Instances data = characteristicInstances(testdata, traindataSet); 150 try { 151 final Normalize normalizer = new Normalize(); 152 normalizer.setInputFormat(data); 153 data = Filter.useFilter(data, normalizer); 154 } 155 catch (Exception e) { 156 throw new RuntimeException( 157 "Unexpected exception during normalization of distributional characteristics.", 158 e); 159 } 160 return data; 161 } 123 162 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/IPointWiseDataselectionStrategy.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataselection; 2 16 … … 6 20 7 21 /** 8 * Interface for pointwise data selection strategies. 22 * Interface for pointwise data selection strategies. 23 * 9 24 * @author Steffen Herbold 10 25 */ 11 26 public interface IPointWiseDataselectionStrategy extends IParameterizable { 12 27 13 /** 14 * Applies the data selection strategy. 15 * @param testdata test data 16 * @param traindata candidate training data 17 * @return the selected training data 18 */ 19 Instances apply(Instances testdata, Instances traindata); 28 /** 29 * Applies the data selection strategy. 30 * 31 * @param testdata 32 * test data 33 * @param traindata 34 * candidate training data 35 * @return the selected training data 36 */ 37 Instances apply(Instances testdata, Instances traindata); 20 38 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/ISetWiseDataselectionStrategy.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataselection; 2 16 … … 9 23 /** 10 24 * Interface for setwise data selection strategies. 25 * 11 26 * @author Steffen Herbold 12 27 */ 13 28 public interface ISetWiseDataselectionStrategy extends IParameterizable { 14 29 15 /** 16 * Applies a setwise data selection strategy. 17 * @param testdata test data for which the training data is selected 18 * @param traindataSet candidate training data 19 */ 20 void apply(Instances testdata, SetUniqueList<Instances> traindataSet); 30 /** 31 * Applies a setwise data selection strategy. 32 * 33 * @param testdata 34 * test data for which the training data is selected 35 * @param traindataSet 36 * candidate training data 37 */ 38 void apply(Instances testdata, SetUniqueList<Instances> traindataSet); 21 39 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/PetersFilter.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataselection; 2 16 … … 13 27 14 28 /** 15 * Filter according to F. Peters, T. Menzies, and A. Marcus: Better Cross Company Defect Prediction 16 * <br><br> 17 * This filter does not work, the paper has been withdrawn. 29 * Filter according to F. Peters, T. Menzies, and A. Marcus: Better Cross Company Defect Prediction <br> 30 * <br> 31 * This filter does not work, the paper has been withdrawn. 32 * 18 33 * @author Steffen Herbold 19 34 */ … … 21 36 public class PetersFilter implements IPointWiseDataselectionStrategy { 22 37 23 24 /** 25 * @see de.ugoe.cs.cpdp.IParameterizable#setParameter(java.lang.String) 26 */ 27 @Override 28 public void setParameter(String parameters) { 29 // dummy 30 } 38 /** 39 * @see de.ugoe.cs.cpdp.IParameterizable#setParameter(java.lang.String) 40 */ 41 @Override 42 public void setParameter(String parameters) { 43 // dummy 44 } 31 45 32 /** 33 * @see de.ugoe.cs.cpdp.dataselection.IPointWiseDataselectionStrategy#apply(weka.core.Instances, weka.core.Instances) 34 */ 35 @Override 36 public Instances apply(Instances testdata, Instances traindata) { 37 final Attribute classAttribute = testdata.classAttribute(); 38 39 final double[][] testDoubles = new double[testdata.numInstances()][testdata.numAttributes()]; 40 for( int i=0; i<testdata.numInstances() ; i++ ) { 41 Instance instance = testdata.instance(i); 42 int tmp = 0; 43 for( int j=0 ; j<testdata.numAttributes(); j++ ) { 44 if( testdata.attribute(j)!=classAttribute ) { 45 testDoubles[i][tmp++] = instance.value(j); 46 } 47 } 48 } 49 50 final double[][] trainDoubles = new double[traindata.numInstances()][testdata.numAttributes()]; 51 for( int i=0; i<traindata.numInstances() ; i++ ) { 52 Instance instance = traindata.instance(i); 53 int tmp = 0; 54 for( int j=0 ; j<testdata.numAttributes(); j++ ) { 55 if( testdata.attribute(j)!=classAttribute ) { 56 trainDoubles[i][tmp++] = instance.value(j); 57 } 58 } 59 } 60 61 final List<List<Integer>> fanList = new ArrayList<List<Integer>>(testdata.numInstances()); 62 for( int i=0; i<testdata.numInstances(); i++ ) { 63 fanList.add(new LinkedList<Integer>()); 64 } 65 66 for( int i=0; i<traindata.numInstances(); i++ ) { 67 double minDistance = Double.MAX_VALUE; 68 int minIndex = 0; 69 for( int j=0; j<testdata.numInstances(); j++ ) { 70 double distance = MathArrays.distance(trainDoubles[i], testDoubles[j]); 71 if( distance<minDistance ) { 72 minDistance = distance; 73 minIndex = j; 74 } 75 } 76 fanList.get(minIndex).add(i); 77 } 78 79 final SetUniqueList<Integer> selectedIndex = SetUniqueList.setUniqueList(new LinkedList<Integer>()); 80 for( int i=0; i<testdata.numInstances(); i++ ) { 81 double minDistance = Double.MAX_VALUE; 82 int minIndex = -1; 83 for( Integer j : fanList.get(i) ) { 84 double distance = MathArrays.distance(testDoubles[i], trainDoubles[j]); 85 if( distance<minDistance && distance>0.0d ) { 86 minDistance = distance; 87 minIndex = j; 88 } 89 } 90 if( minIndex!=-1 ) { 91 selectedIndex.add(minIndex); 92 } 93 } 94 95 final Instances selected = new Instances(testdata); 96 selected.delete(); 97 for( Integer i : selectedIndex) { 98 selected.add(traindata.instance(i)); 99 } 100 return selected; 101 } 46 /** 47 * @see de.ugoe.cs.cpdp.dataselection.IPointWiseDataselectionStrategy#apply(weka.core.Instances, 48 * weka.core.Instances) 49 */ 50 @Override 51 public Instances apply(Instances testdata, Instances traindata) { 52 final Attribute classAttribute = testdata.classAttribute(); 53 54 final double[][] testDoubles = 55 new double[testdata.numInstances()][testdata.numAttributes()]; 56 for (int i = 0; i < testdata.numInstances(); i++) { 57 Instance instance = testdata.instance(i); 58 int tmp = 0; 59 for (int j = 0; j < testdata.numAttributes(); j++) { 60 if (testdata.attribute(j) != classAttribute) { 61 testDoubles[i][tmp++] = instance.value(j); 62 } 63 } 64 } 65 66 final double[][] trainDoubles = 67 new double[traindata.numInstances()][testdata.numAttributes()]; 68 for (int i = 0; i < traindata.numInstances(); i++) { 69 Instance instance = traindata.instance(i); 70 int tmp = 0; 71 for (int j = 0; j < testdata.numAttributes(); j++) { 72 if (testdata.attribute(j) != classAttribute) { 73 trainDoubles[i][tmp++] = instance.value(j); 74 } 75 } 76 } 77 78 final List<List<Integer>> fanList = new ArrayList<List<Integer>>(testdata.numInstances()); 79 for (int i = 0; i < testdata.numInstances(); i++) { 80 fanList.add(new LinkedList<Integer>()); 81 } 82 83 for (int i = 0; i < traindata.numInstances(); i++) { 84 double minDistance = Double.MAX_VALUE; 85 int minIndex = 0; 86 for (int j = 0; j < testdata.numInstances(); j++) { 87 double distance = MathArrays.distance(trainDoubles[i], testDoubles[j]); 88 if (distance < minDistance) { 89 minDistance = distance; 90 minIndex = j; 91 } 92 } 93 fanList.get(minIndex).add(i); 94 } 95 96 final SetUniqueList<Integer> selectedIndex = 97 SetUniqueList.setUniqueList(new LinkedList<Integer>()); 98 for (int i = 0; i < testdata.numInstances(); i++) { 99 double minDistance = Double.MAX_VALUE; 100 int minIndex = -1; 101 for (Integer j : fanList.get(i)) { 102 double distance = MathArrays.distance(testDoubles[i], trainDoubles[j]); 103 if (distance < minDistance && distance > 0.0d) { 104 minDistance = distance; 105 minIndex = j; 106 } 107 } 108 if (minIndex != -1) { 109 selectedIndex.add(minIndex); 110 } 111 } 112 113 final Instances selected = new Instances(testdata); 114 selected.delete(); 115 for (Integer i : selectedIndex) { 116 selected.add(traindata.instance(i)); 117 } 118 return selected; 119 } 102 120 103 121 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/PointWiseEMClusterSelection.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataselection; 2 16 … … 14 28 import de.ugoe.cs.util.console.Console; 15 29 16 17 30 /** 18 31 * Use in Config: 19 32 * 20 * Specify number of clusters 21 * -N = Num Clusters 22 * <pointwiseselector name="PointWiseEMClusterSelection" param="-N 10"/> 23 * 24 * Try to determine the number of clusters: 25 * -I 10 = max iterations 26 * -X 5 = 5 folds for cross evaluation 27 * -max = max number of clusters 28 * <pointwiseselector name="PointWiseEMClusterSelection" param="-I 10 -X 5 -max 300"/> 33 * Specify number of clusters -N = Num Clusters <pointwiseselector 34 * name="PointWiseEMClusterSelection" param="-N 10"/> 29 35 * 30 * Don't forget to add: 31 * <preprocessor name="Normalization" param=""/> 36 * Try to determine the number of clusters: -I 10 = max iterations -X 5 = 5 folds for cross 37 * evaluation -max = max number of clusters <pointwiseselector name="PointWiseEMClusterSelection" 38 * param="-I 10 -X 5 -max 300"/> 39 * 40 * Don't forget to add: <preprocessor name="Normalization" param=""/> 32 41 */ 33 42 public class PointWiseEMClusterSelection implements IPointWiseDataselectionStrategy { 34 35 private String[] params;36 37 @Override38 public void setParameter(String parameters) {39 params = parameters.split(" ");40 }41 43 42 43 /** 44 * 1. Cluster the traindata 45 * 2. for each instance in the testdata find the assigned cluster 46 * 3. select only traindata from the clusters we found in our testdata 47 * 48 * @returns the selected training data 49 */ 50 @Override 51 public Instances apply(Instances testdata, Instances traindata) { 52 //final Attribute classAttribute = testdata.classAttribute(); 53 54 final List<Integer> selectedCluster = SetUniqueList.setUniqueList(new LinkedList<Integer>()); 44 private String[] params; 55 45 56 // 1. copy train- and testdata 57 Instances train = new Instances(traindata); 58 Instances test = new Instances(testdata); 59 60 Instances selected = null; 61 62 try { 63 // remove class attribute from traindata 64 Remove filter = new Remove(); 65 filter.setAttributeIndices("" + (train.classIndex() + 1)); 66 filter.setInputFormat(train); 67 train = Filter.useFilter(train, filter); 68 69 Console.traceln(Level.INFO, String.format("starting clustering")); 70 71 // 3. cluster data 72 EM clusterer = new EM(); 73 clusterer.setOptions(params); 74 clusterer.buildClusterer(train); 75 int numClusters = clusterer.getNumClusters(); 76 if ( numClusters == -1) { 77 Console.traceln(Level.INFO, String.format("we have unlimited clusters")); 78 }else { 79 Console.traceln(Level.INFO, String.format("we have: "+numClusters+" clusters")); 80 } 81 82 83 // 4. classify testdata, save cluster int 84 85 // remove class attribute from testdata? 86 Remove filter2 = new Remove(); 87 filter2.setAttributeIndices("" + (test.classIndex() + 1)); 88 filter2.setInputFormat(test); 89 test = Filter.useFilter(test, filter2); 90 91 int cnum; 92 for( int i=0; i < test.numInstances(); i++ ) { 93 cnum = ((EM)clusterer).clusterInstance(test.get(i)); 46 @Override 47 public void setParameter(String parameters) { 48 params = parameters.split(" "); 49 } 94 50 95 // we dont want doubles (maybe use a hashset instead of list?) 96 if ( !selectedCluster.contains(cnum) ) { 97 selectedCluster.add(cnum); 98 //Console.traceln(Level.INFO, String.format("assigned to cluster: "+cnum)); 99 } 100 } 101 102 Console.traceln(Level.INFO, String.format("our testdata is in: "+selectedCluster.size()+" different clusters")); 103 104 // 5. get cluster membership of our traindata 105 AddCluster cfilter = new AddCluster(); 106 cfilter.setClusterer(clusterer); 107 cfilter.setInputFormat(train); 108 Instances ctrain = Filter.useFilter(train, cfilter); 109 110 111 // 6. for all traindata get the cluster int, if it is in our list of testdata cluster int add the traindata 112 // of this cluster to our returned traindata 113 int cnumber; 114 selected = new Instances(traindata); 115 selected.delete(); 116 117 for ( int j=0; j < ctrain.numInstances(); j++ ) { 118 // get the cluster number from the attributes 119 cnumber = Integer.parseInt(ctrain.get(j).stringValue(ctrain.get(j).numAttributes()-1).replace("cluster", "")); 120 121 //Console.traceln(Level.INFO, String.format("instance "+j+" is in cluster: "+cnumber)); 122 if ( selectedCluster.contains(cnumber) ) { 123 // this only works if the index does not change 124 selected.add(traindata.get(j)); 125 // check for differences, just one attribute, we are pretty sure the index does not change 126 if ( traindata.get(j).value(3) != ctrain.get(j).value(3) ) { 127 Console.traceln(Level.WARNING, String.format("we have a difference between train an ctrain!")); 128 } 129 } 130 } 131 132 Console.traceln(Level.INFO, String.format("that leaves us with: "+selected.numInstances()+" traindata instances from "+traindata.numInstances())); 133 }catch( Exception e ) { 134 Console.traceln(Level.WARNING, String.format("ERROR")); 135 throw new RuntimeException("error in pointwise em", e); 136 } 137 138 return selected; 139 } 51 /** 52 * 1. Cluster the traindata 2. for each instance in the testdata find the assigned cluster 3. 53 * select only traindata from the clusters we found in our testdata 54 * 55 * @returns the selected training data 56 */ 57 @Override 58 public Instances apply(Instances testdata, Instances traindata) { 59 // final Attribute classAttribute = testdata.classAttribute(); 60 61 final List<Integer> selectedCluster = 62 SetUniqueList.setUniqueList(new LinkedList<Integer>()); 63 64 // 1. copy train- and testdata 65 Instances train = new Instances(traindata); 66 Instances test = new Instances(testdata); 67 68 Instances selected = null; 69 70 try { 71 // remove class attribute from traindata 72 Remove filter = new Remove(); 73 filter.setAttributeIndices("" + (train.classIndex() + 1)); 74 filter.setInputFormat(train); 75 train = Filter.useFilter(train, filter); 76 77 Console.traceln(Level.INFO, String.format("starting clustering")); 78 79 // 3. cluster data 80 EM clusterer = new EM(); 81 clusterer.setOptions(params); 82 clusterer.buildClusterer(train); 83 int numClusters = clusterer.getNumClusters(); 84 if (numClusters == -1) { 85 Console.traceln(Level.INFO, String.format("we have unlimited clusters")); 86 } 87 else { 88 Console.traceln(Level.INFO, String.format("we have: " + numClusters + " clusters")); 89 } 90 91 // 4. classify testdata, save cluster int 92 93 // remove class attribute from testdata? 94 Remove filter2 = new Remove(); 95 filter2.setAttributeIndices("" + (test.classIndex() + 1)); 96 filter2.setInputFormat(test); 97 test = Filter.useFilter(test, filter2); 98 99 int cnum; 100 for (int i = 0; i < test.numInstances(); i++) { 101 cnum = ((EM) clusterer).clusterInstance(test.get(i)); 102 103 // we dont want doubles (maybe use a hashset instead of list?) 104 if (!selectedCluster.contains(cnum)) { 105 selectedCluster.add(cnum); 106 // Console.traceln(Level.INFO, String.format("assigned to cluster: "+cnum)); 107 } 108 } 109 110 Console.traceln(Level.INFO, 111 String.format("our testdata is in: " + selectedCluster.size() + 112 " different clusters")); 113 114 // 5. get cluster membership of our traindata 115 AddCluster cfilter = new AddCluster(); 116 cfilter.setClusterer(clusterer); 117 cfilter.setInputFormat(train); 118 Instances ctrain = Filter.useFilter(train, cfilter); 119 120 // 6. for all traindata get the cluster int, if it is in our list of testdata cluster 121 // int add the traindata 122 // of this cluster to our returned traindata 123 int cnumber; 124 selected = new Instances(traindata); 125 selected.delete(); 126 127 for (int j = 0; j < ctrain.numInstances(); j++) { 128 // get the cluster number from the attributes 129 cnumber = 130 Integer.parseInt(ctrain.get(j).stringValue(ctrain.get(j).numAttributes() - 1) 131 .replace("cluster", "")); 132 133 // Console.traceln(Level.INFO, 134 // String.format("instance "+j+" is in cluster: "+cnumber)); 135 if (selectedCluster.contains(cnumber)) { 136 // this only works if the index does not change 137 selected.add(traindata.get(j)); 138 // check for differences, just one attribute, we are pretty sure the index does 139 // not change 140 if (traindata.get(j).value(3) != ctrain.get(j).value(3)) { 141 Console.traceln(Level.WARNING, String 142 .format("we have a difference between train an ctrain!")); 143 } 144 } 145 } 146 147 Console.traceln(Level.INFO, 148 String.format("that leaves us with: " + selected.numInstances() + 149 " traindata instances from " + traindata.numInstances())); 150 } 151 catch (Exception e) { 152 Console.traceln(Level.WARNING, String.format("ERROR")); 153 throw new RuntimeException("error in pointwise em", e); 154 } 155 156 return selected; 157 } 140 158 141 159 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/SeparatabilitySelection.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataselection; 2 16 … … 13 27 14 28 /** 15 * A setwise data selection strategy based on the separatability of the training data from the test data after Z. He, F. Peters, T. Menzies, Y. Yang: Learning from Open-Source Projects: An Empirical Study on Defect Prediction. 16 * <br><br> 17 * This is calculated through the error of a logistic regression classifier that tries to separate the sets. 29 * A setwise data selection strategy based on the separatability of the training data from the test 30 * data after Z. He, F. Peters, T. Menzies, Y. Yang: Learning from Open-Source Projects: An 31 * Empirical Study on Defect Prediction. <br> 32 * <br> 33 * This is calculated through the error of a logistic regression classifier that tries to separate 34 * the sets. 35 * 18 36 * @author Steffen Herbold 19 37 */ 20 38 public class SeparatabilitySelection implements ISetWiseDataselectionStrategy { 21 39 22 /** 23 * size of the random sample that is drawn from both test data and training data 24 */ 25 private final int sampleSize = 500; 26 27 /** 28 * number of repetitions of the sample drawing 29 */ 30 private final int maxRep = 10; 31 32 /** 33 * number of neighbors that are selected 34 */ 35 private int neighbors = 10; 36 37 /** 38 * Sets the number of neighbors that are selected. 39 */ 40 @Override 41 public void setParameter(String parameters) { 42 if( !"".equals(parameters) ) { 43 neighbors = Integer.parseInt(parameters); 44 } 45 } 40 /** 41 * size of the random sample that is drawn from both test data and training data 42 */ 43 private final int sampleSize = 500; 46 44 47 /** 48 * @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 49 */ 50 @Override 51 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 52 final Random rand = new Random(1); 53 54 // calculate distances between testdata and traindata 55 final double[] distances = new double[traindataSet.size()]; 56 57 int i=0; 58 for( Instances traindata : traindataSet ) { 59 double distance = 0.0; 60 for( int rep=0; rep<maxRep ; rep++ ) { 61 // sample instances 62 Instances sample = new Instances(testdata); 63 for( int j=0; j<sampleSize; j++ ) { 64 Instance inst = new DenseInstance(testdata.instance(rand.nextInt(testdata.numInstances()))); 65 inst.setDataset(sample); 66 inst.setClassValue(1.0); 67 sample.add(inst); 68 inst = new DenseInstance(traindata.instance(rand.nextInt(traindata.numInstances()))); 69 inst.setDataset(sample); 70 inst.setClassValue(0.0); 71 sample.add(inst); 72 } 73 74 // calculate separation 75 Evaluation eval; 76 try { 77 eval = new Evaluation(sample); 78 eval.crossValidateModel(new Logistic(), sample, 5, rand); 79 } catch (Exception e) { 80 throw new RuntimeException("cross-validation during calculation of separatability failed", e); 81 } 82 distance += eval.pctCorrect()/100.0; 83 } 84 distances[i++] = 2*((distance/maxRep)-0.5); 85 } 86 87 // select closest neighbors 88 final double[] distancesCopy = Arrays.copyOf(distances, distances.length); 89 Arrays.sort(distancesCopy); 90 final double cutoffDistance = distancesCopy[neighbors]; 91 92 for( i=traindataSet.size()-1; i>=0 ; i-- ) { 93 if( distances[i]>cutoffDistance ) { 94 traindataSet.remove(i); 95 } 96 } 97 } 45 /** 46 * number of repetitions of the sample drawing 47 */ 48 private final int maxRep = 10; 49 50 /** 51 * number of neighbors that are selected 52 */ 53 private int neighbors = 10; 54 55 /** 56 * Sets the number of neighbors that are selected. 57 */ 58 @Override 59 public void setParameter(String parameters) { 60 if (!"".equals(parameters)) { 61 neighbors = Integer.parseInt(parameters); 62 } 63 } 64 65 /** 66 * @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances, 67 * org.apache.commons.collections4.list.SetUniqueList) 68 */ 69 @Override 70 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 71 final Random rand = new Random(1); 72 73 // calculate distances between testdata and traindata 74 final double[] distances = new double[traindataSet.size()]; 75 76 int i = 0; 77 for (Instances traindata : traindataSet) { 78 double distance = 0.0; 79 for (int rep = 0; rep < maxRep; rep++) { 80 // sample instances 81 Instances sample = new Instances(testdata); 82 for (int j = 0; j < sampleSize; j++) { 83 Instance inst = 84 new DenseInstance(testdata.instance(rand.nextInt(testdata.numInstances()))); 85 inst.setDataset(sample); 86 inst.setClassValue(1.0); 87 sample.add(inst); 88 inst = 89 new DenseInstance( 90 traindata.instance(rand.nextInt(traindata.numInstances()))); 91 inst.setDataset(sample); 92 inst.setClassValue(0.0); 93 sample.add(inst); 94 } 95 96 // calculate separation 97 Evaluation eval; 98 try { 99 eval = new Evaluation(sample); 100 eval.crossValidateModel(new Logistic(), sample, 5, rand); 101 } 102 catch (Exception e) { 103 throw new RuntimeException( 104 "cross-validation during calculation of separatability failed", 105 e); 106 } 107 distance += eval.pctCorrect() / 100.0; 108 } 109 distances[i++] = 2 * ((distance / maxRep) - 0.5); 110 } 111 112 // select closest neighbors 113 final double[] distancesCopy = Arrays.copyOf(distances, distances.length); 114 Arrays.sort(distancesCopy); 115 final double cutoffDistance = distancesCopy[neighbors]; 116 117 for (i = traindataSet.size() - 1; i >= 0; i--) { 118 if (distances[i] > cutoffDistance) { 119 traindataSet.remove(i); 120 } 121 } 122 } 98 123 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/SetWiseEMClusterSelection.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataselection; 2 16 … … 11 25 12 26 /** 13 * Filter based on EM clustering after S. Herbold: Training data selection for cross-project defect prediction 27 * Filter based on EM clustering after S. Herbold: Training data selection for cross-project defect 28 * prediction 29 * 14 30 * @author Steffen Herbold 15 31 */ 16 32 public class SetWiseEMClusterSelection extends AbstractCharacteristicSelection { 17 18 /** 19 * @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 20 */ 21 @Override 22 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 23 final Instances data = normalizedCharacteristicInstances(testdata, traindataSet); 24 final Instance targetInstance = data.instance(0); 25 final List<Instance> candidateInstances = new LinkedList<Instance>(); 26 for( int i=1; i<data.numInstances(); i++ ) { 27 candidateInstances.add(data.instance(i)); 28 } 29 30 // cluster and select 31 try { 32 final EM emeans = new EM(); 33 boolean onlyTarget = true; 34 int targetCluster; 35 int maxNumClusters = candidateInstances.size(); 36 do { // while(onlyTarget) 37 emeans.setMaximumNumberOfClusters(maxNumClusters); 38 emeans.buildClusterer(data); 39 40 targetCluster = emeans.clusterInstance(targetInstance); 41 42 // check if cluster only contains target project 43 for( int i=0 ; i<candidateInstances.size() && onlyTarget; i++ ) { 44 onlyTarget &= !(emeans.clusterInstance(candidateInstances.get(i))==targetCluster); 45 } 46 maxNumClusters = emeans.numberOfClusters()-1; 47 } while(onlyTarget); 48 49 int numRemoved = 0; 50 for( int i=0 ; i<candidateInstances.size() ; i++ ) { 51 if( emeans.clusterInstance(candidateInstances.get(i))!=targetCluster ) { 52 traindataSet.remove(i-numRemoved++); 53 } 54 } 55 } catch(Exception e) { 56 throw new RuntimeException("error applying setwise EM clustering training data selection", e); 57 } 58 } 33 34 /** 35 * @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances, 36 * org.apache.commons.collections4.list.SetUniqueList) 37 */ 38 @Override 39 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 40 final Instances data = normalizedCharacteristicInstances(testdata, traindataSet); 41 final Instance targetInstance = data.instance(0); 42 final List<Instance> candidateInstances = new LinkedList<Instance>(); 43 for (int i = 1; i < data.numInstances(); i++) { 44 candidateInstances.add(data.instance(i)); 45 } 46 47 // cluster and select 48 try { 49 final EM emeans = new EM(); 50 boolean onlyTarget = true; 51 int targetCluster; 52 int maxNumClusters = candidateInstances.size(); 53 do { // while(onlyTarget) 54 emeans.setMaximumNumberOfClusters(maxNumClusters); 55 emeans.buildClusterer(data); 56 57 targetCluster = emeans.clusterInstance(targetInstance); 58 59 // check if cluster only contains target project 60 for (int i = 0; i < candidateInstances.size() && onlyTarget; i++) { 61 onlyTarget &= 62 !(emeans.clusterInstance(candidateInstances.get(i)) == targetCluster); 63 } 64 maxNumClusters = emeans.numberOfClusters() - 1; 65 } 66 while (onlyTarget); 67 68 int numRemoved = 0; 69 for (int i = 0; i < candidateInstances.size(); i++) { 70 if (emeans.clusterInstance(candidateInstances.get(i)) != targetCluster) { 71 traindataSet.remove(i - numRemoved++); 72 } 73 } 74 } 75 catch (Exception e) { 76 throw new RuntimeException( 77 "error applying setwise EM clustering training data selection", 78 e); 79 } 80 } 59 81 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/SetWiseEMContextSelection.java
r38 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataselection; 2 16 … … 22 36 * Selects training data by clustering project context factors. 23 37 * 24 * The project context factors used for the clustering are configured in 25 * the XML param attribute, Example: 26 * <setwiseselector name="SetWiseEMContextSelection" param="AFS TND TNC" /> 38 * The project context factors used for the clustering are configured in the XML param attribute, 39 * Example: <setwiseselector name="SetWiseEMContextSelection" param="AFS TND TNC" /> 27 40 */ 28 41 public class SetWiseEMContextSelection implements ISetWiseDataselectionStrategy { 29 30 private String[] project_context_factors; // = new String[]{"TND", "TNC", "TNF", "TLOC"}; 31 32 @Override 33 public void setParameter(String parameters) { 34 if( parameters!=null ) { 35 project_context_factors = parameters.split(" "); 36 } 37 } 38 39 /** 40 * Uses the Weka EM-Clustering algorithm to cluster the projects 41 * by their project context factors. 42 * The project context factors are first normalized and then used for clustering. 43 * They can be configured in the configuration param. 44 * 45 * @param testdata 46 * @param traindataSet 47 */ 48 protected void cluster(Instances testdata, SetUniqueList<Instances> traindataSet) { 49 // now do the clustering, normalizedCharacteristicInstances ruft getContextFactors auf 50 final Instances data = this.normalizedCharacteristicInstances(testdata, traindataSet); 51 52 final Instance targetInstance = data.instance(0); 53 final List<Instance> candidateInstances = new LinkedList<Instance>(); 54 for( int i=1; i<data.numInstances(); i++ ) { 55 candidateInstances.add(data.instance(i)); 56 } 57 58 // cluster and select 59 try { 60 final EM emeans = new EM(); 61 boolean onlyTarget = true; 62 int targetCluster; 63 int maxNumClusters = candidateInstances.size(); 64 65 do { // while(onlyTarget) 66 emeans.setMaximumNumberOfClusters(maxNumClusters); 67 emeans.buildClusterer(data); 68 69 targetCluster = emeans.clusterInstance(targetInstance); 70 71 // check if cluster only contains target project 72 for( int i=0 ; i<candidateInstances.size() && onlyTarget; i++ ) { 73 onlyTarget &= !(emeans.clusterInstance(candidateInstances.get(i))==targetCluster); 74 } 75 maxNumClusters = emeans.numberOfClusters()-1; 76 77 //Console.traceln(Level.INFO, "number of clusters: " + emeans.numberOfClusters()); 78 } while(onlyTarget); 79 80 Console.traceln(Level.INFO, "clusters: " + maxNumClusters); 81 Console.traceln(Level.INFO, "instances vor dem clustern: " + traindataSet.size()); 82 int numRemoved = 0; 83 for( int i=0 ; i<candidateInstances.size() ; i++ ) { 84 if( emeans.clusterInstance(candidateInstances.get(i))!=targetCluster ) { 85 traindataSet.remove(i-numRemoved++); 86 } 87 } 88 Console.traceln(Level.INFO, "instances nach dem clustern: " + traindataSet.size()); 89 } catch(Exception e) { 90 throw new RuntimeException("error applying setwise EM clustering training data selection", e); 91 } 92 } 93 94 @Override 95 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 96 // issuetracking und pl muss passen 97 /* 98 int s = traindataSet.size(); 99 Console.traceln(Level.INFO, "remove non matching PL and IssueTracking projects, size now: " + s); 100 this.removeWrongContext(testdata, traindataSet, "PL"); 101 this.removeWrongContext(testdata, traindataSet, "IssueTracking"); 102 s = traindataSet.size(); 103 Console.traceln(Level.INFO, "size after removal: " + s); 104 */ 105 // now cluster 106 this.cluster(testdata, traindataSet); 107 } 108 109 /** 110 * Returns test- and training data with only the project context factors 111 * which were chosen in the configuration. 112 * This is later used for clustering. 113 * 114 * @param testdata 115 * @param traindataSet 116 * @return 117 */ 118 protected Instances getContextFactors(Instances testdata, SetUniqueList<Instances> traindataSet) { 119 // setup weka Instances for clustering 120 final ArrayList<Attribute> atts = new ArrayList<Attribute>(); 121 122 // we only want the project context factors 123 for( String pcf : this.project_context_factors ) { 124 atts.add(new Attribute(pcf)); 125 } 126 127 // set up the data 128 final Instances data = new Instances("project_context_factors", atts, 0); 129 double[] instanceValues = new double[atts.size()]; 130 131 // only project context factors + only one instance per project needed 132 int i = 0; 133 for( String pcf : this.project_context_factors ) { 134 instanceValues[i] = testdata.instance(0).value(testdata.attribute(pcf)); 135 //Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " + instanceValues[i]); 136 i++; 137 } 138 data.add(new DenseInstance(1.0, instanceValues)); 139 140 // now for the projects of the training stet 141 for( Instances traindata : traindataSet ) { 142 instanceValues = new double[atts.size()]; // ohne das hier immer dieselben werte?! 143 i = 0; 144 for( String pcf : this.project_context_factors ) { 145 instanceValues[i] = traindata.instance(0).value(traindata.attribute(pcf)); 146 //Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " + instanceValues[i]); 147 i++; 148 } 149 150 data.add(new DenseInstance(1.0, instanceValues)); 151 } 152 153 return data; 154 } 155 156 /** 157 * Delete projects where the project context does not match the training project 158 * 159 * @param testdata 160 * @param traindataSet 161 * @param attribute 162 */ 163 protected void removeWrongContext(Instances testdata, SetUniqueList<Instances> traindataSet, String attribute) { 164 Set<Instances> remove = new HashSet<Instances>(); 165 for( Instances traindata : traindataSet ) { 166 if( traindata.firstInstance().value(traindata.attribute(attribute)) != testdata.firstInstance().value(testdata.attribute(attribute)) ) { 167 remove.add(traindata); 168 //Console.traceln(Level.WARNING, "rmove attribute "+attribute+" test: "+testdata.firstInstance().value(testdata.attribute(attribute))+" train: "+traindata.firstInstance().value(traindata.attribute(attribute))); 169 } 170 } 171 172 // now delete the projects from set 173 for( Instances i : remove ) { 174 traindataSet.remove(i); 175 //Console.traceln(Level.INFO, "removing training project from set"); 176 } 177 } 178 179 /** 180 * Normalizes the data before it gets used for clustering 181 * 182 * @param testdata 183 * @param traindataSet 184 * @return 185 */ 186 protected Instances normalizedCharacteristicInstances(Instances testdata, SetUniqueList<Instances> traindataSet) { 187 Instances data = this.getContextFactors(testdata, traindataSet); 188 try { 189 final Normalize normalizer = new Normalize(); 190 normalizer.setInputFormat(data); 191 data = Filter.useFilter(data, normalizer); 192 } catch (Exception e) { 193 throw new RuntimeException("Unexpected exception during normalization of distributional characteristics.", e); 194 } 195 return data; 196 } 42 43 private String[] project_context_factors; // = new String[]{"TND", "TNC", "TNF", "TLOC"}; 44 45 @Override 46 public void setParameter(String parameters) { 47 if (parameters != null) { 48 project_context_factors = parameters.split(" "); 49 } 50 } 51 52 /** 53 * Uses the Weka EM-Clustering algorithm to cluster the projects by their project context 54 * factors. The project context factors are first normalized and then used for clustering. They 55 * can be configured in the configuration param. 56 * 57 * @param testdata 58 * @param traindataSet 59 */ 60 protected void cluster(Instances testdata, SetUniqueList<Instances> traindataSet) { 61 // now do the clustering, normalizedCharacteristicInstances ruft getContextFactors auf 62 final Instances data = this.normalizedCharacteristicInstances(testdata, traindataSet); 63 64 final Instance targetInstance = data.instance(0); 65 final List<Instance> candidateInstances = new LinkedList<Instance>(); 66 for (int i = 1; i < data.numInstances(); i++) { 67 candidateInstances.add(data.instance(i)); 68 } 69 70 // cluster and select 71 try { 72 final EM emeans = new EM(); 73 boolean onlyTarget = true; 74 int targetCluster; 75 int maxNumClusters = candidateInstances.size(); 76 77 do { // while(onlyTarget) 78 emeans.setMaximumNumberOfClusters(maxNumClusters); 79 emeans.buildClusterer(data); 80 81 targetCluster = emeans.clusterInstance(targetInstance); 82 83 // check if cluster only contains target project 84 for (int i = 0; i < candidateInstances.size() && onlyTarget; i++) { 85 onlyTarget &= 86 !(emeans.clusterInstance(candidateInstances.get(i)) == targetCluster); 87 } 88 maxNumClusters = emeans.numberOfClusters() - 1; 89 90 // Console.traceln(Level.INFO, "number of clusters: " + emeans.numberOfClusters()); 91 } 92 while (onlyTarget); 93 94 Console.traceln(Level.INFO, "clusters: " + maxNumClusters); 95 Console.traceln(Level.INFO, "instances vor dem clustern: " + traindataSet.size()); 96 int numRemoved = 0; 97 for (int i = 0; i < candidateInstances.size(); i++) { 98 if (emeans.clusterInstance(candidateInstances.get(i)) != targetCluster) { 99 traindataSet.remove(i - numRemoved++); 100 } 101 } 102 Console.traceln(Level.INFO, "instances nach dem clustern: " + traindataSet.size()); 103 } 104 catch (Exception e) { 105 throw new RuntimeException( 106 "error applying setwise EM clustering training data selection", 107 e); 108 } 109 } 110 111 @Override 112 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 113 // issuetracking und pl muss passen 114 /* 115 * int s = traindataSet.size(); Console.traceln(Level.INFO, 116 * "remove non matching PL and IssueTracking projects, size now: " + s); 117 * this.removeWrongContext(testdata, traindataSet, "PL"); this.removeWrongContext(testdata, 118 * traindataSet, "IssueTracking"); s = traindataSet.size(); Console.traceln(Level.INFO, 119 * "size after removal: " + s); 120 */ 121 // now cluster 122 this.cluster(testdata, traindataSet); 123 } 124 125 /** 126 * Returns test- and training data with only the project context factors which were chosen in 127 * the configuration. This is later used for clustering. 128 * 129 * @param testdata 130 * @param traindataSet 131 * @return 132 */ 133 protected Instances getContextFactors(Instances testdata, SetUniqueList<Instances> traindataSet) 134 { 135 // setup weka Instances for clustering 136 final ArrayList<Attribute> atts = new ArrayList<Attribute>(); 137 138 // we only want the project context factors 139 for (String pcf : this.project_context_factors) { 140 atts.add(new Attribute(pcf)); 141 } 142 143 // set up the data 144 final Instances data = new Instances("project_context_factors", atts, 0); 145 double[] instanceValues = new double[atts.size()]; 146 147 // only project context factors + only one instance per project needed 148 int i = 0; 149 for (String pcf : this.project_context_factors) { 150 instanceValues[i] = testdata.instance(0).value(testdata.attribute(pcf)); 151 // Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " + 152 // instanceValues[i]); 153 i++; 154 } 155 data.add(new DenseInstance(1.0, instanceValues)); 156 157 // now for the projects of the training stet 158 for (Instances traindata : traindataSet) { 159 instanceValues = new double[atts.size()]; // ohne das hier immer dieselben werte?! 160 i = 0; 161 for (String pcf : this.project_context_factors) { 162 instanceValues[i] = traindata.instance(0).value(traindata.attribute(pcf)); 163 // Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " + 164 // instanceValues[i]); 165 i++; 166 } 167 168 data.add(new DenseInstance(1.0, instanceValues)); 169 } 170 171 return data; 172 } 173 174 /** 175 * Delete projects where the project context does not match the training project 176 * 177 * @param testdata 178 * @param traindataSet 179 * @param attribute 180 */ 181 protected void removeWrongContext(Instances testdata, 182 SetUniqueList<Instances> traindataSet, 183 String attribute) 184 { 185 Set<Instances> remove = new HashSet<Instances>(); 186 for (Instances traindata : traindataSet) { 187 if (traindata.firstInstance().value(traindata.attribute(attribute)) != testdata 188 .firstInstance().value(testdata.attribute(attribute))) 189 { 190 remove.add(traindata); 191 // Console.traceln(Level.WARNING, 192 // "rmove attribute "+attribute+" test: "+testdata.firstInstance().value(testdata.attribute(attribute))+" train: "+traindata.firstInstance().value(traindata.attribute(attribute))); 193 } 194 } 195 196 // now delete the projects from set 197 for (Instances i : remove) { 198 traindataSet.remove(i); 199 // Console.traceln(Level.INFO, "removing training project from set"); 200 } 201 } 202 203 /** 204 * Normalizes the data before it gets used for clustering 205 * 206 * @param testdata 207 * @param traindataSet 208 * @return 209 */ 210 protected Instances normalizedCharacteristicInstances(Instances testdata, 211 SetUniqueList<Instances> traindataSet) 212 { 213 Instances data = this.getContextFactors(testdata, traindataSet); 214 try { 215 final Normalize normalizer = new Normalize(); 216 normalizer.setInputFormat(data); 217 data = Filter.useFilter(data, normalizer); 218 } 219 catch (Exception e) { 220 throw new RuntimeException( 221 "Unexpected exception during normalization of distributional characteristics.", 222 e); 223 } 224 return data; 225 } 197 226 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/SetWiseKNNSelection.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataselection; 2 16 … … 10 24 11 25 /** 12 * Filter based on the k-nearest neighbor (KNN) algorithm S. Herbold: Training data selection for cross-project defect prediction 26 * Filter based on the k-nearest neighbor (KNN) algorithm S. Herbold: Training data selection for 27 * cross-project defect prediction 28 * 13 29 * @author Steffen Herbold 14 30 */ 15 31 public class SetWiseKNNSelection extends AbstractCharacteristicSelection { 16 17 /**18 * number of neighbors selected19 */20 private int k = 1;21 22 /**23 * @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList)24 */25 @Override26 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {27 final Instances data = normalizedCharacteristicInstances(testdata, traindataSet);28 29 final Set<Integer> selected = new HashSet<Integer>();30 for( int i=0 ; i<k ; i++ ) {31 int closestIndex = getClosest(data);32 33 selected.add(closestIndex);34 data.delete(closestIndex);35 }36 37 for( int i=traindataSet.size()-1; i>=0 ; i-- ) {38 if( selected.contains(i) ) {39 traindataSet.remove(i);40 }41 }42 }43 44 /**45 * Helper method that determines the index of the instance with the smallest distance to the first instance (index 0).46 * @param data data set47 * @return index of the closest instance48 */49 private int getClosest(Instances data) {50 double closestDistance = Double.MAX_VALUE;51 int closestIndex = 1;52 for( int i=1 ; i<data.numInstances() ; i++ ) {53 double distance = MathArrays.distance(data.instance(0).toDoubleArray(), data.instance(i).toDoubleArray());54 if( distance < closestDistance) {55 closestDistance = distance;56 closestIndex = i;57 }58 }59 return closestIndex;60 }61 32 62 /** 63 * Sets the number of neighbors followed by the distributional characteristics, the values are separated by blanks. 64 * @see AbstractCharacteristicSelection#setParameter(String) 65 */ 66 @Override 67 public void setParameter(String parameters) { 68 if( !"".equals(parameters) ) { 69 final String[] split = parameters.split(" "); 70 k = Integer.parseInt(split[0]); 71 String str = ""; 72 for( int i=1 ; i<split.length; i++ ) { 73 str += split[i]; 74 if( i<split.length-1 ) { 75 str += " "; 76 } 77 } 78 super.setParameter(str); 79 } 80 } 33 /** 34 * number of neighbors selected 35 */ 36 private int k = 1; 37 38 /** 39 * @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances, 40 * org.apache.commons.collections4.list.SetUniqueList) 41 */ 42 @Override 43 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 44 final Instances data = normalizedCharacteristicInstances(testdata, traindataSet); 45 46 final Set<Integer> selected = new HashSet<Integer>(); 47 for (int i = 0; i < k; i++) { 48 int closestIndex = getClosest(data); 49 50 selected.add(closestIndex); 51 data.delete(closestIndex); 52 } 53 54 for (int i = traindataSet.size() - 1; i >= 0; i--) { 55 if (selected.contains(i)) { 56 traindataSet.remove(i); 57 } 58 } 59 } 60 61 /** 62 * Helper method that determines the index of the instance with the smallest distance to the 63 * first instance (index 0). 64 * 65 * @param data 66 * data set 67 * @return index of the closest instance 68 */ 69 private int getClosest(Instances data) { 70 double closestDistance = Double.MAX_VALUE; 71 int closestIndex = 1; 72 for (int i = 1; i < data.numInstances(); i++) { 73 double distance = 74 MathArrays.distance(data.instance(0).toDoubleArray(), data.instance(i) 75 .toDoubleArray()); 76 if (distance < closestDistance) { 77 closestDistance = distance; 78 closestIndex = i; 79 } 80 } 81 return closestIndex; 82 } 83 84 /** 85 * Sets the number of neighbors followed by the distributional characteristics, the values are 86 * separated by blanks. 87 * 88 * @see AbstractCharacteristicSelection#setParameter(String) 89 */ 90 @Override 91 public void setParameter(String parameters) { 92 if (!"".equals(parameters)) { 93 final String[] split = parameters.split(" "); 94 k = Integer.parseInt(split[0]); 95 String str = ""; 96 for (int i = 1; i < split.length; i++) { 97 str += split[i]; 98 if (i < split.length - 1) { 99 str += " "; 100 } 101 } 102 super.setParameter(str); 103 } 104 } 81 105 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/TestAsTraining.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataselection; 2 16 … … 7 21 /** 8 22 * Uses the test data as training data. 23 * 9 24 * @author Steffen Herbold 10 * 25 * 11 26 */ 12 27 public class TestAsTraining implements ISetWiseDataselectionStrategy { 13 28 14 /**15 * no parameters16 */17 @Override18 public void setParameter(String parameters) {19 // dummy20 }29 /** 30 * no parameters 31 */ 32 @Override 33 public void setParameter(String parameters) { 34 // dummy 35 } 21 36 22 /**(non-Javadoc) 23 * @see de.ugoe.cs.cpdp.dataselection.ISetWiseDataselectionStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 24 */ 25 @Override 26 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 27 traindataSet.clear(); 28 traindataSet.add(new Instances(testdata)); 29 } 37 /** 38 * (non-Javadoc) 39 * 40 * @see de.ugoe.cs.cpdp.dataselection.ISetWiseDataselectionStrategy#apply(weka.core.Instances, 41 * org.apache.commons.collections4.list.SetUniqueList) 42 */ 43 @Override 44 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 45 traindataSet.clear(); 46 traindataSet.add(new Instances(testdata)); 47 } 30 48 31 49 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/TurhanFilter.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataselection; 2 16 … … 13 27 14 28 /** 15 * Filter according to B. Turhan, T. Menzies, A. Bener, and J. Die Stefano: On the relative value of cross-company and within company defect prediction 29 * Filter according to B. Turhan, T. Menzies, A. Bener, and J. Die Stefano: On the relative value of 30 * cross-company and within company defect prediction 31 * 16 32 * @author Steffen Herbold 17 33 */ 18 34 public class TurhanFilter implements IPointWiseDataselectionStrategy { 19 35 20 /** 21 * number of neighbors that are selected 22 */ 23 private int k = 10; 24 25 /** 26 * Sets the number of neighbors. 27 * @param parameters number of neighbors 28 */ 29 @Override 30 public void setParameter(String parameters) { 31 k = Integer.parseInt(parameters); 32 } 36 /** 37 * number of neighbors that are selected 38 */ 39 private int k = 10; 33 40 34 /** 35 * @see de.ugoe.cs.cpdp.dataselection.PointWiseDataselectionStrategy#apply(weka.core.Instances, weka.core.Instances) 36 */ 37 @Override 38 public Instances apply(Instances testdata, Instances traindata) { 39 final Attribute classAttribute = testdata.classAttribute(); 40 41 final List<Integer> selectedIndex = SetUniqueList.setUniqueList(new LinkedList<Integer>()); 42 43 final double[][] trainDoubles = new double[traindata.numInstances()][testdata.numAttributes()]; 44 45 for( int i=0; i<traindata.numInstances() ; i++ ) { 46 Instance instance = traindata.instance(i); 47 int tmp = 0; 48 for( int j=0 ; j<testdata.numAttributes(); j++ ) { 49 if( testdata.attribute(j)!=classAttribute ) { 50 trainDoubles[i][tmp++] = instance.value(j); 51 } 52 } 53 } 54 55 for( int i=0; i<testdata.numInstances() ; i++ ) { 56 Instance testIntance = testdata.instance(i); 57 double[] targetVector = new double[testdata.numAttributes()-1]; 58 int tmp = 0; 59 for( int j=0 ; j<testdata.numAttributes(); j++ ) { 60 if( testdata.attribute(j)!=classAttribute ) { 61 targetVector[tmp++] = testIntance.value(j); 62 } 63 } 64 65 double farthestClosestDistance = Double.MAX_VALUE; 66 int farthestClosestIndex = 0; 67 double[] closestDistances = new double[k]; 68 for( int m=0 ; m<closestDistances.length ; m++ ) { 69 closestDistances[m] = Double.MAX_VALUE; 70 } 71 int[] closestIndex = new int[k]; 72 73 for( int n=0; n<traindata.numInstances() ; n++ ) { 74 double distance = MathArrays.distance(targetVector, trainDoubles[n]); 75 76 if( distance<farthestClosestDistance ) { 77 closestIndex[farthestClosestIndex] = n; 78 closestDistances[farthestClosestIndex] = distance; 79 80 farthestClosestIndex = ArrayTools.findMax(closestDistances); 81 farthestClosestDistance = closestDistances[farthestClosestIndex]; 82 } 83 } 84 for( int index : closestIndex ) { 85 selectedIndex.add(index); 86 } 87 } 88 89 final Instances selected = new Instances(testdata); 90 selected.delete(); 91 for( Integer i : selectedIndex) { 92 selected.add(traindata.instance(i)); 93 } 94 return selected; 95 } 41 /** 42 * Sets the number of neighbors. 43 * 44 * @param parameters 45 * number of neighbors 46 */ 47 @Override 48 public void setParameter(String parameters) { 49 k = Integer.parseInt(parameters); 50 } 51 52 /** 53 * @see de.ugoe.cs.cpdp.dataselection.PointWiseDataselectionStrategy#apply(weka.core.Instances, 54 * weka.core.Instances) 55 */ 56 @Override 57 public Instances apply(Instances testdata, Instances traindata) { 58 final Attribute classAttribute = testdata.classAttribute(); 59 60 final List<Integer> selectedIndex = SetUniqueList.setUniqueList(new LinkedList<Integer>()); 61 62 final double[][] trainDoubles = 63 new double[traindata.numInstances()][testdata.numAttributes()]; 64 65 for (int i = 0; i < traindata.numInstances(); i++) { 66 Instance instance = traindata.instance(i); 67 int tmp = 0; 68 for (int j = 0; j < testdata.numAttributes(); j++) { 69 if (testdata.attribute(j) != classAttribute) { 70 trainDoubles[i][tmp++] = instance.value(j); 71 } 72 } 73 } 74 75 for (int i = 0; i < testdata.numInstances(); i++) { 76 Instance testIntance = testdata.instance(i); 77 double[] targetVector = new double[testdata.numAttributes() - 1]; 78 int tmp = 0; 79 for (int j = 0; j < testdata.numAttributes(); j++) { 80 if (testdata.attribute(j) != classAttribute) { 81 targetVector[tmp++] = testIntance.value(j); 82 } 83 } 84 85 double farthestClosestDistance = Double.MAX_VALUE; 86 int farthestClosestIndex = 0; 87 double[] closestDistances = new double[k]; 88 for (int m = 0; m < closestDistances.length; m++) { 89 closestDistances[m] = Double.MAX_VALUE; 90 } 91 int[] closestIndex = new int[k]; 92 93 for (int n = 0; n < traindata.numInstances(); n++) { 94 double distance = MathArrays.distance(targetVector, trainDoubles[n]); 95 96 if (distance < farthestClosestDistance) { 97 closestIndex[farthestClosestIndex] = n; 98 closestDistances[farthestClosestIndex] = distance; 99 100 farthestClosestIndex = ArrayTools.findMax(closestDistances); 101 farthestClosestDistance = closestDistances[farthestClosestIndex]; 102 } 103 } 104 for (int index : closestIndex) { 105 selectedIndex.add(index); 106 } 107 } 108 109 final Instances selected = new Instances(testdata); 110 selected.delete(); 111 for (Integer i : selectedIndex) { 112 selected.add(traindata.instance(i)); 113 } 114 return selected; 115 } 96 116 97 117 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/decentApp/ARFFxResourceTool.java
r36 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.decentApp; 2 16 … … 11 25 * 12 26 * @author Philip Makedonski, Fabian Trautsch 13 * 27 * 14 28 */ 15 29 public class ARFFxResourceTool extends ResourceTool { 16 17 /** 18 * Initializes the Tool Factory, from which the models can be loaded and 19 * inizializes the validator. 20 */ 21 public ARFFxResourceTool(){ 22 super(ARFFxResourceTool.class.getName()); 23 ARFFxPackageImpl.init(); 24 25 // Commented, because simulation has problems with this 26 initializeValidator(); 27 } 28 29 /** 30 * Inizializes the model validator 31 */ 32 @Override 33 protected void initializeValidator(){ 34 super.initializeValidator(); 35 EObjectValidator validator = new EObjectValidator(); 36 EValidator.Registry.INSTANCE.put(ARFFxPackage.eINSTANCE, validator); 37 } 38 30 31 /** 32 * Initializes the Tool Factory, from which the models can be loaded and inizializes the 33 * validator. 34 */ 35 public ARFFxResourceTool() { 36 super(ARFFxResourceTool.class.getName()); 37 ARFFxPackageImpl.init(); 38 39 // Commented, because simulation has problems with this 40 initializeValidator(); 41 } 42 43 /** 44 * Inizializes the model validator 45 */ 46 @Override 47 protected void initializeValidator() { 48 super.initializeValidator(); 49 EObjectValidator validator = new EObjectValidator(); 50 EValidator.Registry.INSTANCE.put(ARFFxPackage.eINSTANCE, validator); 51 } 39 52 40 53 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/decentApp/DECENTEpsilonModelHandler.java
r36 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.decentApp; 2 16 … … 20 34 * 21 35 * @author Philip Makedonski, Fabian Trautsch 22 * 36 * 23 37 */ 24 38 25 39 public class DECENTEpsilonModelHandler { 26 private HashMap<String, Object> metaModelCache = new HashMap<>(); 27 private boolean useDECENTBinary = false; 28 private boolean useARFFxBinary = false; 29 30 public static String metaPath = "./decent/models/"; 31 32 /** 33 * Returns the decent model as IModel instance 34 * 35 * @param decentModelLocation location of the decent model file 36 * @param read indicates if the model should be read from 37 * @param write indicates if data should be written in the model 38 * @return EmFModel (IModel) instance from the decent model, which was loaded 39 * @throws Exception 40 */ 41 public IModel getDECENTModel(String decentModelLocation, boolean read, boolean write) throws Exception { 42 43 EmfModel model; 44 45 if (isUseDECENTBinary()) { 46 unregisterMetaModels(""); 47 if (!read) { 48 new File(decentModelLocation).delete(); 49 new File(decentModelLocation+"bin").delete(); 50 } 51 DECENTResourceTool tool = new DECENTResourceTool(); 52 if (new File(decentModelLocation).exists() && !new File(decentModelLocation+"bin").exists()) { 53 Resource resource = tool.loadResourceFromXMI(decentModelLocation,"decent", DECENTPackage.eINSTANCE); 54 tool.storeBinaryResourceContents(resource.getContents(), decentModelLocation+"bin", "decentbin"); 55 } 56 57 Resource resourceBin = tool.loadResourceFromBinary(decentModelLocation+"bin","decentbin", DECENTPackage.eINSTANCE); 58 //alternative pattern 59 // model = createInMemoryEmfModel("DECENT", resourceLocation, "../DECENT.Meta/model/DECENTv3.ecore", read, write, resourceBin, DECENTPackage.eINSTANCE); 60 // restoreMetaModels(); 61 62 //NOTE: Adding the package is essential as otherwise epsilon breaks 63 model = new InMemoryEmfModel("DECENT", resourceBin, DECENTPackage.eINSTANCE); 64 model.setStoredOnDisposal(write); 65 model.setReadOnLoad(read); 66 model.setCachingEnabled(true); 67 restoreMetaModels(); 68 } else { 69 model = createEmfModel("DECENT", decentModelLocation, metaPath+"DECENTv3.ecore", read, write); 70 } 71 72 return model; 73 } 74 75 /** 76 * Converts the decent model to a binary form 77 * 78 * @param location of the decent model file 79 */ 80 public void convertDECENTModelToBinary(String location) { 81 unregisterMetaModels(""); 82 DECENTResourceTool tool = new DECENTResourceTool(); 83 Resource resource = tool.loadResourceFromXMI(location+"/model.decent","decent", DECENTPackage.eINSTANCE); 84 tool.storeBinaryResourceContents(resource.getContents(), location+"/model.decent"+"bin", "decentbin"); 85 restoreMetaModels(); 86 } 87 88 /** 89 * Converts the decent model to a xmi form 90 * 91 * @param location of the decent model file 92 */ 93 94 public void convertDECENTModelToXMI(String location) { 95 unregisterMetaModels(""); 96 DECENTResourceTool tool = new DECENTResourceTool(); 97 Resource resource = tool.loadResourceFromBinary(location+"/model.decentbin","decentbin", DECENTPackage.eINSTANCE); 98 restoreMetaModels(); 99 tool.storeResourceContents(resource.getContents(), location+"/model.decent", "decent"); 100 } 101 102 /** 103 * Returns the arffx model as IModel instance 104 * 105 * @param arffxModelLocation location of the arffx model file 106 * @param read indicates if the model should be read from 107 * @param write indicates if data should be written in the model 108 * @return EmFModel (IModel) instance from the arffx model, which was loaded 109 * @throws Exception 110 */ 111 112 public IModel getARFFxModel(String arffxModelLocation, boolean read, boolean write) throws Exception { 113 114 EmfModel model; 115 116 if (isUseARFFxBinary()) { 117 unregisterMetaModels(""); 118 if (!read) { 119 new File(arffxModelLocation).delete(); 120 new File(arffxModelLocation+"bin").delete(); 121 } 122 ARFFxResourceTool tool = new ARFFxResourceTool(); 123 if (new File(arffxModelLocation).exists() && !new File(arffxModelLocation+"bin").exists()) { 124 Resource resource = tool.loadResourceFromXMI(arffxModelLocation,"arffx", ARFFxPackage.eINSTANCE); 125 tool.storeBinaryResourceContents(resource.getContents(), arffxModelLocation+"bin", "arffxbin"); 126 } 127 128 Resource resourceBin = tool.loadResourceFromBinary(arffxModelLocation+"bin","arffxbin", ARFFxPackage.eINSTANCE); 129 //alternative pattern 130 // model = createInMemoryEmfModel("DECENT", resourceLocation, "../DECENT.Meta/model/DECENTv3.ecore", read, write, resourceBin, DECENTPackage.eINSTANCE); 131 // restoreMetaModels(); 132 133 //NOTE: Adding the package is essential as otherwise epsilon breaks 134 model = new InMemoryEmfModel("ARFFx", resourceBin, ARFFxPackage.eINSTANCE); 135 // model.getModelImpl().getURI().toFileString() 136 model.setStoredOnDisposal(write); 137 model.setReadOnLoad(read); 138 model.setCachingEnabled(true); 139 restoreMetaModels(); 140 } else { 141 model = createEmfModel("ARFFx", arffxModelLocation, metaPath+"ARFFx.ecore", read, write); 142 } 143 144 return model; 145 } 146 147 148 /** 149 * Converts an arffx model to a binary version 150 * 151 * @param location of the arffx model 152 */ 153 public void convertARFFxModelToBinary(String location) { 154 unregisterMetaModels(""); 155 ARFFxResourceTool tool = new ARFFxResourceTool(); 156 Resource resource = tool.loadResourceFromXMI(location+"/model.arffx","arffx", ARFFxPackage.eINSTANCE); 157 tool.storeBinaryResourceContents(resource.getContents(), location+"/model.arffx"+"bin", "arffxbin"); 158 restoreMetaModels(); 159 } 160 161 /** 162 * Converts an arffx model to xmi 163 * 164 * @param location of the arffx model 165 */ 166 167 public void convertARFFxModelToXMI(String location) { 168 unregisterMetaModels(""); 169 ARFFxResourceTool tool = new ARFFxResourceTool(); 170 Resource resource = tool.loadResourceFromBinary(location+"/model.arffxbin","arffxbin", DECENTPackage.eINSTANCE); 171 restoreMetaModels(); 172 tool.storeResourceContents(resource.getContents(), location+"/model.arffx", "arffx"); 173 } 174 175 176 /** 177 * Returns the log model as IModel instance 178 * 179 * @param logModelLocation location of the log model file 180 * @param read indicates if the model should be read from 181 * @param write indicates if data should be written in the model 182 * @return EmFModel (IModel) instance from the log model, which was loaded 183 * @throws Exception 184 */ 185 186 public IModel getLOGModel(String logModelLocation, boolean read, boolean write) throws Exception { 187 if (!new File(logModelLocation).exists()) { 188 read = false; 189 } 190 IModel model = createEmfModel("LOG", logModelLocation, metaPath +"LOG.ecore", read, write); 191 System.setProperty("epsilon.logFileAvailable", "true"); 192 return model; 193 } 194 195 /** 196 * Creates an EMF Model 197 * 198 * @param name of the emf model 199 * @param model name of the model 200 * @param metamodel name of the metamodel 201 * @param readOnLoad indicates if the model should be read on load 202 * @param storeOnDisposal indicates if the model should be stored on disposal 203 * @return 204 * @throws EolModelLoadingException 205 * @throws URISyntaxException 206 */ 207 208 @SuppressWarnings("deprecation") 209 protected EmfModel createEmfModel(String name, String model, 210 String metamodel, boolean readOnLoad, boolean storeOnDisposal) 211 throws EolModelLoadingException, URISyntaxException { 212 EmfModel emfModel = new EmfModel(); 213 StringProperties properties = new StringProperties(); 214 properties.put(EmfModel.PROPERTY_NAME, name); 215 properties.put(EmfModel.PROPERTY_ALIASES, name); 216 properties.put(EmfModel.PROPERTY_FILE_BASED_METAMODEL_URI, 217 "file:/" + getFile(metamodel).getAbsolutePath()); 218 properties.put(EmfModel.PROPERTY_MODEL_URI, 219 "file:/" + getFile(model).getAbsolutePath()); 220 properties.put(EmfModel.PROPERTY_IS_METAMODEL_FILE_BASED, "true"); 221 properties.put(EmfModel.PROPERTY_READONLOAD, readOnLoad + ""); 222 properties.put(EmfModel.PROPERTY_CACHED, "true"); 223 properties.put(EmfModel.PROPERTY_STOREONDISPOSAL, 224 storeOnDisposal + ""); 225 emfModel.load(properties, ""); 226 //System.out.println(emfModel.allContents()); 227 return emfModel; 228 } 229 230 /** 231 * Returns a new File instance from the given filename 232 * 233 * @param fileName of the file 234 * @return 235 * @throws URISyntaxException 236 */ 237 public File getFile(String fileName) throws URISyntaxException {; 238 return new File(fileName); 239 } 240 241 /** 242 * Restores the metamodels, so that they are registered in the 243 * EPackage registry 244 */ 245 private void restoreMetaModels() { 246 for (String key : metaModelCache .keySet()) { 247 EPackage.Registry.INSTANCE.put(key, metaModelCache.get(key)); 248 }; 249 } 250 251 /** 252 * Unregister the metamodels from the EPackage registry 253 * 254 * @param filter for filtering out certain instances 255 */ 256 private void unregisterMetaModels(String filter) { 257 for (String key : EPackage.Registry.INSTANCE.keySet()) { 258 if (key.contains(filter)) { 259 metaModelCache.put(key, EPackage.Registry.INSTANCE.get(key)); 260 } 261 }; 262 for (String key : metaModelCache .keySet()) { 263 EPackage.Registry.INSTANCE.remove(key); 264 }; 265 } 266 267 /** 268 * Returns true if decent binary model is used 269 * @return 270 */ 271 272 public boolean isUseDECENTBinary() { 273 return useDECENTBinary; 274 } 275 276 /** 277 * Sets the boolean which indicates, if the decent binary 278 * model is used 279 * @param useDECENTBinary 280 */ 281 public void setUseDECENTBinary(boolean useDECENTBinary) { 282 this.useDECENTBinary = useDECENTBinary; 283 } 284 285 /** 286 * Returns true if arffx binary model is used 287 * @return 288 */ 289 public boolean isUseARFFxBinary() { 290 return useARFFxBinary; 291 } 292 293 /** 294 * Sets the boolean which indicates, if the arffx binary 295 * model is used 296 * @param useARFFxBinary 297 */ 298 299 public void setUseARFFxBinary(boolean useARFFxBinary) { 300 this.useARFFxBinary = useARFFxBinary; 301 } 302 303 40 private HashMap<String, Object> metaModelCache = new HashMap<>(); 41 private boolean useDECENTBinary = false; 42 private boolean useARFFxBinary = false; 43 44 public static String metaPath = "./decent/models/"; 45 46 /** 47 * Returns the decent model as IModel instance 48 * 49 * @param decentModelLocation 50 * location of the decent model file 51 * @param read 52 * indicates if the model should be read from 53 * @param write 54 * indicates if data should be written in the model 55 * @return EmFModel (IModel) instance from the decent model, which was loaded 56 * @throws Exception 57 */ 58 public IModel getDECENTModel(String decentModelLocation, boolean read, boolean write) 59 throws Exception 60 { 61 62 EmfModel model; 63 64 if (isUseDECENTBinary()) { 65 unregisterMetaModels(""); 66 if (!read) { 67 new File(decentModelLocation).delete(); 68 new File(decentModelLocation + "bin").delete(); 69 } 70 DECENTResourceTool tool = new DECENTResourceTool(); 71 if (new File(decentModelLocation).exists() && 72 !new File(decentModelLocation + "bin").exists()) 73 { 74 Resource resource = 75 tool.loadResourceFromXMI(decentModelLocation, "decent", DECENTPackage.eINSTANCE); 76 tool.storeBinaryResourceContents(resource.getContents(), decentModelLocation + 77 "bin", "decentbin"); 78 } 79 80 Resource resourceBin = 81 tool.loadResourceFromBinary(decentModelLocation + "bin", "decentbin", 82 DECENTPackage.eINSTANCE); 83 // alternative pattern 84 // model = createInMemoryEmfModel("DECENT", resourceLocation, 85 // "../DECENT.Meta/model/DECENTv3.ecore", read, write, resourceBin, 86 // DECENTPackage.eINSTANCE); 87 // restoreMetaModels(); 88 89 // NOTE: Adding the package is essential as otherwise epsilon breaks 90 model = new InMemoryEmfModel("DECENT", resourceBin, DECENTPackage.eINSTANCE); 91 model.setStoredOnDisposal(write); 92 model.setReadOnLoad(read); 93 model.setCachingEnabled(true); 94 restoreMetaModels(); 95 } 96 else { 97 model = 98 createEmfModel("DECENT", decentModelLocation, metaPath + "DECENTv3.ecore", read, 99 write); 100 } 101 102 return model; 103 } 104 105 /** 106 * Converts the decent model to a binary form 107 * 108 * @param location 109 * of the decent model file 110 */ 111 public void convertDECENTModelToBinary(String location) { 112 unregisterMetaModels(""); 113 DECENTResourceTool tool = new DECENTResourceTool(); 114 Resource resource = 115 tool.loadResourceFromXMI(location + "/model.decent", "decent", DECENTPackage.eINSTANCE); 116 tool.storeBinaryResourceContents(resource.getContents(), 117 location + "/model.decent" + "bin", "decentbin"); 118 restoreMetaModels(); 119 } 120 121 /** 122 * Converts the decent model to a xmi form 123 * 124 * @param location 125 * of the decent model file 126 */ 127 128 public void convertDECENTModelToXMI(String location) { 129 unregisterMetaModels(""); 130 DECENTResourceTool tool = new DECENTResourceTool(); 131 Resource resource = 132 tool.loadResourceFromBinary(location + "/model.decentbin", "decentbin", 133 DECENTPackage.eINSTANCE); 134 restoreMetaModels(); 135 tool.storeResourceContents(resource.getContents(), location + "/model.decent", "decent"); 136 } 137 138 /** 139 * Returns the arffx model as IModel instance 140 * 141 * @param arffxModelLocation 142 * location of the arffx model file 143 * @param read 144 * indicates if the model should be read from 145 * @param write 146 * indicates if data should be written in the model 147 * @return EmFModel (IModel) instance from the arffx model, which was loaded 148 * @throws Exception 149 */ 150 151 public IModel getARFFxModel(String arffxModelLocation, boolean read, boolean write) 152 throws Exception 153 { 154 155 EmfModel model; 156 157 if (isUseARFFxBinary()) { 158 unregisterMetaModels(""); 159 if (!read) { 160 new File(arffxModelLocation).delete(); 161 new File(arffxModelLocation + "bin").delete(); 162 } 163 ARFFxResourceTool tool = new ARFFxResourceTool(); 164 if (new File(arffxModelLocation).exists() && 165 !new File(arffxModelLocation + "bin").exists()) 166 { 167 Resource resource = 168 tool.loadResourceFromXMI(arffxModelLocation, "arffx", ARFFxPackage.eINSTANCE); 169 tool.storeBinaryResourceContents(resource.getContents(), 170 arffxModelLocation + "bin", "arffxbin"); 171 } 172 173 Resource resourceBin = 174 tool.loadResourceFromBinary(arffxModelLocation + "bin", "arffxbin", 175 ARFFxPackage.eINSTANCE); 176 // alternative pattern 177 // model = createInMemoryEmfModel("DECENT", resourceLocation, 178 // "../DECENT.Meta/model/DECENTv3.ecore", read, write, resourceBin, 179 // DECENTPackage.eINSTANCE); 180 // restoreMetaModels(); 181 182 // NOTE: Adding the package is essential as otherwise epsilon breaks 183 model = new InMemoryEmfModel("ARFFx", resourceBin, ARFFxPackage.eINSTANCE); 184 // model.getModelImpl().getURI().toFileString() 185 model.setStoredOnDisposal(write); 186 model.setReadOnLoad(read); 187 model.setCachingEnabled(true); 188 restoreMetaModels(); 189 } 190 else { 191 model = 192 createEmfModel("ARFFx", arffxModelLocation, metaPath + "ARFFx.ecore", read, write); 193 } 194 195 return model; 196 } 197 198 /** 199 * Converts an arffx model to a binary version 200 * 201 * @param location 202 * of the arffx model 203 */ 204 public void convertARFFxModelToBinary(String location) { 205 unregisterMetaModels(""); 206 ARFFxResourceTool tool = new ARFFxResourceTool(); 207 Resource resource = 208 tool.loadResourceFromXMI(location + "/model.arffx", "arffx", ARFFxPackage.eINSTANCE); 209 tool.storeBinaryResourceContents(resource.getContents(), location + "/model.arffx" + "bin", 210 "arffxbin"); 211 restoreMetaModels(); 212 } 213 214 /** 215 * Converts an arffx model to xmi 216 * 217 * @param location 218 * of the arffx model 219 */ 220 221 public void convertARFFxModelToXMI(String location) { 222 unregisterMetaModels(""); 223 ARFFxResourceTool tool = new ARFFxResourceTool(); 224 Resource resource = 225 tool.loadResourceFromBinary(location + "/model.arffxbin", "arffxbin", 226 DECENTPackage.eINSTANCE); 227 restoreMetaModels(); 228 tool.storeResourceContents(resource.getContents(), location + "/model.arffx", "arffx"); 229 } 230 231 /** 232 * Returns the log model as IModel instance 233 * 234 * @param logModelLocation 235 * location of the log model file 236 * @param read 237 * indicates if the model should be read from 238 * @param write 239 * indicates if data should be written in the model 240 * @return EmFModel (IModel) instance from the log model, which was loaded 241 * @throws Exception 242 */ 243 244 public IModel getLOGModel(String logModelLocation, boolean read, boolean write) 245 throws Exception 246 { 247 if (!new File(logModelLocation).exists()) { 248 read = false; 249 } 250 IModel model = createEmfModel("LOG", logModelLocation, metaPath + "LOG.ecore", read, write); 251 System.setProperty("epsilon.logFileAvailable", "true"); 252 return model; 253 } 254 255 /** 256 * Creates an EMF Model 257 * 258 * @param name 259 * of the emf model 260 * @param model 261 * name of the model 262 * @param metamodel 263 * name of the metamodel 264 * @param readOnLoad 265 * indicates if the model should be read on load 266 * @param storeOnDisposal 267 * indicates if the model should be stored on disposal 268 * @return 269 * @throws EolModelLoadingException 270 * @throws URISyntaxException 271 */ 272 273 @SuppressWarnings("deprecation") 274 protected EmfModel createEmfModel(String name, 275 String model, 276 String metamodel, 277 boolean readOnLoad, 278 boolean storeOnDisposal) throws EolModelLoadingException, 279 URISyntaxException 280 { 281 EmfModel emfModel = new EmfModel(); 282 StringProperties properties = new StringProperties(); 283 properties.put(EmfModel.PROPERTY_NAME, name); 284 properties.put(EmfModel.PROPERTY_ALIASES, name); 285 properties.put(EmfModel.PROPERTY_FILE_BASED_METAMODEL_URI, "file:/" + 286 getFile(metamodel).getAbsolutePath()); 287 properties.put(EmfModel.PROPERTY_MODEL_URI, "file:/" + getFile(model).getAbsolutePath()); 288 properties.put(EmfModel.PROPERTY_IS_METAMODEL_FILE_BASED, "true"); 289 properties.put(EmfModel.PROPERTY_READONLOAD, readOnLoad + ""); 290 properties.put(EmfModel.PROPERTY_CACHED, "true"); 291 properties.put(EmfModel.PROPERTY_STOREONDISPOSAL, storeOnDisposal + ""); 292 emfModel.load(properties, ""); 293 // System.out.println(emfModel.allContents()); 294 return emfModel; 295 } 296 297 /** 298 * Returns a new File instance from the given filename 299 * 300 * @param fileName 301 * of the file 302 * @return 303 * @throws URISyntaxException 304 */ 305 public File getFile(String fileName) throws URISyntaxException { 306 ; 307 return new File(fileName); 308 } 309 310 /** 311 * Restores the metamodels, so that they are registered in the EPackage registry 312 */ 313 private void restoreMetaModels() { 314 for (String key : metaModelCache.keySet()) { 315 EPackage.Registry.INSTANCE.put(key, metaModelCache.get(key)); 316 }; 317 } 318 319 /** 320 * Unregister the metamodels from the EPackage registry 321 * 322 * @param filter 323 * for filtering out certain instances 324 */ 325 private void unregisterMetaModels(String filter) { 326 for (String key : EPackage.Registry.INSTANCE.keySet()) { 327 if (key.contains(filter)) { 328 metaModelCache.put(key, EPackage.Registry.INSTANCE.get(key)); 329 } 330 }; 331 for (String key : metaModelCache.keySet()) { 332 EPackage.Registry.INSTANCE.remove(key); 333 }; 334 } 335 336 /** 337 * Returns true if decent binary model is used 338 * 339 * @return 340 */ 341 342 public boolean isUseDECENTBinary() { 343 return useDECENTBinary; 344 } 345 346 /** 347 * Sets the boolean which indicates, if the decent binary model is used 348 * 349 * @param useDECENTBinary 350 */ 351 public void setUseDECENTBinary(boolean useDECENTBinary) { 352 this.useDECENTBinary = useDECENTBinary; 353 } 354 355 /** 356 * Returns true if arffx binary model is used 357 * 358 * @return 359 */ 360 public boolean isUseARFFxBinary() { 361 return useARFFxBinary; 362 } 363 364 /** 365 * Sets the boolean which indicates, if the arffx binary model is used 366 * 367 * @param useARFFxBinary 368 */ 369 370 public void setUseARFFxBinary(boolean useARFFxBinary) { 371 this.useARFFxBinary = useARFFxBinary; 372 } 373 304 374 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/decentApp/DECENTResourceTool.java
r36 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.decentApp; 2 16 … … 12 26 * 13 27 * @author Philip Makedonski, Fabian Trautsch 14 * 28 * 15 29 */ 16 30 public class DECENTResourceTool extends ResourceTool { 17 18 /** 19 * Initializes the Tool Factory, from which the models can be loaded and 20 * inizializes the validator. 21 */ 22 public DECENTResourceTool(){ 23 super(DECENTResourceTool.class.getName()); 24 DECENTPackageImpl.init(); 25 this.resourceFactory = new DECENTResourceFactoryImpl(); 26 initializeValidator(); 27 } 28 29 /** 30 * Inizializes the model validator 31 */ 32 @Override 33 protected void initializeValidator(){ 34 super.initializeValidator(); 35 EObjectValidator validator = new EObjectValidator(); 36 EValidator.Registry.INSTANCE.put(DECENTPackage.eINSTANCE, validator); 37 } 38 39 31 32 /** 33 * Initializes the Tool Factory, from which the models can be loaded and inizializes the 34 * validator. 35 */ 36 public DECENTResourceTool() { 37 super(DECENTResourceTool.class.getName()); 38 DECENTPackageImpl.init(); 39 this.resourceFactory = new DECENTResourceFactoryImpl(); 40 initializeValidator(); 41 } 42 43 /** 44 * Inizializes the model validator 45 */ 46 @Override 47 protected void initializeValidator() { 48 super.initializeValidator(); 49 EObjectValidator validator = new EObjectValidator(); 50 EValidator.Registry.INSTANCE.put(DECENTPackage.eINSTANCE, validator); 51 } 40 52 41 53 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/decentApp/FileWatcher.java
r32 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.decentApp; 2 16 … … 8 22 * 9 23 * @author Philip Makedonski 10 * 24 * 11 25 */ 12 26 public abstract class FileWatcher extends TimerTask { 13 // Last timestamp 14 private long timeStamp; 15 16 // File to watch 17 private File file; 27 // Last timestamp 28 private long timeStamp; 18 29 19 /** 20 * Constructor 21 * @param file 22 */ 23 public FileWatcher(File file) { 24 this.file = file; 25 this.timeStamp = file.lastModified(); 26 } 30 // File to watch 31 private File file; 27 32 28 /** 29 * Watches a file and executes the onChange Method 30 * if a file is changed 31 */ 32 public final void run() { 33 long timeStamp = file.lastModified(); 33 /** 34 * Constructor 35 * 36 * @param file 37 */ 38 public FileWatcher(File file) { 39 this.file = file; 40 this.timeStamp = file.lastModified(); 41 } 34 42 35 if (this.timeStamp != timeStamp) { 36 this.timeStamp = timeStamp; 37 onChange(file); 38 } 39 } 43 /** 44 * Watches a file and executes the onChange Method if a file is changed 45 */ 46 public final void run() { 47 long timeStamp = file.lastModified(); 40 48 41 protected abstract void onChange(File file); 49 if (this.timeStamp != timeStamp) { 50 this.timeStamp = timeStamp; 51 onChange(file); 52 } 53 } 54 55 protected abstract void onChange(File file); 42 56 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/decentApp/ResourceTool.java
r32 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.decentApp; 2 16 … … 32 46 * 33 47 * @author Philip Makedonski 34 * 48 * 35 49 */ 36 50 public class ResourceTool { 37 51 38 protected ResourceFactoryImpl resourceFactory = new XMIResourceFactoryImpl(); 39 40 /** 41 * Constructor 42 * @param loggedClass 43 */ 44 public ResourceTool(String loggedClass) { 45 System.setProperty("org.slf4j.simpleLogger.logFile","validation.log"); 46 System.setProperty("org.slf4j.simpleLogger.logFile","System.out"); 47 } 48 49 /** 50 * Initializes the validator 51 */ 52 protected void initializeValidator() { 53 // OCL.initialize(null); 54 String oclDelegateURI = OCLConstants.OCL_DELEGATE_URI+"/Pivot"; 55 56 EOperation.Internal.InvocationDelegate.Factory.Registry.INSTANCE.put(oclDelegateURI, 57 new OCLInvocationDelegateFactory(oclDelegateURI)); 58 EStructuralFeature.Internal.SettingDelegate.Factory.Registry.INSTANCE.put(oclDelegateURI, 59 new OCLSettingDelegateFactory(oclDelegateURI)); 60 EValidator.ValidationDelegate.Registry.INSTANCE.put(oclDelegateURI, 61 new OCLValidationDelegateFactory(oclDelegateURI)); 62 63 // EStructuralFeature.Internal.SettingDelegate.Factory.Registry.INSTANCE.put(oclDelegateURI, 64 // new OCLSettingDelegateFactory.Global()); 65 // QueryDelegate.Factory.Registry.INSTANCE.put(oclDelegateURI, new OCLQueryDelegateFactory.Global()); 66 67 } 68 69 /** 70 * Validates the ressource 71 * @param resource to validate 72 */ 73 public void validateResource(Resource resource) { 74 BasicDiagnostic diagnostics = new BasicDiagnostic(); 75 boolean valid = true; 76 for (EObject eo : resource.getContents()) 77 { 78 Map<Object, Object> context = new HashMap<Object, Object>(); 79 boolean validationResult = Diagnostician.INSTANCE.validate(eo, diagnostics, context); 80 showDiagnostics(diagnostics, ""); 81 valid &= validationResult; 82 } 83 84 if (!valid){ 85 System.out.println("Problem with validation!"); 86 } 87 } 88 89 /** 90 * Output method for showing diagnostics for different ressources 91 * @param diagnostics 92 * @param indent 93 */ 94 protected void showDiagnostics(Diagnostic diagnostics, String indent) { 95 indent+=" "; 96 for (Diagnostic d : diagnostics.getChildren()){ 97 System.out.println(indent+d.getSource()); 98 System.out.println(indent+" "+d.getMessage()); 99 showDiagnostics(d,indent); 100 } 101 } 102 103 104 /** 105 * Loads a ressource from XMI 106 * @param inputPath path to the xmi 107 * @param extension of the ressource to load 108 * @param p the given EPackage 109 * @return 110 */ 111 //TODO: workarounds copied from respective methods without EPackage parameter 112 @SuppressWarnings({ "rawtypes", "unchecked" }) 113 public Resource loadResourceFromXMI(String inputPath, String extension, EPackage p) { 114 Resource.Factory.Registry reg = Resource.Factory.Registry.INSTANCE; 115 Map<String, Object> m = reg.getExtensionToFactoryMap(); 116 m.put(extension, resourceFactory); 117 ResourceSet resSetIn = new ResourceSetImpl(); 118 //critical part 119 resSetIn.getPackageRegistry().put(p.getNsURI(), p); 120 121 Resource inputResource = resSetIn.createResource(URI.createURI(inputPath)); 122 try { 123 Map options = new HashMap<>(); 124 options.put(XMIResourceImpl.OPTION_DEFER_IDREF_RESOLUTION, Boolean.TRUE); 125 // options.put(XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF, XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF_DISCARD); 126 inputResource.load(options); 127 } catch (IOException e) { 128 e.printStackTrace(); 129 } 130 return inputResource; 131 } 132 133 /** 134 * Loads a ressource from XMI 135 * @param inputPath path to the xmi 136 * @param extension of the ressource to load 137 * @return 138 */ 139 140 @SuppressWarnings({ "rawtypes", "unchecked" }) 141 public Resource loadResourceFromXMI(String inputPath, String extension) { 142 Resource.Factory.Registry reg = Resource.Factory.Registry.INSTANCE; 143 Map<String, Object> m = reg.getExtensionToFactoryMap(); 144 m.put(extension, resourceFactory); 145 ResourceSet resSetIn = new ResourceSetImpl(); 146 Resource inputResource = resSetIn.createResource(URI.createURI(inputPath)); 147 try { 148 Map options = new HashMap<>(); 149 options.put(XMIResourceImpl.OPTION_DEFER_IDREF_RESOLUTION, Boolean.TRUE); 150 // options.put(XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF, XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF_DISCARD); 151 inputResource.load(options); 152 } catch (IOException e) { 153 e.printStackTrace(); 154 } 155 return inputResource; 156 } 157 158 /** 159 * Gets a ressource from a binary form 160 * @param inputPath path to the binary 161 * @param extension of the model to load 162 * @param p EPackage to put the loaded ressource in 163 * @return 164 */ 165 public Resource getResourceFromBinary(String inputPath, String extension, EPackage p) { 166 Resource.Factory.Registry reg = Resource.Factory.Registry.INSTANCE; 167 Map<String, Object> m = reg.getExtensionToFactoryMap(); 168 m.put(extension, new Resource.Factory() { 169 170 @Override 171 public Resource createResource(URI uri) { 172 return new BinaryResourceImpl(uri); 173 } 174 175 }); 176 177 ResourceSet resSetIn = new ResourceSetImpl(); 178 //critical part 179 resSetIn.getPackageRegistry().put(p.getNsURI(), p); 180 181 Resource inputResource = resSetIn.createResource(URI.createURI(inputPath)); 182 return inputResource; 183 } 184 185 186 /** 187 * Loads a ressource from a binary form 188 * @param inputPath path to the binary 189 * @param extension of the model to load 190 * @param p EPackage to put the loaded ressource in 191 * @return 192 */ 193 //TODO: workarounds copied from respective methods without EPackage parameter 194 @SuppressWarnings({ "rawtypes" }) 195 public Resource loadResourceFromBinary(String inputPath, String extension, EPackage p) { 196 Resource.Factory.Registry reg = Resource.Factory.Registry.INSTANCE; 197 Map<String, Object> m = reg.getExtensionToFactoryMap(); 198 m.put(extension, new Resource.Factory() { 199 200 @Override 201 public Resource createResource(URI uri) { 202 return new BinaryResourceImpl(uri); 203 } 204 205 }); 206 207 ResourceSet resSetIn = new ResourceSetImpl(); 208 //critical part 209 resSetIn.getPackageRegistry().put(p.getNsURI(), p); 210 211 Resource inputResource = resSetIn.createResource(URI.createURI(inputPath)); 212 if (new File(inputPath).exists()) { 213 214 try { 215 Map options = new HashMap<>(); 216 // options.put(BinaryResourceImpl.OPTION_DEFER_IDREF_RESOLUTION, Boolean.TRUE); 217 // options.put(BinaryResourceImpl.OPTION_DEFER_IDREF_RESOLUTION, Boolean.TRUE); 218 // options.put(XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF, XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF_DISCARD); 219 inputResource.load(options); 220 } catch (IOException e) { 221 e.printStackTrace(); 222 } 223 } 224 return inputResource; 225 } 226 227 /** 228 * Loads a ressource from a binary form 229 * 230 * @param inputPath path to the binary 231 * @param extension of the model to load 232 * @return 233 */ 234 @SuppressWarnings({ "rawtypes" }) 235 public Resource loadResourceFromBinary(String inputPath, String extension) { 236 Resource.Factory.Registry reg = Resource.Factory.Registry.INSTANCE; 237 Map<String, Object> m = reg.getExtensionToFactoryMap(); 238 m.put(extension, new Resource.Factory() { 239 240 @Override 241 public Resource createResource(URI uri) { 242 return new BinaryResourceImpl(uri); 243 } 244 245 }); 246 247 ResourceSet resSetIn = new ResourceSetImpl(); 248 Resource inputResource = resSetIn.createResource(URI.createURI(inputPath)); 249 try { 250 Map options = new HashMap<>(); 251 // options.put(XMIResourceImpl.OPTION_DEFER_IDREF_RESOLUTION, Boolean.TRUE); 252 // options.put(XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF, XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF_DISCARD); 253 inputResource.load(options); 254 } catch (IOException e) { 255 e.printStackTrace(); 256 } 257 return inputResource; 258 } 259 260 /** 261 * Stores the binary resource contents to a given path 262 * 263 * @param contents EList of different EObjects to store 264 * @param outputPath path to store to 265 * @param extension of the model to store 266 */ 267 @SuppressWarnings({ "rawtypes" }) 268 public void storeBinaryResourceContents(EList<EObject> contents, String outputPath, String extension) { 269 Resource.Factory.Registry reg = Resource.Factory.Registry.INSTANCE; 270 Map<String, Object> m = reg.getExtensionToFactoryMap(); 271 m.put(extension, new Resource.Factory() { 272 273 @Override 274 public Resource createResource(URI uri) { 275 return new BinaryResourceImpl(uri); 276 } 277 278 }); 279 280 ResourceSet resSet = new ResourceSetImpl(); 281 Resource outputResource = resSet.createResource(URI.createURI(outputPath)); 282 outputResource.getContents().addAll(contents); 283 try { 284 Map options = new HashMap<>(); 285 // options.put(XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF, XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF_DISCARD); 286 outputResource.save(options); 287 } catch (IOException e) { 288 e.printStackTrace(); 289 } 290 } 291 292 /** 293 * Stores the resource contents to a given path 294 * 295 * @param contents EList of different EObjects to store 296 * @param outputPath path to store to 297 * @param extension of the model to store 298 */ 299 @SuppressWarnings({ "unchecked", "rawtypes" }) 300 public void storeResourceContents(EList<EObject> contents, String outputPath, String extension) { 301 //TODO: duplicated from loadResourceFromXMI => move to a more appropriate location 302 Resource.Factory.Registry reg = Resource.Factory.Registry.INSTANCE; 303 Map<String, Object> m = reg.getExtensionToFactoryMap(); 304 m.put(extension, resourceFactory); 305 306 ResourceSet resSet = new ResourceSetImpl(); 307 Resource outputResource = resSet.createResource(URI.createURI(outputPath)); 308 outputResource.getContents().addAll(contents); 309 try { 310 Map options = new HashMap<>(); 311 options.put(XMIResourceImpl.OPTION_ENCODING, "UTF-8"); 312 // options.put(XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF, XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF_DISCARD); 313 outputResource.save(options); 314 } catch (IOException e) { 315 e.printStackTrace(); 316 } 317 } 318 52 protected ResourceFactoryImpl resourceFactory = new XMIResourceFactoryImpl(); 53 54 /** 55 * Constructor 56 * 57 * @param loggedClass 58 */ 59 public ResourceTool(String loggedClass) { 60 System.setProperty("org.slf4j.simpleLogger.logFile", "validation.log"); 61 System.setProperty("org.slf4j.simpleLogger.logFile", "System.out"); 62 } 63 64 /** 65 * Initializes the validator 66 */ 67 protected void initializeValidator() { 68 // OCL.initialize(null); 69 String oclDelegateURI = OCLConstants.OCL_DELEGATE_URI + "/Pivot"; 70 71 EOperation.Internal.InvocationDelegate.Factory.Registry.INSTANCE 72 .put(oclDelegateURI, new OCLInvocationDelegateFactory(oclDelegateURI)); 73 EStructuralFeature.Internal.SettingDelegate.Factory.Registry.INSTANCE 74 .put(oclDelegateURI, new OCLSettingDelegateFactory(oclDelegateURI)); 75 EValidator.ValidationDelegate.Registry.INSTANCE 76 .put(oclDelegateURI, new OCLValidationDelegateFactory(oclDelegateURI)); 77 78 // EStructuralFeature.Internal.SettingDelegate.Factory.Registry.INSTANCE.put(oclDelegateURI, 79 // new OCLSettingDelegateFactory.Global()); 80 // QueryDelegate.Factory.Registry.INSTANCE.put(oclDelegateURI, new 81 // OCLQueryDelegateFactory.Global()); 82 83 } 84 85 /** 86 * Validates the ressource 87 * 88 * @param resource 89 * to validate 90 */ 91 public void validateResource(Resource resource) { 92 BasicDiagnostic diagnostics = new BasicDiagnostic(); 93 boolean valid = true; 94 for (EObject eo : resource.getContents()) { 95 Map<Object, Object> context = new HashMap<Object, Object>(); 96 boolean validationResult = Diagnostician.INSTANCE.validate(eo, diagnostics, context); 97 showDiagnostics(diagnostics, ""); 98 valid &= validationResult; 99 } 100 101 if (!valid) { 102 System.out.println("Problem with validation!"); 103 } 104 } 105 106 /** 107 * Output method for showing diagnostics for different ressources 108 * 109 * @param diagnostics 110 * @param indent 111 */ 112 protected void showDiagnostics(Diagnostic diagnostics, String indent) { 113 indent += " "; 114 for (Diagnostic d : diagnostics.getChildren()) { 115 System.out.println(indent + d.getSource()); 116 System.out.println(indent + " " + d.getMessage()); 117 showDiagnostics(d, indent); 118 } 119 } 120 121 /** 122 * Loads a ressource from XMI 123 * 124 * @param inputPath 125 * path to the xmi 126 * @param extension 127 * of the ressource to load 128 * @param p 129 * the given EPackage 130 * @return 131 */ 132 // TODO: workarounds copied from respective methods without EPackage parameter 133 @SuppressWarnings( 134 { "rawtypes", "unchecked" }) 135 public Resource loadResourceFromXMI(String inputPath, String extension, EPackage p) { 136 Resource.Factory.Registry reg = Resource.Factory.Registry.INSTANCE; 137 Map<String, Object> m = reg.getExtensionToFactoryMap(); 138 m.put(extension, resourceFactory); 139 ResourceSet resSetIn = new ResourceSetImpl(); 140 // critical part 141 resSetIn.getPackageRegistry().put(p.getNsURI(), p); 142 143 Resource inputResource = resSetIn.createResource(URI.createURI(inputPath)); 144 try { 145 Map options = new HashMap<>(); 146 options.put(XMIResourceImpl.OPTION_DEFER_IDREF_RESOLUTION, Boolean.TRUE); 147 // options.put(XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF, 148 // XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF_DISCARD); 149 inputResource.load(options); 150 } 151 catch (IOException e) { 152 e.printStackTrace(); 153 } 154 return inputResource; 155 } 156 157 /** 158 * Loads a ressource from XMI 159 * 160 * @param inputPath 161 * path to the xmi 162 * @param extension 163 * of the ressource to load 164 * @return 165 */ 166 167 @SuppressWarnings( 168 { "rawtypes", "unchecked" }) 169 public Resource loadResourceFromXMI(String inputPath, String extension) { 170 Resource.Factory.Registry reg = Resource.Factory.Registry.INSTANCE; 171 Map<String, Object> m = reg.getExtensionToFactoryMap(); 172 m.put(extension, resourceFactory); 173 ResourceSet resSetIn = new ResourceSetImpl(); 174 Resource inputResource = resSetIn.createResource(URI.createURI(inputPath)); 175 try { 176 Map options = new HashMap<>(); 177 options.put(XMIResourceImpl.OPTION_DEFER_IDREF_RESOLUTION, Boolean.TRUE); 178 // options.put(XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF, 179 // XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF_DISCARD); 180 inputResource.load(options); 181 } 182 catch (IOException e) { 183 e.printStackTrace(); 184 } 185 return inputResource; 186 } 187 188 /** 189 * Gets a ressource from a binary form 190 * 191 * @param inputPath 192 * path to the binary 193 * @param extension 194 * of the model to load 195 * @param p 196 * EPackage to put the loaded ressource in 197 * @return 198 */ 199 public Resource getResourceFromBinary(String inputPath, String extension, EPackage p) { 200 Resource.Factory.Registry reg = Resource.Factory.Registry.INSTANCE; 201 Map<String, Object> m = reg.getExtensionToFactoryMap(); 202 m.put(extension, new Resource.Factory() { 203 204 @Override 205 public Resource createResource(URI uri) { 206 return new BinaryResourceImpl(uri); 207 } 208 209 }); 210 211 ResourceSet resSetIn = new ResourceSetImpl(); 212 // critical part 213 resSetIn.getPackageRegistry().put(p.getNsURI(), p); 214 215 Resource inputResource = resSetIn.createResource(URI.createURI(inputPath)); 216 return inputResource; 217 } 218 219 /** 220 * Loads a ressource from a binary form 221 * 222 * @param inputPath 223 * path to the binary 224 * @param extension 225 * of the model to load 226 * @param p 227 * EPackage to put the loaded ressource in 228 * @return 229 */ 230 // TODO: workarounds copied from respective methods without EPackage parameter 231 @SuppressWarnings( 232 { "rawtypes" }) 233 public Resource loadResourceFromBinary(String inputPath, String extension, EPackage p) { 234 Resource.Factory.Registry reg = Resource.Factory.Registry.INSTANCE; 235 Map<String, Object> m = reg.getExtensionToFactoryMap(); 236 m.put(extension, new Resource.Factory() { 237 238 @Override 239 public Resource createResource(URI uri) { 240 return new BinaryResourceImpl(uri); 241 } 242 243 }); 244 245 ResourceSet resSetIn = new ResourceSetImpl(); 246 // critical part 247 resSetIn.getPackageRegistry().put(p.getNsURI(), p); 248 249 Resource inputResource = resSetIn.createResource(URI.createURI(inputPath)); 250 if (new File(inputPath).exists()) { 251 252 try { 253 Map options = new HashMap<>(); 254 // options.put(BinaryResourceImpl.OPTION_DEFER_IDREF_RESOLUTION, Boolean.TRUE); 255 // options.put(BinaryResourceImpl.OPTION_DEFER_IDREF_RESOLUTION, Boolean.TRUE); 256 // options.put(XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF, 257 // XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF_DISCARD); 258 inputResource.load(options); 259 } 260 catch (IOException e) { 261 e.printStackTrace(); 262 } 263 } 264 return inputResource; 265 } 266 267 /** 268 * Loads a ressource from a binary form 269 * 270 * @param inputPath 271 * path to the binary 272 * @param extension 273 * of the model to load 274 * @return 275 */ 276 @SuppressWarnings( 277 { "rawtypes" }) 278 public Resource loadResourceFromBinary(String inputPath, String extension) { 279 Resource.Factory.Registry reg = Resource.Factory.Registry.INSTANCE; 280 Map<String, Object> m = reg.getExtensionToFactoryMap(); 281 m.put(extension, new Resource.Factory() { 282 283 @Override 284 public Resource createResource(URI uri) { 285 return new BinaryResourceImpl(uri); 286 } 287 288 }); 289 290 ResourceSet resSetIn = new ResourceSetImpl(); 291 Resource inputResource = resSetIn.createResource(URI.createURI(inputPath)); 292 try { 293 Map options = new HashMap<>(); 294 // options.put(XMIResourceImpl.OPTION_DEFER_IDREF_RESOLUTION, Boolean.TRUE); 295 // options.put(XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF, 296 // XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF_DISCARD); 297 inputResource.load(options); 298 } 299 catch (IOException e) { 300 e.printStackTrace(); 301 } 302 return inputResource; 303 } 304 305 /** 306 * Stores the binary resource contents to a given path 307 * 308 * @param contents 309 * EList of different EObjects to store 310 * @param outputPath 311 * path to store to 312 * @param extension 313 * of the model to store 314 */ 315 @SuppressWarnings( 316 { "rawtypes" }) 317 public void storeBinaryResourceContents(EList<EObject> contents, 318 String outputPath, 319 String extension) 320 { 321 Resource.Factory.Registry reg = Resource.Factory.Registry.INSTANCE; 322 Map<String, Object> m = reg.getExtensionToFactoryMap(); 323 m.put(extension, new Resource.Factory() { 324 325 @Override 326 public Resource createResource(URI uri) { 327 return new BinaryResourceImpl(uri); 328 } 329 330 }); 331 332 ResourceSet resSet = new ResourceSetImpl(); 333 Resource outputResource = resSet.createResource(URI.createURI(outputPath)); 334 outputResource.getContents().addAll(contents); 335 try { 336 Map options = new HashMap<>(); 337 // options.put(XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF, 338 // XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF_DISCARD); 339 outputResource.save(options); 340 } 341 catch (IOException e) { 342 e.printStackTrace(); 343 } 344 } 345 346 /** 347 * Stores the resource contents to a given path 348 * 349 * @param contents 350 * EList of different EObjects to store 351 * @param outputPath 352 * path to store to 353 * @param extension 354 * of the model to store 355 */ 356 @SuppressWarnings( 357 { "unchecked", "rawtypes" }) 358 public void storeResourceContents(EList<EObject> contents, String outputPath, String extension) 359 { 360 // TODO: duplicated from loadResourceFromXMI => move to a more appropriate location 361 Resource.Factory.Registry reg = Resource.Factory.Registry.INSTANCE; 362 Map<String, Object> m = reg.getExtensionToFactoryMap(); 363 m.put(extension, resourceFactory); 364 365 ResourceSet resSet = new ResourceSetImpl(); 366 Resource outputResource = resSet.createResource(URI.createURI(outputPath)); 367 outputResource.getContents().addAll(contents); 368 try { 369 Map options = new HashMap<>(); 370 options.put(XMIResourceImpl.OPTION_ENCODING, "UTF-8"); 371 // options.put(XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF, 372 // XMIResourceImpl.OPTION_PROCESS_DANGLING_HREF_DISCARD); 373 outputResource.save(options); 374 } 375 catch (IOException e) { 376 e.printStackTrace(); 377 } 378 } 319 379 320 380 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/eval/AbstractWekaEvaluation.java
r35 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.eval; 2 16 … … 17 31 18 32 /** 19 * Base class for the evaluation of results of classifiers compatible with the {@link Classifier} interface.20 * For each classifier, the following metrics are calculated:33 * Base class for the evaluation of results of classifiers compatible with the {@link Classifier} 34 * interface. For each classifier, the following metrics are calculated: 21 35 * <ul> 22 * <li>succHe: Success with recall>0.7, precision>0.5</li> 23 * <li>succZi: Success with recall>0.7, precision>0.7</li> 24 * <li>succG75: Success with gscore>0.75</li> 25 * <li>succG60: Success with gscore>0.6</li> 26 * <li>error</li> 27 * <li>recall</li> 28 * <li>precision</li> 29 * <li>fscore</li> 30 * <li>gscore</li> 31 * <li>AUC</li> 32 * <li>AUCEC (weighted by LOC, if applicable; 0.0 if LOC not available)</li> 33 * <li>tpr: true positive rate</li> 34 * <li>tnr: true negative rate</li> 35 * <li>tp: true positives</li> 36 * <li>fp: false positives</li> 37 * <li>tn: true negatives</li> 38 * <li>fn: false negatives</li> 39 * <li>errortrain: training error</li> 40 * <li>recalltrain: training recall</li> 41 * <li>precisiontrain: training precision</li> 42 * <li>succHetrain: training success with recall>0.7 and precision>0.5 43 * </ul> 36 * <li>succHe: Success with recall>0.7, precision>0.5</li> 37 * <li>succZi: Success with recall>0.7, precision>0.7</li> 38 * <li>succG75: Success with gscore>0.75</li> 39 * <li>succG60: Success with gscore>0.6</li> 40 * <li>error</li> 41 * <li>recall</li> 42 * <li>precision</li> 43 * <li>fscore</li> 44 * <li>gscore</li> 45 * <li>AUC</li> 46 * <li>AUCEC (weighted by LOC, if applicable; 0.0 if LOC not available)</li> 47 * <li>tpr: true positive rate</li> 48 * <li>tnr: true negative rate</li> 49 * <li>tp: true positives</li> 50 * <li>fp: false positives</li> 51 * <li>tn: true negatives</li> 52 * <li>fn: false negatives</li> 53 * <li>errortrain: training error</li> 54 * <li>recalltrain: training recall</li> 55 * <li>precisiontrain: training precision</li> 56 * <li>succHetrain: training success with recall>0.7 and precision>0.5 57 * </ul> 58 * 44 59 * @author Steffen Herbold 45 60 */ 46 61 public abstract class AbstractWekaEvaluation implements IEvaluationStrategy { 47 62 48 /** 49 * writer for the evaluation results 50 */ 51 private PrintWriter output = new PrintWriter(System.out); 52 53 private boolean outputIsSystemOut = true; 54 55 /** 56 * Creates the weka evaluator. Allows the creation of the evaluator in different ways, e.g., for cross-validation 57 * or evaluation on the test data. 58 * @param testdata test data 59 * @param classifier classifier used 60 * @return evaluator 61 */ 62 protected abstract Evaluation createEvaluator(Instances testdata, Classifier classifier); 63 64 /* 65 * (non-Javadoc) 66 * @see de.ugoe.cs.cpdp.eval.EvaluationStrategy#apply(weka.core.Instances, weka.core.Instances, java.util.List, boolean) 67 */ 68 @Override 69 public void apply(Instances testdata, Instances traindata, List<ITrainer> trainers, 70 boolean writeHeader) { 71 final List<Classifier> classifiers = new LinkedList<Classifier>(); 72 for( ITrainer trainer : trainers ) { 73 if( trainer instanceof IWekaCompatibleTrainer ) { 74 classifiers.add(((IWekaCompatibleTrainer) trainer).getClassifier()); 75 } else { 76 throw new RuntimeException("The selected evaluator only support Weka classifiers"); 77 } 78 } 79 80 if( writeHeader ) { 81 output.append("version,size_test,size_training"); 82 for( ITrainer trainer : trainers ) { 83 output.append(",succHe_" + ((IWekaCompatibleTrainer) trainer).getName()); 84 output.append(",succZi_" + ((IWekaCompatibleTrainer) trainer).getName()); 85 output.append(",succG75_" + ((IWekaCompatibleTrainer) trainer).getName()); 86 output.append(",succG60_" + ((IWekaCompatibleTrainer) trainer).getName()); 87 output.append(",error_" + ((IWekaCompatibleTrainer) trainer).getName()); 88 output.append(",recall_" + ((IWekaCompatibleTrainer) trainer).getName()); 89 output.append(",precision_" + ((IWekaCompatibleTrainer) trainer).getName()); 90 output.append(",fscore_" + ((IWekaCompatibleTrainer) trainer).getName()); 91 output.append(",gscore_" + ((IWekaCompatibleTrainer) trainer).getName()); 92 output.append(",mcc_" + ((IWekaCompatibleTrainer) trainer).getName()); 93 output.append(",auc_" + ((IWekaCompatibleTrainer) trainer).getName()); 94 output.append(",aucec_" + ((IWekaCompatibleTrainer) trainer).getName()); 95 output.append(",tpr_" + ((IWekaCompatibleTrainer) trainer).getName()); 96 output.append(",tnr_" + ((IWekaCompatibleTrainer) trainer).getName()); 97 output.append(",tp_" + ((IWekaCompatibleTrainer) trainer).getName()); 98 output.append(",fn_" + ((IWekaCompatibleTrainer) trainer).getName()); 99 output.append(",tn_" + ((IWekaCompatibleTrainer) trainer).getName()); 100 output.append(",fp_" + ((IWekaCompatibleTrainer) trainer).getName()); 101 output.append(",trainerror_" + ((IWekaCompatibleTrainer) trainer).getName()); 102 output.append(",trainrecall_" + ((IWekaCompatibleTrainer) trainer).getName()); 103 output.append(",trainprecision_" + ((IWekaCompatibleTrainer) trainer).getName()); 104 output.append(",trainsuccHe_" + ((IWekaCompatibleTrainer) trainer).getName()); 105 } 106 output.append(StringTools.ENDLINE); 107 } 108 109 output.append(testdata.relationName()); 110 output.append("," + testdata.numInstances()); 111 output.append("," + traindata.numInstances()); 112 113 Evaluation eval = null; 114 Evaluation evalTrain = null; 115 for( Classifier classifier : classifiers ) { 116 eval = createEvaluator(testdata, classifier); 117 evalTrain = createEvaluator(traindata, classifier); 118 119 double pf = eval.numFalsePositives(1)/(eval.numFalsePositives(1)+eval.numTrueNegatives(1)); 120 double gmeasure = 2*eval.recall(1)*(1.0-pf)/(eval.recall(1)+(1.0-pf)); 121 double mcc = (eval.numTruePositives(1)*eval.numTrueNegatives(1)-eval.numFalsePositives(1)*eval.numFalseNegatives(1))/Math.sqrt((eval.numTruePositives(1)+eval.numFalsePositives(1))*(eval.numTruePositives(1)+eval.numFalseNegatives(1))*(eval.numTrueNegatives(1)+eval.numFalsePositives(1))*(eval.numTrueNegatives(1)+eval.numFalseNegatives(1))); 122 double aucec = calculateReviewEffort(testdata, classifier); 123 124 if( eval.recall(1)>=0.7 && eval.precision(1) >= 0.5 ) { 125 output.append(",1"); 126 } else { 127 output.append(",0"); 128 } 129 130 if( eval.recall(1)>=0.7 && eval.precision(1) >= 0.7 ) { 131 output.append(",1"); 132 } else { 133 output.append(",0"); 134 } 135 136 if( gmeasure>0.75 ) { 137 output.append(",1"); 138 } else { 139 output.append(",0"); 140 } 141 142 if( gmeasure>0.6 ) { 143 output.append(",1"); 144 } else { 145 output.append(",0"); 146 } 147 148 output.append("," + eval.errorRate()); 149 output.append("," + eval.recall(1)); 150 output.append("," + eval.precision(1)); 151 output.append("," + eval.fMeasure(1)); 152 output.append("," + gmeasure); 153 output.append("," + mcc); 154 output.append("," + eval.areaUnderROC(1)); 155 output.append("," + aucec); 156 output.append("," + eval.truePositiveRate(1)); 157 output.append("," + eval.trueNegativeRate(1)); 158 output.append("," + eval.numTruePositives(1)); 159 output.append("," + eval.numFalseNegatives(1)); 160 output.append("," + eval.numTrueNegatives(1)); 161 output.append("," + eval.numFalsePositives(1)); 162 output.append("," + evalTrain.errorRate()); 163 output.append("," + evalTrain.recall(1)); 164 output.append("," + evalTrain.precision(1)); 165 if( evalTrain.recall(1)>=0.7 && evalTrain.precision(1) >= 0.5 ) { 166 output.append(",1"); 167 } else { 168 output.append(",0"); 169 } 170 } 171 172 output.append(StringTools.ENDLINE); 173 output.flush(); 174 } 175 176 private double calculateReviewEffort(Instances testdata, Classifier classifier) { 177 178 final Attribute loc = testdata.attribute("loc"); 179 if( loc==null ) { 180 return 0.0; 181 } 182 183 final List<Integer> bugPredicted = new ArrayList<>(); 184 final List<Integer> nobugPredicted = new ArrayList<>(); 185 double totalLoc = 0.0d; 186 int totalBugs = 0; 187 for( int i=0 ; i<testdata.numInstances() ; i++ ) { 188 try { 189 if( Double.compare(classifier.classifyInstance(testdata.instance(i)),0.0d)==0 ) { 190 nobugPredicted.add(i); 191 } else { 192 bugPredicted.add(i); 193 } 194 } catch (Exception e) { 195 throw new RuntimeException("unexpected error during the evaluation of the review effort", e); 196 } 197 if(Double.compare(testdata.instance(i).classValue(),1.0d)==0) { 198 totalBugs++; 199 } 200 totalLoc += testdata.instance(i).value(loc); 201 } 202 203 final List<Double> reviewLoc = new ArrayList<>(testdata.numInstances()); 204 final List<Double> bugsFound = new ArrayList<>(testdata.numInstances()); 205 206 double currentBugsFound = 0; 207 208 while( !bugPredicted.isEmpty() ) { 209 double minLoc = Double.MAX_VALUE; 210 int minIndex = -1; 211 for( int i=0 ; i<bugPredicted.size() ; i++ ) { 212 double currentLoc = testdata.instance(bugPredicted.get(i)).value(loc); 213 if( currentLoc<minLoc ) { 214 minIndex = i; 215 minLoc = currentLoc; 216 } 217 } 218 if( minIndex!=-1 ) { 219 reviewLoc.add(minLoc/totalLoc); 220 221 currentBugsFound += testdata.instance(bugPredicted.get(minIndex)).classValue(); 222 bugsFound.add(currentBugsFound); 223 224 bugPredicted.remove(minIndex); 225 } else { 226 throw new RuntimeException("Shouldn't happen!"); 227 } 228 } 229 230 while( !nobugPredicted.isEmpty() ) { 231 double minLoc = Double.MAX_VALUE; 232 int minIndex = -1; 233 for( int i=0 ; i<nobugPredicted.size() ; i++ ) { 234 double currentLoc = testdata.instance(nobugPredicted.get(i)).value(loc); 235 if( currentLoc<minLoc ) { 236 minIndex = i; 237 minLoc = currentLoc; 238 } 239 } 240 if( minIndex!=-1 ) { 241 reviewLoc.add(minLoc/totalLoc); 242 243 currentBugsFound += testdata.instance(nobugPredicted.get(minIndex)).classValue(); 244 bugsFound.add(currentBugsFound); 245 nobugPredicted.remove(minIndex); 246 } else { 247 throw new RuntimeException("Shouldn't happen!"); 248 } 249 } 250 251 double auc = 0.0; 252 for( int i=0 ; i<bugsFound.size() ; i++ ) { 253 auc += reviewLoc.get(i)*bugsFound.get(i)/totalBugs; 254 } 255 256 return auc; 257 } 258 259 /* 260 * (non-Javadoc) 261 * @see de.ugoe.cs.cpdp.Parameterizable#setParameter(java.lang.String) 262 */ 263 @Override 264 public void setParameter(String parameters) { 265 if( output!=null && !outputIsSystemOut ) { 266 output.close(); 267 } 268 if( "system.out".equals(parameters) || "".equals(parameters) ) { 269 output = new PrintWriter(System.out); 270 outputIsSystemOut = true; 271 } else { 272 try { 273 output = new PrintWriter(new FileOutputStream(parameters)); 274 outputIsSystemOut = false; 275 } catch (FileNotFoundException e) { 276 throw new RuntimeException(e); 277 } 278 } 279 } 63 /** 64 * writer for the evaluation results 65 */ 66 private PrintWriter output = new PrintWriter(System.out); 67 68 private boolean outputIsSystemOut = true; 69 70 /** 71 * Creates the weka evaluator. Allows the creation of the evaluator in different ways, e.g., for 72 * cross-validation or evaluation on the test data. 73 * 74 * @param testdata 75 * test data 76 * @param classifier 77 * classifier used 78 * @return evaluator 79 */ 80 protected abstract Evaluation createEvaluator(Instances testdata, Classifier classifier); 81 82 /* 83 * (non-Javadoc) 84 * 85 * @see de.ugoe.cs.cpdp.eval.EvaluationStrategy#apply(weka.core.Instances, weka.core.Instances, 86 * java.util.List, boolean) 87 */ 88 @Override 89 public void apply(Instances testdata, 90 Instances traindata, 91 List<ITrainer> trainers, 92 boolean writeHeader) 93 { 94 final List<Classifier> classifiers = new LinkedList<Classifier>(); 95 for (ITrainer trainer : trainers) { 96 if (trainer instanceof IWekaCompatibleTrainer) { 97 classifiers.add(((IWekaCompatibleTrainer) trainer).getClassifier()); 98 } 99 else { 100 throw new RuntimeException("The selected evaluator only support Weka classifiers"); 101 } 102 } 103 104 if (writeHeader) { 105 output.append("version,size_test,size_training"); 106 for (ITrainer trainer : trainers) { 107 output.append(",succHe_" + ((IWekaCompatibleTrainer) trainer).getName()); 108 output.append(",succZi_" + ((IWekaCompatibleTrainer) trainer).getName()); 109 output.append(",succG75_" + ((IWekaCompatibleTrainer) trainer).getName()); 110 output.append(",succG60_" + ((IWekaCompatibleTrainer) trainer).getName()); 111 output.append(",error_" + ((IWekaCompatibleTrainer) trainer).getName()); 112 output.append(",recall_" + ((IWekaCompatibleTrainer) trainer).getName()); 113 output.append(",precision_" + ((IWekaCompatibleTrainer) trainer).getName()); 114 output.append(",fscore_" + ((IWekaCompatibleTrainer) trainer).getName()); 115 output.append(",gscore_" + ((IWekaCompatibleTrainer) trainer).getName()); 116 output.append(",mcc_" + ((IWekaCompatibleTrainer) trainer).getName()); 117 output.append(",auc_" + ((IWekaCompatibleTrainer) trainer).getName()); 118 output.append(",aucec_" + ((IWekaCompatibleTrainer) trainer).getName()); 119 output.append(",tpr_" + ((IWekaCompatibleTrainer) trainer).getName()); 120 output.append(",tnr_" + ((IWekaCompatibleTrainer) trainer).getName()); 121 output.append(",tp_" + ((IWekaCompatibleTrainer) trainer).getName()); 122 output.append(",fn_" + ((IWekaCompatibleTrainer) trainer).getName()); 123 output.append(",tn_" + ((IWekaCompatibleTrainer) trainer).getName()); 124 output.append(",fp_" + ((IWekaCompatibleTrainer) trainer).getName()); 125 output.append(",trainerror_" + ((IWekaCompatibleTrainer) trainer).getName()); 126 output.append(",trainrecall_" + ((IWekaCompatibleTrainer) trainer).getName()); 127 output.append(",trainprecision_" + ((IWekaCompatibleTrainer) trainer).getName()); 128 output.append(",trainsuccHe_" + ((IWekaCompatibleTrainer) trainer).getName()); 129 } 130 output.append(StringTools.ENDLINE); 131 } 132 133 output.append(testdata.relationName()); 134 output.append("," + testdata.numInstances()); 135 output.append("," + traindata.numInstances()); 136 137 Evaluation eval = null; 138 Evaluation evalTrain = null; 139 for (Classifier classifier : classifiers) { 140 eval = createEvaluator(testdata, classifier); 141 evalTrain = createEvaluator(traindata, classifier); 142 143 double pf = 144 eval.numFalsePositives(1) / (eval.numFalsePositives(1) + eval.numTrueNegatives(1)); 145 double gmeasure = 2 * eval.recall(1) * (1.0 - pf) / (eval.recall(1) + (1.0 - pf)); 146 double mcc = 147 (eval.numTruePositives(1) * eval.numTrueNegatives(1) - eval.numFalsePositives(1) * 148 eval.numFalseNegatives(1)) / 149 Math.sqrt((eval.numTruePositives(1) + eval.numFalsePositives(1)) * 150 (eval.numTruePositives(1) + eval.numFalseNegatives(1)) * 151 (eval.numTrueNegatives(1) + eval.numFalsePositives(1)) * 152 (eval.numTrueNegatives(1) + eval.numFalseNegatives(1))); 153 double aucec = calculateReviewEffort(testdata, classifier); 154 155 if (eval.recall(1) >= 0.7 && eval.precision(1) >= 0.5) { 156 output.append(",1"); 157 } 158 else { 159 output.append(",0"); 160 } 161 162 if (eval.recall(1) >= 0.7 && eval.precision(1) >= 0.7) { 163 output.append(",1"); 164 } 165 else { 166 output.append(",0"); 167 } 168 169 if (gmeasure > 0.75) { 170 output.append(",1"); 171 } 172 else { 173 output.append(",0"); 174 } 175 176 if (gmeasure > 0.6) { 177 output.append(",1"); 178 } 179 else { 180 output.append(",0"); 181 } 182 183 output.append("," + eval.errorRate()); 184 output.append("," + eval.recall(1)); 185 output.append("," + eval.precision(1)); 186 output.append("," + eval.fMeasure(1)); 187 output.append("," + gmeasure); 188 output.append("," + mcc); 189 output.append("," + eval.areaUnderROC(1)); 190 output.append("," + aucec); 191 output.append("," + eval.truePositiveRate(1)); 192 output.append("," + eval.trueNegativeRate(1)); 193 output.append("," + eval.numTruePositives(1)); 194 output.append("," + eval.numFalseNegatives(1)); 195 output.append("," + eval.numTrueNegatives(1)); 196 output.append("," + eval.numFalsePositives(1)); 197 output.append("," + evalTrain.errorRate()); 198 output.append("," + evalTrain.recall(1)); 199 output.append("," + evalTrain.precision(1)); 200 if (evalTrain.recall(1) >= 0.7 && evalTrain.precision(1) >= 0.5) { 201 output.append(",1"); 202 } 203 else { 204 output.append(",0"); 205 } 206 } 207 208 output.append(StringTools.ENDLINE); 209 output.flush(); 210 } 211 212 private double calculateReviewEffort(Instances testdata, Classifier classifier) { 213 214 final Attribute loc = testdata.attribute("loc"); 215 if (loc == null) { 216 return 0.0; 217 } 218 219 final List<Integer> bugPredicted = new ArrayList<>(); 220 final List<Integer> nobugPredicted = new ArrayList<>(); 221 double totalLoc = 0.0d; 222 int totalBugs = 0; 223 for (int i = 0; i < testdata.numInstances(); i++) { 224 try { 225 if (Double.compare(classifier.classifyInstance(testdata.instance(i)), 0.0d) == 0) { 226 nobugPredicted.add(i); 227 } 228 else { 229 bugPredicted.add(i); 230 } 231 } 232 catch (Exception e) { 233 throw new RuntimeException( 234 "unexpected error during the evaluation of the review effort", 235 e); 236 } 237 if (Double.compare(testdata.instance(i).classValue(), 1.0d) == 0) { 238 totalBugs++; 239 } 240 totalLoc += testdata.instance(i).value(loc); 241 } 242 243 final List<Double> reviewLoc = new ArrayList<>(testdata.numInstances()); 244 final List<Double> bugsFound = new ArrayList<>(testdata.numInstances()); 245 246 double currentBugsFound = 0; 247 248 while (!bugPredicted.isEmpty()) { 249 double minLoc = Double.MAX_VALUE; 250 int minIndex = -1; 251 for (int i = 0; i < bugPredicted.size(); i++) { 252 double currentLoc = testdata.instance(bugPredicted.get(i)).value(loc); 253 if (currentLoc < minLoc) { 254 minIndex = i; 255 minLoc = currentLoc; 256 } 257 } 258 if (minIndex != -1) { 259 reviewLoc.add(minLoc / totalLoc); 260 261 currentBugsFound += testdata.instance(bugPredicted.get(minIndex)).classValue(); 262 bugsFound.add(currentBugsFound); 263 264 bugPredicted.remove(minIndex); 265 } 266 else { 267 throw new RuntimeException("Shouldn't happen!"); 268 } 269 } 270 271 while (!nobugPredicted.isEmpty()) { 272 double minLoc = Double.MAX_VALUE; 273 int minIndex = -1; 274 for (int i = 0; i < nobugPredicted.size(); i++) { 275 double currentLoc = testdata.instance(nobugPredicted.get(i)).value(loc); 276 if (currentLoc < minLoc) { 277 minIndex = i; 278 minLoc = currentLoc; 279 } 280 } 281 if (minIndex != -1) { 282 reviewLoc.add(minLoc / totalLoc); 283 284 currentBugsFound += testdata.instance(nobugPredicted.get(minIndex)).classValue(); 285 bugsFound.add(currentBugsFound); 286 nobugPredicted.remove(minIndex); 287 } 288 else { 289 throw new RuntimeException("Shouldn't happen!"); 290 } 291 } 292 293 double auc = 0.0; 294 for (int i = 0; i < bugsFound.size(); i++) { 295 auc += reviewLoc.get(i) * bugsFound.get(i) / totalBugs; 296 } 297 298 return auc; 299 } 300 301 /* 302 * (non-Javadoc) 303 * 304 * @see de.ugoe.cs.cpdp.Parameterizable#setParameter(java.lang.String) 305 */ 306 @Override 307 public void setParameter(String parameters) { 308 if (output != null && !outputIsSystemOut) { 309 output.close(); 310 } 311 if ("system.out".equals(parameters) || "".equals(parameters)) { 312 output = new PrintWriter(System.out); 313 outputIsSystemOut = true; 314 } 315 else { 316 try { 317 output = new PrintWriter(new FileOutputStream(parameters)); 318 outputIsSystemOut = false; 319 } 320 catch (FileNotFoundException e) { 321 throw new RuntimeException(e); 322 } 323 } 324 } 280 325 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/eval/CVWekaEvaluation.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.eval; 2 16 … … 12 26 /** 13 27 * Implements the {@link AbstractWekaEvaluation} for 10-fold cross validation. 28 * 14 29 * @author Steffen Herbold 15 30 */ 16 31 public class CVWekaEvaluation extends AbstractWekaEvaluation { 17 18 /** 19 * @see de.ugoe.cs.cpdp.eval.AbstractWekaEvaluation#createEvaluator(weka.core.Instances, weka.classifiers.Classifier) 20 */ 21 @Override 22 protected Evaluation createEvaluator(Instances testdata, Classifier classifier) { 23 PrintStream errStr = System.err; 24 System.setErr(new PrintStream(new NullOutputStream())); 25 try { 26 final Evaluation eval = new Evaluation(testdata); 27 eval.crossValidateModel(classifier, testdata, 10, new Random(1)); 28 return eval; 29 } catch (Exception e) { 30 throw new RuntimeException(e); 31 } finally { 32 System.setErr(errStr); 33 } 34 } 32 33 /** 34 * @see de.ugoe.cs.cpdp.eval.AbstractWekaEvaluation#createEvaluator(weka.core.Instances, 35 * weka.classifiers.Classifier) 36 */ 37 @Override 38 protected Evaluation createEvaluator(Instances testdata, Classifier classifier) { 39 PrintStream errStr = System.err; 40 System.setErr(new PrintStream(new NullOutputStream())); 41 try { 42 final Evaluation eval = new Evaluation(testdata); 43 eval.crossValidateModel(classifier, testdata, 10, new Random(1)); 44 return eval; 45 } 46 catch (Exception e) { 47 throw new RuntimeException(e); 48 } 49 finally { 50 System.setErr(errStr); 51 } 52 } 35 53 36 54 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/eval/IEvaluationStrategy.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.eval; 2 16 … … 9 23 10 24 /** 11 * Interface for evaluation strategies to evaluate the performance of classifiers. 25 * Interface for evaluation strategies to evaluate the performance of classifiers. 26 * 12 27 * @author Steffen Herbold 13 28 */ 14 29 public interface IEvaluationStrategy extends IParameterizable { 15 30 16 /** 17 * Applies the evaluation strategy. 18 * @param testdata test data for the evaluation 19 * @param traindata training data used 20 * @param trainers list of training algorithms used to train the classifiers 21 * @param writeHeader if true, a header line for the results file is written (may not be applicable) 22 */ 23 void apply(Instances testdata, Instances traindata, List<ITrainer> trainers, boolean writeHeader); 31 /** 32 * Applies the evaluation strategy. 33 * 34 * @param testdata 35 * test data for the evaluation 36 * @param traindata 37 * training data used 38 * @param trainers 39 * list of training algorithms used to train the classifiers 40 * @param writeHeader 41 * if true, a header line for the results file is written (may not be applicable) 42 */ 43 void apply(Instances testdata, Instances traindata, List<ITrainer> trainers, boolean writeHeader); 24 44 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/eval/NormalWekaEvaluation.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.eval; 2 16 … … 7 21 /** 8 22 * Implements the {@link AbstractWekaEvaluation} for evaluation on the test data. 23 * 9 24 * @author Steffen Herbold 10 * 25 * 11 26 */ 12 27 public class NormalWekaEvaluation extends AbstractWekaEvaluation { 13 28 14 /** 15 * @see de.ugoe.cs.cpdp.eval.AbstractWekaEvaluation#createEvaluator(weka.core.Instances, weka.classifiers.Classifier) 16 */ 17 @Override 18 protected Evaluation createEvaluator(Instances testdata, Classifier classifier) { 19 try { 20 final Evaluation eval = new Evaluation(testdata); 21 eval.evaluateModel(classifier, testdata); 22 return eval; 23 } catch (Exception e) { 24 throw new RuntimeException(e); 25 } 26 } 29 /** 30 * @see de.ugoe.cs.cpdp.eval.AbstractWekaEvaluation#createEvaluator(weka.core.Instances, 31 * weka.classifiers.Classifier) 32 */ 33 @Override 34 protected Evaluation createEvaluator(Instances testdata, Classifier classifier) { 35 try { 36 final Evaluation eval = new Evaluation(testdata); 37 eval.evaluateModel(classifier, testdata); 38 return eval; 39 } 40 catch (Exception e) { 41 throw new RuntimeException(e); 42 } 43 } 27 44 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/execution/ClassifierCreationExperiment.java
r33 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.execution; 2 16 … … 19 33 20 34 /** 21 * Class responsible for executing an experiment according to an {@link ExperimentConfiguration}. The steps22 * of this ClassifierCreationExperiment are as follows:35 * Class responsible for executing an experiment according to an {@link ExperimentConfiguration}. 36 * The steps of this ClassifierCreationExperiment are as follows: 23 37 * <ul> 24 * <li>load the data from the provided data path</li> 25 * <li>check if given resultsdir exists, if not create one</li> 26 * <li>execute the following steps for each data set: 27 * <ul> 28 * <li>load the dataset</li> 29 * <li>set testdata == traindata</li> 30 * <li>preprocess the data</li> 31 * <li>postprocess the data</li> 32 * <li>for each configured trainer do the following:</li> 33 * <ul> 34 * <li>if the classifier should be saved, train it with the dataset</li> 35 * <li>save it in the results dir</li> 36 * <li>For each configured evaluator: Do the evaluation and save results</li> 37 * </ul> 38 * </ul> 38 * <li>load the data from the provided data path</li> 39 * <li>check if given resultsdir exists, if not create one</li> 40 * <li>execute the following steps for each data set: 41 * <ul> 42 * <li>load the dataset</li> 43 * <li>set testdata == traindata</li> 44 * <li>preprocess the data</li> 45 * <li>postprocess the data</li> 46 * <li>for each configured trainer do the following:</li> 47 * <ul> 48 * <li>if the classifier should be saved, train it with the dataset</li> 49 * <li>save it in the results dir</li> 50 * <li>For each configured evaluator: Do the evaluation and save results</li> 39 51 * </ul> 40 * 41 * Note that this class implements {@link IExectuionStrategy}, i.e., each experiment can be started 52 * </ul> 53 * </ul> 54 * 55 * Note that this class implements {@link IExectuionStrategy}, i.e., each experiment can be started 42 56 * in its own thread. 43 57 * … … 46 60 public class ClassifierCreationExperiment implements IExecutionStrategy { 47 61 48 /** 49 * configuration of the experiment 50 */ 51 private final ExperimentConfiguration config; 52 53 /** 54 * Constructor. Creates a new experiment based on a configuration. 55 * @param config configuration of the experiment 56 */ 57 public ClassifierCreationExperiment(ExperimentConfiguration config) { 58 this.config = config; 59 } 60 61 /** 62 * Executes the experiment with the steps as described in the class comment. 63 * @see Runnable#run() 64 */ 65 @Override 66 public void run() { 67 final List<SoftwareVersion> versions = new LinkedList<>(); 68 69 boolean writeHeader = true; 70 71 for(IVersionLoader loader : config.getLoaders()) { 72 versions.addAll(loader.load()); 73 } 74 62 /** 63 * configuration of the experiment 64 */ 65 private final ExperimentConfiguration config; 75 66 76 File resultsDir = new File(config.getResultsPath()); 77 if (!resultsDir.exists()) { 78 resultsDir.mkdir(); 79 } 80 81 82 int versionCount = 1; 83 for( SoftwareVersion testVersion : versions ) { 84 85 // At first: traindata == testdata 86 Instances testdata = testVersion.getInstances(); 87 Instances traindata = new Instances(testdata); 88 89 // Give the dataset a new name 90 testdata.setRelationName(testVersion.getProject()); 91 92 for( IProcessesingStrategy processor : config.getPreProcessors() ) { 93 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying preprocessor %s", config.getExperimentName(), versionCount, versions.size(), testVersion.getProject(), processor.getClass().getName())); 94 processor.apply(testdata, traindata); 95 } 96 97 for( IPointWiseDataselectionStrategy dataselector : config.getPointWiseSelectors() ) { 98 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying pointwise selection %s", config.getExperimentName(), versionCount, versions.size(), testVersion.getProject(), dataselector.getClass().getName())); 99 traindata = dataselector.apply(testdata, traindata); 100 } 101 102 for( IProcessesingStrategy processor : config.getPostProcessors() ) { 103 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying setwise postprocessor %s", config.getExperimentName(), versionCount, versions.size(), testVersion.getProject(), processor.getClass().getName())); 104 processor.apply(testdata, traindata); 105 } 106 107 108 109 110 // Trainerlist for evaluation later on 111 List<ITrainer> allTrainers = new LinkedList<>(); 112 113 for( ITrainingStrategy trainer : config.getTrainers() ) { 67 /** 68 * Constructor. Creates a new experiment based on a configuration. 69 * 70 * @param config 71 * configuration of the experiment 72 */ 73 public ClassifierCreationExperiment(ExperimentConfiguration config) { 74 this.config = config; 75 } 114 76 115 // Add trainer to list for evaluation 116 allTrainers.add(trainer); 117 118 // Train classifier 119 trainer.apply(traindata); 120 121 if(config.getSaveClassifier()) { 122 // If classifier should be saved, train him and save him 123 // be careful with typecasting here! 124 IWekaCompatibleTrainer trainerToSave = (IWekaCompatibleTrainer) trainer; 125 //Console.println(trainerToSave.getClassifier().toString()); 126 try { 127 weka.core.SerializationHelper.write(resultsDir.getAbsolutePath()+"/"+trainer.getName()+"-"+testVersion.getProject(), trainerToSave.getClassifier()); 128 } catch (Exception e) { 129 e.printStackTrace(); 130 } 131 132 } 133 } 134 135 136 137 for( IEvaluationStrategy evaluator : config.getEvaluators() ) { 138 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying evaluator %s", config.getExperimentName(), versionCount, versions.size(), testVersion.getProject(), evaluator.getClass().getName())); 77 /** 78 * Executes the experiment with the steps as described in the class comment. 79 * 80 * @see Runnable#run() 81 */ 82 @Override 83 public void run() { 84 final List<SoftwareVersion> versions = new LinkedList<>(); 139 85 140 if( writeHeader ) { 141 evaluator.setParameter(config.getResultsPath() + "/" + config.getExperimentName() + ".csv"); 142 } 143 evaluator.apply(testdata, traindata, allTrainers, writeHeader); 144 writeHeader = false; 145 } 146 147 versionCount++; 148 149 Console.traceln(Level.INFO, String.format("[%s] [%02d/%02d] %s: finished", config.getExperimentName(), versionCount, versions.size(), testVersion.getProject())); 150 151 } 152 153 } 154 86 boolean writeHeader = true; 87 88 for (IVersionLoader loader : config.getLoaders()) { 89 versions.addAll(loader.load()); 90 } 91 92 File resultsDir = new File(config.getResultsPath()); 93 if (!resultsDir.exists()) { 94 resultsDir.mkdir(); 95 } 96 97 int versionCount = 1; 98 for (SoftwareVersion testVersion : versions) { 99 100 // At first: traindata == testdata 101 Instances testdata = testVersion.getInstances(); 102 Instances traindata = new Instances(testdata); 103 104 // Give the dataset a new name 105 testdata.setRelationName(testVersion.getProject()); 106 107 for (IProcessesingStrategy processor : config.getPreProcessors()) { 108 Console.traceln(Level.FINE, String 109 .format("[%s] [%02d/%02d] %s: applying preprocessor %s", 110 config.getExperimentName(), versionCount, versions.size(), 111 testVersion.getProject(), processor.getClass().getName())); 112 processor.apply(testdata, traindata); 113 } 114 115 for (IPointWiseDataselectionStrategy dataselector : config.getPointWiseSelectors()) { 116 Console.traceln(Level.FINE, String 117 .format("[%s] [%02d/%02d] %s: applying pointwise selection %s", 118 config.getExperimentName(), versionCount, versions.size(), 119 testVersion.getProject(), dataselector.getClass().getName())); 120 traindata = dataselector.apply(testdata, traindata); 121 } 122 123 for (IProcessesingStrategy processor : config.getPostProcessors()) { 124 Console.traceln(Level.FINE, String 125 .format("[%s] [%02d/%02d] %s: applying setwise postprocessor %s", 126 config.getExperimentName(), versionCount, versions.size(), 127 testVersion.getProject(), processor.getClass().getName())); 128 processor.apply(testdata, traindata); 129 } 130 131 // Trainerlist for evaluation later on 132 List<ITrainer> allTrainers = new LinkedList<>(); 133 134 for (ITrainingStrategy trainer : config.getTrainers()) { 135 136 // Add trainer to list for evaluation 137 allTrainers.add(trainer); 138 139 // Train classifier 140 trainer.apply(traindata); 141 142 if (config.getSaveClassifier()) { 143 // If classifier should be saved, train him and save him 144 // be careful with typecasting here! 145 IWekaCompatibleTrainer trainerToSave = (IWekaCompatibleTrainer) trainer; 146 // Console.println(trainerToSave.getClassifier().toString()); 147 try { 148 weka.core.SerializationHelper.write(resultsDir.getAbsolutePath() + "/" + 149 trainer.getName() + "-" + 150 testVersion.getProject(), 151 trainerToSave.getClassifier()); 152 } 153 catch (Exception e) { 154 e.printStackTrace(); 155 } 156 157 } 158 } 159 160 for (IEvaluationStrategy evaluator : config.getEvaluators()) { 161 Console.traceln(Level.FINE, String 162 .format("[%s] [%02d/%02d] %s: applying evaluator %s", 163 config.getExperimentName(), versionCount, versions.size(), 164 testVersion.getProject(), evaluator.getClass().getName())); 165 166 if (writeHeader) { 167 evaluator.setParameter(config.getResultsPath() + "/" + 168 config.getExperimentName() + ".csv"); 169 } 170 evaluator.apply(testdata, traindata, allTrainers, writeHeader); 171 writeHeader = false; 172 } 173 174 versionCount++; 175 176 Console.traceln(Level.INFO, String.format("[%s] [%02d/%02d] %s: finished", 177 config.getExperimentName(), versionCount, 178 versions.size(), testVersion.getProject())); 179 180 } 181 182 } 183 155 184 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/execution/CrossProjectExperiment.java
r32 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.execution; 2 16 … … 25 39 26 40 /** 27 * Class responsible for executing an experiment according to an {@link ExperimentConfiguration}. The steps of an experiment are as follows: 41 * Class responsible for executing an experiment according to an {@link ExperimentConfiguration}. 42 * The steps of an experiment are as follows: 28 43 * <ul> 29 * <li>load the data from the provided data path</li> 30 * <li>filter the data sets according to the provided version filters</li> 31 * <li>execute the following steps for each data sets as test data that is not ignored through the test version filter: 32 * <ul> 33 * <li>filter the data sets to setup the candidate training data: 34 * <ul> 35 * <li>remove all data sets from the same project</li> 36 * <li>filter all data sets according to the training data filter 37 * </ul></li> 38 * <li>apply the setwise preprocessors</li> 39 * <li>apply the setwise data selection algorithms</li> 40 * <li>apply the setwise postprocessors</li> 41 * <li>train the setwise training classifiers</li> 42 * <li>unify all remaining training data into one data set</li> 43 * <li>apply the preprocessors</li> 44 * <li>apply the pointwise data selection algorithms</li> 45 * <li>apply the postprocessors</li> 46 * <li>train the normal classifiers</li> 47 * <li>evaluate the results for all trained classifiers on the training data</li> 48 * </ul></li> 44 * <li>load the data from the provided data path</li> 45 * <li>filter the data sets according to the provided version filters</li> 46 * <li>execute the following steps for each data sets as test data that is not ignored through the 47 * test version filter: 48 * <ul> 49 * <li>filter the data sets to setup the candidate training data: 50 * <ul> 51 * <li>remove all data sets from the same project</li> 52 * <li>filter all data sets according to the training data filter 53 * </ul> 54 * </li> 55 * <li>apply the setwise preprocessors</li> 56 * <li>apply the setwise data selection algorithms</li> 57 * <li>apply the setwise postprocessors</li> 58 * <li>train the setwise training classifiers</li> 59 * <li>unify all remaining training data into one data set</li> 60 * <li>apply the preprocessors</li> 61 * <li>apply the pointwise data selection algorithms</li> 62 * <li>apply the postprocessors</li> 63 * <li>train the normal classifiers</li> 64 * <li>evaluate the results for all trained classifiers on the training data</li> 65 * </ul> 66 * </li> 49 67 * </ul> 50 68 * 51 * Note that this class implements {@link Runnable}, i.e., each experiment can be started in its own thread. 69 * Note that this class implements {@link Runnable}, i.e., each experiment can be started in its own 70 * thread. 71 * 52 72 * @author Steffen Herbold 53 73 */ 54 74 public class CrossProjectExperiment implements IExecutionStrategy { 55 75 56 /** 57 * configuration of the experiment 58 */ 59 private final ExperimentConfiguration config; 60 61 /** 62 * Constructor. Creates a new experiment based on a configuration. 63 * @param config configuration of the experiment 64 */ 65 public CrossProjectExperiment(ExperimentConfiguration config) { 66 this.config = config; 67 } 68 69 /** 70 * Executes the experiment with the steps as described in the class comment. 71 * @see Runnable#run() 72 */ 73 @Override 74 public void run() { 75 final List<SoftwareVersion> versions = new LinkedList<>(); 76 77 for(IVersionLoader loader : config.getLoaders()) { 78 versions.addAll(loader.load()); 79 } 80 81 for( IVersionFilter filter : config.getVersionFilters() ) { 82 filter.apply(versions); 83 } 84 boolean writeHeader = true; 85 int versionCount = 1; 86 int testVersionCount = 0; 87 88 for( SoftwareVersion testVersion : versions ) { 89 if( isVersion(testVersion, config.getTestVersionFilters()) ) { 90 testVersionCount++; 91 } 92 } 93 94 // sort versions 95 Collections.sort(versions); 96 97 for( SoftwareVersion testVersion : versions ) { 98 if( isVersion(testVersion, config.getTestVersionFilters()) ) { 99 Console.traceln(Level.INFO, String.format("[%s] [%02d/%02d] %s: starting", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion())); 100 101 // Setup testdata and training data 102 Instances testdata = testVersion.getInstances(); 103 String testProject = testVersion.getProject(); 104 SetUniqueList<Instances> traindataSet = SetUniqueList.setUniqueList(new LinkedList<Instances>()); 105 for( SoftwareVersion trainingVersion : versions ) { 106 if( isVersion(trainingVersion, config.getTrainingVersionFilters()) ) { 107 if( trainingVersion!=testVersion ) { 108 if( !trainingVersion.getProject().equals(testProject) ) { 109 traindataSet.add(trainingVersion.getInstances()); 110 } 111 } 112 } 113 } 114 115 for( ISetWiseProcessingStrategy processor : config.getSetWisePreprocessors() ) { 116 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying setwise preprocessor %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), processor.getClass().getName())); 117 processor.apply(testdata, traindataSet); 118 } 119 for( ISetWiseDataselectionStrategy dataselector : config.getSetWiseSelectors() ) { 120 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying setwise selection %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), dataselector.getClass().getName())); 121 dataselector.apply(testdata, traindataSet); 122 } 123 for( ISetWiseProcessingStrategy processor : config.getSetWisePostprocessors() ) { 124 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying setwise postprocessor %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), processor.getClass().getName())); 125 processor.apply(testdata, traindataSet); 126 } 127 for( ISetWiseTrainingStrategy setwiseTrainer : config.getSetWiseTrainers() ) { 128 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying setwise trainer %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), setwiseTrainer.getName())); 129 setwiseTrainer.apply(traindataSet); 130 } 131 Instances traindata = makeSingleTrainingSet(traindataSet); 132 for( IProcessesingStrategy processor : config.getPreProcessors() ) { 133 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying preprocessor %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), processor.getClass().getName())); 134 processor.apply(testdata, traindata); 135 } 136 for( IPointWiseDataselectionStrategy dataselector : config.getPointWiseSelectors() ) { 137 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying pointwise selection %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), dataselector.getClass().getName())); 138 traindata = dataselector.apply(testdata, traindata); 139 } 140 for( IProcessesingStrategy processor : config.getPostProcessors() ) { 141 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying setwise postprocessor %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), processor.getClass().getName())); 142 processor.apply(testdata, traindata); 143 } 144 for( ITrainingStrategy trainer : config.getTrainers() ) { 145 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying trainer %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), trainer.getName())); 146 trainer.apply(traindata); 147 } 148 File resultsDir = new File(config.getResultsPath()); 149 if (!resultsDir.exists()) { 150 resultsDir.mkdir(); 151 } 152 for( IEvaluationStrategy evaluator : config.getEvaluators() ) { 153 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying evaluator %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), evaluator.getClass().getName())); 154 List<ITrainer> allTrainers = new LinkedList<>(); 155 for( ISetWiseTrainingStrategy setwiseTrainer : config.getSetWiseTrainers() ) { 156 allTrainers.add(setwiseTrainer); 157 } 158 for( ITrainingStrategy trainer : config.getTrainers() ) { 159 allTrainers.add(trainer); 160 } 161 if( writeHeader ) { 162 evaluator.setParameter(config.getResultsPath() + "/" + config.getExperimentName() + ".csv"); 163 } 164 evaluator.apply(testdata, traindata, allTrainers, writeHeader); 165 writeHeader = false; 166 } 167 Console.traceln(Level.INFO, String.format("[%s] [%02d/%02d] %s: finished", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion())); 168 versionCount++; 169 } 170 } 171 } 172 173 /** 174 * Helper method that checks if a version passes all filters. 175 * @param version version that is checked 176 * @param filters list of the filters 177 * @return true, if the version passes all filters, false otherwise 178 */ 179 private boolean isVersion(SoftwareVersion version, List<IVersionFilter> filters) { 180 boolean result = true; 181 for( IVersionFilter filter : filters) { 182 result &= !filter.apply(version); 183 } 184 return result; 185 } 186 187 /** 188 * Helper method that combines a set of Weka {@link Instances} sets into a single {@link Instances} set. 189 * @param traindataSet set of {@link Instances} to be combines 190 * @return single {@link Instances} set 191 */ 192 public static Instances makeSingleTrainingSet(SetUniqueList<Instances> traindataSet) { 193 Instances traindataFull = null; 194 for( Instances traindata : traindataSet) { 195 if( traindataFull==null ) { 196 traindataFull = new Instances(traindata); 197 } else { 198 for( int i=0 ; i<traindata.numInstances() ; i++ ) { 199 traindataFull.add(traindata.instance(i)); 200 } 201 } 202 } 203 return traindataFull; 204 } 76 /** 77 * configuration of the experiment 78 */ 79 private final ExperimentConfiguration config; 80 81 /** 82 * Constructor. Creates a new experiment based on a configuration. 83 * 84 * @param config 85 * configuration of the experiment 86 */ 87 public CrossProjectExperiment(ExperimentConfiguration config) { 88 this.config = config; 89 } 90 91 /** 92 * Executes the experiment with the steps as described in the class comment. 93 * 94 * @see Runnable#run() 95 */ 96 @Override 97 public void run() { 98 final List<SoftwareVersion> versions = new LinkedList<>(); 99 100 for (IVersionLoader loader : config.getLoaders()) { 101 versions.addAll(loader.load()); 102 } 103 104 for (IVersionFilter filter : config.getVersionFilters()) { 105 filter.apply(versions); 106 } 107 boolean writeHeader = true; 108 int versionCount = 1; 109 int testVersionCount = 0; 110 111 for (SoftwareVersion testVersion : versions) { 112 if (isVersion(testVersion, config.getTestVersionFilters())) { 113 testVersionCount++; 114 } 115 } 116 117 // sort versions 118 Collections.sort(versions); 119 120 for (SoftwareVersion testVersion : versions) { 121 if (isVersion(testVersion, config.getTestVersionFilters())) { 122 Console.traceln(Level.INFO, String.format("[%s] [%02d/%02d] %s: starting", 123 config.getExperimentName(), versionCount, 124 testVersionCount, 125 testVersion.getVersion())); 126 127 // Setup testdata and training data 128 Instances testdata = testVersion.getInstances(); 129 String testProject = testVersion.getProject(); 130 SetUniqueList<Instances> traindataSet = 131 SetUniqueList.setUniqueList(new LinkedList<Instances>()); 132 for (SoftwareVersion trainingVersion : versions) { 133 if (isVersion(trainingVersion, config.getTrainingVersionFilters())) { 134 if (trainingVersion != testVersion) { 135 if (!trainingVersion.getProject().equals(testProject)) { 136 traindataSet.add(trainingVersion.getInstances()); 137 } 138 } 139 } 140 } 141 142 for (ISetWiseProcessingStrategy processor : config.getSetWisePreprocessors()) { 143 Console.traceln(Level.FINE, String 144 .format("[%s] [%02d/%02d] %s: applying setwise preprocessor %s", 145 config.getExperimentName(), versionCount, testVersionCount, 146 testVersion.getVersion(), processor.getClass().getName())); 147 processor.apply(testdata, traindataSet); 148 } 149 for (ISetWiseDataselectionStrategy dataselector : config.getSetWiseSelectors()) { 150 Console.traceln(Level.FINE, String 151 .format("[%s] [%02d/%02d] %s: applying setwise selection %s", 152 config.getExperimentName(), versionCount, testVersionCount, 153 testVersion.getVersion(), dataselector.getClass().getName())); 154 dataselector.apply(testdata, traindataSet); 155 } 156 for (ISetWiseProcessingStrategy processor : config.getSetWisePostprocessors()) { 157 Console.traceln(Level.FINE, String 158 .format("[%s] [%02d/%02d] %s: applying setwise postprocessor %s", 159 config.getExperimentName(), versionCount, testVersionCount, 160 testVersion.getVersion(), processor.getClass().getName())); 161 processor.apply(testdata, traindataSet); 162 } 163 for (ISetWiseTrainingStrategy setwiseTrainer : config.getSetWiseTrainers()) { 164 Console.traceln(Level.FINE, String 165 .format("[%s] [%02d/%02d] %s: applying setwise trainer %s", 166 config.getExperimentName(), versionCount, testVersionCount, 167 testVersion.getVersion(), setwiseTrainer.getName())); 168 setwiseTrainer.apply(traindataSet); 169 } 170 Instances traindata = makeSingleTrainingSet(traindataSet); 171 for (IProcessesingStrategy processor : config.getPreProcessors()) { 172 Console.traceln(Level.FINE, String 173 .format("[%s] [%02d/%02d] %s: applying preprocessor %s", 174 config.getExperimentName(), versionCount, testVersionCount, 175 testVersion.getVersion(), processor.getClass().getName())); 176 processor.apply(testdata, traindata); 177 } 178 for (IPointWiseDataselectionStrategy dataselector : config.getPointWiseSelectors()) 179 { 180 Console.traceln(Level.FINE, String 181 .format("[%s] [%02d/%02d] %s: applying pointwise selection %s", 182 config.getExperimentName(), versionCount, testVersionCount, 183 testVersion.getVersion(), dataselector.getClass().getName())); 184 traindata = dataselector.apply(testdata, traindata); 185 } 186 for (IProcessesingStrategy processor : config.getPostProcessors()) { 187 Console.traceln(Level.FINE, String 188 .format("[%s] [%02d/%02d] %s: applying setwise postprocessor %s", 189 config.getExperimentName(), versionCount, testVersionCount, 190 testVersion.getVersion(), processor.getClass().getName())); 191 processor.apply(testdata, traindata); 192 } 193 for (ITrainingStrategy trainer : config.getTrainers()) { 194 Console.traceln(Level.FINE, String 195 .format("[%s] [%02d/%02d] %s: applying trainer %s", 196 config.getExperimentName(), versionCount, testVersionCount, 197 testVersion.getVersion(), trainer.getName())); 198 trainer.apply(traindata); 199 } 200 File resultsDir = new File(config.getResultsPath()); 201 if (!resultsDir.exists()) { 202 resultsDir.mkdir(); 203 } 204 for (IEvaluationStrategy evaluator : config.getEvaluators()) { 205 Console.traceln(Level.FINE, String 206 .format("[%s] [%02d/%02d] %s: applying evaluator %s", 207 config.getExperimentName(), versionCount, testVersionCount, 208 testVersion.getVersion(), evaluator.getClass().getName())); 209 List<ITrainer> allTrainers = new LinkedList<>(); 210 for (ISetWiseTrainingStrategy setwiseTrainer : config.getSetWiseTrainers()) { 211 allTrainers.add(setwiseTrainer); 212 } 213 for (ITrainingStrategy trainer : config.getTrainers()) { 214 allTrainers.add(trainer); 215 } 216 if (writeHeader) { 217 evaluator.setParameter(config.getResultsPath() + "/" + 218 config.getExperimentName() + ".csv"); 219 } 220 evaluator.apply(testdata, traindata, allTrainers, writeHeader); 221 writeHeader = false; 222 } 223 Console.traceln(Level.INFO, String.format("[%s] [%02d/%02d] %s: finished", 224 config.getExperimentName(), versionCount, 225 testVersionCount, 226 testVersion.getVersion())); 227 versionCount++; 228 } 229 } 230 } 231 232 /** 233 * Helper method that checks if a version passes all filters. 234 * 235 * @param version 236 * version that is checked 237 * @param filters 238 * list of the filters 239 * @return true, if the version passes all filters, false otherwise 240 */ 241 private boolean isVersion(SoftwareVersion version, List<IVersionFilter> filters) { 242 boolean result = true; 243 for (IVersionFilter filter : filters) { 244 result &= !filter.apply(version); 245 } 246 return result; 247 } 248 249 /** 250 * Helper method that combines a set of Weka {@link Instances} sets into a single 251 * {@link Instances} set. 252 * 253 * @param traindataSet 254 * set of {@link Instances} to be combines 255 * @return single {@link Instances} set 256 */ 257 public static Instances makeSingleTrainingSet(SetUniqueList<Instances> traindataSet) { 258 Instances traindataFull = null; 259 for (Instances traindata : traindataSet) { 260 if (traindataFull == null) { 261 traindataFull = new Instances(traindata); 262 } 263 else { 264 for (int i = 0; i < traindata.numInstances(); i++) { 265 traindataFull.add(traindata.instance(i)); 266 } 267 } 268 } 269 return traindataFull; 270 } 205 271 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/execution/IExecutionStrategy.java
r32 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.execution; 2 16 … … 4 18 5 19 /** 6 * Interface that must be implemented from the different experiments 7 * (e.g.ClassifierCreationExeperiment) to be runnable by {@link Runner}20 * Interface that must be implemented from the different experiments (e.g. 21 * ClassifierCreationExeperiment) to be runnable by {@link Runner} 8 22 * 9 23 * @author Fabian Trautsch 10 * 24 * 11 25 */ 12 public interface IExecutionStrategy extends Runnable {26 public interface IExecutionStrategy extends Runnable { 13 27 14 28 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/execution/RelaxedCrossProjectExperiment.java
r39 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.execution; 2 16 … … 25 39 26 40 /** 27 * Class responsible for executing an experiment according to an {@link ExperimentConfiguration}. The steps of an experiment are as follows: 41 * Class responsible for executing an experiment according to an {@link ExperimentConfiguration}. 42 * The steps of an experiment are as follows: 28 43 * <ul> 29 * <li>load the data from the provided data path</li> 30 * <li>filter the data sets according to the provided version filters</li> 31 * <li>execute the following steps for each data sets as test data that is not ignored through the test version filter: 32 * <ul> 33 * <li>filter the data sets to setup the candidate training data: 34 * <ul> 35 * <li>filter all data sets according to the training data filter 36 * </ul></li> 37 * <li>apply the setwise preprocessors</li> 38 * <li>apply the setwise data selection algorithms</li> 39 * <li>apply the setwise postprocessors</li> 40 * <li>train the setwise training classifiers</li> 41 * <li>unify all remaining training data into one data set</li> 42 * <li>apply the preprocessors</li> 43 * <li>apply the pointwise data selection algorithms</li> 44 * <li>apply the postprocessors</li> 45 * <li>train the normal classifiers</li> 46 * <li>evaluate the results for all trained classifiers on the training data</li> 47 * </ul></li> 44 * <li>load the data from the provided data path</li> 45 * <li>filter the data sets according to the provided version filters</li> 46 * <li>execute the following steps for each data sets as test data that is not ignored through the 47 * test version filter: 48 * <ul> 49 * <li>filter the data sets to setup the candidate training data: 50 * <ul> 51 * <li>filter all data sets according to the training data filter 52 * </ul> 53 * </li> 54 * <li>apply the setwise preprocessors</li> 55 * <li>apply the setwise data selection algorithms</li> 56 * <li>apply the setwise postprocessors</li> 57 * <li>train the setwise training classifiers</li> 58 * <li>unify all remaining training data into one data set</li> 59 * <li>apply the preprocessors</li> 60 * <li>apply the pointwise data selection algorithms</li> 61 * <li>apply the postprocessors</li> 62 * <li>train the normal classifiers</li> 63 * <li>evaluate the results for all trained classifiers on the training data</li> 64 * </ul> 65 * </li> 48 66 * </ul> 49 67 * 50 * Note that this class implements {@link Runnable}, i.e., each experiment can be started in its own thread. 68 * Note that this class implements {@link Runnable}, i.e., each experiment can be started in its own 69 * thread. 70 * 51 71 * @author Steffen Herbold 52 72 */ 53 73 public class RelaxedCrossProjectExperiment implements IExecutionStrategy { 54 74 55 /** 56 * configuration of the experiment 57 */ 58 private final ExperimentConfiguration config; 59 60 /** 61 * Constructor. Creates a new experiment based on a configuration. 62 * @param config configuration of the experiment 63 */ 64 public RelaxedCrossProjectExperiment(ExperimentConfiguration config) { 65 this.config = config; 66 } 67 68 /** 69 * Executes the experiment with the steps as described in the class comment. 70 * @see Runnable#run() 71 */ 72 @Override 73 public void run() { 74 final List<SoftwareVersion> versions = new LinkedList<>(); 75 76 for(IVersionLoader loader : config.getLoaders()) { 77 versions.addAll(loader.load()); 78 } 79 80 for( IVersionFilter filter : config.getVersionFilters() ) { 81 filter.apply(versions); 82 } 83 boolean writeHeader = true; 84 int versionCount = 1; 85 int testVersionCount = 0; 86 87 for( SoftwareVersion testVersion : versions ) { 88 if( isVersion(testVersion, config.getTestVersionFilters()) ) { 89 testVersionCount++; 90 } 91 } 92 93 // sort versions 94 Collections.sort(versions); 95 96 for( SoftwareVersion testVersion : versions ) { 97 if( isVersion(testVersion, config.getTestVersionFilters()) ) { 98 Console.traceln(Level.INFO, String.format("[%s] [%02d/%02d] %s: starting", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion())); 99 100 // Setup testdata and training data 101 Instances testdata = testVersion.getInstances(); 102 String testProject = testVersion.getProject(); 103 SetUniqueList<Instances> traindataSet = SetUniqueList.setUniqueList(new LinkedList<Instances>()); 104 for( SoftwareVersion trainingVersion : versions ) { 105 if( isVersion(trainingVersion, config.getTrainingVersionFilters()) ) { 106 if( trainingVersion!=testVersion ) { 107 if( trainingVersion.getProject().equals(testProject) ) { 108 if( trainingVersion.compareTo(testVersion)<0 ) { 109 // only add if older 110 traindataSet.add(trainingVersion.getInstances()); 111 } 112 } else { 113 traindataSet.add(trainingVersion.getInstances()); 114 } 115 } 116 } 117 } 118 119 for( ISetWiseProcessingStrategy processor : config.getSetWisePreprocessors() ) { 120 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying setwise preprocessor %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), processor.getClass().getName())); 121 processor.apply(testdata, traindataSet); 122 } 123 for( ISetWiseDataselectionStrategy dataselector : config.getSetWiseSelectors() ) { 124 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying setwise selection %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), dataselector.getClass().getName())); 125 dataselector.apply(testdata, traindataSet); 126 } 127 for( ISetWiseProcessingStrategy processor : config.getSetWisePostprocessors() ) { 128 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying setwise postprocessor %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), processor.getClass().getName())); 129 processor.apply(testdata, traindataSet); 130 } 131 for( ISetWiseTrainingStrategy setwiseTrainer : config.getSetWiseTrainers() ) { 132 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying setwise trainer %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), setwiseTrainer.getName())); 133 setwiseTrainer.apply(traindataSet); 134 } 135 Instances traindata = makeSingleTrainingSet(traindataSet); 136 for( IProcessesingStrategy processor : config.getPreProcessors() ) { 137 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying preprocessor %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), processor.getClass().getName())); 138 processor.apply(testdata, traindata); 139 } 140 for( IPointWiseDataselectionStrategy dataselector : config.getPointWiseSelectors() ) { 141 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying pointwise selection %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), dataselector.getClass().getName())); 142 traindata = dataselector.apply(testdata, traindata); 143 } 144 for( IProcessesingStrategy processor : config.getPostProcessors() ) { 145 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying setwise postprocessor %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), processor.getClass().getName())); 146 processor.apply(testdata, traindata); 147 } 148 for( ITrainingStrategy trainer : config.getTrainers() ) { 149 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying trainer %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), trainer.getName())); 150 trainer.apply(traindata); 151 } 152 File resultsDir = new File(config.getResultsPath()); 153 if (!resultsDir.exists()) { 154 resultsDir.mkdir(); 155 } 156 for( IEvaluationStrategy evaluator : config.getEvaluators() ) { 157 Console.traceln(Level.FINE, String.format("[%s] [%02d/%02d] %s: applying evaluator %s", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion(), evaluator.getClass().getName())); 158 List<ITrainer> allTrainers = new LinkedList<>(); 159 for( ISetWiseTrainingStrategy setwiseTrainer : config.getSetWiseTrainers() ) { 160 allTrainers.add(setwiseTrainer); 161 } 162 for( ITrainingStrategy trainer : config.getTrainers() ) { 163 allTrainers.add(trainer); 164 } 165 if( writeHeader ) { 166 evaluator.setParameter(config.getResultsPath() + "/" + config.getExperimentName() + ".csv"); 167 } 168 evaluator.apply(testdata, traindata, allTrainers, writeHeader); 169 writeHeader = false; 170 } 171 Console.traceln(Level.INFO, String.format("[%s] [%02d/%02d] %s: finished", config.getExperimentName(), versionCount, testVersionCount, testVersion.getVersion())); 172 versionCount++; 173 } 174 } 175 } 176 177 /** 178 * Helper method that checks if a version passes all filters. 179 * @param version version that is checked 180 * @param filters list of the filters 181 * @return true, if the version passes all filters, false otherwise 182 */ 183 private boolean isVersion(SoftwareVersion version, List<IVersionFilter> filters) { 184 boolean result = true; 185 for( IVersionFilter filter : filters) { 186 result &= !filter.apply(version); 187 } 188 return result; 189 } 190 191 /** 192 * Helper method that combines a set of Weka {@link Instances} sets into a single {@link Instances} set. 193 * @param traindataSet set of {@link Instances} to be combines 194 * @return single {@link Instances} set 195 */ 196 public static Instances makeSingleTrainingSet(SetUniqueList<Instances> traindataSet) { 197 Instances traindataFull = null; 198 for( Instances traindata : traindataSet) { 199 if( traindataFull==null ) { 200 traindataFull = new Instances(traindata); 201 } else { 202 for( int i=0 ; i<traindata.numInstances() ; i++ ) { 203 traindataFull.add(traindata.instance(i)); 204 } 205 } 206 } 207 return traindataFull; 208 } 75 /** 76 * configuration of the experiment 77 */ 78 private final ExperimentConfiguration config; 79 80 /** 81 * Constructor. Creates a new experiment based on a configuration. 82 * 83 * @param config 84 * configuration of the experiment 85 */ 86 public RelaxedCrossProjectExperiment(ExperimentConfiguration config) { 87 this.config = config; 88 } 89 90 /** 91 * Executes the experiment with the steps as described in the class comment. 92 * 93 * @see Runnable#run() 94 */ 95 @Override 96 public void run() { 97 final List<SoftwareVersion> versions = new LinkedList<>(); 98 99 for (IVersionLoader loader : config.getLoaders()) { 100 versions.addAll(loader.load()); 101 } 102 103 for (IVersionFilter filter : config.getVersionFilters()) { 104 filter.apply(versions); 105 } 106 boolean writeHeader = true; 107 int versionCount = 1; 108 int testVersionCount = 0; 109 110 for (SoftwareVersion testVersion : versions) { 111 if (isVersion(testVersion, config.getTestVersionFilters())) { 112 testVersionCount++; 113 } 114 } 115 116 // sort versions 117 Collections.sort(versions); 118 119 for (SoftwareVersion testVersion : versions) { 120 if (isVersion(testVersion, config.getTestVersionFilters())) { 121 Console.traceln(Level.INFO, String.format("[%s] [%02d/%02d] %s: starting", 122 config.getExperimentName(), versionCount, 123 testVersionCount, 124 testVersion.getVersion())); 125 126 // Setup testdata and training data 127 Instances testdata = testVersion.getInstances(); 128 String testProject = testVersion.getProject(); 129 SetUniqueList<Instances> traindataSet = 130 SetUniqueList.setUniqueList(new LinkedList<Instances>()); 131 for (SoftwareVersion trainingVersion : versions) { 132 if (isVersion(trainingVersion, config.getTrainingVersionFilters())) { 133 if (trainingVersion != testVersion) { 134 if (trainingVersion.getProject().equals(testProject)) { 135 if (trainingVersion.compareTo(testVersion) < 0) { 136 // only add if older 137 traindataSet.add(trainingVersion.getInstances()); 138 } 139 } 140 else { 141 traindataSet.add(trainingVersion.getInstances()); 142 } 143 } 144 } 145 } 146 147 for (ISetWiseProcessingStrategy processor : config.getSetWisePreprocessors()) { 148 Console.traceln(Level.FINE, String 149 .format("[%s] [%02d/%02d] %s: applying setwise preprocessor %s", 150 config.getExperimentName(), versionCount, testVersionCount, 151 testVersion.getVersion(), processor.getClass().getName())); 152 processor.apply(testdata, traindataSet); 153 } 154 for (ISetWiseDataselectionStrategy dataselector : config.getSetWiseSelectors()) { 155 Console.traceln(Level.FINE, String 156 .format("[%s] [%02d/%02d] %s: applying setwise selection %s", 157 config.getExperimentName(), versionCount, testVersionCount, 158 testVersion.getVersion(), dataselector.getClass().getName())); 159 dataselector.apply(testdata, traindataSet); 160 } 161 for (ISetWiseProcessingStrategy processor : config.getSetWisePostprocessors()) { 162 Console.traceln(Level.FINE, String 163 .format("[%s] [%02d/%02d] %s: applying setwise postprocessor %s", 164 config.getExperimentName(), versionCount, testVersionCount, 165 testVersion.getVersion(), processor.getClass().getName())); 166 processor.apply(testdata, traindataSet); 167 } 168 for (ISetWiseTrainingStrategy setwiseTrainer : config.getSetWiseTrainers()) { 169 Console.traceln(Level.FINE, String 170 .format("[%s] [%02d/%02d] %s: applying setwise trainer %s", 171 config.getExperimentName(), versionCount, testVersionCount, 172 testVersion.getVersion(), setwiseTrainer.getName())); 173 setwiseTrainer.apply(traindataSet); 174 } 175 Instances traindata = makeSingleTrainingSet(traindataSet); 176 for (IProcessesingStrategy processor : config.getPreProcessors()) { 177 Console.traceln(Level.FINE, String 178 .format("[%s] [%02d/%02d] %s: applying preprocessor %s", 179 config.getExperimentName(), versionCount, testVersionCount, 180 testVersion.getVersion(), processor.getClass().getName())); 181 processor.apply(testdata, traindata); 182 } 183 for (IPointWiseDataselectionStrategy dataselector : config.getPointWiseSelectors()) 184 { 185 Console.traceln(Level.FINE, String 186 .format("[%s] [%02d/%02d] %s: applying pointwise selection %s", 187 config.getExperimentName(), versionCount, testVersionCount, 188 testVersion.getVersion(), dataselector.getClass().getName())); 189 traindata = dataselector.apply(testdata, traindata); 190 } 191 for (IProcessesingStrategy processor : config.getPostProcessors()) { 192 Console.traceln(Level.FINE, String 193 .format("[%s] [%02d/%02d] %s: applying setwise postprocessor %s", 194 config.getExperimentName(), versionCount, testVersionCount, 195 testVersion.getVersion(), processor.getClass().getName())); 196 processor.apply(testdata, traindata); 197 } 198 for (ITrainingStrategy trainer : config.getTrainers()) { 199 Console.traceln(Level.FINE, String 200 .format("[%s] [%02d/%02d] %s: applying trainer %s", 201 config.getExperimentName(), versionCount, testVersionCount, 202 testVersion.getVersion(), trainer.getName())); 203 trainer.apply(traindata); 204 } 205 File resultsDir = new File(config.getResultsPath()); 206 if (!resultsDir.exists()) { 207 resultsDir.mkdir(); 208 } 209 for (IEvaluationStrategy evaluator : config.getEvaluators()) { 210 Console.traceln(Level.FINE, String 211 .format("[%s] [%02d/%02d] %s: applying evaluator %s", 212 config.getExperimentName(), versionCount, testVersionCount, 213 testVersion.getVersion(), evaluator.getClass().getName())); 214 List<ITrainer> allTrainers = new LinkedList<>(); 215 for (ISetWiseTrainingStrategy setwiseTrainer : config.getSetWiseTrainers()) { 216 allTrainers.add(setwiseTrainer); 217 } 218 for (ITrainingStrategy trainer : config.getTrainers()) { 219 allTrainers.add(trainer); 220 } 221 if (writeHeader) { 222 evaluator.setParameter(config.getResultsPath() + "/" + 223 config.getExperimentName() + ".csv"); 224 } 225 evaluator.apply(testdata, traindata, allTrainers, writeHeader); 226 writeHeader = false; 227 } 228 Console.traceln(Level.INFO, String.format("[%s] [%02d/%02d] %s: finished", 229 config.getExperimentName(), versionCount, 230 testVersionCount, 231 testVersion.getVersion())); 232 versionCount++; 233 } 234 } 235 } 236 237 /** 238 * Helper method that checks if a version passes all filters. 239 * 240 * @param version 241 * version that is checked 242 * @param filters 243 * list of the filters 244 * @return true, if the version passes all filters, false otherwise 245 */ 246 private boolean isVersion(SoftwareVersion version, List<IVersionFilter> filters) { 247 boolean result = true; 248 for (IVersionFilter filter : filters) { 249 result &= !filter.apply(version); 250 } 251 return result; 252 } 253 254 /** 255 * Helper method that combines a set of Weka {@link Instances} sets into a single 256 * {@link Instances} set. 257 * 258 * @param traindataSet 259 * set of {@link Instances} to be combines 260 * @return single {@link Instances} set 261 */ 262 public static Instances makeSingleTrainingSet(SetUniqueList<Instances> traindataSet) { 263 Instances traindataFull = null; 264 for (Instances traindata : traindataSet) { 265 if (traindataFull == null) { 266 traindataFull = new Instances(traindata); 267 } 268 else { 269 for (int i = 0; i < traindata.numInstances(); i++) { 270 traindataFull.add(traindata.instance(i)); 271 } 272 } 273 } 274 return traindataFull; 275 } 209 276 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/ARFFFolderLoader.java
r4 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.loader; 2 16 … … 9 23 public class ARFFFolderLoader extends AbstractFolderLoader { 10 24 11 /*12 * (non-Javadoc)13 *14 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader#getSingleLoader()15 */16 @Override17 protected SingleVersionLoader getSingleLoader() {18 return new ARFFLoader();19 }25 /* 26 * (non-Javadoc) 27 * 28 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader#getSingleLoader() 29 */ 30 @Override 31 protected SingleVersionLoader getSingleLoader() { 32 return new ARFFLoader(); 33 } 20 34 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/ARFFLoader.java
r6 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.loader; 2 16 … … 15 29 public class ARFFLoader implements SingleVersionLoader { 16 30 17 /* 18 * (non-Javadoc) 19 * 20 * @see de.ugoe.cs.cpdp.loader.SingleVersionLoader#load(java.io.File) 21 */ 22 @Override 23 public Instances load(File file) { 24 BufferedReader reader; 25 Instances data; 26 try { 27 reader = new BufferedReader(new FileReader(file)); 28 data = new Instances(reader); 29 reader.close(); 30 } catch (IOException e) { 31 throw new RuntimeException("error reading file: " + file.getName(), e); 32 } 31 /* 32 * (non-Javadoc) 33 * 34 * @see de.ugoe.cs.cpdp.loader.SingleVersionLoader#load(java.io.File) 35 */ 36 @Override 37 public Instances load(File file) { 38 BufferedReader reader; 39 Instances data; 40 try { 41 reader = new BufferedReader(new FileReader(file)); 42 data = new Instances(reader); 43 reader.close(); 44 } 45 catch (IOException e) { 46 throw new RuntimeException("error reading file: " + file.getName(), e); 47 } 33 48 34 // setting class attribute35 data.setClassIndex(data.numAttributes() - 1);49 // setting class attribute 50 data.setClassIndex(data.numAttributes() - 1); 36 51 37 return data;38 }52 return data; 53 } 39 54 40 /* 41 * (non-Javadoc) 42 * 43 * @see 44 * de.ugoe.cs.cpdp.loader.SingleVersionLoader#filenameFilter(java.lang.String 45 * ) 46 */ 47 @Override 48 public boolean filenameFilter(String filename) { 49 return filename.endsWith(".arff"); 50 } 55 /* 56 * (non-Javadoc) 57 * 58 * @see de.ugoe.cs.cpdp.loader.SingleVersionLoader#filenameFilter(java.lang.String ) 59 */ 60 @Override 61 public boolean filenameFilter(String filename) { 62 return filename.endsWith(".arff"); 63 } 51 64 52 65 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/AUDIChangeFolderLoader.java
r38 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.loader; 2 16 3 17 public class AUDIChangeFolderLoader extends AbstractFolderLoader { 4 18 5 /*6 * (non-Javadoc)7 *8 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader#getSingleLoader()9 */10 @Override11 protected SingleVersionLoader getSingleLoader() {12 return new AUDIChangeLoader();13 }19 /* 20 * (non-Javadoc) 21 * 22 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader#getSingleLoader() 23 */ 24 @Override 25 protected SingleVersionLoader getSingleLoader() { 26 return new AUDIChangeLoader(); 27 } 14 28 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/AUDIChangeLoader.java
r38 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.loader; 2 16 … … 15 29 /** 16 30 * TODO 31 * 17 32 * @author sherbold 18 * 33 * 19 34 */ 20 35 class AUDIChangeLoader implements SingleVersionLoader { 21 22 private class EntityRevisionPair implements Comparable<EntityRevisionPair> { 23 private final String entity; 24 private final int revision; 25 26 public EntityRevisionPair(String entity, int revision) { 27 this.entity = entity; 28 this.revision = revision; 29 } 30 31 @Override 32 public boolean equals(Object other) { 33 if( !(other instanceof EntityRevisionPair) ) { 34 return false; 35 } else { 36 return compareTo((EntityRevisionPair) other)==0; 37 } 38 } 39 40 @Override 41 public int hashCode() { 42 return entity.hashCode()+revision; 43 } 44 45 @Override 46 public int compareTo(EntityRevisionPair other) { 47 int strCmp = this.entity.compareTo(other.entity); 48 if( strCmp!=0 ) { 49 return strCmp; 50 } 51 return Integer.compare(revision, other.revision); 52 } 53 54 @Override 55 public String toString() { 56 return entity+"@"+revision; 57 } 58 } 59 60 @Override 61 public Instances load(File file) { 62 final String[] lines; 63 String[] lineSplit; 64 String[] lineSplitBug; 65 66 try { 67 lines = FileTools.getLinesFromFile(file.getAbsolutePath()); 68 } catch (IOException e) { 69 throw new RuntimeException(e); 70 } 71 72 // information about bugs are in another file 73 String path = file.getAbsolutePath(); 74 path = path.substring(0, path.length()-14) + "repro.csv"; 75 final String[] linesBug; 76 try { 77 linesBug = FileTools.getLinesFromFile(path); 78 } catch (IOException e) { 79 throw new RuntimeException(e); 80 } 81 82 int revisionIndex=-1; 83 int bugIndex=-1; 84 lineSplitBug = linesBug[0].split(";"); 85 for( int j=0; j<lineSplitBug.length ; j++ ) { 86 if( lineSplitBug[j].equals("svnrev") ) { 87 revisionIndex=j; 88 } 89 if( lineSplitBug[j].equals("num_bugs_trace") ) { 90 bugIndex=j; 91 } 92 } 93 if( revisionIndex<0 ) { 94 throw new RuntimeException("could not find SVN revisions"); 95 } 96 if( bugIndex<0 ) { 97 throw new RuntimeException("could not find bug information"); 98 } 99 100 int metricsStartIndex=-1; 101 int metricsEndIndex=-1; 102 lineSplit = lines[0].split(";"); 103 for( int j=0; j<lineSplit.length ; j++ ) { 104 if( lineSplit[j].equals("lm_LOC") ) { 105 metricsStartIndex=j; 106 } 107 if( lineSplit[j].equals("h_E") ) { 108 metricsEndIndex=j; 109 } 110 } 111 if( metricsStartIndex<0 ) { 112 throw new RuntimeException("could not find first metric, i.e., lm_LOC"); 113 } 114 if( metricsEndIndex<0 ) { 115 throw new RuntimeException("could not find last metric, i.e., h_E"); 116 } 117 int numMetrics = metricsEndIndex-metricsStartIndex+1; 118 119 // create sets of all filenames and revisions 120 SortedMap<EntityRevisionPair, Integer> entityRevisionPairs = new TreeMap<>(); 121 for( int i=1; i<linesBug.length ; i++ ) { 122 lineSplitBug = linesBug[i].split(";"); 123 entityRevisionPairs.put(new EntityRevisionPair(lineSplitBug[0], Integer.parseInt(lineSplitBug[revisionIndex])), i); 124 } 125 126 127 // prepare weka instances 128 final ArrayList<Attribute> atts = new ArrayList<Attribute>(); 129 lineSplit = lines[0].split(";"); 130 for (int j = metricsStartIndex; j<=metricsEndIndex; j++) { 131 atts.add(new Attribute(lineSplit[j]+"_delta")); 132 } 133 for (int j = metricsStartIndex; j<=metricsEndIndex; j++) { 134 atts.add(new Attribute(lineSplit[j]+"_abs")); 135 } 136 final ArrayList<String> classAttVals = new ArrayList<String>(); 137 classAttVals.add("0"); 138 classAttVals.add("1"); 139 final Attribute classAtt = new Attribute("bug", classAttVals); 140 atts.add(classAtt); 141 142 final Instances data = new Instances(file.getName(), atts, 0); 143 data.setClass(classAtt); 144 145 // create data 146 String lastFile = null; 147 double[] lastValues = null; 148 int lastNumBugs = 0; 149 for( Entry<EntityRevisionPair, Integer> entry : entityRevisionPairs.entrySet() ) { 150 try { 151 // first get values 152 lineSplit = lines[entry.getValue()].split(";"); 153 lineSplitBug = linesBug[entry.getValue()].split(";"); 154 int i=0; 155 double[] values = new double[numMetrics]; 156 for(int j=metricsStartIndex ; j<=metricsEndIndex ; j++ ) { 157 values[i] = Double.parseDouble(lineSplit[j]); 158 i++; 159 } 160 int numBugs = Integer.parseInt(lineSplitBug[bugIndex]); 161 162 // then check if an entity must be created 163 if( entry.getKey().entity.equals(lastFile)) { 164 // create new instance 165 double[] instanceValues = new double[2*numMetrics+1]; 166 for( int j=0; j<numMetrics; j++ ) { 167 instanceValues[j] = values[j]-lastValues[j]; 168 instanceValues[j+numMetrics]= values[j]; 169 } 170 // check if any value>0 171 boolean changeOccured = false; 172 for( int j=0; j<numMetrics; j++ ) { 173 if( instanceValues[j]>0 ) { 174 changeOccured = true; 175 } 176 } 177 if( changeOccured ) { 178 instanceValues[instanceValues.length-1] = numBugs<=lastNumBugs ? 0 : 1; 179 data.add(new DenseInstance(1.0, instanceValues)); 180 } 181 } 182 lastFile = entry.getKey().entity; 183 lastValues = values; 184 lastNumBugs = numBugs; 185 } catch(IllegalArgumentException e) { 186 System.err.println("error in line " + entry.getValue() + ": " + e.getMessage()); 187 System.err.println("metrics line: " + lines[entry.getValue()]); 188 System.err.println("bugs line: " + linesBug[entry.getValue()]); 189 System.err.println("line is ignored"); 190 } 191 } 192 193 return data; 194 } 195 196 /* 197 * (non-Javadoc) 198 * 199 * @see 200 * de.ugoe.cs.cpdp.loader.AbstractFolderLoader.SingleVersionLoader#load( 201 * java.io.File) 202 */ 203 204 public Instances load(File file, String dummy) { 205 final String[] lines; 206 try { 207 lines = FileTools.getLinesFromFile(file.getAbsolutePath()); 208 } catch (IOException e) { 209 throw new RuntimeException(e); 210 } 211 212 // information about bugs are in another file 213 String path = file.getAbsolutePath(); 214 path = path.substring(0, path.length()-14) + "repro.csv"; 215 final String[] linesBug; 216 try { 217 linesBug = FileTools.getLinesFromFile(path); 218 } catch (IOException e) { 219 throw new RuntimeException(e); 220 } 221 222 // configure Instances 223 final ArrayList<Attribute> atts = new ArrayList<Attribute>(); 224 225 String[] lineSplit = lines[0].split(";"); 226 // ignore first three/four and last two columns 227 int offset; 228 if( lineSplit[3].equals("project_rev") ) { 229 offset = 4; 230 } else { 231 offset = 3; 232 } 233 for (int j = 0; j < lineSplit.length - (offset+2); j++) { 234 atts.add(new Attribute(lineSplit[j + offset])); 235 } 236 final ArrayList<String> classAttVals = new ArrayList<String>(); 237 classAttVals.add("0"); 238 classAttVals.add("1"); 239 final Attribute classAtt = new Attribute("bug", classAttVals); 240 atts.add(classAtt); 241 242 final Instances data = new Instances(file.getName(), atts, 0); 243 data.setClass(classAtt); 244 245 // fetch data 246 for (int i = 1; i < lines.length; i++) { 247 boolean validInstance = true; 248 lineSplit = lines[i].split(";"); 249 String[] lineSplitBug = linesBug[i].split(";"); 250 double[] values = new double[data.numAttributes()]; 251 for (int j = 0; validInstance && j < values.length-1; j++) { 252 if( lineSplit[j + offset].trim().isEmpty() ) { 253 validInstance = false; 254 } else { 255 values[j] = Double.parseDouble(lineSplit[j + offset].trim()); 256 } 257 } 258 if( offset==3 ) { 259 values[values.length - 1] = lineSplitBug[7].equals("0") ? 0 : 1; 260 } else { 261 values[values.length - 1] = lineSplitBug[8].equals("0") ? 0 : 1; 262 } 263 264 if( validInstance ) { 265 data.add(new DenseInstance(1.0, values)); 266 } else { 267 System.out.println("instance " + i + " is invalid"); 268 } 269 } 270 return data; 271 } 272 273 /* 274 * (non-Javadoc) 275 * 276 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader.SingleVersionLoader# 277 * filenameFilter(java.lang.String) 278 */ 279 @Override 280 public boolean filenameFilter(String filename) { 281 return filename.endsWith("src.csv"); 282 } 36 37 private class EntityRevisionPair implements Comparable<EntityRevisionPair> { 38 private final String entity; 39 private final int revision; 40 41 public EntityRevisionPair(String entity, int revision) { 42 this.entity = entity; 43 this.revision = revision; 44 } 45 46 @Override 47 public boolean equals(Object other) { 48 if (!(other instanceof EntityRevisionPair)) { 49 return false; 50 } 51 else { 52 return compareTo((EntityRevisionPair) other) == 0; 53 } 54 } 55 56 @Override 57 public int hashCode() { 58 return entity.hashCode() + revision; 59 } 60 61 @Override 62 public int compareTo(EntityRevisionPair other) { 63 int strCmp = this.entity.compareTo(other.entity); 64 if (strCmp != 0) { 65 return strCmp; 66 } 67 return Integer.compare(revision, other.revision); 68 } 69 70 @Override 71 public String toString() { 72 return entity + "@" + revision; 73 } 74 } 75 76 @Override 77 public Instances load(File file) { 78 final String[] lines; 79 String[] lineSplit; 80 String[] lineSplitBug; 81 82 try { 83 lines = FileTools.getLinesFromFile(file.getAbsolutePath()); 84 } 85 catch (IOException e) { 86 throw new RuntimeException(e); 87 } 88 89 // information about bugs are in another file 90 String path = file.getAbsolutePath(); 91 path = path.substring(0, path.length() - 14) + "repro.csv"; 92 final String[] linesBug; 93 try { 94 linesBug = FileTools.getLinesFromFile(path); 95 } 96 catch (IOException e) { 97 throw new RuntimeException(e); 98 } 99 100 int revisionIndex = -1; 101 int bugIndex = -1; 102 lineSplitBug = linesBug[0].split(";"); 103 for (int j = 0; j < lineSplitBug.length; j++) { 104 if (lineSplitBug[j].equals("svnrev")) { 105 revisionIndex = j; 106 } 107 if (lineSplitBug[j].equals("num_bugs_trace")) { 108 bugIndex = j; 109 } 110 } 111 if (revisionIndex < 0) { 112 throw new RuntimeException("could not find SVN revisions"); 113 } 114 if (bugIndex < 0) { 115 throw new RuntimeException("could not find bug information"); 116 } 117 118 int metricsStartIndex = -1; 119 int metricsEndIndex = -1; 120 lineSplit = lines[0].split(";"); 121 for (int j = 0; j < lineSplit.length; j++) { 122 if (lineSplit[j].equals("lm_LOC")) { 123 metricsStartIndex = j; 124 } 125 if (lineSplit[j].equals("h_E")) { 126 metricsEndIndex = j; 127 } 128 } 129 if (metricsStartIndex < 0) { 130 throw new RuntimeException("could not find first metric, i.e., lm_LOC"); 131 } 132 if (metricsEndIndex < 0) { 133 throw new RuntimeException("could not find last metric, i.e., h_E"); 134 } 135 int numMetrics = metricsEndIndex - metricsStartIndex + 1; 136 137 // create sets of all filenames and revisions 138 SortedMap<EntityRevisionPair, Integer> entityRevisionPairs = new TreeMap<>(); 139 for (int i = 1; i < linesBug.length; i++) { 140 lineSplitBug = linesBug[i].split(";"); 141 entityRevisionPairs.put(new EntityRevisionPair(lineSplitBug[0], Integer 142 .parseInt(lineSplitBug[revisionIndex])), i); 143 } 144 145 // prepare weka instances 146 final ArrayList<Attribute> atts = new ArrayList<Attribute>(); 147 lineSplit = lines[0].split(";"); 148 for (int j = metricsStartIndex; j <= metricsEndIndex; j++) { 149 atts.add(new Attribute(lineSplit[j] + "_delta")); 150 } 151 for (int j = metricsStartIndex; j <= metricsEndIndex; j++) { 152 atts.add(new Attribute(lineSplit[j] + "_abs")); 153 } 154 final ArrayList<String> classAttVals = new ArrayList<String>(); 155 classAttVals.add("0"); 156 classAttVals.add("1"); 157 final Attribute classAtt = new Attribute("bug", classAttVals); 158 atts.add(classAtt); 159 160 final Instances data = new Instances(file.getName(), atts, 0); 161 data.setClass(classAtt); 162 163 // create data 164 String lastFile = null; 165 double[] lastValues = null; 166 int lastNumBugs = 0; 167 for (Entry<EntityRevisionPair, Integer> entry : entityRevisionPairs.entrySet()) { 168 try { 169 // first get values 170 lineSplit = lines[entry.getValue()].split(";"); 171 lineSplitBug = linesBug[entry.getValue()].split(";"); 172 int i = 0; 173 double[] values = new double[numMetrics]; 174 for (int j = metricsStartIndex; j <= metricsEndIndex; j++) { 175 values[i] = Double.parseDouble(lineSplit[j]); 176 i++; 177 } 178 int numBugs = Integer.parseInt(lineSplitBug[bugIndex]); 179 180 // then check if an entity must be created 181 if (entry.getKey().entity.equals(lastFile)) { 182 // create new instance 183 double[] instanceValues = new double[2 * numMetrics + 1]; 184 for (int j = 0; j < numMetrics; j++) { 185 instanceValues[j] = values[j] - lastValues[j]; 186 instanceValues[j + numMetrics] = values[j]; 187 } 188 // check if any value>0 189 boolean changeOccured = false; 190 for (int j = 0; j < numMetrics; j++) { 191 if (instanceValues[j] > 0) { 192 changeOccured = true; 193 } 194 } 195 if (changeOccured) { 196 instanceValues[instanceValues.length - 1] = numBugs <= lastNumBugs ? 0 : 1; 197 data.add(new DenseInstance(1.0, instanceValues)); 198 } 199 } 200 lastFile = entry.getKey().entity; 201 lastValues = values; 202 lastNumBugs = numBugs; 203 } 204 catch (IllegalArgumentException e) { 205 System.err.println("error in line " + entry.getValue() + ": " + e.getMessage()); 206 System.err.println("metrics line: " + lines[entry.getValue()]); 207 System.err.println("bugs line: " + linesBug[entry.getValue()]); 208 System.err.println("line is ignored"); 209 } 210 } 211 212 return data; 213 } 214 215 /* 216 * (non-Javadoc) 217 * 218 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader.SingleVersionLoader#load( java.io.File) 219 */ 220 221 public Instances load(File file, String dummy) { 222 final String[] lines; 223 try { 224 lines = FileTools.getLinesFromFile(file.getAbsolutePath()); 225 } 226 catch (IOException e) { 227 throw new RuntimeException(e); 228 } 229 230 // information about bugs are in another file 231 String path = file.getAbsolutePath(); 232 path = path.substring(0, path.length() - 14) + "repro.csv"; 233 final String[] linesBug; 234 try { 235 linesBug = FileTools.getLinesFromFile(path); 236 } 237 catch (IOException e) { 238 throw new RuntimeException(e); 239 } 240 241 // configure Instances 242 final ArrayList<Attribute> atts = new ArrayList<Attribute>(); 243 244 String[] lineSplit = lines[0].split(";"); 245 // ignore first three/four and last two columns 246 int offset; 247 if (lineSplit[3].equals("project_rev")) { 248 offset = 4; 249 } 250 else { 251 offset = 3; 252 } 253 for (int j = 0; j < lineSplit.length - (offset + 2); j++) { 254 atts.add(new Attribute(lineSplit[j + offset])); 255 } 256 final ArrayList<String> classAttVals = new ArrayList<String>(); 257 classAttVals.add("0"); 258 classAttVals.add("1"); 259 final Attribute classAtt = new Attribute("bug", classAttVals); 260 atts.add(classAtt); 261 262 final Instances data = new Instances(file.getName(), atts, 0); 263 data.setClass(classAtt); 264 265 // fetch data 266 for (int i = 1; i < lines.length; i++) { 267 boolean validInstance = true; 268 lineSplit = lines[i].split(";"); 269 String[] lineSplitBug = linesBug[i].split(";"); 270 double[] values = new double[data.numAttributes()]; 271 for (int j = 0; validInstance && j < values.length - 1; j++) { 272 if (lineSplit[j + offset].trim().isEmpty()) { 273 validInstance = false; 274 } 275 else { 276 values[j] = Double.parseDouble(lineSplit[j + offset].trim()); 277 } 278 } 279 if (offset == 3) { 280 values[values.length - 1] = lineSplitBug[7].equals("0") ? 0 : 1; 281 } 282 else { 283 values[values.length - 1] = lineSplitBug[8].equals("0") ? 0 : 1; 284 } 285 286 if (validInstance) { 287 data.add(new DenseInstance(1.0, values)); 288 } 289 else { 290 System.out.println("instance " + i + " is invalid"); 291 } 292 } 293 return data; 294 } 295 296 /* 297 * (non-Javadoc) 298 * 299 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader.SingleVersionLoader# 300 * filenameFilter(java.lang.String) 301 */ 302 @Override 303 public boolean filenameFilter(String filename) { 304 return filename.endsWith("src.csv"); 305 } 283 306 284 307 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/AUDIDataLoader.java
r35 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.loader; 2 16 … … 12 26 /** 13 27 * TODO 28 * 14 29 * @author sherbold 15 * 30 * 16 31 */ 17 32 class AUDIDataLoader implements SingleVersionLoader { 18 33 19 /* 20 * (non-Javadoc) 21 * 22 * @see 23 * de.ugoe.cs.cpdp.loader.AbstractFolderLoader.SingleVersionLoader#load( 24 * java.io.File) 25 */ 26 @Override 27 public Instances load(File file) { 28 final String[] lines; 29 try { 30 lines = FileTools.getLinesFromFile(file.getAbsolutePath()); 31 } catch (IOException e) { 32 throw new RuntimeException(e); 33 } 34 35 // information about bugs are in another file 36 String path = file.getAbsolutePath(); 37 path = path.substring(0, path.length()-14) + "repro.csv"; 38 final String[] linesBug; 39 try { 40 linesBug = FileTools.getLinesFromFile(path); 41 } catch (IOException e) { 42 throw new RuntimeException(e); 43 } 44 45 // configure Instances 46 final ArrayList<Attribute> atts = new ArrayList<Attribute>(); 34 /* 35 * (non-Javadoc) 36 * 37 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader.SingleVersionLoader#load( java.io.File) 38 */ 39 @Override 40 public Instances load(File file) { 41 final String[] lines; 42 try { 43 lines = FileTools.getLinesFromFile(file.getAbsolutePath()); 44 } 45 catch (IOException e) { 46 throw new RuntimeException(e); 47 } 47 48 48 String[] lineSplit = lines[0].split(";"); 49 // ignore first three/four and last two columns 50 int offset; 51 if( lineSplit[3].equals("project_rev") ) { 52 offset = 4; 53 } else { 54 offset = 3; 55 } 56 for (int j = 0; j < lineSplit.length - (offset+2); j++) { 57 atts.add(new Attribute(lineSplit[j + offset])); 58 } 59 final ArrayList<String> classAttVals = new ArrayList<String>(); 60 classAttVals.add("0"); 61 classAttVals.add("1"); 62 final Attribute classAtt = new Attribute("bug", classAttVals); 63 atts.add(classAtt); 49 // information about bugs are in another file 50 String path = file.getAbsolutePath(); 51 path = path.substring(0, path.length() - 14) + "repro.csv"; 52 final String[] linesBug; 53 try { 54 linesBug = FileTools.getLinesFromFile(path); 55 } 56 catch (IOException e) { 57 throw new RuntimeException(e); 58 } 64 59 65 final Instances data = new Instances(file.getName(), atts, 0); 66 data.setClass(classAtt);60 // configure Instances 61 final ArrayList<Attribute> atts = new ArrayList<Attribute>(); 67 62 68 // fetch data 69 for (int i = 1; i < lines.length; i++) { 70 boolean validInstance = true; 71 lineSplit = lines[i].split(";"); 72 String[] lineSplitBug = linesBug[i].split(";"); 73 double[] values = new double[data.numAttributes()]; 74 for (int j = 0; validInstance && j < values.length-1; j++) { 75 if( lineSplit[j + offset].trim().isEmpty() ) { 76 validInstance = false; 77 } else { 78 values[j] = Double.parseDouble(lineSplit[j + offset].trim()); 79 } 80 } 81 if( offset==3 ) { 82 values[values.length - 1] = lineSplitBug[7].equals("0") ? 0 : 1; 83 } else { 84 values[values.length - 1] = lineSplitBug[8].equals("0") ? 0 : 1; 85 } 86 87 if( validInstance ) { 88 data.add(new DenseInstance(1.0, values)); 89 } else { 90 System.out.println("instance " + i + " is invalid"); 91 } 92 } 93 return data; 94 } 63 String[] lineSplit = lines[0].split(";"); 64 // ignore first three/four and last two columns 65 int offset; 66 if (lineSplit[3].equals("project_rev")) { 67 offset = 4; 68 } 69 else { 70 offset = 3; 71 } 72 for (int j = 0; j < lineSplit.length - (offset + 2); j++) { 73 atts.add(new Attribute(lineSplit[j + offset])); 74 } 75 final ArrayList<String> classAttVals = new ArrayList<String>(); 76 classAttVals.add("0"); 77 classAttVals.add("1"); 78 final Attribute classAtt = new Attribute("bug", classAttVals); 79 atts.add(classAtt); 95 80 96 /* 97 * (non-Javadoc) 98 * 99 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader.SingleVersionLoader# 100 * filenameFilter(java.lang.String) 101 */ 102 @Override 103 public boolean filenameFilter(String filename) { 104 return filename.endsWith("src.csv"); 105 } 81 final Instances data = new Instances(file.getName(), atts, 0); 82 data.setClass(classAtt); 83 84 // fetch data 85 for (int i = 1; i < lines.length; i++) { 86 boolean validInstance = true; 87 lineSplit = lines[i].split(";"); 88 String[] lineSplitBug = linesBug[i].split(";"); 89 double[] values = new double[data.numAttributes()]; 90 for (int j = 0; validInstance && j < values.length - 1; j++) { 91 if (lineSplit[j + offset].trim().isEmpty()) { 92 validInstance = false; 93 } 94 else { 95 values[j] = Double.parseDouble(lineSplit[j + offset].trim()); 96 } 97 } 98 if (offset == 3) { 99 values[values.length - 1] = lineSplitBug[7].equals("0") ? 0 : 1; 100 } 101 else { 102 values[values.length - 1] = lineSplitBug[8].equals("0") ? 0 : 1; 103 } 104 105 if (validInstance) { 106 data.add(new DenseInstance(1.0, values)); 107 } 108 else { 109 System.out.println("instance " + i + " is invalid"); 110 } 111 } 112 return data; 113 } 114 115 /* 116 * (non-Javadoc) 117 * 118 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader.SingleVersionLoader# 119 * filenameFilter(java.lang.String) 120 */ 121 @Override 122 public boolean filenameFilter(String filename) { 123 return filename.endsWith("src.csv"); 124 } 106 125 107 126 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/AUDIFolderLoader.java
r35 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.loader; 2 16 3 17 public class AUDIFolderLoader extends AbstractFolderLoader { 4 18 5 /*6 * (non-Javadoc)7 *8 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader#getSingleLoader()9 */10 @Override11 protected SingleVersionLoader getSingleLoader() {12 return new AUDIDataLoader();13 }19 /* 20 * (non-Javadoc) 21 * 22 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader#getSingleLoader() 23 */ 24 @Override 25 protected SingleVersionLoader getSingleLoader() { 26 return new AUDIDataLoader(); 27 } 14 28 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/AbstractFolderLoader.java
r32 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.loader; 2 16 … … 10 24 11 25 /** 12 * Abstract class for loading data from a folder. The subfolders of a defined 13 * folder define the projects, the file contained in the subfolder are the 14 * versions of a project. 26 * Abstract class for loading data from a folder. The subfolders of a defined folder define the 27 * projects, the file contained in the subfolder are the versions of a project. 15 28 * 16 29 * @author Steffen Herbold … … 18 31 public abstract class AbstractFolderLoader implements IVersionLoader { 19 32 20 /**21 * Path of the data.22 */23 protected String path = "";33 /** 34 * Path of the data. 35 */ 36 protected String path = ""; 24 37 25 /**26 * @see de.ugoe.cs.cpdp.loader.IVersionLoader#setLocation(java.lang.String)27 */28 @Override29 public void setLocation(String location) {30 path = location;31 }38 /** 39 * @see de.ugoe.cs.cpdp.loader.IVersionLoader#setLocation(java.lang.String) 40 */ 41 @Override 42 public void setLocation(String location) { 43 path = location; 44 } 32 45 33 /**34 * @see de.ugoe.cs.cpdp.loader.IVersionLoader#load()35 */36 @Override37 public List<SoftwareVersion> load() {38 final List<SoftwareVersion> versions = new LinkedList<SoftwareVersion>();46 /** 47 * @see de.ugoe.cs.cpdp.loader.IVersionLoader#load() 48 */ 49 @Override 50 public List<SoftwareVersion> load() { 51 final List<SoftwareVersion> versions = new LinkedList<SoftwareVersion>(); 39 52 40 final File dataDir = new File(path);41 final SingleVersionLoader instancesLoader = getSingleLoader();53 final File dataDir = new File(path); 54 final SingleVersionLoader instancesLoader = getSingleLoader(); 42 55 43 for (File projectDir : dataDir.listFiles()) { 44 if (projectDir.isDirectory()) { 45 String projectName = projectDir.getName(); 46 for (File versionFile : projectDir.listFiles()) { 47 if (versionFile.isFile() 48 && instancesLoader.filenameFilter(versionFile 49 .getName())) { 50 String versionName = versionFile.getName(); 51 Instances data = instancesLoader.load(versionFile); 52 versions.add(new SoftwareVersion(projectName, 53 versionName, data)); 54 } 55 } 56 } 57 } 58 return versions; 59 } 56 for (File projectDir : dataDir.listFiles()) { 57 if (projectDir.isDirectory()) { 58 String projectName = projectDir.getName(); 59 for (File versionFile : projectDir.listFiles()) { 60 if (versionFile.isFile() && 61 instancesLoader.filenameFilter(versionFile.getName())) 62 { 63 String versionName = versionFile.getName(); 64 Instances data = instancesLoader.load(versionFile); 65 versions.add(new SoftwareVersion(projectName, versionName, data)); 66 } 67 } 68 } 69 } 70 return versions; 71 } 60 72 61 /** 62 * Returns the concrete {@link SingleVersionLoader} to be used with this 63 * folder loader. 64 * 65 * @return 66 */ 67 abstract protected SingleVersionLoader getSingleLoader(); 73 /** 74 * Returns the concrete {@link SingleVersionLoader} to be used with this folder loader. 75 * 76 * @return 77 */ 78 abstract protected SingleVersionLoader getSingleLoader(); 68 79 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/CSVDataLoader.java
r4 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.loader; 2 16 … … 11 25 12 26 /** 13 * Loads the instances for a software version from a CSV file of the PROMISE 14 * data set mined byJurezko and Madeyski.27 * Loads the instances for a software version from a CSV file of the PROMISE data set mined by 28 * Jurezko and Madeyski. 15 29 * 16 30 * @author Steffen Herbold … … 18 32 class CSVDataLoader implements SingleVersionLoader { 19 33 20 /* 21 * (non-Javadoc) 22 * 23 * @see 24 * de.ugoe.cs.cpdp.loader.AbstractFolderLoader.SingleVersionLoader#load( 25 * java.io.File) 26 */ 27 @Override 28 public Instances load(File file) { 29 final String[] lines; 30 try { 31 lines = FileTools.getLinesFromFile(file.getAbsolutePath()); 32 } catch (IOException e) { 33 throw new RuntimeException(e); 34 } 34 /* 35 * (non-Javadoc) 36 * 37 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader.SingleVersionLoader#load( java.io.File) 38 */ 39 @Override 40 public Instances load(File file) { 41 final String[] lines; 42 try { 43 lines = FileTools.getLinesFromFile(file.getAbsolutePath()); 44 } 45 catch (IOException e) { 46 throw new RuntimeException(e); 47 } 35 48 36 // configure Instances37 final ArrayList<Attribute> atts = new ArrayList<Attribute>();49 // configure Instances 50 final ArrayList<Attribute> atts = new ArrayList<Attribute>(); 38 51 39 String[] lineSplit = lines[0].split(",");40 for (int j = 0; j < lineSplit.length - 4; j++) {41 atts.add(new Attribute(lineSplit[j + 3]));42 }43 final ArrayList<String> classAttVals = new ArrayList<String>();44 classAttVals.add("0");45 classAttVals.add("1");46 final Attribute classAtt = new Attribute("bug", classAttVals);47 atts.add(classAtt);52 String[] lineSplit = lines[0].split(","); 53 for (int j = 0; j < lineSplit.length - 4; j++) { 54 atts.add(new Attribute(lineSplit[j + 3])); 55 } 56 final ArrayList<String> classAttVals = new ArrayList<String>(); 57 classAttVals.add("0"); 58 classAttVals.add("1"); 59 final Attribute classAtt = new Attribute("bug", classAttVals); 60 atts.add(classAtt); 48 61 49 final Instances data = new Instances(file.getName(), atts, 0);50 data.setClass(classAtt);62 final Instances data = new Instances(file.getName(), atts, 0); 63 data.setClass(classAtt); 51 64 52 // fetch data 53 for (int i = 1; i < lines.length; i++) { 54 lineSplit = lines[i].split(","); 55 double[] values = new double[lineSplit.length - 3]; 56 for (int j = 0; j < values.length - 1; j++) { 57 values[j] = Double.parseDouble(lineSplit[j + 3].trim()); 58 } 59 values[values.length - 1] = lineSplit[lineSplit.length - 1].trim() 60 .equals("0") ? 0 : 1; 61 data.add(new DenseInstance(1.0, values)); 62 } 65 // fetch data 66 for (int i = 1; i < lines.length; i++) { 67 lineSplit = lines[i].split(","); 68 double[] values = new double[lineSplit.length - 3]; 69 for (int j = 0; j < values.length - 1; j++) { 70 values[j] = Double.parseDouble(lineSplit[j + 3].trim()); 71 } 72 values[values.length - 1] = lineSplit[lineSplit.length - 1].trim().equals("0") ? 0 : 1; 73 data.add(new DenseInstance(1.0, values)); 74 } 63 75 64 return data;65 }76 return data; 77 } 66 78 67 /*68 * (non-Javadoc)69 *70 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader.SingleVersionLoader#71 * filenameFilter(java.lang.String)72 */73 @Override74 public boolean filenameFilter(String filename) {75 return filename.endsWith(".csv");76 }79 /* 80 * (non-Javadoc) 81 * 82 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader.SingleVersionLoader# 83 * filenameFilter(java.lang.String) 84 */ 85 @Override 86 public boolean filenameFilter(String filename) { 87 return filename.endsWith(".csv"); 88 } 77 89 78 90 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/CSVFolderLoader.java
r4 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.loader; 2 16 3 17 /** 4 * Implements the {@link AbstractFolderLoader} for data from the PROMISE 5 * repository mined by Jurezkoand Madeyski.18 * Implements the {@link AbstractFolderLoader} for data from the PROMISE repository mined by Jurezko 19 * and Madeyski. 6 20 * 7 21 * @author Steffen Herbold … … 9 23 public class CSVFolderLoader extends AbstractFolderLoader { 10 24 11 /*12 * (non-Javadoc)13 *14 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader#getSingleLoader()15 */16 @Override17 protected SingleVersionLoader getSingleLoader() {18 return new CSVDataLoader();19 }25 /* 26 * (non-Javadoc) 27 * 28 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader#getSingleLoader() 29 */ 30 @Override 31 protected SingleVersionLoader getSingleLoader() { 32 return new CSVDataLoader(); 33 } 20 34 21 35 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/CSVMockusDataLoader.java
r29 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.loader; 2 16 … … 10 24 import de.ugoe.cs.util.FileTools; 11 25 12 13 26 class CSVMockusDataLoader implements SingleVersionLoader { 14 27 15 @Override 16 public Instances load(File file) { 17 final String[] lines; 18 try { 19 20 lines = FileTools.getLinesFromFile(file.getAbsolutePath()); 21 } catch (IOException e) { 22 throw new RuntimeException(e); 23 } 24 25 26 // configure Instances 27 final ArrayList<Attribute> atts = new ArrayList<Attribute>(); 28 @Override 29 public Instances load(File file) { 30 final String[] lines; 31 try { 28 32 29 String[] lineSplit = lines[0].split(","); 30 for (int j = 0; j < lineSplit.length - 3; j++) { 31 atts.add(new Attribute(lineSplit[j + 2])); 32 } 33 34 final ArrayList<String> classAttVals = new ArrayList<String>(); 35 classAttVals.add("0"); 36 classAttVals.add("1"); 37 final Attribute classAtt = new Attribute("bug", classAttVals); 38 atts.add(classAtt); 33 lines = FileTools.getLinesFromFile(file.getAbsolutePath()); 34 } 35 catch (IOException e) { 36 throw new RuntimeException(e); 37 } 39 38 40 final Instances data = new Instances(file.getName(), atts, 0); 41 data.setClass(classAtt);39 // configure Instances 40 final ArrayList<Attribute> atts = new ArrayList<Attribute>(); 42 41 43 // fetch data 44 for (int i = 1; i < lines.length; i++) { 45 lineSplit = lines[i].split(","); 46 double[] values = new double[lineSplit.length - 2]; 47 for (int j = 0; j < values.length - 1; j++) { 48 values[j] = Double.parseDouble(lineSplit[j + 2].trim()); 49 } 50 values[values.length - 1] = lineSplit[lineSplit.length - 1].trim() 51 .equals("0") ? 0 : 1; 52 data.add(new DenseInstance(1.0, values)); 53 } 42 String[] lineSplit = lines[0].split(","); 43 for (int j = 0; j < lineSplit.length - 3; j++) { 44 atts.add(new Attribute(lineSplit[j + 2])); 45 } 54 46 55 return data; 56 } 47 final ArrayList<String> classAttVals = new ArrayList<String>(); 48 classAttVals.add("0"); 49 classAttVals.add("1"); 50 final Attribute classAtt = new Attribute("bug", classAttVals); 51 atts.add(classAtt); 57 52 58 @Override 59 public boolean filenameFilter(String filename) { 60 return filename.endsWith(".csv"); 61 } 53 final Instances data = new Instances(file.getName(), atts, 0); 54 data.setClass(classAtt); 55 56 // fetch data 57 for (int i = 1; i < lines.length; i++) { 58 lineSplit = lines[i].split(","); 59 double[] values = new double[lineSplit.length - 2]; 60 for (int j = 0; j < values.length - 1; j++) { 61 values[j] = Double.parseDouble(lineSplit[j + 2].trim()); 62 } 63 values[values.length - 1] = lineSplit[lineSplit.length - 1].trim().equals("0") ? 0 : 1; 64 data.add(new DenseInstance(1.0, values)); 65 } 66 67 return data; 68 } 69 70 @Override 71 public boolean filenameFilter(String filename) { 72 return filename.endsWith(".csv"); 73 } 62 74 63 75 } 64 -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/CSVMockusFolderLoader.java
r28 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.loader; 2 16 3 17 public class CSVMockusFolderLoader extends AbstractFolderLoader { 4 18 5 @Override6 protected SingleVersionLoader getSingleLoader() {7 return new CSVMockusDataLoader();8 }19 @Override 20 protected SingleVersionLoader getSingleLoader() { 21 return new CSVMockusDataLoader(); 22 } 9 23 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/DecentDataLoader.java
r36 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.loader; 2 16 … … 38 52 39 53 /** 40 * Class for loading a decent model file. 41 * Loads a decent model file and (if no arff file is present) and does the 42 * following conversions: 43 * DECENT -> ARFFX -> ARFF 54 * Class for loading a decent model file. Loads a decent model file and (if no arff file is present) 55 * and does the following conversions: DECENT -> ARFFX -> ARFF 44 56 * 45 57 * @author Fabian Trautsch 46 * 58 * 47 59 */ 48 public class DecentDataLoader implements SingleVersionLoader{ 49 50 // Model Handler for Decent Models 51 private DECENTEpsilonModelHandler modelHandler = new DECENTEpsilonModelHandler(); 52 53 // Set log level 54 String logLevel = "1"; 55 String logToFile = "false"; 56 57 // This list contains attributes, that should be removed before building the arff file 58 private static List<String> attributeFilter = new LinkedList<String>(); 59 60 // This list contains all names of the different artifacts 61 private static Set<String> artifactNames = new LinkedHashSet<String>(); 62 63 // Name of the class attribute. 64 private static final String classAttributeName = "LABEL.Artifact.Target.BugFix.AverageWeight"; 65 66 67 private int getIndexOfArtifactName(String artifactName) { 68 int index = -1; 69 if(artifactNames.contains(artifactName)) { 70 int i=0; 71 for(String nameInSet: artifactNames) { 72 if(nameInSet.equals(artifactName)) { 73 index = i; 74 } else { 75 i++; 76 } 77 } 78 } 79 80 return index; 81 } 82 83 /** 84 * Defines attributes, that should be removed before building the 85 * ARFF File from. 86 */ 87 private void setAttributeFilter() { 88 attributeFilter.add("Agent.Name"); 89 90 } 91 92 /** 93 * Saves the dataset as arff after transformation (decent->arffx) and 94 * filtering 95 * 96 * @param dataSet the WEKA dataset to save 97 * @param arffLocation location where it should be saved to 98 */ 99 public void save(Instances dataSet, String arffLocation) { 100 101 102 ArffSaver saver = new ArffSaver(); 103 saver.setInstances(dataSet); 104 try { 105 saver.setFile(new File(arffLocation)); 106 saver.writeBatch(); 107 } catch (IOException e) { 108 Console.printerrln("Cannot save the file to path: "+arffLocation); 109 e.printStackTrace(); 110 } 111 } 112 113 114 /** 115 * Loads the given decent file and tranform it from decent->arffx->arff 116 * @return Instances in WEKA format 117 */ 118 @Override 119 public Instances load(File file) { 120 121 // Set attributeFilter 122 setAttributeFilter(); 123 124 // Register MetaModels 125 try { 126 registerMetaModels(); 127 } catch (Exception e1) { 128 Console.printerrln("Metamodels cannot be registered!"); 129 e1.printStackTrace(); 130 } 131 132 // Set location of decent and arffx Model 133 String decentModelLocation = file.getAbsolutePath(); 134 String pathToDecentModelFolder = decentModelLocation.substring(0,decentModelLocation.lastIndexOf(File.separator)); 135 String arffxModelLocation = pathToDecentModelFolder+"/model.arffx"; 136 String logModelLocation = pathToDecentModelFolder+"/model.log"; 137 String arffLocation = pathToDecentModelFolder+"/model.arff"; 138 139 // If arff File exists, load from it! 140 if(new File(arffLocation).exists()) { 141 System.out.println("Loading arff File..."); 142 BufferedReader reader; 143 Instances data = null; 144 try { 145 reader = new BufferedReader(new FileReader(arffLocation)); 146 data = new Instances(reader); 147 reader.close(); 148 } catch (FileNotFoundException e) { 149 Console.printerrln("File with path: "+arffLocation+" was not found."); 150 e.printStackTrace(); 151 } catch (IOException e) { 152 Console.printerrln("File with path: "+arffLocation+" cannot be read."); 153 e.printStackTrace(); 154 } 155 156 // Set class attribute if not set 157 if(data.classIndex() == -1) { 158 Attribute classAttribute = data.attribute(classAttributeName); 159 data.setClass(classAttribute); 160 } 161 162 163 return data; 164 } 165 166 // Location of EOL Scripts 167 String preprocess = "./decent/epsilon/query/preprocess.eol"; 168 String arffxToArffSource = "./decent/epsilon/query/addLabels.eol"; 169 170 // Set Log Properties 171 System.setProperty("epsilon.logLevel", logLevel); 172 System.setProperty("epsilon.logToFile", logToFile); 173 System.setProperty("epsilon.logFileAvailable", "false"); 174 175 // Set decent2arffx Properties 176 System.setProperty("epsilon.transformation.decent2arffx.skipSource", "false"); 177 System.setProperty("epsilon.transformation.decent2arffx.type", "code"); 178 179 180 181 // Preprocess Data, transform from decent2arffx 182 try { 183 IEolExecutableModule preProcessModule = loadModule(preprocess); 184 IModel preProcessDecentModel = modelHandler.getDECENTModel(decentModelLocation, true, true); 185 IModel preProcessArffxarffxModel = modelHandler.getARFFxModel(arffxModelLocation, false, true); 186 preProcessModule.getContext().getModelRepository().addModel(preProcessDecentModel); 187 preProcessModule.getContext().getModelRepository().addModel(preProcessArffxarffxModel); 188 execute(preProcessModule, logModelLocation); 189 preProcessDecentModel.dispose(); 190 preProcessArffxarffxModel.dispose(); 191 preProcessModule.reset(); 192 } catch (URISyntaxException e) { 193 Console.printerrln("URI Syntax for decent or arffx model is wrong."); 194 e.printStackTrace(); 195 } catch (Exception e) { 196 e.printStackTrace(); 197 } 198 199 200 201 202 // Transform to arff, for label and confidence attributes 203 try { 204 IEolExecutableModule arffxToArffModule = loadModule(arffxToArffSource); 205 IModel arffxToArffArffxModel = modelHandler.getARFFxModel(arffxModelLocation, true, true); 206 arffxToArffModule.getContext().getModelRepository().addModel(arffxToArffArffxModel); 207 execute(arffxToArffModule, logModelLocation); 208 arffxToArffArffxModel.dispose(); 209 // can be stored and retained alternatively 210 arffxToArffModule.reset(); 211 } catch (URISyntaxException e) { 212 Console.printerrln("URI Syntax for arffx model is wrong."); 213 e.printStackTrace(); 214 } catch (Exception e) { 215 e.printStackTrace(); 216 } 217 218 // Unregister MetaModels, otherwise cast will fail 219 HashMap<String, Object> metaModelCache = new HashMap<>(); 220 for (String key : EPackage.Registry.INSTANCE.keySet()) { 221 metaModelCache.put(key, EPackage.Registry.INSTANCE.get(key)); 222 }; 223 224 for (String key : metaModelCache .keySet()) { 225 EPackage.Registry.INSTANCE.remove(key); 226 }; 227 228 229 // Workaround to gernerate a usable URI. Absolute path is not 230 // possible, therefore we need to construct a relative path 231 232 URL location = DecentDataLoader.class.getProtectionDomain().getCodeSource().getLocation(); 233 String basePath = location.getFile(); 234 235 // Location is the bin folder, so we need to delete the last 4 characters 236 basePath = basePath.substring(0, basePath.length() - 4); 237 String relativePath = new File(basePath).toURI().relativize(new File(arffxModelLocation).toURI()).getPath(); 238 239 // Loard arffx file and create WEKA Instances 240 ARFFxResourceTool tool = new ARFFxResourceTool(); 241 Resource resource = tool.loadResourceFromXMI(relativePath, "arffx"); 242 243 Instances dataSet = null; 244 for(EObject o: resource.getContents()) { 245 Model m = (Model) o; 246 dataSet = createWekaDataFormat(m); 247 248 for(Instance i : m.getData()) { 249 createWekaInstance(dataSet, i); 250 } 251 } 252 253 // Set class attribute 254 Attribute classAttribute = dataSet.attribute(classAttributeName); 255 dataSet.setClass(classAttribute); 256 257 // Save as ARFF 258 save(dataSet, arffLocation); 259 260 return dataSet; 261 262 } 263 264 265 /** 266 * Creates a WekaInstance from an ARFFX Model Instance 267 * 268 * @param dataSet WekaInstance dataset, where the arffx model instances should be 269 * added to 270 * @param i arffx model instance 271 */ 272 private void createWekaInstance(Instances dataSet, Instance i) { 273 double[] values = new double[dataSet.numAttributes()]; 274 int j=0; 275 276 for(Value value : i.getValues()) { 277 String dataValue = value.getContent(); 278 String attributeName = value.getOfAttribute().getName(); 279 280 if(attributeFilter.contains(attributeName)) { 281 continue; 282 } 283 284 // Is value a LABEL.* attribute? 285 if(isLabel(attributeName)) { 286 values[j] = dataSet.attribute(j).indexOfValue(dataValue); 287 } else if (isConfidenceLabel(attributeName)){ 288 // Is value a CONFIDENCE.* attribute? 289 values[j] = dataSet.attribute(j).indexOfValue(dataValue); 290 } else if(attributeName.equals("Artifact.Name")){ 291 // Is it the name of the artifact? 292 artifactNames.add(dataValue); 293 values[j] = getIndexOfArtifactName(dataValue); 294 } else { 295 // Is it a numeric value? 296 values[j] = Double.parseDouble(dataValue); 297 } 298 299 j++; 300 } 301 302 DenseInstance inst = new DenseInstance(1.0, values); 303 dataSet.add(inst); 304 } 305 306 /** 307 * Creates a Weka Instances set out of a arffx model 308 * @param m arffx model 309 * @return 310 */ 311 private Instances createWekaDataFormat(Model m) { 312 313 // Bad solution, can be enhanced (continue in for loop) 314 ArrayList<Attribute> datasetAttributes = new ArrayList<Attribute>(); 315 for(de.ugoe.cs.cpdp.decentApp.models.arffx.Attribute attribute :m.getAttributes()) { 316 String attributeName = attribute.getName(); 317 318 if(attributeFilter.contains(attributeName)) { 319 continue; 320 } 321 322 Attribute wekaAttr; 323 324 // Is attribute a LABEL.* attribute? 325 if(isLabel(attributeName)) { 326 // Classattribute 327 final ArrayList<String> classAttVals = new ArrayList<String>(); 328 classAttVals.add("false"); 329 classAttVals.add("true"); 330 wekaAttr = new Attribute(attributeName, classAttVals); 331 } else if(isConfidenceLabel(attributeName)){ 332 // Is attribute a CONFIDENCE.* attribute? 333 ArrayList<String> labels = new ArrayList<String>(); 334 labels.add("high"); 335 labels.add("low"); 336 wekaAttr = new Attribute(attributeName, labels); 337 } else { 338 // Is it a numeric attribute? 339 wekaAttr = new Attribute(attributeName); 340 } 341 342 datasetAttributes.add(wekaAttr); 343 } 344 345 346 return new Instances("test-dataset", datasetAttributes, 0); 347 } 348 349 /** 350 * Helper methods which indicates if the given value starts with "LABEL" 351 * 352 * @param value to test 353 * @return 354 */ 355 private boolean isLabel(String value) { 356 if(value.length()>= 5 && value.substring(0, 5).equals("LABEL")) { 357 return true; 358 } 359 360 return false; 361 } 362 363 /** 364 * Helper method which indicates if the given value starts with "CONFIDENCE" 365 * @param value to test 366 * @return 367 */ 368 private boolean isConfidenceLabel(String value) { 369 if(value.length()>= 10 && value.substring(0, 10).equals("CONFIDENCE")) { 370 return true; 371 } 372 373 return false; 374 } 375 376 377 /** 378 * Returns if a filename ends with ".decent" 379 * @return 380 */ 381 @Override 382 public boolean filenameFilter(String filename) { 383 return filename.endsWith(".decent"); 384 } 385 386 /** 387 * Helper method for executing a eol scripts and adding the log model beforehand 388 * @param module module to execute 389 * @param logModelLocation location of the log model 390 * @throws Exception 391 */ 392 private void execute(IEolExecutableModule module, String logModelLocation) 393 throws Exception { 394 IModel logModel = modelHandler.getLOGModel(logModelLocation, true, true); 395 module.getContext().getModelRepository().addModel(logModel); 396 module.execute(); 397 logModel.dispose(); 398 } 399 400 /** 401 * Loads the module from a given source 402 * 403 * @param source where the module is (e.g. eol script) 404 * @return 405 * @throws Exception 406 * @throws URISyntaxException 407 */ 408 private IEolExecutableModule loadModule(String source) throws Exception, 409 URISyntaxException { 410 411 IEolExecutableModule module = null; 412 if (source.endsWith("etl")) { 413 module = new EtlModule(); 414 } else if (source.endsWith("eol")) { 415 module = new EolModule(); 416 } else { 417 418 } 419 420 module.parse(modelHandler.getFile(source)); 421 422 if (module.getParseProblems().size() > 0) { 423 Console.printerrln("Parse error occured..."); 424 for (ParseProblem problem : module.getParseProblems()) { 425 System.err.println(problem.toString()); 426 } 427 // System.exit(-1); 428 } 429 430 return module; 431 } 432 433 /** 434 * Helper method for registering the metamodels 435 * @throws Exception 436 */ 437 private void registerMetaModels() throws Exception { 438 String metaModelsPath = DECENTEpsilonModelHandler.metaPath; 439 File metaModelsLocation = new File(metaModelsPath); 440 for (File file : metaModelsLocation.listFiles()) { 441 if (file.getName().endsWith(".ecore")) { 442 EmfUtil.register(URI.createFileURI(file.getAbsolutePath()), EPackage.Registry.INSTANCE); 443 } 444 } 445 } 446 60 public class DecentDataLoader implements SingleVersionLoader { 61 62 // Model Handler for Decent Models 63 private DECENTEpsilonModelHandler modelHandler = new DECENTEpsilonModelHandler(); 64 65 // Set log level 66 String logLevel = "1"; 67 String logToFile = "false"; 68 69 // This list contains attributes, that should be removed before building the arff file 70 private static List<String> attributeFilter = new LinkedList<String>(); 71 72 // This list contains all names of the different artifacts 73 private static Set<String> artifactNames = new LinkedHashSet<String>(); 74 75 // Name of the class attribute. 76 private static final String classAttributeName = "LABEL.Artifact.Target.BugFix.AverageWeight"; 77 78 private int getIndexOfArtifactName(String artifactName) { 79 int index = -1; 80 if (artifactNames.contains(artifactName)) { 81 int i = 0; 82 for (String nameInSet : artifactNames) { 83 if (nameInSet.equals(artifactName)) { 84 index = i; 85 } 86 else { 87 i++; 88 } 89 } 90 } 91 92 return index; 93 } 94 95 /** 96 * Defines attributes, that should be removed before building the ARFF File from. 97 */ 98 private void setAttributeFilter() { 99 attributeFilter.add("Agent.Name"); 100 101 } 102 103 /** 104 * Saves the dataset as arff after transformation (decent->arffx) and filtering 105 * 106 * @param dataSet 107 * the WEKA dataset to save 108 * @param arffLocation 109 * location where it should be saved to 110 */ 111 public void save(Instances dataSet, String arffLocation) { 112 113 ArffSaver saver = new ArffSaver(); 114 saver.setInstances(dataSet); 115 try { 116 saver.setFile(new File(arffLocation)); 117 saver.writeBatch(); 118 } 119 catch (IOException e) { 120 Console.printerrln("Cannot save the file to path: " + arffLocation); 121 e.printStackTrace(); 122 } 123 } 124 125 /** 126 * Loads the given decent file and tranform it from decent->arffx->arff 127 * 128 * @return Instances in WEKA format 129 */ 130 @Override 131 public Instances load(File file) { 132 133 // Set attributeFilter 134 setAttributeFilter(); 135 136 // Register MetaModels 137 try { 138 registerMetaModels(); 139 } 140 catch (Exception e1) { 141 Console.printerrln("Metamodels cannot be registered!"); 142 e1.printStackTrace(); 143 } 144 145 // Set location of decent and arffx Model 146 String decentModelLocation = file.getAbsolutePath(); 147 String pathToDecentModelFolder = 148 decentModelLocation.substring(0, decentModelLocation.lastIndexOf(File.separator)); 149 String arffxModelLocation = pathToDecentModelFolder + "/model.arffx"; 150 String logModelLocation = pathToDecentModelFolder + "/model.log"; 151 String arffLocation = pathToDecentModelFolder + "/model.arff"; 152 153 // If arff File exists, load from it! 154 if (new File(arffLocation).exists()) { 155 System.out.println("Loading arff File..."); 156 BufferedReader reader; 157 Instances data = null; 158 try { 159 reader = new BufferedReader(new FileReader(arffLocation)); 160 data = new Instances(reader); 161 reader.close(); 162 } 163 catch (FileNotFoundException e) { 164 Console.printerrln("File with path: " + arffLocation + " was not found."); 165 e.printStackTrace(); 166 } 167 catch (IOException e) { 168 Console.printerrln("File with path: " + arffLocation + " cannot be read."); 169 e.printStackTrace(); 170 } 171 172 // Set class attribute if not set 173 if (data.classIndex() == -1) { 174 Attribute classAttribute = data.attribute(classAttributeName); 175 data.setClass(classAttribute); 176 } 177 178 return data; 179 } 180 181 // Location of EOL Scripts 182 String preprocess = "./decent/epsilon/query/preprocess.eol"; 183 String arffxToArffSource = "./decent/epsilon/query/addLabels.eol"; 184 185 // Set Log Properties 186 System.setProperty("epsilon.logLevel", logLevel); 187 System.setProperty("epsilon.logToFile", logToFile); 188 System.setProperty("epsilon.logFileAvailable", "false"); 189 190 // Set decent2arffx Properties 191 System.setProperty("epsilon.transformation.decent2arffx.skipSource", "false"); 192 System.setProperty("epsilon.transformation.decent2arffx.type", "code"); 193 194 // Preprocess Data, transform from decent2arffx 195 try { 196 IEolExecutableModule preProcessModule = loadModule(preprocess); 197 IModel preProcessDecentModel = 198 modelHandler.getDECENTModel(decentModelLocation, true, true); 199 IModel preProcessArffxarffxModel = 200 modelHandler.getARFFxModel(arffxModelLocation, false, true); 201 preProcessModule.getContext().getModelRepository().addModel(preProcessDecentModel); 202 preProcessModule.getContext().getModelRepository().addModel(preProcessArffxarffxModel); 203 execute(preProcessModule, logModelLocation); 204 preProcessDecentModel.dispose(); 205 preProcessArffxarffxModel.dispose(); 206 preProcessModule.reset(); 207 } 208 catch (URISyntaxException e) { 209 Console.printerrln("URI Syntax for decent or arffx model is wrong."); 210 e.printStackTrace(); 211 } 212 catch (Exception e) { 213 e.printStackTrace(); 214 } 215 216 // Transform to arff, for label and confidence attributes 217 try { 218 IEolExecutableModule arffxToArffModule = loadModule(arffxToArffSource); 219 IModel arffxToArffArffxModel = 220 modelHandler.getARFFxModel(arffxModelLocation, true, true); 221 arffxToArffModule.getContext().getModelRepository().addModel(arffxToArffArffxModel); 222 execute(arffxToArffModule, logModelLocation); 223 arffxToArffArffxModel.dispose(); 224 // can be stored and retained alternatively 225 arffxToArffModule.reset(); 226 } 227 catch (URISyntaxException e) { 228 Console.printerrln("URI Syntax for arffx model is wrong."); 229 e.printStackTrace(); 230 } 231 catch (Exception e) { 232 e.printStackTrace(); 233 } 234 235 // Unregister MetaModels, otherwise cast will fail 236 HashMap<String, Object> metaModelCache = new HashMap<>(); 237 for (String key : EPackage.Registry.INSTANCE.keySet()) { 238 metaModelCache.put(key, EPackage.Registry.INSTANCE.get(key)); 239 }; 240 241 for (String key : metaModelCache.keySet()) { 242 EPackage.Registry.INSTANCE.remove(key); 243 }; 244 245 // Workaround to gernerate a usable URI. Absolute path is not 246 // possible, therefore we need to construct a relative path 247 248 URL location = DecentDataLoader.class.getProtectionDomain().getCodeSource().getLocation(); 249 String basePath = location.getFile(); 250 251 // Location is the bin folder, so we need to delete the last 4 characters 252 basePath = basePath.substring(0, basePath.length() - 4); 253 String relativePath = 254 new File(basePath).toURI().relativize(new File(arffxModelLocation).toURI()).getPath(); 255 256 // Loard arffx file and create WEKA Instances 257 ARFFxResourceTool tool = new ARFFxResourceTool(); 258 Resource resource = tool.loadResourceFromXMI(relativePath, "arffx"); 259 260 Instances dataSet = null; 261 for (EObject o : resource.getContents()) { 262 Model m = (Model) o; 263 dataSet = createWekaDataFormat(m); 264 265 for (Instance i : m.getData()) { 266 createWekaInstance(dataSet, i); 267 } 268 } 269 270 // Set class attribute 271 Attribute classAttribute = dataSet.attribute(classAttributeName); 272 dataSet.setClass(classAttribute); 273 274 // Save as ARFF 275 save(dataSet, arffLocation); 276 277 return dataSet; 278 279 } 280 281 /** 282 * Creates a WekaInstance from an ARFFX Model Instance 283 * 284 * @param dataSet 285 * WekaInstance dataset, where the arffx model instances should be added to 286 * @param i 287 * arffx model instance 288 */ 289 private void createWekaInstance(Instances dataSet, Instance i) { 290 double[] values = new double[dataSet.numAttributes()]; 291 int j = 0; 292 293 for (Value value : i.getValues()) { 294 String dataValue = value.getContent(); 295 String attributeName = value.getOfAttribute().getName(); 296 297 if (attributeFilter.contains(attributeName)) { 298 continue; 299 } 300 301 // Is value a LABEL.* attribute? 302 if (isLabel(attributeName)) { 303 values[j] = dataSet.attribute(j).indexOfValue(dataValue); 304 } 305 else if (isConfidenceLabel(attributeName)) { 306 // Is value a CONFIDENCE.* attribute? 307 values[j] = dataSet.attribute(j).indexOfValue(dataValue); 308 } 309 else if (attributeName.equals("Artifact.Name")) { 310 // Is it the name of the artifact? 311 artifactNames.add(dataValue); 312 values[j] = getIndexOfArtifactName(dataValue); 313 } 314 else { 315 // Is it a numeric value? 316 values[j] = Double.parseDouble(dataValue); 317 } 318 319 j++; 320 } 321 322 DenseInstance inst = new DenseInstance(1.0, values); 323 dataSet.add(inst); 324 } 325 326 /** 327 * Creates a Weka Instances set out of a arffx model 328 * 329 * @param m 330 * arffx model 331 * @return 332 */ 333 private Instances createWekaDataFormat(Model m) { 334 335 // Bad solution, can be enhanced (continue in for loop) 336 ArrayList<Attribute> datasetAttributes = new ArrayList<Attribute>(); 337 for (de.ugoe.cs.cpdp.decentApp.models.arffx.Attribute attribute : m.getAttributes()) { 338 String attributeName = attribute.getName(); 339 340 if (attributeFilter.contains(attributeName)) { 341 continue; 342 } 343 344 Attribute wekaAttr; 345 346 // Is attribute a LABEL.* attribute? 347 if (isLabel(attributeName)) { 348 // Classattribute 349 final ArrayList<String> classAttVals = new ArrayList<String>(); 350 classAttVals.add("false"); 351 classAttVals.add("true"); 352 wekaAttr = new Attribute(attributeName, classAttVals); 353 } 354 else if (isConfidenceLabel(attributeName)) { 355 // Is attribute a CONFIDENCE.* attribute? 356 ArrayList<String> labels = new ArrayList<String>(); 357 labels.add("high"); 358 labels.add("low"); 359 wekaAttr = new Attribute(attributeName, labels); 360 } 361 else { 362 // Is it a numeric attribute? 363 wekaAttr = new Attribute(attributeName); 364 } 365 366 datasetAttributes.add(wekaAttr); 367 } 368 369 return new Instances("test-dataset", datasetAttributes, 0); 370 } 371 372 /** 373 * Helper methods which indicates if the given value starts with "LABEL" 374 * 375 * @param value 376 * to test 377 * @return 378 */ 379 private boolean isLabel(String value) { 380 if (value.length() >= 5 && value.substring(0, 5).equals("LABEL")) { 381 return true; 382 } 383 384 return false; 385 } 386 387 /** 388 * Helper method which indicates if the given value starts with "CONFIDENCE" 389 * 390 * @param value 391 * to test 392 * @return 393 */ 394 private boolean isConfidenceLabel(String value) { 395 if (value.length() >= 10 && value.substring(0, 10).equals("CONFIDENCE")) { 396 return true; 397 } 398 399 return false; 400 } 401 402 /** 403 * Returns if a filename ends with ".decent" 404 * 405 * @return 406 */ 407 @Override 408 public boolean filenameFilter(String filename) { 409 return filename.endsWith(".decent"); 410 } 411 412 /** 413 * Helper method for executing a eol scripts and adding the log model beforehand 414 * 415 * @param module 416 * module to execute 417 * @param logModelLocation 418 * location of the log model 419 * @throws Exception 420 */ 421 private void execute(IEolExecutableModule module, String logModelLocation) throws Exception { 422 IModel logModel = modelHandler.getLOGModel(logModelLocation, true, true); 423 module.getContext().getModelRepository().addModel(logModel); 424 module.execute(); 425 logModel.dispose(); 426 } 427 428 /** 429 * Loads the module from a given source 430 * 431 * @param source 432 * where the module is (e.g. eol script) 433 * @return 434 * @throws Exception 435 * @throws URISyntaxException 436 */ 437 private IEolExecutableModule loadModule(String source) throws Exception, URISyntaxException { 438 439 IEolExecutableModule module = null; 440 if (source.endsWith("etl")) { 441 module = new EtlModule(); 442 } 443 else if (source.endsWith("eol")) { 444 module = new EolModule(); 445 } 446 else { 447 448 } 449 450 module.parse(modelHandler.getFile(source)); 451 452 if (module.getParseProblems().size() > 0) { 453 Console.printerrln("Parse error occured..."); 454 for (ParseProblem problem : module.getParseProblems()) { 455 System.err.println(problem.toString()); 456 } 457 // System.exit(-1); 458 } 459 460 return module; 461 } 462 463 /** 464 * Helper method for registering the metamodels 465 * 466 * @throws Exception 467 */ 468 private void registerMetaModels() throws Exception { 469 String metaModelsPath = DECENTEpsilonModelHandler.metaPath; 470 File metaModelsLocation = new File(metaModelsPath); 471 for (File file : metaModelsLocation.listFiles()) { 472 if (file.getName().endsWith(".ecore")) { 473 EmfUtil.register(URI.createFileURI(file.getAbsolutePath()), 474 EPackage.Registry.INSTANCE); 475 } 476 } 477 } 478 447 479 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/DecentFolderLoader.java
r32 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.loader; 2 16 … … 16 30 public class DecentFolderLoader extends AbstractFolderLoader { 17 31 18 /* 19 * (non-Javadoc) 20 * 21 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader#getSingleLoader() 22 */ 23 @Override 24 protected SingleVersionLoader getSingleLoader() { 25 return new DecentDataLoader(); 26 } 27 28 /** 29 * @see de.ugoe.cs.cpdp.loader.IVersionLoader#load() 30 */ 31 @Override 32 public List<SoftwareVersion> load() { 33 final List<SoftwareVersion> versions = new LinkedList<SoftwareVersion>(); 32 /* 33 * (non-Javadoc) 34 * 35 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader#getSingleLoader() 36 */ 37 @Override 38 protected SingleVersionLoader getSingleLoader() { 39 return new DecentDataLoader(); 40 } 34 41 35 final File dataDir = new File(path); 36 final SingleVersionLoader instancesLoader = getSingleLoader(); 42 /** 43 * @see de.ugoe.cs.cpdp.loader.IVersionLoader#load() 44 */ 45 @Override 46 public List<SoftwareVersion> load() { 47 final List<SoftwareVersion> versions = new LinkedList<SoftwareVersion>(); 37 48 38 String projectName = dataDir.getName(); 39 40 41 /* 42 * The following lines make it possible, that we can have two different possibilities 43 * to load data: 44 * 1) From one project (e.g. /decent/input/project1) 45 * 2) From more than one project (e.g. /decent/input/) 46 * 47 * Requirement is, that we have a folder structure like this: 48 * "/decent/input/project1/model.decent, /decent/input/project2/model.decent, ..." 49 * 50 * In the first one the "else" is executed, therefore it will just search the folder "project1" 51 * for a "model.decent" file. In the second one, it will look into each folder and searches for 52 * "model.decent" files. 53 */ 54 for (File projectDir : dataDir.listFiles()) { 55 if (projectDir.isDirectory()) { 56 projectName = projectDir.getName(); 57 for (File versionFile : projectDir.listFiles()) { 58 loadDataFromFile(versionFile,instancesLoader, projectName, versions); 59 } 60 } else { 61 loadDataFromFile(projectDir, instancesLoader, projectName, versions); 62 } 63 } 64 return versions; 65 } 66 67 /** 68 * Loads data from a file and adds the instances from the load method to the 69 * versions List. 70 * 71 * @param versionFile file to load from 72 * @param instancesLoader loader that should be used 73 * @param projectName name of the project which was loaded 74 * @param versions list, where the weka instances are added to 75 */ 76 77 private void loadDataFromFile(File versionFile, 78 SingleVersionLoader instancesLoader, String projectName, List<SoftwareVersion> versions) { 79 if (versionFile.isFile() 80 && instancesLoader.filenameFilter(versionFile 81 .getName())) { 82 String versionName = versionFile.getName(); 83 Instances data = instancesLoader.load(versionFile); 84 versions.add(new SoftwareVersion(projectName, 85 versionName, data)); 86 } 87 } 49 final File dataDir = new File(path); 50 final SingleVersionLoader instancesLoader = getSingleLoader(); 51 52 String projectName = dataDir.getName(); 53 54 /* 55 * The following lines make it possible, that we can have two different possibilities to 56 * load data: 1) From one project (e.g. /decent/input/project1) 2) From more than one 57 * project (e.g. /decent/input/) 58 * 59 * Requirement is, that we have a folder structure like this: 60 * "/decent/input/project1/model.decent, /decent/input/project2/model.decent, ..." 61 * 62 * In the first one the "else" is executed, therefore it will just search the folder 63 * "project1" for a "model.decent" file. In the second one, it will look into each folder 64 * and searches for "model.decent" files. 65 */ 66 for (File projectDir : dataDir.listFiles()) { 67 if (projectDir.isDirectory()) { 68 projectName = projectDir.getName(); 69 for (File versionFile : projectDir.listFiles()) { 70 loadDataFromFile(versionFile, instancesLoader, projectName, versions); 71 } 72 } 73 else { 74 loadDataFromFile(projectDir, instancesLoader, projectName, versions); 75 } 76 } 77 return versions; 78 } 79 80 /** 81 * Loads data from a file and adds the instances from the load method to the versions List. 82 * 83 * @param versionFile 84 * file to load from 85 * @param instancesLoader 86 * loader that should be used 87 * @param projectName 88 * name of the project which was loaded 89 * @param versions 90 * list, where the weka instances are added to 91 */ 92 93 private void loadDataFromFile(File versionFile, 94 SingleVersionLoader instancesLoader, 95 String projectName, 96 List<SoftwareVersion> versions) 97 { 98 if (versionFile.isFile() && instancesLoader.filenameFilter(versionFile.getName())) { 99 String versionName = versionFile.getName(); 100 Instances data = instancesLoader.load(versionFile); 101 versions.add(new SoftwareVersion(projectName, versionName, data)); 102 } 103 } 88 104 89 105 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/IDecentVersionLoader.java
r32 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.loader; 2 16 … … 5 19 import de.ugoe.cs.cpdp.versions.SoftwareVersion; 6 20 7 public interface IDecentVersionLoader extends IVersionLoader {8 9 public List<SoftwareVersion> load(List<String> decentAttributes);21 public interface IDecentVersionLoader extends IVersionLoader { 22 23 public List<SoftwareVersion> load(List<String> decentAttributes); 10 24 11 25 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/IVersionLoader.java
r32 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.loader; 2 16 … … 12 26 public interface IVersionLoader { 13 27 14 /**15 * Sets the location of the data.16 *17 * @param location18 * location of the data19 */20 public void setLocation(String location);28 /** 29 * Sets the location of the data. 30 * 31 * @param location 32 * location of the data 33 */ 34 public void setLocation(String location); 21 35 22 /**23 * Loads the data.24 *25 * @return the data26 */27 public List<SoftwareVersion> load();36 /** 37 * Loads the data. 38 * 39 * @return the data 40 */ 41 public List<SoftwareVersion> load(); 28 42 29 43 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/NasaARFFFolderLoader.java
r4 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.loader; 2 16 3 17 /** 4 * Implements the {@link AbstractFolderLoader} for the NASA/SOFTLAB/MDP data 5 * set. 18 * Implements the {@link AbstractFolderLoader} for the NASA/SOFTLAB/MDP data set. 6 19 * 7 20 * @author Steffen Herbold … … 9 22 public class NasaARFFFolderLoader extends AbstractFolderLoader { 10 23 11 /*12 * (non-Javadoc)13 *14 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader#getSingleLoader()15 */16 @Override17 protected SingleVersionLoader getSingleLoader() {18 return new NasaARFFLoader();19 }24 /* 25 * (non-Javadoc) 26 * 27 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader#getSingleLoader() 28 */ 29 @Override 30 protected SingleVersionLoader getSingleLoader() { 31 return new NasaARFFLoader(); 32 } 20 33 21 34 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/NasaARFFLoader.java
r4 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.loader; 2 16 … … 16 30 17 31 /** 18 * Loads the instances for a software version from an ARFF file of the 19 * NASA/SOFTLAB/MDP data. 32 * Loads the instances for a software version from an ARFF file of the NASA/SOFTLAB/MDP data. 20 33 * 21 34 * @author Steffen Herbold … … 23 36 public class NasaARFFLoader implements SingleVersionLoader { 24 37 25 /**26 * used to map attributes the same attribute with different names to each 27 * other 28 */ 29 Map<String, String> attributeNameMap; 30 31 /** 32 * used to ensure that the attribute order is the same after loading 33 */ 34 List<String> attributeOrder; 35 36 /** 37 * Constructor. Creates a new NasaARFFLoader. 38 */ 39 public NasaARFFLoader() { 40 attributeNameMap = new HashMap<>(); 41 42 // Map entries for ar project 43 attributeNameMap.put("total_loc", "LOC_TOTAL");44 attributeNameMap.put("comment_loc", "LOC_COMMENTS");45 attributeNameMap.put("code_and_comment_loc", "LOC_CODE_AND_COMMENT");46 attributeNameMap.put("executable_loc", "LOC_EXECUTABLE");47 attributeNameMap.put("unique_operands", "NUM_UNIQUE_OPERANDS");48 attributeNameMap.put("unique_operators", "NUM_UNIQUE_OPERATORS");49 attributeNameMap.put("total_operands", "NUM_OPERANDS");50 attributeNameMap.put("total_operators", "NUM_OPERATORS");51 attributeNameMap.put("halstead_length", "HALSTEAD_LENGTH");52 attributeNameMap.put("halstead_volume", "HALSTEAD_VOLUME");53 attributeNameMap.put("halstead_difficulty", "HALSTEAD_DIFFICULTY");54 attributeNameMap.put("halstead_effort", "HALSTEAD_EFFORT");55 attributeNameMap.put("halstead_error", "HALSTEAD_ERROR_EST");56 attributeNameMap.put("halstead_time", "HALSTEAD_PROG_TIME");57 attributeNameMap.put("branch_count", "BRANCH_COUNT");58 attributeNameMap.put("cyclomatic_complexity", "CYCLOMATIC_COMPLEXITY");59 attributeNameMap.put("design_complexity", "DESIGN_COMPLEXITY"); 60 61 // Map entries for KC2 62 attributeNameMap.put("loc", "LOC_TOTAL");63 attributeNameMap.put("lOCode", "LOC_EXECUTABLE");64 attributeNameMap.put("lOComment", "LOC_COMMENTS");65 attributeNameMap.put("lOCodeAndComment", "LOC_CODE_AND_COMMENT");66 attributeNameMap.put("uniq_Op", "NUM_UNIQUE_OPERATORS");67 attributeNameMap.put("uniq_Opnd", "NUM_UNIQUE_OPERANDS");68 attributeNameMap.put("total_Op", "NUM_OPERATORS");69 attributeNameMap.put("total_Opnd", "NUM_OPERANDS");70 attributeNameMap.put("v", "HALSTEAD_VOLUME");71 attributeNameMap.put("l", "HALSTEAD_LENGTH");72 attributeNameMap.put("d", "HALSTEAD_DIFFICULTY");73 attributeNameMap.put("e", "HALSTEAD_EFFORT");74 attributeNameMap.put("b", "HALSTEAD_ERROR_EST");75 attributeNameMap.put("t", "HALSTEAD_PROG_TIME");76 attributeNameMap.put("branchCount", "BRANCH_COUNT");77 attributeNameMap.put("v(g)", "CYCLOMATIC_COMPLEXITY");78 attributeNameMap.put("iv(g)", "DESIGN_COMPLEXITY"); 79 80 attributeNameMap.put("defects", "bug");81 attributeNameMap.put("Defective", "bug");82 attributeNameMap.put("problems", "bug");83 attributeNameMap.put("label", "bug"); 84 85 // build list with normalized attribute order 86 attributeOrder = new LinkedList<>(); 87 88 attributeOrder.add("LOC_TOTAL");89 attributeOrder.add("LOC_EXECUTABLE");90 attributeOrder.add("LOC_COMMENTS");91 attributeOrder.add("LOC_CODE_AND_COMMENT");92 attributeOrder.add("NUM_UNIQUE_OPERATORS");93 attributeOrder.add("NUM_UNIQUE_OPERANDS");94 attributeOrder.add("NUM_OPERATORS");95 attributeOrder.add("NUM_OPERANDS");96 attributeOrder.add("HALSTEAD_VOLUME");97 attributeOrder.add("HALSTEAD_LENGTH");98 attributeOrder.add("HALSTEAD_DIFFICULTY");99 attributeOrder.add("HALSTEAD_EFFORT");100 attributeOrder.add("HALSTEAD_ERROR_EST");101 attributeOrder.add("HALSTEAD_PROG_TIME");102 attributeOrder.add("BRANCH_COUNT");103 attributeOrder.add("CYCLOMATIC_COMPLEXITY");104 attributeOrder.add("DESIGN_COMPLEXITY");105 attributeOrder.add("bug"); 106 } 107 108 /* 109 * (non-Javadoc) 110 * 111 * @see de.ugoe.cs.cpdp.loader.SingleVersionLoader#load(java.io.File) 112 */ 113 @Override 114 public Instances load(File file) { 115 BufferedReader reader;116 Instances data; 117 try { 118 reader = new BufferedReader(new FileReader(file));119 data = new Instances(reader);120 reader.close(); 121 }catch (IOException e) {122 throw new RuntimeException("Error reading data", e);123 }124 125 // setting class attribute126 data.setClassIndex(data.numAttributes() - 1);127 128 // normalize attribute names129 for (int i = 0; i < data.numAttributes(); i++) {130 String mapValue = attributeNameMap.get(data.attribute(i).name());131 if (mapValue != null) {132 data.renameAttribute(i, mapValue);133 }134 }135 136 // determine new attribute order (unwanted attributes are implicitly137 // removed138 String orderString = "";139 for (String attName : attributeOrder) {140 for (int i = 0; i < data.numAttributes(); i++) {141 if (attName.equals(data.attribute(i).name())) {142 orderString += (i + 1) + ",";143 }144 }145 }146 orderString = orderString.substring(0, orderString.length() - 1);147 148 String relationName = data.relationName();149 String[] options = new String[2];150 options[0] = "-R";151 options[1] = orderString;152 Reorder reorder = new Reorder();153 try {154 reorder.setOptions(options);155 reorder.setInputFormat(data);156 data = Filter.useFilter(data, reorder);157 } catch (Exception e) { 158 throw new RuntimeException("Error while reordering the data", e); 159 } 160 if (data.numAttributes() != attributeOrder.size()) { 161 throw new RuntimeException( 162 "Invalid number of attributes; filename: " + file.getName());163 }164 165 // normalize bug nominal values166 Add add = new Add();167 add.setAttributeIndex("last");168 add.setNominalLabels("0,1");169 add.setAttributeName("bug-new");170 try {171 add.setInputFormat(data);172 data = Filter.useFilter(data, add);173 } catch (Exception e) { 174 throw new RuntimeException( 175 "Error while normalizing the bug nonminal values", e);176 }177 data.setRelationName(relationName);178 179 double classValue;180 181 String firstValue = data.classAttribute().enumerateValues() 182 .nextElement().toString(); 183 if (firstValue.equals("Y") || firstValue.equals("yes") 184 || firstValue.equals("true")) { 185 classValue = 0.0; 186 } else { 187 classValue = 1.0; 188 } 189 190 for (int i = 0; i < data.numInstances(); i++) {191 if (data.instance(i).classValue() == classValue) { 192 data.instance(i).setValue(data.classIndex() + 1, 1.0); 193 }else {194 data.instance(i).setValue(data.classIndex() + 1, 0.0);195 }196 }197 198 int oldClassIndex = data.classIndex();199 data.setClassIndex(oldClassIndex + 1);200 data.deleteAttributeAt(oldClassIndex);201 202 return data;203 }204 205 /*206 * (non-Javadoc)207 *208 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader.SingleVersionLoader#209 * filenameFilter(java.lang.String)210 */211 @Override212 public boolean filenameFilter(String filename) {213 return filename.endsWith(".arff");214 }38 /** 39 * used to map attributes the same attribute with different names to each other 40 */ 41 Map<String, String> attributeNameMap; 42 43 /** 44 * used to ensure that the attribute order is the same after loading 45 */ 46 List<String> attributeOrder; 47 48 /** 49 * Constructor. Creates a new NasaARFFLoader. 50 */ 51 public NasaARFFLoader() { 52 attributeNameMap = new HashMap<>(); 53 54 // Map entries for ar project 55 attributeNameMap.put("total_loc", "LOC_TOTAL"); 56 attributeNameMap.put("comment_loc", "LOC_COMMENTS"); 57 attributeNameMap.put("code_and_comment_loc", "LOC_CODE_AND_COMMENT"); 58 attributeNameMap.put("executable_loc", "LOC_EXECUTABLE"); 59 attributeNameMap.put("unique_operands", "NUM_UNIQUE_OPERANDS"); 60 attributeNameMap.put("unique_operators", "NUM_UNIQUE_OPERATORS"); 61 attributeNameMap.put("total_operands", "NUM_OPERANDS"); 62 attributeNameMap.put("total_operators", "NUM_OPERATORS"); 63 attributeNameMap.put("halstead_length", "HALSTEAD_LENGTH"); 64 attributeNameMap.put("halstead_volume", "HALSTEAD_VOLUME"); 65 attributeNameMap.put("halstead_difficulty", "HALSTEAD_DIFFICULTY"); 66 attributeNameMap.put("halstead_effort", "HALSTEAD_EFFORT"); 67 attributeNameMap.put("halstead_error", "HALSTEAD_ERROR_EST"); 68 attributeNameMap.put("halstead_time", "HALSTEAD_PROG_TIME"); 69 attributeNameMap.put("branch_count", "BRANCH_COUNT"); 70 attributeNameMap.put("cyclomatic_complexity", "CYCLOMATIC_COMPLEXITY"); 71 attributeNameMap.put("design_complexity", "DESIGN_COMPLEXITY"); 72 73 // Map entries for KC2 74 attributeNameMap.put("loc", "LOC_TOTAL"); 75 attributeNameMap.put("lOCode", "LOC_EXECUTABLE"); 76 attributeNameMap.put("lOComment", "LOC_COMMENTS"); 77 attributeNameMap.put("lOCodeAndComment", "LOC_CODE_AND_COMMENT"); 78 attributeNameMap.put("uniq_Op", "NUM_UNIQUE_OPERATORS"); 79 attributeNameMap.put("uniq_Opnd", "NUM_UNIQUE_OPERANDS"); 80 attributeNameMap.put("total_Op", "NUM_OPERATORS"); 81 attributeNameMap.put("total_Opnd", "NUM_OPERANDS"); 82 attributeNameMap.put("v", "HALSTEAD_VOLUME"); 83 attributeNameMap.put("l", "HALSTEAD_LENGTH"); 84 attributeNameMap.put("d", "HALSTEAD_DIFFICULTY"); 85 attributeNameMap.put("e", "HALSTEAD_EFFORT"); 86 attributeNameMap.put("b", "HALSTEAD_ERROR_EST"); 87 attributeNameMap.put("t", "HALSTEAD_PROG_TIME"); 88 attributeNameMap.put("branchCount", "BRANCH_COUNT"); 89 attributeNameMap.put("v(g)", "CYCLOMATIC_COMPLEXITY"); 90 attributeNameMap.put("iv(g)", "DESIGN_COMPLEXITY"); 91 92 attributeNameMap.put("defects", "bug"); 93 attributeNameMap.put("Defective", "bug"); 94 attributeNameMap.put("problems", "bug"); 95 attributeNameMap.put("label", "bug"); 96 97 // build list with normalized attribute order 98 attributeOrder = new LinkedList<>(); 99 100 attributeOrder.add("LOC_TOTAL"); 101 attributeOrder.add("LOC_EXECUTABLE"); 102 attributeOrder.add("LOC_COMMENTS"); 103 attributeOrder.add("LOC_CODE_AND_COMMENT"); 104 attributeOrder.add("NUM_UNIQUE_OPERATORS"); 105 attributeOrder.add("NUM_UNIQUE_OPERANDS"); 106 attributeOrder.add("NUM_OPERATORS"); 107 attributeOrder.add("NUM_OPERANDS"); 108 attributeOrder.add("HALSTEAD_VOLUME"); 109 attributeOrder.add("HALSTEAD_LENGTH"); 110 attributeOrder.add("HALSTEAD_DIFFICULTY"); 111 attributeOrder.add("HALSTEAD_EFFORT"); 112 attributeOrder.add("HALSTEAD_ERROR_EST"); 113 attributeOrder.add("HALSTEAD_PROG_TIME"); 114 attributeOrder.add("BRANCH_COUNT"); 115 attributeOrder.add("CYCLOMATIC_COMPLEXITY"); 116 attributeOrder.add("DESIGN_COMPLEXITY"); 117 attributeOrder.add("bug"); 118 } 119 120 /* 121 * (non-Javadoc) 122 * 123 * @see de.ugoe.cs.cpdp.loader.SingleVersionLoader#load(java.io.File) 124 */ 125 @Override 126 public Instances load(File file) { 127 BufferedReader reader; 128 Instances data; 129 try { 130 reader = new BufferedReader(new FileReader(file)); 131 data = new Instances(reader); 132 reader.close(); 133 } 134 catch (IOException e) { 135 throw new RuntimeException("Error reading data", e); 136 } 137 138 // setting class attribute 139 data.setClassIndex(data.numAttributes() - 1); 140 141 // normalize attribute names 142 for (int i = 0; i < data.numAttributes(); i++) { 143 String mapValue = attributeNameMap.get(data.attribute(i).name()); 144 if (mapValue != null) { 145 data.renameAttribute(i, mapValue); 146 } 147 } 148 149 // determine new attribute order (unwanted attributes are implicitly 150 // removed 151 String orderString = ""; 152 for (String attName : attributeOrder) { 153 for (int i = 0; i < data.numAttributes(); i++) { 154 if (attName.equals(data.attribute(i).name())) { 155 orderString += (i + 1) + ","; 156 } 157 } 158 } 159 orderString = orderString.substring(0, orderString.length() - 1); 160 161 String relationName = data.relationName(); 162 String[] options = new String[2]; 163 options[0] = "-R"; 164 options[1] = orderString; 165 Reorder reorder = new Reorder(); 166 try { 167 reorder.setOptions(options); 168 reorder.setInputFormat(data); 169 data = Filter.useFilter(data, reorder); 170 } 171 catch (Exception e) { 172 throw new RuntimeException("Error while reordering the data", e); 173 } 174 if (data.numAttributes() != attributeOrder.size()) { 175 throw new RuntimeException("Invalid number of attributes; filename: " + file.getName()); 176 } 177 178 // normalize bug nominal values 179 Add add = new Add(); 180 add.setAttributeIndex("last"); 181 add.setNominalLabels("0,1"); 182 add.setAttributeName("bug-new"); 183 try { 184 add.setInputFormat(data); 185 data = Filter.useFilter(data, add); 186 } 187 catch (Exception e) { 188 throw new RuntimeException("Error while normalizing the bug nonminal values", e); 189 } 190 data.setRelationName(relationName); 191 192 double classValue; 193 194 String firstValue = data.classAttribute().enumerateValues().nextElement().toString(); 195 if (firstValue.equals("Y") || firstValue.equals("yes") || firstValue.equals("true")) { 196 classValue = 0.0; 197 } 198 else { 199 classValue = 1.0; 200 } 201 202 for (int i = 0; i < data.numInstances(); i++) { 203 if (data.instance(i).classValue() == classValue) { 204 data.instance(i).setValue(data.classIndex() + 1, 1.0); 205 } 206 else { 207 data.instance(i).setValue(data.classIndex() + 1, 0.0); 208 } 209 } 210 211 int oldClassIndex = data.classIndex(); 212 data.setClassIndex(oldClassIndex + 1); 213 data.deleteAttributeAt(oldClassIndex); 214 215 return data; 216 } 217 218 /* 219 * (non-Javadoc) 220 * 221 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader.SingleVersionLoader# 222 * filenameFilter(java.lang.String) 223 */ 224 @Override 225 public boolean filenameFilter(String filename) { 226 return filename.endsWith(".arff"); 227 } 215 228 216 229 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/SingleVersionLoader.java
r4 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.loader; 2 16 … … 6 20 7 21 /** 8 * Interface for version loaders, i.e., loading of a set of instances from a 9 * file 22 * Interface for version loaders, i.e., loading of a set of instances from a file 10 23 * 11 24 * @author Steffen Herbold … … 13 26 public interface SingleVersionLoader { 14 27 15 /**16 * Loads the instances.17 *18 * @param file19 * handle to the file of the instances20 * @return the instances21 */22 Instances load(File file);28 /** 29 * Loads the instances. 30 * 31 * @param file 32 * handle to the file of the instances 33 * @return the instances 34 */ 35 Instances load(File file); 23 36 24 /**25 * Defines a filter for the files to be loaded; only strings that end with 26 * the filter areconsidered.27 *28 * @param filename29 * string defining the filename filter30 * @return true if a filename shall be considered31 */32 boolean filenameFilter(String endsWith);37 /** 38 * Defines a filter for the files to be loaded; only strings that end with the filter are 39 * considered. 40 * 41 * @param filename 42 * string defining the filename filter 43 * @return true if a filename shall be considered 44 */ 45 boolean filenameFilter(String endsWith); 33 46 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/FixClass.java
r31 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.training; 2 16 … … 14 28 * @author Steffen Herbold 15 29 */ 16 public class FixClass extends AbstractClassifier implements ITrainingStrategy, IWekaCompatibleTrainer { 30 public class FixClass extends AbstractClassifier implements ITrainingStrategy, 31 IWekaCompatibleTrainer 32 { 17 33 18 private static final long serialVersionUID = 1L;34 private static final long serialVersionUID = 1L; 19 35 20 private double fixedClassValue = 0.0d;36 private double fixedClassValue = 0.0d; 21 37 22 /**23 * Returns default capabilities of the classifier.24 *25 * @return the capabilities of this classifier26 */27 @Override28 public Capabilities getCapabilities() {29 Capabilities result = super.getCapabilities();30 result.disableAll();38 /** 39 * Returns default capabilities of the classifier. 40 * 41 * @return the capabilities of this classifier 42 */ 43 @Override 44 public Capabilities getCapabilities() { 45 Capabilities result = super.getCapabilities(); 46 result.disableAll(); 31 47 32 // attributes33 result.enable(Capability.NOMINAL_ATTRIBUTES);34 result.enable(Capability.NUMERIC_ATTRIBUTES);35 result.enable(Capability.DATE_ATTRIBUTES);36 result.enable(Capability.STRING_ATTRIBUTES);37 result.enable(Capability.RELATIONAL_ATTRIBUTES);38 result.enable(Capability.MISSING_VALUES);48 // attributes 49 result.enable(Capability.NOMINAL_ATTRIBUTES); 50 result.enable(Capability.NUMERIC_ATTRIBUTES); 51 result.enable(Capability.DATE_ATTRIBUTES); 52 result.enable(Capability.STRING_ATTRIBUTES); 53 result.enable(Capability.RELATIONAL_ATTRIBUTES); 54 result.enable(Capability.MISSING_VALUES); 39 55 40 // class41 result.enable(Capability.NOMINAL_CLASS);42 result.enable(Capability.NUMERIC_CLASS);43 result.enable(Capability.MISSING_CLASS_VALUES);56 // class 57 result.enable(Capability.NOMINAL_CLASS); 58 result.enable(Capability.NUMERIC_CLASS); 59 result.enable(Capability.MISSING_CLASS_VALUES); 44 60 45 // instances46 result.setMinimumNumberInstances(0);61 // instances 62 result.setMinimumNumberInstances(0); 47 63 48 return result;49 }64 return result; 65 } 50 66 51 @Override52 public void setOptions(String[] options) throws Exception {53 fixedClassValue = Double.parseDouble(Utils.getOption('C', options));54 }67 @Override 68 public void setOptions(String[] options) throws Exception { 69 fixedClassValue = Double.parseDouble(Utils.getOption('C', options)); 70 } 55 71 56 @Override57 public double classifyInstance(Instance instance) {58 return fixedClassValue;59 }72 @Override 73 public double classifyInstance(Instance instance) { 74 return fixedClassValue; 75 } 60 76 61 @Override62 public void buildClassifier(Instances traindata) throws Exception {63 // do nothing64 }77 @Override 78 public void buildClassifier(Instances traindata) throws Exception { 79 // do nothing 80 } 65 81 66 @Override 67 public void setParameter(String parameters) { 68 try { 69 this.setOptions(parameters.split(" ")); 70 } catch (Exception e) { 71 e.printStackTrace(); 72 } 73 } 82 @Override 83 public void setParameter(String parameters) { 84 try { 85 this.setOptions(parameters.split(" ")); 86 } 87 catch (Exception e) { 88 e.printStackTrace(); 89 } 90 } 74 91 75 @Override76 public void apply(Instances traindata) {77 // do nothing!78 }92 @Override 93 public void apply(Instances traindata) { 94 // do nothing! 95 } 79 96 80 @Override81 public String getName() {82 return "FixClass";83 }97 @Override 98 public String getName() { 99 return "FixClass"; 100 } 84 101 85 @Override86 public Classifier getClassifier() {87 return this;88 }102 @Override 103 public Classifier getClassifier() { 104 return this; 105 } 89 106 90 107 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/ISetWiseTrainingStrategy.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.training; 2 16 … … 7 21 // Bagging Strategy: separate models for each training data set 8 22 public interface ISetWiseTrainingStrategy extends ITrainer { 9 10 void apply(SetUniqueList<Instances> traindataSet);11 12 String getName();23 24 void apply(SetUniqueList<Instances> traindataSet); 25 26 String getName(); 13 27 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/ITrainer.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.training; 2 16 -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/ITrainingStrategy.java
r6 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.training; 2 16 … … 4 18 5 19 public interface ITrainingStrategy extends ITrainer { 6 7 void apply(Instances traindata);8 9 String getName();20 21 void apply(Instances traindata); 22 23 String getName(); 10 24 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/IWekaCompatibleTrainer.java
r24 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.training; 2 16 … … 4 18 5 19 public interface IWekaCompatibleTrainer extends ITrainer { 6 7 Classifier getClassifier();8 9 String getName();20 21 Classifier getClassifier(); 22 23 String getName(); 10 24 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/QuadTree.java
r23 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.training; 2 16 … … 12 26 * QuadTree implementation 13 27 * 14 * QuadTree gets a list of instances and then recursively split them into 4 childs 15 * For this it usesthe median of the 2 values x,y28 * QuadTree gets a list of instances and then recursively split them into 4 childs For this it uses 29 * the median of the 2 values x,y 16 30 */ 17 31 public class QuadTree { 18 19 /* 1 parent or null */ 20 private QuadTree parent = null; 21 22 /* 4 childs, 1 per quadrant */ 23 private QuadTree child_nw; 24 private QuadTree child_ne; 25 private QuadTree child_se; 26 private QuadTree child_sw; 27 28 /* list (only helps with generation of list of childs!) */ 29 private ArrayList<QuadTree> l = new ArrayList<QuadTree>(); 30 31 /* level only used for debugging */ 32 public int level = 0; 33 34 /* size of the quadrant */ 35 private double[] x; 36 private double[] y; 37 38 public static boolean verbose = false; 39 public static int size = 0; 40 public static double alpha = 0; 41 42 /* cluster payloads */ 43 public static ArrayList<ArrayList<QuadTreePayload<Instance>>> ccluster = new ArrayList<ArrayList<QuadTreePayload<Instance>>>(); 44 45 /* cluster sizes (index is cluster number, arraylist is list of boxes (x0,y0,x1,y1) */ 46 public static HashMap<Integer, ArrayList<Double[][]>> csize = new HashMap<Integer, ArrayList<Double[][]>>(); 47 48 /* payload of this instance */ 49 private ArrayList<QuadTreePayload<Instance>> payload; 50 51 52 public QuadTree(QuadTree parent, ArrayList<QuadTreePayload<Instance>> payload) { 53 this.parent = parent; 54 this.payload = payload; 55 } 56 57 58 public String toString() { 59 String n = ""; 60 if(this.parent == null) { 61 n += "rootnode "; 62 } 63 String level = new String(new char[this.level]).replace("\0", "-"); 64 n += level + " instances: " + this.getNumbers(); 65 return n; 66 } 67 68 /** 69 * Returns the payload, used for clustering 70 * in the clustering list we only have children with paylod 71 * 72 * @return payload 73 */ 74 public ArrayList<QuadTreePayload<Instance>> getPayload() { 75 return this.payload; 76 } 77 78 /** 79 * Calculate the density of this quadrant 80 * 81 * density = number of instances / global size (all instances) 82 * 83 * @return density 84 */ 85 public double getDensity() { 86 double dens = 0; 87 dens = (double)this.getNumbers() / QuadTree.size; 88 return dens; 89 } 90 91 public void setSize(double[] x, double[] y){ 92 this.x = x; 93 this.y = y; 94 } 95 96 public double[][] getSize() { 97 return new double[][] {this.x, this.y}; 98 } 99 100 public Double[][] getSizeDouble() { 101 Double[] tmpX = new Double[2]; 102 Double[] tmpY = new Double[2]; 103 104 tmpX[0] = this.x[0]; 105 tmpX[1] = this.x[1]; 106 107 tmpY[0] = this.y[0]; 108 tmpY[1] = this.y[1]; 109 110 return new Double[][] {tmpX, tmpY}; 111 } 112 113 /** 114 * TODO: DRY, median ist immer dasselbe 115 * 116 * @return median for x 117 */ 118 private double getMedianForX() { 119 double med_x =0 ; 120 121 Collections.sort(this.payload, new Comparator<QuadTreePayload<Instance>>() { 122 @Override 123 public int compare(QuadTreePayload<Instance> x1, QuadTreePayload<Instance> x2) { 124 return Double.compare(x1.x, x2.x); 125 } 126 }); 127 128 if(this.payload.size() % 2 == 0) { 129 int mid = this.payload.size() / 2; 130 med_x = (this.payload.get(mid).x + this.payload.get(mid+1).x) / 2; 131 }else { 132 int mid = this.payload.size() / 2; 133 med_x = this.payload.get(mid).x; 134 } 135 136 if(QuadTree.verbose) { 137 System.out.println("sorted:"); 138 for(int i = 0; i < this.payload.size(); i++) { 139 System.out.print(""+this.payload.get(i).x+","); 140 } 141 System.out.println("median x: " + med_x); 142 } 143 return med_x; 144 } 145 146 private double getMedianForY() { 147 double med_y =0 ; 148 149 Collections.sort(this.payload, new Comparator<QuadTreePayload<Instance>>() { 150 @Override 151 public int compare(QuadTreePayload<Instance> y1, QuadTreePayload<Instance> y2) { 152 return Double.compare(y1.y, y2.y); 153 } 154 }); 155 156 if(this.payload.size() % 2 == 0) { 157 int mid = this.payload.size() / 2; 158 med_y = (this.payload.get(mid).y + this.payload.get(mid+1).y) / 2; 159 }else { 160 int mid = this.payload.size() / 2; 161 med_y = this.payload.get(mid).y; 162 } 163 164 if(QuadTree.verbose) { 165 System.out.println("sorted:"); 166 for(int i = 0; i < this.payload.size(); i++) { 167 System.out.print(""+this.payload.get(i).y+","); 168 } 169 System.out.println("median y: " + med_y); 170 } 171 return med_y; 172 } 173 174 /** 175 * Reurns the number of instances in the payload 176 * 177 * @return int number of instances 178 */ 179 public int getNumbers() { 180 int number = 0; 181 if(this.payload != null) { 182 number = this.payload.size(); 183 } 184 return number; 185 } 186 187 /** 188 * Calculate median values of payload for x, y and split into 4 sectors 189 * 190 * @return Array of QuadTree nodes (4 childs) 191 * @throws Exception if we would run into an recursive loop 192 */ 193 public QuadTree[] split() throws Exception { 194 195 double medx = this.getMedianForX(); 196 double medy = this.getMedianForY(); 197 198 // Payload lists for each child 199 ArrayList<QuadTreePayload<Instance>> nw = new ArrayList<QuadTreePayload<Instance>>(); 200 ArrayList<QuadTreePayload<Instance>> sw = new ArrayList<QuadTreePayload<Instance>>(); 201 ArrayList<QuadTreePayload<Instance>> ne = new ArrayList<QuadTreePayload<Instance>>(); 202 ArrayList<QuadTreePayload<Instance>> se = new ArrayList<QuadTreePayload<Instance>>(); 203 204 // sort the payloads to new payloads 205 // here we have the problem that payloads with the same values are sorted 206 // into the same slots and it could happen that medx and medy = size_x[1] and size_y[1] 207 // in that case we would have an endless loop 208 for(int i=0; i < this.payload.size(); i++) { 209 210 QuadTreePayload<Instance> item = this.payload.get(i); 211 212 // north west 213 if(item.x <= medx && item.y >= medy) { 214 nw.add(item); 215 } 216 217 // south west 218 else if(item.x <= medx && item.y <= medy) { 219 sw.add(item); 220 } 221 222 // north east 223 else if(item.x >= medx && item.y >= medy) { 224 ne.add(item); 225 } 226 227 // south east 228 else if(item.x >= medx && item.y <= medy) { 229 se.add(item); 230 } 231 } 232 233 // if we assign one child a payload equal to our own (see problem above) 234 // we throw an exceptions which stops the recursion on this node 235 if(nw.equals(this.payload)) { 236 throw new Exception("payload equal"); 237 } 238 if(sw.equals(this.payload)) { 239 throw new Exception("payload equal"); 240 } 241 if(ne.equals(this.payload)) { 242 throw new Exception("payload equal"); 243 } 244 if(se.equals(this.payload)) { 245 throw new Exception("payload equal"); 246 } 247 248 this.child_nw = new QuadTree(this, nw); 249 this.child_nw.setSize(new double[] {this.x[0], medx}, new double[] {medy, this.y[1]}); 250 this.child_nw.level = this.level + 1; 251 252 this.child_sw = new QuadTree(this, sw); 253 this.child_sw.setSize(new double[] {this.x[0], medx}, new double[] {this.y[0], medy}); 254 this.child_sw.level = this.level + 1; 255 256 this.child_ne = new QuadTree(this, ne); 257 this.child_ne.setSize(new double[] {medx, this.x[1]}, new double[] {medy, this.y[1]}); 258 this.child_ne.level = this.level + 1; 259 260 this.child_se = new QuadTree(this, se); 261 this.child_se.setSize(new double[] {medx, this.x[1]}, new double[] {this.y[0], medy}); 262 this.child_se.level = this.level + 1; 263 264 this.payload = null; 265 return new QuadTree[] {this.child_nw, this.child_ne, this.child_se, this.child_sw}; 266 } 267 268 /** 269 * TODO: static method 270 * 271 * @param q 272 */ 273 public void recursiveSplit(QuadTree q) { 274 if(QuadTree.verbose) { 275 System.out.println("splitting: "+ q); 276 } 277 if(q.getNumbers() < QuadTree.alpha) { 278 return; 279 }else{ 280 // exception is thrown if we would run into an endless loop (see comments in split()) 281 try { 282 QuadTree[] childs = q.split(); 283 this.recursiveSplit(childs[0]); 284 this.recursiveSplit(childs[1]); 285 this.recursiveSplit(childs[2]); 286 this.recursiveSplit(childs[3]); 287 }catch(Exception e) { 288 return; 289 } 290 } 291 } 292 293 /** 294 * returns an list of childs sorted by density 295 * 296 * @param q QuadTree 297 * @return list of QuadTrees 298 */ 299 private void generateList(QuadTree q) { 300 301 // we only have all childs or none at all 302 if(q.child_ne == null) { 303 this.l.add(q); 304 } 305 306 if(q.child_ne != null) { 307 this.generateList(q.child_ne); 308 } 309 if(q.child_nw != null) { 310 this.generateList(q.child_nw); 311 } 312 if(q.child_se != null) { 313 this.generateList(q.child_se); 314 } 315 if(q.child_sw != null) { 316 this.generateList(q.child_sw); 317 } 318 } 319 320 /** 321 * Checks if passed QuadTree is neighboring to us 322 * 323 * @param q QuadTree 324 * @return true if passed QuadTree is a neighbor 325 */ 326 public boolean isNeighbour(QuadTree q) { 327 boolean is_neighbour = false; 328 329 double[][] our_size = this.getSize(); 330 double[][] new_size = q.getSize(); 331 332 // X is i=0, Y is i=1 333 for(int i =0; i < 2; i++) { 334 // we are smaller than q 335 // -------------- q 336 // ------- we 337 if(our_size[i][0] >= new_size[i][0] && our_size[i][1] <= new_size[i][1]) { 338 is_neighbour = true; 339 } 340 // we overlap with q at some point 341 //a) ---------------q 342 // ----------- we 343 //b) --------- q 344 // --------- we 345 if((our_size[i][0] >= new_size[i][0] && our_size[i][0] <= new_size[i][1]) || 346 (our_size[i][1] >= new_size[i][0] && our_size[i][1] <= new_size[i][1])) { 347 is_neighbour = true; 348 } 349 // we are larger than q 350 // ---- q 351 // ---------- we 352 if(our_size[i][1] >= new_size[i][1] && our_size[i][0] <= new_size[i][0]) { 353 is_neighbour = true; 354 } 355 } 356 357 if(is_neighbour && QuadTree.verbose) { 358 System.out.println(this + " neighbour of: " + q); 359 } 360 361 return is_neighbour; 362 } 363 364 /** 365 * Perform pruning and clustering of the quadtree 366 * 367 * Pruning according to: 368 * Tim Menzies, Andrew Butcher, David Cok, Andrian Marcus, Lucas Layman, 369 * Forrest Shull, Burak Turhan, Thomas Zimmermann, 370 * "Local versus Global Lessons for Defect Prediction and Effort Estimation," 371 * IEEE Transactions on Software Engineering, vol. 39, no. 6, pp. 822-834, June, 2013 372 * 373 * 1) get list of leaf quadrants 374 * 2) sort by their density 375 * 3) set stop_rule to 0.5 * highest Density in the list 376 * 4) merge all nodes with a density > stop_rule to the new cluster and remove all from list 377 * 5) repeat 378 * 379 * @param q List of QuadTree (children only) 380 */ 381 public void gridClustering(ArrayList<QuadTree> list) { 382 383 if(list.size() == 0) { 384 return; 385 } 386 387 double stop_rule; 388 QuadTree biggest; 389 QuadTree current; 390 391 // current clusterlist 392 ArrayList<QuadTreePayload<Instance>> current_cluster; 393 394 // remove list (for removal of items after scanning of the list) 395 ArrayList<Integer> remove = new ArrayList<Integer>(); 396 397 // 1. find biggest, and add it 398 biggest = list.get(list.size()-1); 399 stop_rule = biggest.getDensity() * 0.5; 400 401 current_cluster = new ArrayList<QuadTreePayload<Instance>>(); 402 current_cluster.addAll(biggest.getPayload()); 403 404 // remove the biggest because we are starting with it 405 remove.add(list.size()-1); 406 407 ArrayList<Double[][]> tmpSize = new ArrayList<Double[][]>(); 408 tmpSize.add(biggest.getSizeDouble()); 409 410 // check the items for their density 411 for(int i=list.size()-1; i >= 0; i--) { 412 current = list.get(i); 413 414 // 2. find neighbors with correct density 415 // if density > stop_rule and is_neighbour add to cluster and remove from list 416 if(current.getDensity() > stop_rule && !current.equals(biggest) && current.isNeighbour(biggest)) { 417 current_cluster.addAll(current.getPayload()); 418 419 // add it to remove list (we cannot remove it inside the loop because it would move the index) 420 remove.add(i); 421 422 // get the size 423 tmpSize.add(current.getSizeDouble()); 424 } 425 } 426 427 // 3. remove our removal candidates from the list 428 for(Integer item: remove) { 429 list.remove((int)item); 430 } 431 432 // 4. add to cluster 433 QuadTree.ccluster.add(current_cluster); 434 435 // 5. add sizes of our current (biggest) this adds a number of sizes (all QuadTree Instances belonging to this cluster) 436 // we need that to classify test instances to a cluster later 437 Integer cnumber = new Integer(QuadTree.ccluster.size()-1); 438 if(QuadTree.csize.containsKey(cnumber) == false) { 439 QuadTree.csize.put(cnumber, tmpSize); 440 } 441 442 // repeat 443 this.gridClustering(list); 444 } 445 446 public void printInfo() { 447 System.out.println("we have " + ccluster.size() + " clusters"); 448 449 for(int i=0; i < ccluster.size(); i++) { 450 System.out.println("cluster: "+i+ " size: "+ ccluster.get(i).size()); 451 } 452 } 453 454 /** 455 * Helper Method to get a sorted list (by density) for all 456 * children 457 * 458 * @param q QuadTree 459 * @return Sorted ArrayList of quadtrees 460 */ 461 public ArrayList<QuadTree> getList(QuadTree q) { 462 this.generateList(q); 463 464 Collections.sort(this.l, new Comparator<QuadTree>() { 465 @Override 466 public int compare(QuadTree x1, QuadTree x2) { 467 return Double.compare(x1.getDensity(), x2.getDensity()); 468 } 469 }); 470 471 return this.l; 472 } 32 33 /* 1 parent or null */ 34 private QuadTree parent = null; 35 36 /* 4 childs, 1 per quadrant */ 37 private QuadTree child_nw; 38 private QuadTree child_ne; 39 private QuadTree child_se; 40 private QuadTree child_sw; 41 42 /* list (only helps with generation of list of childs!) */ 43 private ArrayList<QuadTree> l = new ArrayList<QuadTree>(); 44 45 /* level only used for debugging */ 46 public int level = 0; 47 48 /* size of the quadrant */ 49 private double[] x; 50 private double[] y; 51 52 public static boolean verbose = false; 53 public static int size = 0; 54 public static double alpha = 0; 55 56 /* cluster payloads */ 57 public static ArrayList<ArrayList<QuadTreePayload<Instance>>> ccluster = 58 new ArrayList<ArrayList<QuadTreePayload<Instance>>>(); 59 60 /* cluster sizes (index is cluster number, arraylist is list of boxes (x0,y0,x1,y1) */ 61 public static HashMap<Integer, ArrayList<Double[][]>> csize = 62 new HashMap<Integer, ArrayList<Double[][]>>(); 63 64 /* payload of this instance */ 65 private ArrayList<QuadTreePayload<Instance>> payload; 66 67 public QuadTree(QuadTree parent, ArrayList<QuadTreePayload<Instance>> payload) { 68 this.parent = parent; 69 this.payload = payload; 70 } 71 72 public String toString() { 73 String n = ""; 74 if (this.parent == null) { 75 n += "rootnode "; 76 } 77 String level = new String(new char[this.level]).replace("\0", "-"); 78 n += level + " instances: " + this.getNumbers(); 79 return n; 80 } 81 82 /** 83 * Returns the payload, used for clustering in the clustering list we only have children with 84 * paylod 85 * 86 * @return payload 87 */ 88 public ArrayList<QuadTreePayload<Instance>> getPayload() { 89 return this.payload; 90 } 91 92 /** 93 * Calculate the density of this quadrant 94 * 95 * density = number of instances / global size (all instances) 96 * 97 * @return density 98 */ 99 public double getDensity() { 100 double dens = 0; 101 dens = (double) this.getNumbers() / QuadTree.size; 102 return dens; 103 } 104 105 public void setSize(double[] x, double[] y) { 106 this.x = x; 107 this.y = y; 108 } 109 110 public double[][] getSize() { 111 return new double[][] 112 { this.x, this.y }; 113 } 114 115 public Double[][] getSizeDouble() { 116 Double[] tmpX = new Double[2]; 117 Double[] tmpY = new Double[2]; 118 119 tmpX[0] = this.x[0]; 120 tmpX[1] = this.x[1]; 121 122 tmpY[0] = this.y[0]; 123 tmpY[1] = this.y[1]; 124 125 return new Double[][] 126 { tmpX, tmpY }; 127 } 128 129 /** 130 * TODO: DRY, median ist immer dasselbe 131 * 132 * @return median for x 133 */ 134 private double getMedianForX() { 135 double med_x = 0; 136 137 Collections.sort(this.payload, new Comparator<QuadTreePayload<Instance>>() { 138 @Override 139 public int compare(QuadTreePayload<Instance> x1, QuadTreePayload<Instance> x2) { 140 return Double.compare(x1.x, x2.x); 141 } 142 }); 143 144 if (this.payload.size() % 2 == 0) { 145 int mid = this.payload.size() / 2; 146 med_x = (this.payload.get(mid).x + this.payload.get(mid + 1).x) / 2; 147 } 148 else { 149 int mid = this.payload.size() / 2; 150 med_x = this.payload.get(mid).x; 151 } 152 153 if (QuadTree.verbose) { 154 System.out.println("sorted:"); 155 for (int i = 0; i < this.payload.size(); i++) { 156 System.out.print("" + this.payload.get(i).x + ","); 157 } 158 System.out.println("median x: " + med_x); 159 } 160 return med_x; 161 } 162 163 private double getMedianForY() { 164 double med_y = 0; 165 166 Collections.sort(this.payload, new Comparator<QuadTreePayload<Instance>>() { 167 @Override 168 public int compare(QuadTreePayload<Instance> y1, QuadTreePayload<Instance> y2) { 169 return Double.compare(y1.y, y2.y); 170 } 171 }); 172 173 if (this.payload.size() % 2 == 0) { 174 int mid = this.payload.size() / 2; 175 med_y = (this.payload.get(mid).y + this.payload.get(mid + 1).y) / 2; 176 } 177 else { 178 int mid = this.payload.size() / 2; 179 med_y = this.payload.get(mid).y; 180 } 181 182 if (QuadTree.verbose) { 183 System.out.println("sorted:"); 184 for (int i = 0; i < this.payload.size(); i++) { 185 System.out.print("" + this.payload.get(i).y + ","); 186 } 187 System.out.println("median y: " + med_y); 188 } 189 return med_y; 190 } 191 192 /** 193 * Reurns the number of instances in the payload 194 * 195 * @return int number of instances 196 */ 197 public int getNumbers() { 198 int number = 0; 199 if (this.payload != null) { 200 number = this.payload.size(); 201 } 202 return number; 203 } 204 205 /** 206 * Calculate median values of payload for x, y and split into 4 sectors 207 * 208 * @return Array of QuadTree nodes (4 childs) 209 * @throws Exception 210 * if we would run into an recursive loop 211 */ 212 public QuadTree[] split() throws Exception { 213 214 double medx = this.getMedianForX(); 215 double medy = this.getMedianForY(); 216 217 // Payload lists for each child 218 ArrayList<QuadTreePayload<Instance>> nw = new ArrayList<QuadTreePayload<Instance>>(); 219 ArrayList<QuadTreePayload<Instance>> sw = new ArrayList<QuadTreePayload<Instance>>(); 220 ArrayList<QuadTreePayload<Instance>> ne = new ArrayList<QuadTreePayload<Instance>>(); 221 ArrayList<QuadTreePayload<Instance>> se = new ArrayList<QuadTreePayload<Instance>>(); 222 223 // sort the payloads to new payloads 224 // here we have the problem that payloads with the same values are sorted 225 // into the same slots and it could happen that medx and medy = size_x[1] and size_y[1] 226 // in that case we would have an endless loop 227 for (int i = 0; i < this.payload.size(); i++) { 228 229 QuadTreePayload<Instance> item = this.payload.get(i); 230 231 // north west 232 if (item.x <= medx && item.y >= medy) { 233 nw.add(item); 234 } 235 236 // south west 237 else if (item.x <= medx && item.y <= medy) { 238 sw.add(item); 239 } 240 241 // north east 242 else if (item.x >= medx && item.y >= medy) { 243 ne.add(item); 244 } 245 246 // south east 247 else if (item.x >= medx && item.y <= medy) { 248 se.add(item); 249 } 250 } 251 252 // if we assign one child a payload equal to our own (see problem above) 253 // we throw an exceptions which stops the recursion on this node 254 if (nw.equals(this.payload)) { 255 throw new Exception("payload equal"); 256 } 257 if (sw.equals(this.payload)) { 258 throw new Exception("payload equal"); 259 } 260 if (ne.equals(this.payload)) { 261 throw new Exception("payload equal"); 262 } 263 if (se.equals(this.payload)) { 264 throw new Exception("payload equal"); 265 } 266 267 this.child_nw = new QuadTree(this, nw); 268 this.child_nw.setSize(new double[] 269 { this.x[0], medx }, new double[] 270 { medy, this.y[1] }); 271 this.child_nw.level = this.level + 1; 272 273 this.child_sw = new QuadTree(this, sw); 274 this.child_sw.setSize(new double[] 275 { this.x[0], medx }, new double[] 276 { this.y[0], medy }); 277 this.child_sw.level = this.level + 1; 278 279 this.child_ne = new QuadTree(this, ne); 280 this.child_ne.setSize(new double[] 281 { medx, this.x[1] }, new double[] 282 { medy, this.y[1] }); 283 this.child_ne.level = this.level + 1; 284 285 this.child_se = new QuadTree(this, se); 286 this.child_se.setSize(new double[] 287 { medx, this.x[1] }, new double[] 288 { this.y[0], medy }); 289 this.child_se.level = this.level + 1; 290 291 this.payload = null; 292 return new QuadTree[] 293 { this.child_nw, this.child_ne, this.child_se, this.child_sw }; 294 } 295 296 /** 297 * TODO: static method 298 * 299 * @param q 300 */ 301 public void recursiveSplit(QuadTree q) { 302 if (QuadTree.verbose) { 303 System.out.println("splitting: " + q); 304 } 305 if (q.getNumbers() < QuadTree.alpha) { 306 return; 307 } 308 else { 309 // exception is thrown if we would run into an endless loop (see comments in split()) 310 try { 311 QuadTree[] childs = q.split(); 312 this.recursiveSplit(childs[0]); 313 this.recursiveSplit(childs[1]); 314 this.recursiveSplit(childs[2]); 315 this.recursiveSplit(childs[3]); 316 } 317 catch (Exception e) { 318 return; 319 } 320 } 321 } 322 323 /** 324 * returns an list of childs sorted by density 325 * 326 * @param q 327 * QuadTree 328 * @return list of QuadTrees 329 */ 330 private void generateList(QuadTree q) { 331 332 // we only have all childs or none at all 333 if (q.child_ne == null) { 334 this.l.add(q); 335 } 336 337 if (q.child_ne != null) { 338 this.generateList(q.child_ne); 339 } 340 if (q.child_nw != null) { 341 this.generateList(q.child_nw); 342 } 343 if (q.child_se != null) { 344 this.generateList(q.child_se); 345 } 346 if (q.child_sw != null) { 347 this.generateList(q.child_sw); 348 } 349 } 350 351 /** 352 * Checks if passed QuadTree is neighboring to us 353 * 354 * @param q 355 * QuadTree 356 * @return true if passed QuadTree is a neighbor 357 */ 358 public boolean isNeighbour(QuadTree q) { 359 boolean is_neighbour = false; 360 361 double[][] our_size = this.getSize(); 362 double[][] new_size = q.getSize(); 363 364 // X is i=0, Y is i=1 365 for (int i = 0; i < 2; i++) { 366 // we are smaller than q 367 // -------------- q 368 // ------- we 369 if (our_size[i][0] >= new_size[i][0] && our_size[i][1] <= new_size[i][1]) { 370 is_neighbour = true; 371 } 372 // we overlap with q at some point 373 // a) ---------------q 374 // ----------- we 375 // b) --------- q 376 // --------- we 377 if ((our_size[i][0] >= new_size[i][0] && our_size[i][0] <= new_size[i][1]) || 378 (our_size[i][1] >= new_size[i][0] && our_size[i][1] <= new_size[i][1])) 379 { 380 is_neighbour = true; 381 } 382 // we are larger than q 383 // ---- q 384 // ---------- we 385 if (our_size[i][1] >= new_size[i][1] && our_size[i][0] <= new_size[i][0]) { 386 is_neighbour = true; 387 } 388 } 389 390 if (is_neighbour && QuadTree.verbose) { 391 System.out.println(this + " neighbour of: " + q); 392 } 393 394 return is_neighbour; 395 } 396 397 /** 398 * Perform pruning and clustering of the quadtree 399 * 400 * Pruning according to: Tim Menzies, Andrew Butcher, David Cok, Andrian Marcus, Lucas Layman, 401 * Forrest Shull, Burak Turhan, Thomas Zimmermann, 402 * "Local versus Global Lessons for Defect Prediction and Effort Estimation," IEEE Transactions 403 * on Software Engineering, vol. 39, no. 6, pp. 822-834, June, 2013 404 * 405 * 1) get list of leaf quadrants 2) sort by their density 3) set stop_rule to 0.5 * highest 406 * Density in the list 4) merge all nodes with a density > stop_rule to the new cluster and 407 * remove all from list 5) repeat 408 * 409 * @param q 410 * List of QuadTree (children only) 411 */ 412 public void gridClustering(ArrayList<QuadTree> list) { 413 414 if (list.size() == 0) { 415 return; 416 } 417 418 double stop_rule; 419 QuadTree biggest; 420 QuadTree current; 421 422 // current clusterlist 423 ArrayList<QuadTreePayload<Instance>> current_cluster; 424 425 // remove list (for removal of items after scanning of the list) 426 ArrayList<Integer> remove = new ArrayList<Integer>(); 427 428 // 1. find biggest, and add it 429 biggest = list.get(list.size() - 1); 430 stop_rule = biggest.getDensity() * 0.5; 431 432 current_cluster = new ArrayList<QuadTreePayload<Instance>>(); 433 current_cluster.addAll(biggest.getPayload()); 434 435 // remove the biggest because we are starting with it 436 remove.add(list.size() - 1); 437 438 ArrayList<Double[][]> tmpSize = new ArrayList<Double[][]>(); 439 tmpSize.add(biggest.getSizeDouble()); 440 441 // check the items for their density 442 for (int i = list.size() - 1; i >= 0; i--) { 443 current = list.get(i); 444 445 // 2. find neighbors with correct density 446 // if density > stop_rule and is_neighbour add to cluster and remove from list 447 if (current.getDensity() > stop_rule && !current.equals(biggest) && 448 current.isNeighbour(biggest)) 449 { 450 current_cluster.addAll(current.getPayload()); 451 452 // add it to remove list (we cannot remove it inside the loop because it would move 453 // the index) 454 remove.add(i); 455 456 // get the size 457 tmpSize.add(current.getSizeDouble()); 458 } 459 } 460 461 // 3. remove our removal candidates from the list 462 for (Integer item : remove) { 463 list.remove((int) item); 464 } 465 466 // 4. add to cluster 467 QuadTree.ccluster.add(current_cluster); 468 469 // 5. add sizes of our current (biggest) this adds a number of sizes (all QuadTree Instances 470 // belonging to this cluster) 471 // we need that to classify test instances to a cluster later 472 Integer cnumber = new Integer(QuadTree.ccluster.size() - 1); 473 if (QuadTree.csize.containsKey(cnumber) == false) { 474 QuadTree.csize.put(cnumber, tmpSize); 475 } 476 477 // repeat 478 this.gridClustering(list); 479 } 480 481 public void printInfo() { 482 System.out.println("we have " + ccluster.size() + " clusters"); 483 484 for (int i = 0; i < ccluster.size(); i++) { 485 System.out.println("cluster: " + i + " size: " + ccluster.get(i).size()); 486 } 487 } 488 489 /** 490 * Helper Method to get a sorted list (by density) for all children 491 * 492 * @param q 493 * QuadTree 494 * @return Sorted ArrayList of quadtrees 495 */ 496 public ArrayList<QuadTree> getList(QuadTree q) { 497 this.generateList(q); 498 499 Collections.sort(this.l, new Comparator<QuadTree>() { 500 @Override 501 public int compare(QuadTree x1, QuadTree x2) { 502 return Double.compare(x1.getDensity(), x2.getDensity()); 503 } 504 }); 505 506 return this.l; 507 } 473 508 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/RandomClass.java
r38 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.training; 2 16 … … 11 25 * Assigns a random class label to the instance it is evaluated on. 12 26 * 13 * The range of class labels are hardcoded in fixedClassValues. 14 * This can later be extended to take values from the XML configuration.27 * The range of class labels are hardcoded in fixedClassValues. This can later be extended to take 28 * values from the XML configuration. 15 29 */ 16 public class RandomClass extends AbstractClassifier implements ITrainingStrategy, IWekaCompatibleTrainer { 30 public class RandomClass extends AbstractClassifier implements ITrainingStrategy, 31 IWekaCompatibleTrainer 32 { 17 33 18 private static final long serialVersionUID = 1L;34 private static final long serialVersionUID = 1L; 19 35 20 private double[] fixedClassValues = {0.0d, 1.0d}; 21 22 @Override 23 public void setParameter(String parameters) { 24 // do nothing, maybe take percentages for distribution later 25 } 36 private double[] fixedClassValues = 37 { 0.0d, 1.0d }; 26 38 27 @Override28 public void buildClassifier(Instances arg0) throws Exception{29 // do nothing 30 }39 @Override 40 public void setParameter(String parameters) { 41 // do nothing, maybe take percentages for distribution later 42 } 31 43 32 @Override33 public Classifier getClassifier(){34 return this; 35 }44 @Override 45 public void buildClassifier(Instances arg0) throws Exception { 46 // do nothing 47 } 36 48 37 @Override38 public void apply(Instances traindata) {39 // nothing to do 40 }49 @Override 50 public Classifier getClassifier() { 51 return this; 52 } 41 53 42 @Override 43 public String getName() { 44 return "RandomClass"; 45 } 46 47 @Override 48 public double classifyInstance(Instance instance) { 49 Random rand = new Random(); 50 int randomNum = rand.nextInt(this.fixedClassValues.length); 51 return this.fixedClassValues[randomNum]; 52 } 54 @Override 55 public void apply(Instances traindata) { 56 // nothing to do 57 } 58 59 @Override 60 public String getName() { 61 return "RandomClass"; 62 } 63 64 @Override 65 public double classifyInstance(Instance instance) { 66 Random rand = new Random(); 67 int randomNum = rand.nextInt(this.fixedClassValues.length); 68 return this.fixedClassValues[randomNum]; 69 } 53 70 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/WekaBaggingTraining.java
r25 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.training; 2 16 … … 18 32 /** 19 33 * Programmatic WekaBaggingTraining 20 *21 * first parameter is Trainer Name.22 * second parameter is class name23 34 * 24 * all subsequent parameters are configuration params (for example for trees) 25 * Cross Validation params always come last and are prepended with -CVPARAM 35 * first parameter is Trainer Name. second parameter is class name 36 * 37 * all subsequent parameters are configuration params (for example for trees) Cross Validation 38 * params always come last and are prepended with -CVPARAM 26 39 * 27 40 * XML Configurations for Weka Classifiers: 41 * 28 42 * <pre> 29 43 * {@code … … 37 51 public class WekaBaggingTraining extends WekaBaseTraining implements ISetWiseTrainingStrategy { 38 52 39 private final TraindatasetBagging classifier = new TraindatasetBagging(); 40 41 @Override 42 public Classifier getClassifier() { 43 return classifier; 44 } 45 46 @Override 47 public void apply(SetUniqueList<Instances> traindataSet) { 48 PrintStream errStr = System.err; 49 System.setErr(new PrintStream(new NullOutputStream())); 50 try { 51 classifier.buildClassifier(traindataSet); 52 } catch (Exception e) { 53 throw new RuntimeException(e); 54 } finally { 55 System.setErr(errStr); 56 } 57 } 58 59 public class TraindatasetBagging extends AbstractClassifier { 60 61 private static final long serialVersionUID = 1L; 53 private final TraindatasetBagging classifier = new TraindatasetBagging(); 62 54 63 private List<Instances> trainingData = null; 64 65 private List<Classifier> classifiers = null; 66 67 @Override 68 public double classifyInstance(Instance instance) { 69 if( classifiers==null ) { 70 return 0.0; 71 } 72 73 double classification = 0.0; 74 for( int i=0 ; i<classifiers.size(); i++ ) { 75 Classifier classifier = classifiers.get(i); 76 Instances traindata = trainingData.get(i); 77 78 Set<String> attributeNames = new HashSet<>(); 79 for( int j=0; j<traindata.numAttributes(); j++ ) { 80 attributeNames.add(traindata.attribute(j).name()); 81 } 82 83 double[] values = new double[traindata.numAttributes()]; 84 int index = 0; 85 for( int j=0; j<instance.numAttributes(); j++ ) { 86 if( attributeNames.contains(instance.attribute(j).name())) { 87 values[index] = instance.value(j); 88 index++; 89 } 90 } 91 92 Instances tmp = new Instances(traindata); 93 tmp.clear(); 94 Instance instCopy = new DenseInstance(instance.weight(), values); 95 instCopy.setDataset(tmp); 96 try { 97 classification += classifier.classifyInstance(instCopy); 98 } catch (Exception e) { 99 throw new RuntimeException("bagging classifier could not classify an instance", e); 100 } 101 } 102 classification /= classifiers.size(); 103 return (classification>=0.5) ? 1.0 : 0.0; 104 } 105 106 public void buildClassifier(SetUniqueList<Instances> traindataSet) throws Exception { 107 classifiers = new LinkedList<>(); 108 trainingData = new LinkedList<>(); 109 for( Instances traindata : traindataSet ) { 110 Classifier classifier = setupClassifier(); 111 classifier.buildClassifier(traindata); 112 classifiers.add(classifier); 113 trainingData.add(new Instances(traindata)); 114 } 115 } 116 117 @Override 118 public void buildClassifier(Instances traindata) throws Exception { 119 classifiers = new LinkedList<>(); 120 trainingData = new LinkedList<>(); 121 final Classifier classifier = setupClassifier(); 122 classifier.buildClassifier(traindata); 123 classifiers.add(classifier); 124 trainingData.add(new Instances(traindata)); 125 } 126 } 55 @Override 56 public Classifier getClassifier() { 57 return classifier; 58 } 59 60 @Override 61 public void apply(SetUniqueList<Instances> traindataSet) { 62 PrintStream errStr = System.err; 63 System.setErr(new PrintStream(new NullOutputStream())); 64 try { 65 classifier.buildClassifier(traindataSet); 66 } 67 catch (Exception e) { 68 throw new RuntimeException(e); 69 } 70 finally { 71 System.setErr(errStr); 72 } 73 } 74 75 public class TraindatasetBagging extends AbstractClassifier { 76 77 private static final long serialVersionUID = 1L; 78 79 private List<Instances> trainingData = null; 80 81 private List<Classifier> classifiers = null; 82 83 @Override 84 public double classifyInstance(Instance instance) { 85 if (classifiers == null) { 86 return 0.0; 87 } 88 89 double classification = 0.0; 90 for (int i = 0; i < classifiers.size(); i++) { 91 Classifier classifier = classifiers.get(i); 92 Instances traindata = trainingData.get(i); 93 94 Set<String> attributeNames = new HashSet<>(); 95 for (int j = 0; j < traindata.numAttributes(); j++) { 96 attributeNames.add(traindata.attribute(j).name()); 97 } 98 99 double[] values = new double[traindata.numAttributes()]; 100 int index = 0; 101 for (int j = 0; j < instance.numAttributes(); j++) { 102 if (attributeNames.contains(instance.attribute(j).name())) { 103 values[index] = instance.value(j); 104 index++; 105 } 106 } 107 108 Instances tmp = new Instances(traindata); 109 tmp.clear(); 110 Instance instCopy = new DenseInstance(instance.weight(), values); 111 instCopy.setDataset(tmp); 112 try { 113 classification += classifier.classifyInstance(instCopy); 114 } 115 catch (Exception e) { 116 throw new RuntimeException("bagging classifier could not classify an instance", 117 e); 118 } 119 } 120 classification /= classifiers.size(); 121 return (classification >= 0.5) ? 1.0 : 0.0; 122 } 123 124 public void buildClassifier(SetUniqueList<Instances> traindataSet) throws Exception { 125 classifiers = new LinkedList<>(); 126 trainingData = new LinkedList<>(); 127 for (Instances traindata : traindataSet) { 128 Classifier classifier = setupClassifier(); 129 classifier.buildClassifier(traindata); 130 classifiers.add(classifier); 131 trainingData.add(new Instances(traindata)); 132 } 133 } 134 135 @Override 136 public void buildClassifier(Instances traindata) throws Exception { 137 classifiers = new LinkedList<>(); 138 trainingData = new LinkedList<>(); 139 final Classifier classifier = setupClassifier(); 140 classifier.buildClassifier(traindata); 141 classifiers.add(classifier); 142 trainingData.add(new Instances(traindata)); 143 } 144 } 127 145 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/WekaBaseTraining.java
r25 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.training; 2 16 … … 15 29 * Allows specification of the Weka classifier and its params in the XML experiment configuration. 16 30 * 17 * Important conventions of the XML format: 18 * Cross Validation params always come last and are prepended with -CVPARAM19 * Example: <trainer name="WekaTraining"param="RandomForestLocal weka.classifiers.trees.RandomForest -CVPARAM I 5 25 5"/>31 * Important conventions of the XML format: Cross Validation params always come last and are 32 * prepended with -CVPARAM Example: <trainer name="WekaTraining" 33 * param="RandomForestLocal weka.classifiers.trees.RandomForest -CVPARAM I 5 25 5"/> 20 34 */ 21 35 public abstract class WekaBaseTraining implements IWekaCompatibleTrainer { 22 23 protected Classifier classifier = null;24 protected String classifierClassName;25 protected String classifierName;26 protected String[] classifierParams;27 28 @Override29 public void setParameter(String parameters) {30 String[] params = parameters.split(" ");31 36 32 // first part of the params is the classifierName (e.g. SMORBF) 33 classifierName = params[0]; 34 35 // the following parameters can be copied from weka! 36 37 // second param is classifierClassName (e.g. weka.classifiers.functions.SMO) 38 classifierClassName = params[1]; 39 40 // rest are params to the specified classifier (e.g. -K weka.classifiers.functions.supportVector.RBFKernel) 41 classifierParams = Arrays.copyOfRange(params, 2, params.length); 42 43 classifier = setupClassifier(); 44 } 37 protected Classifier classifier = null; 38 protected String classifierClassName; 39 protected String classifierName; 40 protected String[] classifierParams; 45 41 46 @Override 47 public Classifier getClassifier() { 48 return classifier; 49 } 42 @Override 43 public void setParameter(String parameters) { 44 String[] params = parameters.split(" "); 50 45 51 public Classifier setupClassifier() { 52 Classifier cl = null; 53 try{ 54 @SuppressWarnings("rawtypes") 55 Class c = Class.forName(classifierClassName); 56 Classifier obj = (Classifier) c.newInstance(); 57 58 // Filter out -CVPARAM, these are special because they do not belong to the Weka classifier class as parameters 59 String[] param = Arrays.copyOf(classifierParams, classifierParams.length); 60 String[] cvparam = {}; 61 boolean cv = false; 62 for ( int i=0; i < classifierParams.length; i++ ) { 63 if(classifierParams[i].equals("-CVPARAM")) { 64 // rest of array are cvparam 65 cvparam = Arrays.copyOfRange(classifierParams, i+1, classifierParams.length); 66 67 // before this we have normal params 68 param = Arrays.copyOfRange(classifierParams, 0, i); 69 70 cv = true; 71 break; 72 } 73 } 74 75 // set classifier params 76 ((OptionHandler)obj).setOptions(param); 77 cl = obj; 78 79 // we have cross val params 80 // cant check on cvparam.length here, it may not be initialized 81 if(cv) { 82 final CVParameterSelection ps = new CVParameterSelection(); 83 ps.setClassifier(obj); 84 ps.setNumFolds(5); 85 //ps.addCVParameter("I 5 25 5"); 86 for( int i=1 ; i<cvparam.length/4 ; i++ ) { 87 ps.addCVParameter(Arrays.asList(Arrays.copyOfRange(cvparam, 0, 4*i)).toString().replaceAll(", ", " ").replaceAll("^\\[|\\]$", "")); 88 } 89 90 cl = ps; 91 } 46 // first part of the params is the classifierName (e.g. SMORBF) 47 classifierName = params[0]; 92 48 93 }catch(ClassNotFoundException e) { 94 Console.traceln(Level.WARNING, String.format("class not found: %s", e.toString())); 95 e.printStackTrace(); 96 } catch (InstantiationException e) { 97 Console.traceln(Level.WARNING, String.format("Instantiation Exception: %s", e.toString())); 98 e.printStackTrace(); 99 } catch (IllegalAccessException e) { 100 Console.traceln(Level.WARNING, String.format("Illegal Access Exception: %s", e.toString())); 101 e.printStackTrace(); 102 } catch (Exception e) { 103 Console.traceln(Level.WARNING, String.format("Exception: %s", e.toString())); 104 e.printStackTrace(); 105 } 106 107 return cl; 108 } 49 // the following parameters can be copied from weka! 109 50 110 @Override 111 public String getName() { 112 return classifierName; 113 } 114 51 // second param is classifierClassName (e.g. weka.classifiers.functions.SMO) 52 classifierClassName = params[1]; 53 54 // rest are params to the specified classifier (e.g. -K 55 // weka.classifiers.functions.supportVector.RBFKernel) 56 classifierParams = Arrays.copyOfRange(params, 2, params.length); 57 58 classifier = setupClassifier(); 59 } 60 61 @Override 62 public Classifier getClassifier() { 63 return classifier; 64 } 65 66 public Classifier setupClassifier() { 67 Classifier cl = null; 68 try { 69 @SuppressWarnings("rawtypes") 70 Class c = Class.forName(classifierClassName); 71 Classifier obj = (Classifier) c.newInstance(); 72 73 // Filter out -CVPARAM, these are special because they do not belong to the Weka 74 // classifier class as parameters 75 String[] param = Arrays.copyOf(classifierParams, classifierParams.length); 76 String[] cvparam = { }; 77 boolean cv = false; 78 for (int i = 0; i < classifierParams.length; i++) { 79 if (classifierParams[i].equals("-CVPARAM")) { 80 // rest of array are cvparam 81 cvparam = Arrays.copyOfRange(classifierParams, i + 1, classifierParams.length); 82 83 // before this we have normal params 84 param = Arrays.copyOfRange(classifierParams, 0, i); 85 86 cv = true; 87 break; 88 } 89 } 90 91 // set classifier params 92 ((OptionHandler) obj).setOptions(param); 93 cl = obj; 94 95 // we have cross val params 96 // cant check on cvparam.length here, it may not be initialized 97 if (cv) { 98 final CVParameterSelection ps = new CVParameterSelection(); 99 ps.setClassifier(obj); 100 ps.setNumFolds(5); 101 // ps.addCVParameter("I 5 25 5"); 102 for (int i = 1; i < cvparam.length / 4; i++) { 103 ps.addCVParameter(Arrays.asList(Arrays.copyOfRange(cvparam, 0, 4 * i)) 104 .toString().replaceAll(", ", " ").replaceAll("^\\[|\\]$", "")); 105 } 106 107 cl = ps; 108 } 109 110 } 111 catch (ClassNotFoundException e) { 112 Console.traceln(Level.WARNING, String.format("class not found: %s", e.toString())); 113 e.printStackTrace(); 114 } 115 catch (InstantiationException e) { 116 Console.traceln(Level.WARNING, 117 String.format("Instantiation Exception: %s", e.toString())); 118 e.printStackTrace(); 119 } 120 catch (IllegalAccessException e) { 121 Console.traceln(Level.WARNING, 122 String.format("Illegal Access Exception: %s", e.toString())); 123 e.printStackTrace(); 124 } 125 catch (Exception e) { 126 Console.traceln(Level.WARNING, String.format("Exception: %s", e.toString())); 127 e.printStackTrace(); 128 } 129 130 return cl; 131 } 132 133 @Override 134 public String getName() { 135 return classifierName; 136 } 137 115 138 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/WekaLocalEMTraining.java
r25 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.training; 2 16 … … 24 38 * WekaLocalEMTraining 25 39 * 26 * Local Trainer with EM Clustering for data partitioning. 27 * Currently supports only EM Clustering. 28 * 29 * 1. Cluster training data 30 * 2. for each cluster train a classifier with training data from cluster 40 * Local Trainer with EM Clustering for data partitioning. Currently supports only EM Clustering. 41 * 42 * 1. Cluster training data 2. for each cluster train a classifier with training data from cluster 31 43 * 3. match test data instance to a cluster, then classify with classifier from the cluster 32 44 * 33 * XML configuration: 34 * <!-- because of clustering --> 35 * <preprocessor name="Normalization" param=""/> 36 * 37 * <!-- cluster trainer --> 38 * <trainer name="WekaLocalEMTraining" param="NaiveBayes weka.classifiers.bayes.NaiveBayes" /> 45 * XML configuration: <!-- because of clustering --> <preprocessor name="Normalization" param=""/> 46 * 47 * <!-- cluster trainer --> <trainer name="WekaLocalEMTraining" 48 * param="NaiveBayes weka.classifiers.bayes.NaiveBayes" /> 39 49 */ 40 50 public class WekaLocalEMTraining extends WekaBaseTraining implements ITrainingStrategy { 41 51 42 private final TraindatasetCluster classifier = new TraindatasetCluster(); 43 44 @Override 45 public Classifier getClassifier() { 46 return classifier; 47 } 48 49 @Override 50 public void apply(Instances traindata) { 51 PrintStream errStr = System.err; 52 System.setErr(new PrintStream(new NullOutputStream())); 53 try { 54 classifier.buildClassifier(traindata); 55 } catch (Exception e) { 56 throw new RuntimeException(e); 57 } finally { 58 System.setErr(errStr); 59 } 60 } 61 62 63 public class TraindatasetCluster extends AbstractClassifier { 64 65 private static final long serialVersionUID = 1L; 66 67 private EM clusterer = null; 68 69 private HashMap<Integer, Classifier> cclassifier; 70 private HashMap<Integer, Instances> ctraindata; 71 72 73 /** 74 * Helper method that gives us a clean instance copy with 75 * the values of the instancelist of the first parameter. 76 * 77 * @param instancelist with attributes 78 * @param instance with only values 79 * @return copy of the instance 80 */ 81 private Instance createInstance(Instances instances, Instance instance) { 82 // attributes for feeding instance to classifier 83 Set<String> attributeNames = new HashSet<>(); 84 for( int j=0; j<instances.numAttributes(); j++ ) { 85 attributeNames.add(instances.attribute(j).name()); 86 } 87 88 double[] values = new double[instances.numAttributes()]; 89 int index = 0; 90 for( int j=0; j<instance.numAttributes(); j++ ) { 91 if( attributeNames.contains(instance.attribute(j).name())) { 92 values[index] = instance.value(j); 93 index++; 94 } 95 } 96 97 Instances tmp = new Instances(instances); 98 tmp.clear(); 99 Instance instCopy = new DenseInstance(instance.weight(), values); 100 instCopy.setDataset(tmp); 101 102 return instCopy; 103 } 104 105 @Override 106 public double classifyInstance(Instance instance) { 107 double ret = 0; 108 try { 109 // 1. copy the instance (keep the class attribute) 110 Instances traindata = ctraindata.get(0); 111 Instance classInstance = createInstance(traindata, instance); 112 113 // 2. remove class attribute before clustering 114 Remove filter = new Remove(); 115 filter.setAttributeIndices("" + (traindata.classIndex() + 1)); 116 filter.setInputFormat(traindata); 117 traindata = Filter.useFilter(traindata, filter); 118 119 // 3. copy the instance (without the class attribute) for clustering 120 Instance clusterInstance = createInstance(traindata, instance); 121 122 // 4. match instance without class attribute to a cluster number 123 int cnum = clusterer.clusterInstance(clusterInstance); 124 125 // 5. classify instance with class attribute to the classifier of that cluster number 126 ret = cclassifier.get(cnum).classifyInstance(classInstance); 127 128 }catch( Exception e ) { 129 Console.traceln(Level.INFO, String.format("ERROR matching instance to cluster!")); 130 throw new RuntimeException(e); 131 } 132 return ret; 133 } 134 135 @Override 136 public void buildClassifier(Instances traindata) throws Exception { 137 138 // 1. copy training data 139 Instances train = new Instances(traindata); 140 141 // 2. remove class attribute for clustering 142 Remove filter = new Remove(); 143 filter.setAttributeIndices("" + (train.classIndex() + 1)); 144 filter.setInputFormat(train); 145 train = Filter.useFilter(train, filter); 146 147 // new objects 148 cclassifier = new HashMap<Integer, Classifier>(); 149 ctraindata = new HashMap<Integer, Instances>(); 150 151 Instances ctrain; 152 int maxNumClusters = train.size(); 153 boolean sufficientInstancesInEachCluster; 154 do { // while(onlyTarget) 155 sufficientInstancesInEachCluster = true; 156 clusterer = new EM(); 157 clusterer.setMaximumNumberOfClusters(maxNumClusters); 158 clusterer.buildClusterer(train); 159 160 // 4. get cluster membership of our traindata 161 //AddCluster cfilter = new AddCluster(); 162 //cfilter.setClusterer(clusterer); 163 //cfilter.setInputFormat(train); 164 //Instances ctrain = Filter.useFilter(train, cfilter); 165 166 ctrain = new Instances(train); 167 ctraindata = new HashMap<>(); 168 169 // get traindata per cluster 170 for ( int j=0; j < ctrain.numInstances(); j++ ) { 171 // get the cluster number from the attributes, subract 1 because if we clusterInstance we get 0-n, and this is 1-n 172 //cnumber = Integer.parseInt(ctrain.get(j).stringValue(ctrain.get(j).numAttributes()-1).replace("cluster", "")) - 1; 173 174 int cnumber = clusterer.clusterInstance(ctrain.get(j)); 175 // add training data to list of instances for this cluster number 176 if ( !ctraindata.containsKey(cnumber) ) { 177 ctraindata.put(cnumber, new Instances(traindata)); 178 ctraindata.get(cnumber).delete(); 179 } 180 ctraindata.get(cnumber).add(traindata.get(j)); 181 } 182 183 for( Entry<Integer,Instances> entry : ctraindata.entrySet() ) { 184 Instances instances = entry.getValue(); 185 int[] counts = instances.attributeStats(instances.classIndex()).nominalCounts; 186 for( int count : counts ) { 187 sufficientInstancesInEachCluster &= count>0; 188 } 189 sufficientInstancesInEachCluster &= instances.numInstances()>=5; 190 } 191 maxNumClusters = clusterer.numberOfClusters()-1; 192 } while(!sufficientInstancesInEachCluster); 193 194 // train one classifier per cluster, we get the cluster number from the training data 195 Iterator<Integer> clusternumber = ctraindata.keySet().iterator(); 196 while ( clusternumber.hasNext() ) { 197 int cnumber = clusternumber.next(); 198 cclassifier.put(cnumber,setupClassifier()); 199 cclassifier.get(cnumber).buildClassifier(ctraindata.get(cnumber)); 200 201 //Console.traceln(Level.INFO, String.format("classifier in cluster "+cnumber)); 202 } 203 } 204 } 52 private final TraindatasetCluster classifier = new TraindatasetCluster(); 53 54 @Override 55 public Classifier getClassifier() { 56 return classifier; 57 } 58 59 @Override 60 public void apply(Instances traindata) { 61 PrintStream errStr = System.err; 62 System.setErr(new PrintStream(new NullOutputStream())); 63 try { 64 classifier.buildClassifier(traindata); 65 } 66 catch (Exception e) { 67 throw new RuntimeException(e); 68 } 69 finally { 70 System.setErr(errStr); 71 } 72 } 73 74 public class TraindatasetCluster extends AbstractClassifier { 75 76 private static final long serialVersionUID = 1L; 77 78 private EM clusterer = null; 79 80 private HashMap<Integer, Classifier> cclassifier; 81 private HashMap<Integer, Instances> ctraindata; 82 83 /** 84 * Helper method that gives us a clean instance copy with the values of the instancelist of 85 * the first parameter. 86 * 87 * @param instancelist 88 * with attributes 89 * @param instance 90 * with only values 91 * @return copy of the instance 92 */ 93 private Instance createInstance(Instances instances, Instance instance) { 94 // attributes for feeding instance to classifier 95 Set<String> attributeNames = new HashSet<>(); 96 for (int j = 0; j < instances.numAttributes(); j++) { 97 attributeNames.add(instances.attribute(j).name()); 98 } 99 100 double[] values = new double[instances.numAttributes()]; 101 int index = 0; 102 for (int j = 0; j < instance.numAttributes(); j++) { 103 if (attributeNames.contains(instance.attribute(j).name())) { 104 values[index] = instance.value(j); 105 index++; 106 } 107 } 108 109 Instances tmp = new Instances(instances); 110 tmp.clear(); 111 Instance instCopy = new DenseInstance(instance.weight(), values); 112 instCopy.setDataset(tmp); 113 114 return instCopy; 115 } 116 117 @Override 118 public double classifyInstance(Instance instance) { 119 double ret = 0; 120 try { 121 // 1. copy the instance (keep the class attribute) 122 Instances traindata = ctraindata.get(0); 123 Instance classInstance = createInstance(traindata, instance); 124 125 // 2. remove class attribute before clustering 126 Remove filter = new Remove(); 127 filter.setAttributeIndices("" + (traindata.classIndex() + 1)); 128 filter.setInputFormat(traindata); 129 traindata = Filter.useFilter(traindata, filter); 130 131 // 3. copy the instance (without the class attribute) for clustering 132 Instance clusterInstance = createInstance(traindata, instance); 133 134 // 4. match instance without class attribute to a cluster number 135 int cnum = clusterer.clusterInstance(clusterInstance); 136 137 // 5. classify instance with class attribute to the classifier of that cluster 138 // number 139 ret = cclassifier.get(cnum).classifyInstance(classInstance); 140 141 } 142 catch (Exception e) { 143 Console.traceln(Level.INFO, String.format("ERROR matching instance to cluster!")); 144 throw new RuntimeException(e); 145 } 146 return ret; 147 } 148 149 @Override 150 public void buildClassifier(Instances traindata) throws Exception { 151 152 // 1. copy training data 153 Instances train = new Instances(traindata); 154 155 // 2. remove class attribute for clustering 156 Remove filter = new Remove(); 157 filter.setAttributeIndices("" + (train.classIndex() + 1)); 158 filter.setInputFormat(train); 159 train = Filter.useFilter(train, filter); 160 161 // new objects 162 cclassifier = new HashMap<Integer, Classifier>(); 163 ctraindata = new HashMap<Integer, Instances>(); 164 165 Instances ctrain; 166 int maxNumClusters = train.size(); 167 boolean sufficientInstancesInEachCluster; 168 do { // while(onlyTarget) 169 sufficientInstancesInEachCluster = true; 170 clusterer = new EM(); 171 clusterer.setMaximumNumberOfClusters(maxNumClusters); 172 clusterer.buildClusterer(train); 173 174 // 4. get cluster membership of our traindata 175 // AddCluster cfilter = new AddCluster(); 176 // cfilter.setClusterer(clusterer); 177 // cfilter.setInputFormat(train); 178 // Instances ctrain = Filter.useFilter(train, cfilter); 179 180 ctrain = new Instances(train); 181 ctraindata = new HashMap<>(); 182 183 // get traindata per cluster 184 for (int j = 0; j < ctrain.numInstances(); j++) { 185 // get the cluster number from the attributes, subract 1 because if we 186 // clusterInstance we get 0-n, and this is 1-n 187 // cnumber = 188 // Integer.parseInt(ctrain.get(j).stringValue(ctrain.get(j).numAttributes()-1).replace("cluster", 189 // "")) - 1; 190 191 int cnumber = clusterer.clusterInstance(ctrain.get(j)); 192 // add training data to list of instances for this cluster number 193 if (!ctraindata.containsKey(cnumber)) { 194 ctraindata.put(cnumber, new Instances(traindata)); 195 ctraindata.get(cnumber).delete(); 196 } 197 ctraindata.get(cnumber).add(traindata.get(j)); 198 } 199 200 for (Entry<Integer, Instances> entry : ctraindata.entrySet()) { 201 Instances instances = entry.getValue(); 202 int[] counts = instances.attributeStats(instances.classIndex()).nominalCounts; 203 for (int count : counts) { 204 sufficientInstancesInEachCluster &= count > 0; 205 } 206 sufficientInstancesInEachCluster &= instances.numInstances() >= 5; 207 } 208 maxNumClusters = clusterer.numberOfClusters() - 1; 209 } 210 while (!sufficientInstancesInEachCluster); 211 212 // train one classifier per cluster, we get the cluster number from the training data 213 Iterator<Integer> clusternumber = ctraindata.keySet().iterator(); 214 while (clusternumber.hasNext()) { 215 int cnumber = clusternumber.next(); 216 cclassifier.put(cnumber, setupClassifier()); 217 cclassifier.get(cnumber).buildClassifier(ctraindata.get(cnumber)); 218 219 // Console.traceln(Level.INFO, String.format("classifier in cluster "+cnumber)); 220 } 221 } 222 } 205 223 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/WekaLocalFQTraining.java
r25 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.training; 2 16 … … 24 38 25 39 /** 26 * Trainer with reimplementation of WHERE clustering algorithm from: 27 * Tim Menzies, Andrew Butcher, David Cok, Andrian Marcus, Lucas Layman, 28 * Forrest Shull, Burak Turhan, Thomas Zimmermann, 29 * "Local versus Global Lessons for Defect Prediction and Effort Estimation," 30 * IEEE Transactions on Software Engineering, vol. 39, no. 6, pp. 822-834, June, 2013 40 * Trainer with reimplementation of WHERE clustering algorithm from: Tim Menzies, Andrew Butcher, 41 * David Cok, Andrian Marcus, Lucas Layman, Forrest Shull, Burak Turhan, Thomas Zimmermann, 42 * "Local versus Global Lessons for Defect Prediction and Effort Estimation," IEEE Transactions on 43 * Software Engineering, vol. 39, no. 6, pp. 822-834, June, 2013 31 44 * 32 * With WekaLocalFQTraining we do the following: 33 * 1) Run the Fastmap algorithm on all training data, let it calculate the 2 most significant34 * dimensions and projections of each instance to these dimensions35 * 2) With these 2 dimensions we span a QuadTree which gets recursively split on median(x) and median(y) values.36 * 3) We cluster the QuadTree nodes together if they have similar density (50%)37 * 4) We save the clusters and their training data38 * 5) We only use clusters with > ALPHA instances (currently Math.sqrt(SIZE)), rest is discarded with the training data of this cluster39 * 6) We train a Weka classifier for each cluster with the clusters training data40 * 7) We recalculate Fastmap distances for a single instance with the old pivots and then try to find a cluster containing the coords of the instance.41 * 7.1.) If we can not find a cluster (due to coords outside of all clusters) we find the nearest cluster.42 * 8) We classify the Instance with theclassifier and traindata from the Cluster we found in 7.45 * With WekaLocalFQTraining we do the following: 1) Run the Fastmap algorithm on all training data, 46 * let it calculate the 2 most significant dimensions and projections of each instance to these 47 * dimensions 2) With these 2 dimensions we span a QuadTree which gets recursively split on 48 * median(x) and median(y) values. 3) We cluster the QuadTree nodes together if they have similar 49 * density (50%) 4) We save the clusters and their training data 5) We only use clusters with > 50 * ALPHA instances (currently Math.sqrt(SIZE)), rest is discarded with the training data of this 51 * cluster 6) We train a Weka classifier for each cluster with the clusters training data 7) We 52 * recalculate Fastmap distances for a single instance with the old pivots and then try to find a 53 * cluster containing the coords of the instance. 7.1.) If we can not find a cluster (due to coords 54 * outside of all clusters) we find the nearest cluster. 8) We classify the Instance with the 55 * classifier and traindata from the Cluster we found in 7. 43 56 */ 44 57 public class WekaLocalFQTraining extends WekaBaseTraining implements ITrainingStrategy { 45 46 private final TraindatasetCluster classifier = new TraindatasetCluster(); 47 48 @Override 49 public Classifier getClassifier() { 50 return classifier; 51 } 52 53 @Override 54 public void apply(Instances traindata) { 55 PrintStream errStr = System.err; 56 System.setErr(new PrintStream(new NullOutputStream())); 57 try { 58 classifier.buildClassifier(traindata); 59 } catch (Exception e) { 60 throw new RuntimeException(e); 61 } finally { 62 System.setErr(errStr); 63 } 64 } 65 66 67 public class TraindatasetCluster extends AbstractClassifier { 68 69 private static final long serialVersionUID = 1L; 70 71 /* classifier per cluster */ 72 private HashMap<Integer, Classifier> cclassifier; 73 74 /* instances per cluster */ 75 private HashMap<Integer, Instances> ctraindata; 76 77 /* holds the instances and indices of the pivot objects of the Fastmap calculation in buildClassifier*/ 78 private HashMap<Integer, Instance> cpivots; 79 80 /* holds the indices of the pivot objects for x,y and the dimension [x,y][dimension]*/ 81 private int[][] cpivotindices; 82 83 /* holds the sizes of the cluster multiple "boxes" per cluster */ 84 private HashMap<Integer, ArrayList<Double[][]>> csize; 85 86 /* debug vars */ 87 @SuppressWarnings("unused") 88 private boolean show_biggest = true; 89 90 @SuppressWarnings("unused") 91 private int CFOUND = 0; 92 @SuppressWarnings("unused") 93 private int CNOTFOUND = 0; 94 95 96 private Instance createInstance(Instances instances, Instance instance) { 97 // attributes for feeding instance to classifier 98 Set<String> attributeNames = new HashSet<>(); 99 for( int j=0; j<instances.numAttributes(); j++ ) { 100 attributeNames.add(instances.attribute(j).name()); 101 } 102 103 double[] values = new double[instances.numAttributes()]; 104 int index = 0; 105 for( int j=0; j<instance.numAttributes(); j++ ) { 106 if( attributeNames.contains(instance.attribute(j).name())) { 107 values[index] = instance.value(j); 108 index++; 109 } 110 } 111 112 Instances tmp = new Instances(instances); 113 tmp.clear(); 114 Instance instCopy = new DenseInstance(instance.weight(), values); 115 instCopy.setDataset(tmp); 116 117 return instCopy; 118 } 119 120 /** 121 * Because Fastmap saves only the image not the values of the attributes it used 122 * we can not use the old data directly to classify single instances to clusters. 123 * 124 * To classify a single instance we do a new fastmap computation with only the instance and 125 * the old pivot elements. 126 * 127 * After that we find the cluster with our fastmap result for x and y. 128 */ 129 @Override 130 public double classifyInstance(Instance instance) { 131 132 double ret = 0; 133 try { 134 // classinstance gets passed to classifier 135 Instances traindata = ctraindata.get(0); 136 Instance classInstance = createInstance(traindata, instance); 137 138 // this one keeps the class attribute 139 Instances traindata2 = ctraindata.get(1); 140 141 // remove class attribute before clustering 142 Remove filter = new Remove(); 143 filter.setAttributeIndices("" + (traindata.classIndex() + 1)); 144 filter.setInputFormat(traindata); 145 traindata = Filter.useFilter(traindata, filter); 146 Instance clusterInstance = createInstance(traindata, instance); 147 148 Fastmap FMAP = new Fastmap(2); 149 EuclideanDistance dist = new EuclideanDistance(traindata); 150 151 // we set our pivot indices [x=0,y=1][dimension] 152 int[][] npivotindices = new int[2][2]; 153 npivotindices[0][0] = 1; 154 npivotindices[1][0] = 2; 155 npivotindices[0][1] = 3; 156 npivotindices[1][1] = 4; 157 158 // build temp dist matrix (2 pivots per dimension + 1 instance we want to classify) 159 // the instance we want to classify comes first after that the pivot elements in the order defined above 160 double[][] distmat = new double[2*FMAP.target_dims+1][2*FMAP.target_dims+1]; 161 distmat[0][0] = 0; 162 distmat[0][1] = dist.distance(clusterInstance, this.cpivots.get((Integer)this.cpivotindices[0][0])); 163 distmat[0][2] = dist.distance(clusterInstance, this.cpivots.get((Integer)this.cpivotindices[1][0])); 164 distmat[0][3] = dist.distance(clusterInstance, this.cpivots.get((Integer)this.cpivotindices[0][1])); 165 distmat[0][4] = dist.distance(clusterInstance, this.cpivots.get((Integer)this.cpivotindices[1][1])); 166 167 distmat[1][0] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[0][0]), clusterInstance); 168 distmat[1][1] = 0; 169 distmat[1][2] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[0][0]), this.cpivots.get((Integer)this.cpivotindices[1][0])); 170 distmat[1][3] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[0][0]), this.cpivots.get((Integer)this.cpivotindices[0][1])); 171 distmat[1][4] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[0][0]), this.cpivots.get((Integer)this.cpivotindices[1][1])); 172 173 distmat[2][0] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[1][0]), clusterInstance); 174 distmat[2][1] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[1][0]), this.cpivots.get((Integer)this.cpivotindices[0][0])); 175 distmat[2][2] = 0; 176 distmat[2][3] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[1][0]), this.cpivots.get((Integer)this.cpivotindices[0][1])); 177 distmat[2][4] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[1][0]), this.cpivots.get((Integer)this.cpivotindices[1][1])); 178 179 distmat[3][0] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[0][1]), clusterInstance); 180 distmat[3][1] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[0][1]), this.cpivots.get((Integer)this.cpivotindices[0][0])); 181 distmat[3][2] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[0][1]), this.cpivots.get((Integer)this.cpivotindices[1][0])); 182 distmat[3][3] = 0; 183 distmat[3][4] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[0][1]), this.cpivots.get((Integer)this.cpivotindices[1][1])); 184 185 distmat[4][0] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[1][1]), clusterInstance); 186 distmat[4][1] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[1][1]), this.cpivots.get((Integer)this.cpivotindices[0][0])); 187 distmat[4][2] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[1][1]), this.cpivots.get((Integer)this.cpivotindices[1][0])); 188 distmat[4][3] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[1][1]), this.cpivots.get((Integer)this.cpivotindices[0][1])); 189 distmat[4][4] = 0; 190 191 192 /* debug output: show biggest distance found within the new distance matrix 193 double biggest = 0; 194 for(int i=0; i < distmat.length; i++) { 195 for(int j=0; j < distmat[0].length; j++) { 196 if(biggest < distmat[i][j]) { 197 biggest = distmat[i][j]; 198 } 199 } 200 } 201 if(this.show_biggest) { 202 Console.traceln(Level.INFO, String.format(""+clusterInstance)); 203 Console.traceln(Level.INFO, String.format("biggest distances: "+ biggest)); 204 this.show_biggest = false; 205 } 206 */ 207 208 FMAP.setDistmat(distmat); 209 FMAP.setPivots(npivotindices); 210 FMAP.calculate(); 211 double[][] x = FMAP.getX(); 212 double[] proj = x[0]; 213 214 // debug output: show the calculated distance matrix, our result vektor for the instance and the complete result matrix 215 /* 216 Console.traceln(Level.INFO, "distmat:"); 217 for(int i=0; i<distmat.length; i++){ 218 for(int j=0; j<distmat[0].length; j++){ 219 Console.trace(Level.INFO, String.format("%20s", distmat[i][j])); 220 } 221 Console.traceln(Level.INFO, ""); 222 } 223 224 Console.traceln(Level.INFO, "vector:"); 225 for(int i=0; i < proj.length; i++) { 226 Console.trace(Level.INFO, String.format("%20s", proj[i])); 227 } 228 Console.traceln(Level.INFO, ""); 229 230 Console.traceln(Level.INFO, "resultmat:"); 231 for(int i=0; i<x.length; i++){ 232 for(int j=0; j<x[0].length; j++){ 233 Console.trace(Level.INFO, String.format("%20s", x[i][j])); 234 } 235 Console.traceln(Level.INFO, ""); 236 } 237 */ 238 239 // now we iterate over all clusters (well, boxes of sizes per cluster really) and save the number of the 240 // cluster in which we are 241 int cnumber; 242 int found_cnumber = -1; 243 Iterator<Integer> clusternumber = this.csize.keySet().iterator(); 244 while ( clusternumber.hasNext() && found_cnumber == -1) { 245 cnumber = clusternumber.next(); 246 247 // now iterate over the boxes of the cluster and hope we find one (cluster could have been removed) 248 // or we are too far away from any cluster because of the fastmap calculation with the initial pivot objects 249 for ( int box=0; box < this.csize.get(cnumber).size(); box++ ) { 250 Double[][] current = this.csize.get(cnumber).get(box); 251 252 if(proj[0] >= current[0][0] && proj[0] <= current[0][1] && // x 253 proj[1] >= current[1][0] && proj[1] <= current[1][1]) { // y 254 found_cnumber = cnumber; 255 } 256 } 257 } 258 259 // we want to count how often we are really inside a cluster 260 //if ( found_cnumber == -1 ) { 261 // CNOTFOUND += 1; 262 //}else { 263 // CFOUND += 1; 264 //} 265 266 // now it can happen that we do not find a cluster because we deleted it previously (too few instances) 267 // or we get bigger distance measures from weka so that we are completely outside of our clusters. 268 // in these cases we just find the nearest cluster to our instance and use it for classification. 269 // to do that we use the EuclideanDistance again to compare our distance to all other Instances 270 // then we take the cluster of the closest weka instance 271 dist = new EuclideanDistance(traindata2); 272 if( !this.ctraindata.containsKey(found_cnumber) ) { 273 double min_distance = Double.MAX_VALUE; 274 clusternumber = ctraindata.keySet().iterator(); 275 while ( clusternumber.hasNext() ) { 276 cnumber = clusternumber.next(); 277 for(int i=0; i < ctraindata.get(cnumber).size(); i++) { 278 if(dist.distance(instance, ctraindata.get(cnumber).get(i)) <= min_distance) { 279 found_cnumber = cnumber; 280 min_distance = dist.distance(instance, ctraindata.get(cnumber).get(i)); 281 } 282 } 283 } 284 } 285 286 // here we have the cluster where an instance has the minimum distance between itself and the 287 // instance we want to classify 288 // if we still have not found a cluster we exit because something is really wrong 289 if( found_cnumber == -1 ) { 290 Console.traceln(Level.INFO, String.format("ERROR matching instance to cluster with full search!")); 291 throw new RuntimeException("cluster not found with full search"); 292 } 293 294 // classify the passed instance with the cluster we found and its training data 295 ret = cclassifier.get(found_cnumber).classifyInstance(classInstance); 296 297 }catch( Exception e ) { 298 Console.traceln(Level.INFO, String.format("ERROR matching instance to cluster!")); 299 throw new RuntimeException(e); 300 } 301 return ret; 302 } 303 304 @Override 305 public void buildClassifier(Instances traindata) throws Exception { 306 307 //Console.traceln(Level.INFO, String.format("found: "+ CFOUND + ", notfound: " + CNOTFOUND)); 308 this.show_biggest = true; 309 310 cclassifier = new HashMap<Integer, Classifier>(); 311 ctraindata = new HashMap<Integer, Instances>(); 312 cpivots = new HashMap<Integer, Instance>(); 313 cpivotindices = new int[2][2]; 314 315 // 1. copy traindata 316 Instances train = new Instances(traindata); 317 Instances train2 = new Instances(traindata); // this one keeps the class attribute 318 319 // 2. remove class attribute for clustering 320 Remove filter = new Remove(); 321 filter.setAttributeIndices("" + (train.classIndex() + 1)); 322 filter.setInputFormat(train); 323 train = Filter.useFilter(train, filter); 324 325 // 3. calculate distance matrix (needed for Fastmap because it starts at dimension 1) 326 double biggest = 0; 327 EuclideanDistance dist = new EuclideanDistance(train); 328 double[][] distmat = new double[train.size()][train.size()]; 329 for( int i=0; i < train.size(); i++ ) { 330 for( int j=0; j < train.size(); j++ ) { 331 distmat[i][j] = dist.distance(train.get(i), train.get(j)); 332 if( distmat[i][j] > biggest ) { 333 biggest = distmat[i][j]; 334 } 335 } 336 } 337 //Console.traceln(Level.INFO, String.format("biggest distances: "+ biggest)); 338 339 // 4. run fastmap for 2 dimensions on the distance matrix 340 Fastmap FMAP = new Fastmap(2); 341 FMAP.setDistmat(distmat); 342 FMAP.calculate(); 343 344 cpivotindices = FMAP.getPivots(); 345 346 double[][] X = FMAP.getX(); 347 distmat = new double[0][0]; 348 System.gc(); 349 350 // quadtree payload generation 351 ArrayList<QuadTreePayload<Instance>> qtp = new ArrayList<QuadTreePayload<Instance>>(); 352 353 // we need these for the sizes of the quadrants 354 double[] big = {0,0}; 355 double[] small = {Double.MAX_VALUE,Double.MAX_VALUE}; 356 357 // set quadtree payload values and get max and min x and y values for size 358 for( int i=0; i<X.length; i++ ){ 359 if(X[i][0] >= big[0]) { 360 big[0] = X[i][0]; 361 } 362 if(X[i][1] >= big[1]) { 363 big[1] = X[i][1]; 364 } 365 if(X[i][0] <= small[0]) { 366 small[0] = X[i][0]; 367 } 368 if(X[i][1] <= small[1]) { 369 small[1] = X[i][1]; 370 } 371 QuadTreePayload<Instance> tmp = new QuadTreePayload<Instance>(X[i][0], X[i][1], train2.get(i)); 372 qtp.add(tmp); 373 } 374 375 //Console.traceln(Level.INFO, String.format("size for cluster ("+small[0]+","+small[1]+") - ("+big[0]+","+big[1]+")")); 376 377 // 5. generate quadtree 378 QuadTree TREE = new QuadTree(null, qtp); 379 QuadTree.size = train.size(); 380 QuadTree.alpha = Math.sqrt(train.size()); 381 QuadTree.ccluster = new ArrayList<ArrayList<QuadTreePayload<Instance>>>(); 382 QuadTree.csize = new HashMap<Integer, ArrayList<Double[][]>>(); 383 384 //Console.traceln(Level.INFO, String.format("Generate QuadTree with "+ QuadTree.size + " size, Alpha: "+ QuadTree.alpha+ "")); 385 386 // set the size and then split the tree recursively at the median value for x, y 387 TREE.setSize(new double[] {small[0], big[0]}, new double[] {small[1], big[1]}); 388 389 // recursive split und grid clustering eher static 390 TREE.recursiveSplit(TREE); 391 392 // generate list of nodes sorted by density (childs only) 393 ArrayList<QuadTree> l = new ArrayList<QuadTree>(TREE.getList(TREE)); 394 395 // recursive grid clustering (tree pruning), the values are stored in ccluster 396 TREE.gridClustering(l); 397 398 // wir iterieren durch die cluster und sammeln uns die instanzen daraus 399 //ctraindata.clear(); 400 for( int i=0; i < QuadTree.ccluster.size(); i++ ) { 401 ArrayList<QuadTreePayload<Instance>> current = QuadTree.ccluster.get(i); 402 403 // i is the clusternumber 404 // we only allow clusters with Instances > ALPHA, other clusters are not considered! 405 //if(current.size() > QuadTree.alpha) { 406 if( current.size() > 4 ) { 407 for( int j=0; j < current.size(); j++ ) { 408 if( !ctraindata.containsKey(i) ) { 409 ctraindata.put(i, new Instances(train2)); 410 ctraindata.get(i).delete(); 411 } 412 ctraindata.get(i).add(current.get(j).getInst()); 413 } 414 }else{ 415 Console.traceln(Level.INFO, String.format("drop cluster, only: " + current.size() + " instances")); 416 } 417 } 418 419 // here we keep things we need later on 420 // QuadTree sizes for later use (matching new instances) 421 this.csize = new HashMap<Integer, ArrayList<Double[][]>>(QuadTree.csize); 422 423 // pivot elements 424 //this.cpivots.clear(); 425 for( int i=0; i < FMAP.PA[0].length; i++ ) { 426 this.cpivots.put(FMAP.PA[0][i], (Instance)train.get(FMAP.PA[0][i]).copy()); 427 } 428 for( int j=0; j < FMAP.PA[0].length; j++ ) { 429 this.cpivots.put(FMAP.PA[1][j], (Instance)train.get(FMAP.PA[1][j]).copy()); 430 } 431 432 433 /* debug output 434 int pnumber; 435 Iterator<Integer> pivotnumber = cpivots.keySet().iterator(); 436 while ( pivotnumber.hasNext() ) { 437 pnumber = pivotnumber.next(); 438 Console.traceln(Level.INFO, String.format("pivot: "+pnumber+ " inst: "+cpivots.get(pnumber))); 439 } 440 */ 441 442 // train one classifier per cluster, we get the cluster number from the traindata 443 int cnumber; 444 Iterator<Integer> clusternumber = ctraindata.keySet().iterator(); 445 //cclassifier.clear(); 446 447 //int traindata_count = 0; 448 while ( clusternumber.hasNext() ) { 449 cnumber = clusternumber.next(); 450 cclassifier.put(cnumber,setupClassifier()); // this is the classifier used for the cluster 451 cclassifier.get(cnumber).buildClassifier(ctraindata.get(cnumber)); 452 //Console.traceln(Level.INFO, String.format("classifier in cluster "+cnumber)); 453 //traindata_count += ctraindata.get(cnumber).size(); 454 //Console.traceln(Level.INFO, String.format("building classifier in cluster "+cnumber +" with "+ ctraindata.get(cnumber).size() +" traindata instances")); 455 } 456 457 // add all traindata 458 //Console.traceln(Level.INFO, String.format("traindata in all clusters: " + traindata_count)); 459 } 460 } 461 462 463 /** 464 * Payload for the QuadTree. 465 * x and y are the calculated Fastmap values. 466 * T is a weka instance. 467 */ 468 public class QuadTreePayload<T> { 469 470 public double x; 471 public double y; 472 private T inst; 473 474 public QuadTreePayload(double x, double y, T value) { 475 this.x = x; 476 this.y = y; 477 this.inst = value; 478 } 479 480 public T getInst() { 481 return this.inst; 482 } 483 } 484 485 486 /** 487 * Fastmap implementation 488 * 489 * Faloutsos, C., & Lin, K. I. (1995). 490 * FastMap: A fast algorithm for indexing, data-mining and visualization of traditional and multimedia datasets 491 * (Vol. 24, No. 2, pp. 163-174). ACM. 492 */ 493 public class Fastmap { 494 495 /*N x k Array, at the end, the i-th row will be the image of the i-th object*/ 496 private double[][] X; 497 498 /*2 x k pivot Array one pair per recursive call*/ 499 private int[][] PA; 500 501 /*Objects we got (distance matrix)*/ 502 private double[][] O; 503 504 /*column of X currently updated (also the dimension)*/ 505 private int col = 0; 506 507 /*number of dimensions we want*/ 508 private int target_dims = 0; 509 510 // if we already have the pivot elements 511 private boolean pivot_set = false; 512 513 514 public Fastmap(int k) { 515 this.target_dims = k; 516 } 517 518 /** 519 * Sets the distance matrix 520 * and params that depend on this 521 * @param O 522 */ 523 public void setDistmat(double[][] O) { 524 this.O = O; 525 int N = O.length; 526 this.X = new double[N][this.target_dims]; 527 this.PA = new int[2][this.target_dims]; 528 } 529 530 /** 531 * Set pivot elements, we need that to classify instances 532 * after the calculation is complete (because we then want to reuse 533 * only the pivot elements). 534 * 535 * @param pi 536 */ 537 public void setPivots(int[][] pi) { 538 this.pivot_set = true; 539 this.PA = pi; 540 } 541 542 /** 543 * Return the pivot elements that were chosen during the calculation 544 * 545 * @return 546 */ 547 public int[][] getPivots() { 548 return this.PA; 549 } 550 551 /** 552 * The distance function for euclidean distance 553 * 554 * Acts according to equation 4 of the fastmap paper 555 * 556 * @param x x index of x image (if k==0 x object) 557 * @param y y index of y image (if k==0 y object) 558 * @param kdimensionality 559 * @return distance 560 */ 561 private double dist(int x, int y, int k) { 562 563 // basis is object distance, we get this from our distance matrix 564 double tmp = this.O[x][y] * this.O[x][y]; 565 566 // decrease by projections 567 for( int i=0; i < k; i++ ) { 568 double tmp2 = (this.X[x][i] - this.X[y][i]); 569 tmp -= tmp2 * tmp2; 570 } 571 572 return Math.abs(tmp); 573 } 574 575 /** 576 * Find the object farthest from the given index 577 * This method is a helper Method for findDistandObjects 578 * 579 * @param index of the object 580 * @return index of the farthest object from the given index 581 */ 582 private int findFarthest(int index) { 583 double furthest = Double.MIN_VALUE; 584 int ret = 0; 585 586 for( int i=0; i < O.length; i++ ) { 587 double dist = this.dist(i, index, this.col); 588 if( i != index && dist > furthest ) { 589 furthest = dist; 590 ret = i; 591 } 592 } 593 return ret; 594 } 595 596 /** 597 * Finds the pivot objects 598 * 599 * This method is basically algorithm 1 of the fastmap paper. 600 * 601 * @return 2 indexes of the choosen pivot objects 602 */ 603 private int[] findDistantObjects() { 604 // 1. choose object randomly 605 Random r = new Random(); 606 int obj = r.nextInt(this.O.length); 607 608 // 2. find farthest object from randomly chosen object 609 int idx1 = this.findFarthest(obj); 610 611 // 3. find farthest object from previously farthest object 612 int idx2 = this.findFarthest(idx1); 613 614 return new int[] {idx1, idx2}; 615 } 616 617 /** 618 * Calculates the new k-vector values (projections) 619 * 620 * This is basically algorithm 2 of the fastmap paper. 621 * We just added the possibility to pre-set the pivot elements because 622 * we need to classify single instances after the computation is already done. 623 * 624 * @param dims dimensionality 625 */ 626 public void calculate() { 627 628 for( int k=0; k < this.target_dims; k++ ) { 629 // 2) choose pivot objects 630 if ( !this.pivot_set ) { 631 int[] pivots = this.findDistantObjects(); 632 633 // 3) record ids of pivot objects 634 this.PA[0][this.col] = pivots[0]; 635 this.PA[1][this.col] = pivots[1]; 636 } 637 638 // 4) inter object distances are zero (this.X is initialized with 0 so we just continue) 639 if( this.dist(this.PA[0][this.col], this.PA[1][this.col], this.col) == 0 ) { 640 continue; 641 } 642 643 // 5) project the objects on the line between the pivots 644 double dxy = this.dist(this.PA[0][this.col], this.PA[1][this.col], this.col); 645 for( int i=0; i < this.O.length; i++ ) { 646 647 double dix = this.dist(i, this.PA[0][this.col], this.col); 648 double diy = this.dist(i, this.PA[1][this.col], this.col); 649 650 double tmp = (dix + dxy - diy) / (2 * Math.sqrt(dxy)); 651 652 // save the projection 653 this.X[i][this.col] = tmp; 654 } 655 656 this.col += 1; 657 } 658 } 659 660 /** 661 * returns the result matrix of the projections 662 * 663 * @return calculated result 664 */ 665 public double[][] getX() { 666 return this.X; 667 } 668 } 58 59 private final TraindatasetCluster classifier = new TraindatasetCluster(); 60 61 @Override 62 public Classifier getClassifier() { 63 return classifier; 64 } 65 66 @Override 67 public void apply(Instances traindata) { 68 PrintStream errStr = System.err; 69 System.setErr(new PrintStream(new NullOutputStream())); 70 try { 71 classifier.buildClassifier(traindata); 72 } 73 catch (Exception e) { 74 throw new RuntimeException(e); 75 } 76 finally { 77 System.setErr(errStr); 78 } 79 } 80 81 public class TraindatasetCluster extends AbstractClassifier { 82 83 private static final long serialVersionUID = 1L; 84 85 /* classifier per cluster */ 86 private HashMap<Integer, Classifier> cclassifier; 87 88 /* instances per cluster */ 89 private HashMap<Integer, Instances> ctraindata; 90 91 /* 92 * holds the instances and indices of the pivot objects of the Fastmap calculation in 93 * buildClassifier 94 */ 95 private HashMap<Integer, Instance> cpivots; 96 97 /* holds the indices of the pivot objects for x,y and the dimension [x,y][dimension] */ 98 private int[][] cpivotindices; 99 100 /* holds the sizes of the cluster multiple "boxes" per cluster */ 101 private HashMap<Integer, ArrayList<Double[][]>> csize; 102 103 /* debug vars */ 104 @SuppressWarnings("unused") 105 private boolean show_biggest = true; 106 107 @SuppressWarnings("unused") 108 private int CFOUND = 0; 109 @SuppressWarnings("unused") 110 private int CNOTFOUND = 0; 111 112 private Instance createInstance(Instances instances, Instance instance) { 113 // attributes for feeding instance to classifier 114 Set<String> attributeNames = new HashSet<>(); 115 for (int j = 0; j < instances.numAttributes(); j++) { 116 attributeNames.add(instances.attribute(j).name()); 117 } 118 119 double[] values = new double[instances.numAttributes()]; 120 int index = 0; 121 for (int j = 0; j < instance.numAttributes(); j++) { 122 if (attributeNames.contains(instance.attribute(j).name())) { 123 values[index] = instance.value(j); 124 index++; 125 } 126 } 127 128 Instances tmp = new Instances(instances); 129 tmp.clear(); 130 Instance instCopy = new DenseInstance(instance.weight(), values); 131 instCopy.setDataset(tmp); 132 133 return instCopy; 134 } 135 136 /** 137 * Because Fastmap saves only the image not the values of the attributes it used we can not 138 * use the old data directly to classify single instances to clusters. 139 * 140 * To classify a single instance we do a new fastmap computation with only the instance and 141 * the old pivot elements. 142 * 143 * After that we find the cluster with our fastmap result for x and y. 144 */ 145 @Override 146 public double classifyInstance(Instance instance) { 147 148 double ret = 0; 149 try { 150 // classinstance gets passed to classifier 151 Instances traindata = ctraindata.get(0); 152 Instance classInstance = createInstance(traindata, instance); 153 154 // this one keeps the class attribute 155 Instances traindata2 = ctraindata.get(1); 156 157 // remove class attribute before clustering 158 Remove filter = new Remove(); 159 filter.setAttributeIndices("" + (traindata.classIndex() + 1)); 160 filter.setInputFormat(traindata); 161 traindata = Filter.useFilter(traindata, filter); 162 Instance clusterInstance = createInstance(traindata, instance); 163 164 Fastmap FMAP = new Fastmap(2); 165 EuclideanDistance dist = new EuclideanDistance(traindata); 166 167 // we set our pivot indices [x=0,y=1][dimension] 168 int[][] npivotindices = new int[2][2]; 169 npivotindices[0][0] = 1; 170 npivotindices[1][0] = 2; 171 npivotindices[0][1] = 3; 172 npivotindices[1][1] = 4; 173 174 // build temp dist matrix (2 pivots per dimension + 1 instance we want to classify) 175 // the instance we want to classify comes first after that the pivot elements in the 176 // order defined above 177 double[][] distmat = new double[2 * FMAP.target_dims + 1][2 * FMAP.target_dims + 1]; 178 distmat[0][0] = 0; 179 distmat[0][1] = 180 dist.distance(clusterInstance, 181 this.cpivots.get((Integer) this.cpivotindices[0][0])); 182 distmat[0][2] = 183 dist.distance(clusterInstance, 184 this.cpivots.get((Integer) this.cpivotindices[1][0])); 185 distmat[0][3] = 186 dist.distance(clusterInstance, 187 this.cpivots.get((Integer) this.cpivotindices[0][1])); 188 distmat[0][4] = 189 dist.distance(clusterInstance, 190 this.cpivots.get((Integer) this.cpivotindices[1][1])); 191 192 distmat[1][0] = 193 dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][0]), 194 clusterInstance); 195 distmat[1][1] = 0; 196 distmat[1][2] = 197 dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][0]), 198 this.cpivots.get((Integer) this.cpivotindices[1][0])); 199 distmat[1][3] = 200 dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][0]), 201 this.cpivots.get((Integer) this.cpivotindices[0][1])); 202 distmat[1][4] = 203 dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][0]), 204 this.cpivots.get((Integer) this.cpivotindices[1][1])); 205 206 distmat[2][0] = 207 dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][0]), 208 clusterInstance); 209 distmat[2][1] = 210 dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][0]), 211 this.cpivots.get((Integer) this.cpivotindices[0][0])); 212 distmat[2][2] = 0; 213 distmat[2][3] = 214 dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][0]), 215 this.cpivots.get((Integer) this.cpivotindices[0][1])); 216 distmat[2][4] = 217 dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][0]), 218 this.cpivots.get((Integer) this.cpivotindices[1][1])); 219 220 distmat[3][0] = 221 dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][1]), 222 clusterInstance); 223 distmat[3][1] = 224 dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][1]), 225 this.cpivots.get((Integer) this.cpivotindices[0][0])); 226 distmat[3][2] = 227 dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][1]), 228 this.cpivots.get((Integer) this.cpivotindices[1][0])); 229 distmat[3][3] = 0; 230 distmat[3][4] = 231 dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][1]), 232 this.cpivots.get((Integer) this.cpivotindices[1][1])); 233 234 distmat[4][0] = 235 dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][1]), 236 clusterInstance); 237 distmat[4][1] = 238 dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][1]), 239 this.cpivots.get((Integer) this.cpivotindices[0][0])); 240 distmat[4][2] = 241 dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][1]), 242 this.cpivots.get((Integer) this.cpivotindices[1][0])); 243 distmat[4][3] = 244 dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][1]), 245 this.cpivots.get((Integer) this.cpivotindices[0][1])); 246 distmat[4][4] = 0; 247 248 /* 249 * debug output: show biggest distance found within the new distance matrix double 250 * biggest = 0; for(int i=0; i < distmat.length; i++) { for(int j=0; j < 251 * distmat[0].length; j++) { if(biggest < distmat[i][j]) { biggest = distmat[i][j]; 252 * } } } if(this.show_biggest) { Console.traceln(Level.INFO, 253 * String.format(""+clusterInstance)); Console.traceln(Level.INFO, 254 * String.format("biggest distances: "+ biggest)); this.show_biggest = false; } 255 */ 256 257 FMAP.setDistmat(distmat); 258 FMAP.setPivots(npivotindices); 259 FMAP.calculate(); 260 double[][] x = FMAP.getX(); 261 double[] proj = x[0]; 262 263 // debug output: show the calculated distance matrix, our result vektor for the 264 // instance and the complete result matrix 265 /* 266 * Console.traceln(Level.INFO, "distmat:"); for(int i=0; i<distmat.length; i++){ 267 * for(int j=0; j<distmat[0].length; j++){ Console.trace(Level.INFO, 268 * String.format("%20s", distmat[i][j])); } Console.traceln(Level.INFO, ""); } 269 * 270 * Console.traceln(Level.INFO, "vector:"); for(int i=0; i < proj.length; i++) { 271 * Console.trace(Level.INFO, String.format("%20s", proj[i])); } 272 * Console.traceln(Level.INFO, ""); 273 * 274 * Console.traceln(Level.INFO, "resultmat:"); for(int i=0; i<x.length; i++){ for(int 275 * j=0; j<x[0].length; j++){ Console.trace(Level.INFO, String.format("%20s", 276 * x[i][j])); } Console.traceln(Level.INFO, ""); } 277 */ 278 279 // now we iterate over all clusters (well, boxes of sizes per cluster really) and 280 // save the number of the 281 // cluster in which we are 282 int cnumber; 283 int found_cnumber = -1; 284 Iterator<Integer> clusternumber = this.csize.keySet().iterator(); 285 while (clusternumber.hasNext() && found_cnumber == -1) { 286 cnumber = clusternumber.next(); 287 288 // now iterate over the boxes of the cluster and hope we find one (cluster could 289 // have been removed) 290 // or we are too far away from any cluster because of the fastmap calculation 291 // with the initial pivot objects 292 for (int box = 0; box < this.csize.get(cnumber).size(); box++) { 293 Double[][] current = this.csize.get(cnumber).get(box); 294 295 if (proj[0] >= current[0][0] && proj[0] <= current[0][1] && // x 296 proj[1] >= current[1][0] && proj[1] <= current[1][1]) 297 { // y 298 found_cnumber = cnumber; 299 } 300 } 301 } 302 303 // we want to count how often we are really inside a cluster 304 // if ( found_cnumber == -1 ) { 305 // CNOTFOUND += 1; 306 // }else { 307 // CFOUND += 1; 308 // } 309 310 // now it can happen that we do not find a cluster because we deleted it previously 311 // (too few instances) 312 // or we get bigger distance measures from weka so that we are completely outside of 313 // our clusters. 314 // in these cases we just find the nearest cluster to our instance and use it for 315 // classification. 316 // to do that we use the EuclideanDistance again to compare our distance to all 317 // other Instances 318 // then we take the cluster of the closest weka instance 319 dist = new EuclideanDistance(traindata2); 320 if (!this.ctraindata.containsKey(found_cnumber)) { 321 double min_distance = Double.MAX_VALUE; 322 clusternumber = ctraindata.keySet().iterator(); 323 while (clusternumber.hasNext()) { 324 cnumber = clusternumber.next(); 325 for (int i = 0; i < ctraindata.get(cnumber).size(); i++) { 326 if (dist.distance(instance, ctraindata.get(cnumber).get(i)) <= min_distance) 327 { 328 found_cnumber = cnumber; 329 min_distance = 330 dist.distance(instance, ctraindata.get(cnumber).get(i)); 331 } 332 } 333 } 334 } 335 336 // here we have the cluster where an instance has the minimum distance between 337 // itself and the 338 // instance we want to classify 339 // if we still have not found a cluster we exit because something is really wrong 340 if (found_cnumber == -1) { 341 Console.traceln(Level.INFO, String 342 .format("ERROR matching instance to cluster with full search!")); 343 throw new RuntimeException("cluster not found with full search"); 344 } 345 346 // classify the passed instance with the cluster we found and its training data 347 ret = cclassifier.get(found_cnumber).classifyInstance(classInstance); 348 349 } 350 catch (Exception e) { 351 Console.traceln(Level.INFO, String.format("ERROR matching instance to cluster!")); 352 throw new RuntimeException(e); 353 } 354 return ret; 355 } 356 357 @Override 358 public void buildClassifier(Instances traindata) throws Exception { 359 360 // Console.traceln(Level.INFO, String.format("found: "+ CFOUND + ", notfound: " + 361 // CNOTFOUND)); 362 this.show_biggest = true; 363 364 cclassifier = new HashMap<Integer, Classifier>(); 365 ctraindata = new HashMap<Integer, Instances>(); 366 cpivots = new HashMap<Integer, Instance>(); 367 cpivotindices = new int[2][2]; 368 369 // 1. copy traindata 370 Instances train = new Instances(traindata); 371 Instances train2 = new Instances(traindata); // this one keeps the class attribute 372 373 // 2. remove class attribute for clustering 374 Remove filter = new Remove(); 375 filter.setAttributeIndices("" + (train.classIndex() + 1)); 376 filter.setInputFormat(train); 377 train = Filter.useFilter(train, filter); 378 379 // 3. calculate distance matrix (needed for Fastmap because it starts at dimension 1) 380 double biggest = 0; 381 EuclideanDistance dist = new EuclideanDistance(train); 382 double[][] distmat = new double[train.size()][train.size()]; 383 for (int i = 0; i < train.size(); i++) { 384 for (int j = 0; j < train.size(); j++) { 385 distmat[i][j] = dist.distance(train.get(i), train.get(j)); 386 if (distmat[i][j] > biggest) { 387 biggest = distmat[i][j]; 388 } 389 } 390 } 391 // Console.traceln(Level.INFO, String.format("biggest distances: "+ biggest)); 392 393 // 4. run fastmap for 2 dimensions on the distance matrix 394 Fastmap FMAP = new Fastmap(2); 395 FMAP.setDistmat(distmat); 396 FMAP.calculate(); 397 398 cpivotindices = FMAP.getPivots(); 399 400 double[][] X = FMAP.getX(); 401 distmat = new double[0][0]; 402 System.gc(); 403 404 // quadtree payload generation 405 ArrayList<QuadTreePayload<Instance>> qtp = new ArrayList<QuadTreePayload<Instance>>(); 406 407 // we need these for the sizes of the quadrants 408 double[] big = 409 { 0, 0 }; 410 double[] small = 411 { Double.MAX_VALUE, Double.MAX_VALUE }; 412 413 // set quadtree payload values and get max and min x and y values for size 414 for (int i = 0; i < X.length; i++) { 415 if (X[i][0] >= big[0]) { 416 big[0] = X[i][0]; 417 } 418 if (X[i][1] >= big[1]) { 419 big[1] = X[i][1]; 420 } 421 if (X[i][0] <= small[0]) { 422 small[0] = X[i][0]; 423 } 424 if (X[i][1] <= small[1]) { 425 small[1] = X[i][1]; 426 } 427 QuadTreePayload<Instance> tmp = 428 new QuadTreePayload<Instance>(X[i][0], X[i][1], train2.get(i)); 429 qtp.add(tmp); 430 } 431 432 // Console.traceln(Level.INFO, 433 // String.format("size for cluster ("+small[0]+","+small[1]+") - ("+big[0]+","+big[1]+")")); 434 435 // 5. generate quadtree 436 QuadTree TREE = new QuadTree(null, qtp); 437 QuadTree.size = train.size(); 438 QuadTree.alpha = Math.sqrt(train.size()); 439 QuadTree.ccluster = new ArrayList<ArrayList<QuadTreePayload<Instance>>>(); 440 QuadTree.csize = new HashMap<Integer, ArrayList<Double[][]>>(); 441 442 // Console.traceln(Level.INFO, String.format("Generate QuadTree with "+ QuadTree.size + 443 // " size, Alpha: "+ QuadTree.alpha+ "")); 444 445 // set the size and then split the tree recursively at the median value for x, y 446 TREE.setSize(new double[] 447 { small[0], big[0] }, new double[] 448 { small[1], big[1] }); 449 450 // recursive split und grid clustering eher static 451 TREE.recursiveSplit(TREE); 452 453 // generate list of nodes sorted by density (childs only) 454 ArrayList<QuadTree> l = new ArrayList<QuadTree>(TREE.getList(TREE)); 455 456 // recursive grid clustering (tree pruning), the values are stored in ccluster 457 TREE.gridClustering(l); 458 459 // wir iterieren durch die cluster und sammeln uns die instanzen daraus 460 // ctraindata.clear(); 461 for (int i = 0; i < QuadTree.ccluster.size(); i++) { 462 ArrayList<QuadTreePayload<Instance>> current = QuadTree.ccluster.get(i); 463 464 // i is the clusternumber 465 // we only allow clusters with Instances > ALPHA, other clusters are not considered! 466 // if(current.size() > QuadTree.alpha) { 467 if (current.size() > 4) { 468 for (int j = 0; j < current.size(); j++) { 469 if (!ctraindata.containsKey(i)) { 470 ctraindata.put(i, new Instances(train2)); 471 ctraindata.get(i).delete(); 472 } 473 ctraindata.get(i).add(current.get(j).getInst()); 474 } 475 } 476 else { 477 Console.traceln(Level.INFO, 478 String.format("drop cluster, only: " + current.size() + 479 " instances")); 480 } 481 } 482 483 // here we keep things we need later on 484 // QuadTree sizes for later use (matching new instances) 485 this.csize = new HashMap<Integer, ArrayList<Double[][]>>(QuadTree.csize); 486 487 // pivot elements 488 // this.cpivots.clear(); 489 for (int i = 0; i < FMAP.PA[0].length; i++) { 490 this.cpivots.put(FMAP.PA[0][i], (Instance) train.get(FMAP.PA[0][i]).copy()); 491 } 492 for (int j = 0; j < FMAP.PA[0].length; j++) { 493 this.cpivots.put(FMAP.PA[1][j], (Instance) train.get(FMAP.PA[1][j]).copy()); 494 } 495 496 /* 497 * debug output int pnumber; Iterator<Integer> pivotnumber = 498 * cpivots.keySet().iterator(); while ( pivotnumber.hasNext() ) { pnumber = 499 * pivotnumber.next(); Console.traceln(Level.INFO, String.format("pivot: "+pnumber+ 500 * " inst: "+cpivots.get(pnumber))); } 501 */ 502 503 // train one classifier per cluster, we get the cluster number from the traindata 504 int cnumber; 505 Iterator<Integer> clusternumber = ctraindata.keySet().iterator(); 506 // cclassifier.clear(); 507 508 // int traindata_count = 0; 509 while (clusternumber.hasNext()) { 510 cnumber = clusternumber.next(); 511 cclassifier.put(cnumber, setupClassifier()); // this is the classifier used for the 512 // cluster 513 cclassifier.get(cnumber).buildClassifier(ctraindata.get(cnumber)); 514 // Console.traceln(Level.INFO, String.format("classifier in cluster "+cnumber)); 515 // traindata_count += ctraindata.get(cnumber).size(); 516 // Console.traceln(Level.INFO, 517 // String.format("building classifier in cluster "+cnumber +" with "+ 518 // ctraindata.get(cnumber).size() +" traindata instances")); 519 } 520 521 // add all traindata 522 // Console.traceln(Level.INFO, String.format("traindata in all clusters: " + 523 // traindata_count)); 524 } 525 } 526 527 /** 528 * Payload for the QuadTree. x and y are the calculated Fastmap values. T is a weka instance. 529 */ 530 public class QuadTreePayload<T> { 531 532 public double x; 533 public double y; 534 private T inst; 535 536 public QuadTreePayload(double x, double y, T value) { 537 this.x = x; 538 this.y = y; 539 this.inst = value; 540 } 541 542 public T getInst() { 543 return this.inst; 544 } 545 } 546 547 /** 548 * Fastmap implementation 549 * 550 * Faloutsos, C., & Lin, K. I. (1995). FastMap: A fast algorithm for indexing, data-mining and 551 * visualization of traditional and multimedia datasets (Vol. 24, No. 2, pp. 163-174). ACM. 552 */ 553 public class Fastmap { 554 555 /* N x k Array, at the end, the i-th row will be the image of the i-th object */ 556 private double[][] X; 557 558 /* 2 x k pivot Array one pair per recursive call */ 559 private int[][] PA; 560 561 /* Objects we got (distance matrix) */ 562 private double[][] O; 563 564 /* column of X currently updated (also the dimension) */ 565 private int col = 0; 566 567 /* number of dimensions we want */ 568 private int target_dims = 0; 569 570 // if we already have the pivot elements 571 private boolean pivot_set = false; 572 573 public Fastmap(int k) { 574 this.target_dims = k; 575 } 576 577 /** 578 * Sets the distance matrix and params that depend on this 579 * 580 * @param O 581 */ 582 public void setDistmat(double[][] O) { 583 this.O = O; 584 int N = O.length; 585 this.X = new double[N][this.target_dims]; 586 this.PA = new int[2][this.target_dims]; 587 } 588 589 /** 590 * Set pivot elements, we need that to classify instances after the calculation is complete 591 * (because we then want to reuse only the pivot elements). 592 * 593 * @param pi 594 */ 595 public void setPivots(int[][] pi) { 596 this.pivot_set = true; 597 this.PA = pi; 598 } 599 600 /** 601 * Return the pivot elements that were chosen during the calculation 602 * 603 * @return 604 */ 605 public int[][] getPivots() { 606 return this.PA; 607 } 608 609 /** 610 * The distance function for euclidean distance 611 * 612 * Acts according to equation 4 of the fastmap paper 613 * 614 * @param x 615 * x index of x image (if k==0 x object) 616 * @param y 617 * y index of y image (if k==0 y object) 618 * @param kdimensionality 619 * @return distance 620 */ 621 private double dist(int x, int y, int k) { 622 623 // basis is object distance, we get this from our distance matrix 624 double tmp = this.O[x][y] * this.O[x][y]; 625 626 // decrease by projections 627 for (int i = 0; i < k; i++) { 628 double tmp2 = (this.X[x][i] - this.X[y][i]); 629 tmp -= tmp2 * tmp2; 630 } 631 632 return Math.abs(tmp); 633 } 634 635 /** 636 * Find the object farthest from the given index This method is a helper Method for 637 * findDistandObjects 638 * 639 * @param index 640 * of the object 641 * @return index of the farthest object from the given index 642 */ 643 private int findFarthest(int index) { 644 double furthest = Double.MIN_VALUE; 645 int ret = 0; 646 647 for (int i = 0; i < O.length; i++) { 648 double dist = this.dist(i, index, this.col); 649 if (i != index && dist > furthest) { 650 furthest = dist; 651 ret = i; 652 } 653 } 654 return ret; 655 } 656 657 /** 658 * Finds the pivot objects 659 * 660 * This method is basically algorithm 1 of the fastmap paper. 661 * 662 * @return 2 indexes of the choosen pivot objects 663 */ 664 private int[] findDistantObjects() { 665 // 1. choose object randomly 666 Random r = new Random(); 667 int obj = r.nextInt(this.O.length); 668 669 // 2. find farthest object from randomly chosen object 670 int idx1 = this.findFarthest(obj); 671 672 // 3. find farthest object from previously farthest object 673 int idx2 = this.findFarthest(idx1); 674 675 return new int[] 676 { idx1, idx2 }; 677 } 678 679 /** 680 * Calculates the new k-vector values (projections) 681 * 682 * This is basically algorithm 2 of the fastmap paper. We just added the possibility to 683 * pre-set the pivot elements because we need to classify single instances after the 684 * computation is already done. 685 * 686 * @param dims 687 * dimensionality 688 */ 689 public void calculate() { 690 691 for (int k = 0; k < this.target_dims; k++) { 692 // 2) choose pivot objects 693 if (!this.pivot_set) { 694 int[] pivots = this.findDistantObjects(); 695 696 // 3) record ids of pivot objects 697 this.PA[0][this.col] = pivots[0]; 698 this.PA[1][this.col] = pivots[1]; 699 } 700 701 // 4) inter object distances are zero (this.X is initialized with 0 so we just 702 // continue) 703 if (this.dist(this.PA[0][this.col], this.PA[1][this.col], this.col) == 0) { 704 continue; 705 } 706 707 // 5) project the objects on the line between the pivots 708 double dxy = this.dist(this.PA[0][this.col], this.PA[1][this.col], this.col); 709 for (int i = 0; i < this.O.length; i++) { 710 711 double dix = this.dist(i, this.PA[0][this.col], this.col); 712 double diy = this.dist(i, this.PA[1][this.col], this.col); 713 714 double tmp = (dix + dxy - diy) / (2 * Math.sqrt(dxy)); 715 716 // save the projection 717 this.X[i][this.col] = tmp; 718 } 719 720 this.col += 1; 721 } 722 } 723 724 /** 725 * returns the result matrix of the projections 726 * 727 * @return calculated result 728 */ 729 public double[][] getX() { 730 return this.X; 731 } 732 } 669 733 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/training/WekaTraining.java
r25 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.training; 2 16 … … 11 25 /** 12 26 * Programmatic WekaTraining 13 *14 * first parameter is Trainer Name.15 * second parameter is class name16 27 * 17 * all subsequent parameters are configuration params (for example for trees) 18 * Cross Validation params always come last and are prepended with -CVPARAM 28 * first parameter is Trainer Name. second parameter is class name 29 * 30 * all subsequent parameters are configuration params (for example for trees) Cross Validation 31 * params always come last and are prepended with -CVPARAM 19 32 * 20 33 * XML Configurations for Weka Classifiers: 34 * 21 35 * <pre> 22 36 * {@code … … 30 44 public class WekaTraining extends WekaBaseTraining implements ITrainingStrategy { 31 45 32 @Override 33 public void apply(Instances traindata) { 34 PrintStream errStr = System.err; 35 System.setErr(new PrintStream(new NullOutputStream())); 36 try { 37 if(classifier == null) { 38 Console.traceln(Level.WARNING, String.format("classifier null!")); 39 } 40 classifier.buildClassifier(traindata); 41 } catch (Exception e) { 42 throw new RuntimeException(e); 43 } finally { 44 System.setErr(errStr); 45 } 46 } 46 @Override 47 public void apply(Instances traindata) { 48 PrintStream errStr = System.err; 49 System.setErr(new PrintStream(new NullOutputStream())); 50 try { 51 if (classifier == null) { 52 Console.traceln(Level.WARNING, String.format("classifier null!")); 53 } 54 classifier.buildClassifier(traindata); 55 } 56 catch (Exception e) { 57 throw new RuntimeException(e); 58 } 59 finally { 60 System.setErr(errStr); 61 } 62 } 47 63 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/versions/AbstractVersionFilter.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.versions; 2 16 … … 6 20 /** 7 21 * Implements a skeletal {@link IVersionFilter}. 22 * 8 23 * @author Steffen Herbold 9 24 */ 10 25 public abstract class AbstractVersionFilter implements IVersionFilter { 11 26 12 /**13 * @see de.ugoe.cs.cpdp.versions.IVersionFilter#apply(java.util.List)14 */15 @Override16 public int apply(List<SoftwareVersion> versions) {17 int removed = 0;18 for( final Iterator<SoftwareVersion> iter=versions.iterator() ; iter.hasNext() ;) {19 SoftwareVersion version = iter.next();20 21 if( apply(version)) {22 iter.remove();23 removed++;24 }25 }26 return removed;27 }27 /** 28 * @see de.ugoe.cs.cpdp.versions.IVersionFilter#apply(java.util.List) 29 */ 30 @Override 31 public int apply(List<SoftwareVersion> versions) { 32 int removed = 0; 33 for (final Iterator<SoftwareVersion> iter = versions.iterator(); iter.hasNext();) { 34 SoftwareVersion version = iter.next(); 35 36 if (apply(version)) { 37 iter.remove(); 38 removed++; 39 } 40 } 41 return removed; 42 } 28 43 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/versions/IVersionFilter.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.versions; 2 16 … … 6 20 7 21 /** 8 * Implements the interface for a {@link SoftwareVersion} filter. 22 * Implements the interface for a {@link SoftwareVersion} filter. 23 * 9 24 * @author Steffen Herbold 10 25 */ 11 26 public interface IVersionFilter extends IParameterizable { 12 27 13 /** 14 * Applies the filter to a single version. 15 * @param version the version 16 * @return true if filter applies to version, false otherwise 17 */ 18 boolean apply(SoftwareVersion version); 19 20 /** 21 * Applies the filter a a list of versions. Versions were the filter applies are automatically removed from the list. 22 * @param versions list of versions 23 * @return number of removed versions 24 */ 25 int apply(List<SoftwareVersion> versions); 28 /** 29 * Applies the filter to a single version. 30 * 31 * @param version 32 * the version 33 * @return true if filter applies to version, false otherwise 34 */ 35 boolean apply(SoftwareVersion version); 36 37 /** 38 * Applies the filter a a list of versions. Versions were the filter applies are automatically 39 * removed from the list. 40 * 41 * @param versions 42 * list of versions 43 * @return number of removed versions 44 */ 45 int apply(List<SoftwareVersion> versions); 26 46 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/versions/MaxInstanceNumberFilter.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.versions; 2 16 3 17 /** 4 * Applies to large data sets. All data sets that have more than the required maximum number of instances are removed. 18 * Applies to large data sets. All data sets that have more than the required maximum number of 19 * instances are removed. 20 * 5 21 * @author Steffen Herbold 6 22 */ 7 23 public class MaxInstanceNumberFilter extends AbstractVersionFilter { 8 24 9 /** 10 * maximum number of instances required 11 */ 12 private int maxInstances = 0; 13 14 /** 15 * @see de.ugoe.cs.cpdp.versions.IVersionFilter#apply(de.ugoe.cs.cpdp.versions.SoftwareVersion) 16 */ 17 @Override 18 public boolean apply(SoftwareVersion version) { 19 return version.getInstances().numInstances()>maxInstances; 20 } 25 /** 26 * maximum number of instances required 27 */ 28 private int maxInstances = 0; 21 29 22 /** 23 * Sets the minimal number of instances. 24 * @param parameters number of instances 25 */ 26 @Override 27 public void setParameter(String parameters) { 28 maxInstances = Integer.parseInt(parameters); 29 } 30 /** 31 * @see de.ugoe.cs.cpdp.versions.IVersionFilter#apply(de.ugoe.cs.cpdp.versions.SoftwareVersion) 32 */ 33 @Override 34 public boolean apply(SoftwareVersion version) { 35 return version.getInstances().numInstances() > maxInstances; 36 } 37 38 /** 39 * Sets the minimal number of instances. 40 * 41 * @param parameters 42 * number of instances 43 */ 44 @Override 45 public void setParameter(String parameters) { 46 maxInstances = Integer.parseInt(parameters); 47 } 30 48 31 49 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/versions/MinClassNumberFilter.java
r26 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.versions; 2 16 … … 4 18 5 19 /** 6 * Applies to small data sets. All data sets that do not have the required minimal number of instances in each class (i.e., positive, negative) are removed. 20 * Applies to small data sets. All data sets that do not have the required minimal number of 21 * instances in each class (i.e., positive, negative) are removed. 22 * 7 23 * @author Steffen Herbold 8 24 */ 9 25 public class MinClassNumberFilter extends AbstractVersionFilter { 10 26 11 /** 12 * minimal number of instances required 13 */ 14 private int minInstances = 0; 15 16 /** 17 * @see de.ugoe.cs.cpdp.versions.IVersionFilter#apply(de.ugoe.cs.cpdp.versions.SoftwareVersion) 18 */ 19 @Override 20 public boolean apply(SoftwareVersion version) { 21 Instances instances = version.getInstances(); 22 int[] counts = instances.attributeStats(instances.classIndex()).nominalCounts; 23 boolean toSmall = false; 24 for( int count : counts ) { 25 toSmall |= count<minInstances; 26 } 27 return toSmall; 28 } 27 /** 28 * minimal number of instances required 29 */ 30 private int minInstances = 0; 29 31 30 /** 31 * Sets the minimal number of instances for each class. 32 * @param parameters number of instances 33 */ 34 @Override 35 public void setParameter(String parameters) { 36 minInstances = Integer.parseInt(parameters); 37 } 32 /** 33 * @see de.ugoe.cs.cpdp.versions.IVersionFilter#apply(de.ugoe.cs.cpdp.versions.SoftwareVersion) 34 */ 35 @Override 36 public boolean apply(SoftwareVersion version) { 37 Instances instances = version.getInstances(); 38 int[] counts = instances.attributeStats(instances.classIndex()).nominalCounts; 39 boolean toSmall = false; 40 for (int count : counts) { 41 toSmall |= count < minInstances; 42 } 43 return toSmall; 44 } 45 46 /** 47 * Sets the minimal number of instances for each class. 48 * 49 * @param parameters 50 * number of instances 51 */ 52 @Override 53 public void setParameter(String parameters) { 54 minInstances = Integer.parseInt(parameters); 55 } 38 56 39 57 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/versions/MinInstanceNumberFilter.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.versions; 2 16 3 17 /** 4 * Applies to small data sets. All data sets that do not have the required minimal number of instances are removed. 18 * Applies to small data sets. All data sets that do not have the required minimal number of 19 * instances are removed. 20 * 5 21 * @author Steffen Herbold 6 22 */ 7 23 public class MinInstanceNumberFilter extends AbstractVersionFilter { 8 24 9 /** 10 * minimal number of instances required 11 */ 12 private int minInstances = 0; 13 14 /** 15 * @see de.ugoe.cs.cpdp.versions.IVersionFilter#apply(de.ugoe.cs.cpdp.versions.SoftwareVersion) 16 */ 17 @Override 18 public boolean apply(SoftwareVersion version) { 19 return version.getInstances().numInstances()<minInstances; 20 } 25 /** 26 * minimal number of instances required 27 */ 28 private int minInstances = 0; 21 29 22 /** 23 * Sets the minimal number of instances. 24 * @param parameters number of instances 25 */ 26 @Override 27 public void setParameter(String parameters) { 28 minInstances = Integer.parseInt(parameters); 29 } 30 /** 31 * @see de.ugoe.cs.cpdp.versions.IVersionFilter#apply(de.ugoe.cs.cpdp.versions.SoftwareVersion) 32 */ 33 @Override 34 public boolean apply(SoftwareVersion version) { 35 return version.getInstances().numInstances() < minInstances; 36 } 37 38 /** 39 * Sets the minimal number of instances. 40 * 41 * @param parameters 42 * number of instances 43 */ 44 @Override 45 public void setParameter(String parameters) { 46 minInstances = Integer.parseInt(parameters); 47 } 30 48 31 49 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/versions/SoftwareVersion.java
r27 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.versions; 2 16 … … 4 18 5 19 /** 6 * Data class for software versions. 20 * Data class for software versions. 21 * 7 22 * @author Steffen Herbold 8 23 */ 9 24 public class SoftwareVersion implements Comparable<SoftwareVersion> { 10 25 11 /** 12 * name of the project 13 */ 14 private final String project; 15 16 /** 17 * version of the project 18 */ 19 private final String version; 26 /** 27 * name of the project 28 */ 29 private final String project; 20 30 21 /** 22 * data of the version 23 */ 24 private final Instances instances; 25 26 /** 27 * Constructor. Creates a new version. 28 * @param project name of the project 29 * @param version name of the version 30 * @param instances data of the version 31 */ 32 public SoftwareVersion(String project, String version, Instances instances) { 33 this.project = project; 34 this.version = version; 35 this.instances = instances; 36 } 37 38 /** 39 * returns the project name 40 * @return project name 41 */ 42 public String getProject() { 43 return project; 44 } 45 46 /** 47 * returns the name of the version 48 * @return name of the version 49 */ 50 public String getVersion() { 51 return version; 52 } 53 54 /** 55 * returns the data of the version 56 * @return data 57 */ 58 public Instances getInstances() { 59 return new Instances(instances); 60 } 31 /** 32 * version of the project 33 */ 34 private final String version; 61 35 62 /** 63 * Compares first based on project name and then based on version. Only string comparisons are performed. 64 * @see java.lang.Comparable#compareTo(java.lang.Object) 65 */ 66 @Override 67 public int compareTo(SoftwareVersion o) { 68 int projectStrCmp = 0; 69 if( project!=null ) { 70 projectStrCmp = project.compareTo(o.project); 71 } 72 if( projectStrCmp==0 && version!=null ) { 73 return version.compareTo(o.version); 74 } else { 75 return projectStrCmp; 76 } 77 } 36 /** 37 * data of the version 38 */ 39 private final Instances instances; 40 41 /** 42 * Constructor. Creates a new version. 43 * 44 * @param project 45 * name of the project 46 * @param version 47 * name of the version 48 * @param instances 49 * data of the version 50 */ 51 public SoftwareVersion(String project, String version, Instances instances) { 52 this.project = project; 53 this.version = version; 54 this.instances = instances; 55 } 56 57 /** 58 * returns the project name 59 * 60 * @return project name 61 */ 62 public String getProject() { 63 return project; 64 } 65 66 /** 67 * returns the name of the version 68 * 69 * @return name of the version 70 */ 71 public String getVersion() { 72 return version; 73 } 74 75 /** 76 * returns the data of the version 77 * 78 * @return data 79 */ 80 public Instances getInstances() { 81 return new Instances(instances); 82 } 83 84 /** 85 * Compares first based on project name and then based on version. Only string comparisons are 86 * performed. 87 * 88 * @see java.lang.Comparable#compareTo(java.lang.Object) 89 */ 90 @Override 91 public int compareTo(SoftwareVersion o) { 92 int projectStrCmp = 0; 93 if (project != null) { 94 projectStrCmp = project.compareTo(o.project); 95 } 96 if (projectStrCmp == 0 && version != null) { 97 return version.compareTo(o.version); 98 } 99 else { 100 return projectStrCmp; 101 } 102 } 78 103 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/versions/UnbalancedFilter.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.versions; 2 16 … … 4 18 5 19 /** 6 * Removes unbalanced data sets in terms of classification. All data sets that are outside of the quantil defined 7 * by setParameter (default=0.1) are removed. 20 * Removes unbalanced data sets in terms of classification. All data sets that are outside of the 21 * quantil defined by setParameter (default=0.1) are removed. 22 * 8 23 * @author Steffen Herbold 9 24 */ 10 25 public class UnbalancedFilter extends AbstractVersionFilter { 11 26 12 /** 13 * quantil where outside lying versions are removed 14 */ 15 private double quantil = 0.1; 16 17 /** 18 * Sets the quantil. 19 * @param parameters the quantil as string 20 */ 21 @Override 22 public void setParameter(String parameters) { 23 quantil = Double.parseDouble(parameters); 24 } 27 /** 28 * quantil where outside lying versions are removed 29 */ 30 private double quantil = 0.1; 25 31 26 /** 27 * @see de.ugoe.cs.cpdp.versions.IVersionFilter#apply(de.ugoe.cs.cpdp.versions.SoftwareVersion) 28 */ 29 @Override 30 public boolean apply(SoftwareVersion version) { 31 final Instances instances = version.getInstances(); 32 33 final int[] counts = instances.attributeStats(instances.classIndex()).nominalCounts; 34 return ((double) counts[0])/instances.numInstances() >= (1-quantil) || 35 ((double) counts[0])/instances.numInstances() <= (quantil); 36 } 32 /** 33 * Sets the quantil. 34 * 35 * @param parameters 36 * the quantil as string 37 */ 38 @Override 39 public void setParameter(String parameters) { 40 quantil = Double.parseDouble(parameters); 41 } 42 43 /** 44 * @see de.ugoe.cs.cpdp.versions.IVersionFilter#apply(de.ugoe.cs.cpdp.versions.SoftwareVersion) 45 */ 46 @Override 47 public boolean apply(SoftwareVersion version) { 48 final Instances instances = version.getInstances(); 49 50 final int[] counts = instances.attributeStats(instances.classIndex()).nominalCounts; 51 return ((double) counts[0]) / instances.numInstances() >= (1 - quantil) || 52 ((double) counts[0]) / instances.numInstances() <= (quantil); 53 } 37 54 38 55 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/wekaclassifier/FixClass.java
r30 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.wekaclassifier; 2 3 16 4 17 import weka.classifiers.AbstractClassifier; … … 16 29 public class FixClass extends AbstractClassifier { 17 30 18 private static final long serialVersionUID = 1L;31 private static final long serialVersionUID = 1L; 19 32 20 private double fixedClassValue = 0.0d;33 private double fixedClassValue = 0.0d; 21 34 22 public FixClass() {23 // TODO Auto-generated constructor stub24 }35 public FixClass() { 36 // TODO Auto-generated constructor stub 37 } 25 38 26 /**27 * Returns default capabilities of the classifier.28 *29 * @return the capabilities of this classifier30 */31 @Override32 public Capabilities getCapabilities() {33 Capabilities result = super.getCapabilities();34 result.disableAll();39 /** 40 * Returns default capabilities of the classifier. 41 * 42 * @return the capabilities of this classifier 43 */ 44 @Override 45 public Capabilities getCapabilities() { 46 Capabilities result = super.getCapabilities(); 47 result.disableAll(); 35 48 36 // attributes37 result.enable(Capability.NOMINAL_ATTRIBUTES);38 result.enable(Capability.NUMERIC_ATTRIBUTES);39 result.enable(Capability.DATE_ATTRIBUTES);40 result.enable(Capability.STRING_ATTRIBUTES);41 result.enable(Capability.RELATIONAL_ATTRIBUTES);42 result.enable(Capability.MISSING_VALUES);49 // attributes 50 result.enable(Capability.NOMINAL_ATTRIBUTES); 51 result.enable(Capability.NUMERIC_ATTRIBUTES); 52 result.enable(Capability.DATE_ATTRIBUTES); 53 result.enable(Capability.STRING_ATTRIBUTES); 54 result.enable(Capability.RELATIONAL_ATTRIBUTES); 55 result.enable(Capability.MISSING_VALUES); 43 56 44 // class45 result.enable(Capability.NOMINAL_CLASS);46 result.enable(Capability.NUMERIC_CLASS);47 result.enable(Capability.MISSING_CLASS_VALUES);57 // class 58 result.enable(Capability.NOMINAL_CLASS); 59 result.enable(Capability.NUMERIC_CLASS); 60 result.enable(Capability.MISSING_CLASS_VALUES); 48 61 49 // instances50 result.setMinimumNumberInstances(0);62 // instances 63 result.setMinimumNumberInstances(0); 51 64 52 return result;53 }65 return result; 66 } 54 67 55 @Override56 public void setOptions(String[] options) throws Exception {57 fixedClassValue = Double.parseDouble(Utils.getOption('C', options));58 }68 @Override 69 public void setOptions(String[] options) throws Exception { 70 fixedClassValue = Double.parseDouble(Utils.getOption('C', options)); 71 } 59 72 60 @Override61 public double classifyInstance(Instance instance) {62 return fixedClassValue;63 }73 @Override 74 public double classifyInstance(Instance instance) { 75 return fixedClassValue; 76 } 64 77 65 @Override66 public void buildClassifier(Instances traindata) throws Exception {67 // do nothing68 }78 @Override 79 public void buildClassifier(Instances traindata) throws Exception { 80 // do nothing 81 } 69 82 }
Note: See TracChangeset
for help on using the changeset viewer.