- Timestamp:
- 09/24/15 10:59:05 (9 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/Oversampling.java
r38 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 8 22 9 23 /** 10 * Implements oversampling, a strategy for 11 * handling bias in data. In case there are less positive samples (i.e. 12 * defect-prone) samples in the data than negative samples (i.e. 13 * non-defect-prone), the defect-prone entities are over-sampled such that the 14 * number of defect-prone and non-defect-prone instances is the same afterwards. 15 * This means, that some of the defect-prone entities will be more than once 16 * within the data. 24 * Implements oversampling, a strategy for handling bias in data. In case there are less positive 25 * samples (i.e. defect-prone) samples in the data than negative samples (i.e. non-defect-prone), 26 * the defect-prone entities are over-sampled such that the number of defect-prone and 27 * non-defect-prone instances is the same afterwards. This means, that some of the defect-prone 28 * entities will be more than once within the data. 17 29 * 18 30 * @author Steffen Herbold 19 31 */ 20 public class Oversampling implements IProcessesingStrategy, 21 ISetWiseProcessingStrategy { 32 public class Oversampling implements IProcessesingStrategy, ISetWiseProcessingStrategy { 22 33 23 24 25 26 27 28 29 30 31 32 34 /** 35 * Does not have parameters. String is ignored. 36 * 37 * @param parameters 38 * ignored 39 */ 40 @Override 41 public void setParameter(String parameters) { 42 // dummy 43 } 33 44 34 /* 35 * (non-Javadoc) 36 * 37 * @see 38 * de.ugoe.cs.cpdp.dataprocessing.ISetWiseProcessingStrategy#apply(weka. 39 * core.Instances, org.apache.commons.collections4.list.SetUniqueList) 40 */ 41 @Override 42 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 43 for (Instances traindata : traindataSet) { 44 apply(testdata, traindata); 45 } 46 } 45 /* 46 * (non-Javadoc) 47 * 48 * @see de.ugoe.cs.cpdp.dataprocessing.ISetWiseProcessingStrategy#apply(weka. core.Instances, 49 * org.apache.commons.collections4.list.SetUniqueList) 50 */ 51 @Override 52 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 53 for (Instances traindata : traindataSet) { 54 apply(testdata, traindata); 55 } 56 } 47 57 48 /* 49 * (non-Javadoc) 50 * 51 * @see 52 * de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core. 53 * Instances, weka.core.Instances) 54 */ 55 @Override 56 public void apply(Instances testdata, Instances traindata) { 58 /* 59 * (non-Javadoc) 60 * 61 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core. Instances, 62 * weka.core.Instances) 63 */ 64 @Override 65 public void apply(Instances testdata, Instances traindata) { 57 66 58 59 60 61 67 final int[] counts = traindata.attributeStats(traindata.classIndex()).nominalCounts; 68 if (counts[1] < counts[0]) { 69 Instances negatives = new Instances(traindata); 70 Instances positives = new Instances(traindata); 62 71 63 64 65 66 67 68 69 70 72 for (int i = traindata.size() - 1; i >= 0; i--) { 73 if (Double.compare(1.0, negatives.get(i).classValue()) == 0) { 74 negatives.remove(i); 75 } 76 if (Double.compare(0.0, positives.get(i).classValue()) == 0) { 77 positives.remove(i); 78 } 79 } 71 80 72 Resample resample = new Resample(); 73 // TODO: resample.setSampleSizePercent((100.0*counts[1])/100+0.01); 74 // Ohne +0.01 wird bei tomcat, xerces-1.2 und jedit-4.0 ein negative 75 // weniger zurückgegeben 76 resample.setSampleSizePercent((100.0 * counts[0]) / counts[1]); 77 try { 78 resample.setInputFormat(traindata); 79 positives = Filter.useFilter(positives, resample); 80 } catch (Exception e) { 81 throw new RuntimeException(e); 82 } 83 traindata.clear(); 84 for (int i = 0; i < negatives.size(); i++) { 85 traindata.add(negatives.get(i)); 86 } 87 for (int i = 0; i < positives.size(); i++) { 88 traindata.add(positives.get(i)); 89 } 90 } 91 } 81 Resample resample = new Resample(); 82 // TODO: resample.setSampleSizePercent((100.0*counts[1])/100+0.01); 83 // Ohne +0.01 wird bei tomcat, xerces-1.2 und jedit-4.0 ein negative 84 // weniger zurückgegeben 85 resample.setSampleSizePercent((100.0 * counts[0]) / counts[1]); 86 try { 87 resample.setInputFormat(traindata); 88 positives = Filter.useFilter(positives, resample); 89 } 90 catch (Exception e) { 91 throw new RuntimeException(e); 92 } 93 traindata.clear(); 94 for (int i = 0; i < negatives.size(); i++) { 95 traindata.add(negatives.get(i)); 96 } 97 for (int i = 0; i < positives.size(); i++) { 98 traindata.add(positives.get(i)); 99 } 100 } 101 } 92 102 93 103 }
Note: See TracChangeset
for help on using the changeset viewer.