- Timestamp:
- 09/24/15 10:59:05 (9 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/Undersampling.java
r18 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 8 22 9 23 /** 10 * Implements undersampling, a strategy for handling bias in data. In case there are less positive samples (i.e. defect-prone) samples in the 11 * data than negative samples (i.e. non-defect-prone), the non-defect-prone entities are sampled such thatthe number of defect-prone and non-defect-prone instances is the same afterwards. 24 * Implements undersampling, a strategy for handling bias in data. In case there are less positive 25 * samples (i.e. defect-prone) samples in the data than negative samples (i.e. non-defect-prone), 26 * the non-defect-prone entities are sampled such thatthe number of defect-prone and 27 * non-defect-prone instances is the same afterwards. 28 * 12 29 * @author Steffen Herbold 13 30 */ 14 public class Undersampling implements IProcessesingStrategy, 15 ISetWiseProcessingStrategy { 31 public class Undersampling implements IProcessesingStrategy, ISetWiseProcessingStrategy { 16 32 17 18 /** 19 * Does not have parameters. String is ignored. 20 * @param parameters ignored 21 */ 22 @Override 23 public void setParameter(String parameters) { 24 // dummy 25 } 33 /** 34 * Does not have parameters. String is ignored. 35 * 36 * @param parameters 37 * ignored 38 */ 39 @Override 40 public void setParameter(String parameters) { 41 // dummy 42 } 26 43 27 /* 28 * (non-Javadoc) 29 * @see de.ugoe.cs.cpdp.dataprocessing.ISetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 30 */ 31 @Override 32 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 33 for( Instances traindata : traindataSet ) { 34 apply(testdata, traindata); 35 } 36 } 44 /* 45 * (non-Javadoc) 46 * 47 * @see de.ugoe.cs.cpdp.dataprocessing.ISetWiseProcessingStrategy#apply(weka.core.Instances, 48 * org.apache.commons.collections4.list.SetUniqueList) 49 */ 50 @Override 51 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 52 for (Instances traindata : traindataSet) { 53 apply(testdata, traindata); 54 } 55 } 37 56 38 /* 39 * (non-Javadoc) 40 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 41 */ 42 @Override 43 public void apply(Instances testdata, Instances traindata) { 44 45 final int[] counts = traindata.attributeStats(traindata.classIndex()).nominalCounts; 46 47 if( counts[1]<counts[0] ) { 48 Instances negatives = new Instances(traindata); 49 Instances positives = new Instances(traindata); 50 51 for( int i=traindata.size()-1 ; i>=0 ; i-- ) { 52 if( Double.compare(1.0, negatives.get(i).classValue())==0 ) { 53 negatives.remove(i); 54 } 55 if( Double.compare(0.0, positives.get(i).classValue())==0 ) { 56 positives.remove(i); 57 } 58 } 59 60 Resample resample = new Resample(); 61 // TODO: resample.setSampleSizePercent((100.0*counts[1])/100+0.01); 62 // Ohne +0.01 wird bei tomcat, xerces-1.2 und jedit-4.0 ein negative weniger zurückgegeben 63 resample.setSampleSizePercent((100.0* counts[1])/counts[0]); 64 try { 65 resample.setInputFormat(traindata); 66 negatives = Filter.useFilter(negatives, resample); 67 } catch (Exception e) { 68 throw new RuntimeException(e); 69 } 70 traindata.clear(); 71 for( int i=0 ; i<negatives.size() ; i++ ) { 72 traindata.add(negatives.get(i)); 73 } 74 for( int i=0 ; i<positives.size() ; i++ ) { 75 traindata.add(positives.get(i)); 76 } 77 } 78 } 57 /* 58 * (non-Javadoc) 59 * 60 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances, 61 * weka.core.Instances) 62 */ 63 @Override 64 public void apply(Instances testdata, Instances traindata) { 65 66 final int[] counts = traindata.attributeStats(traindata.classIndex()).nominalCounts; 67 68 if (counts[1] < counts[0]) { 69 Instances negatives = new Instances(traindata); 70 Instances positives = new Instances(traindata); 71 72 for (int i = traindata.size() - 1; i >= 0; i--) { 73 if (Double.compare(1.0, negatives.get(i).classValue()) == 0) { 74 negatives.remove(i); 75 } 76 if (Double.compare(0.0, positives.get(i).classValue()) == 0) { 77 positives.remove(i); 78 } 79 } 80 81 Resample resample = new Resample(); 82 // TODO: resample.setSampleSizePercent((100.0*counts[1])/100+0.01); 83 // Ohne +0.01 wird bei tomcat, xerces-1.2 und jedit-4.0 ein negative weniger 84 // zurückgegeben 85 resample.setSampleSizePercent((100.0 * counts[1]) / counts[0]); 86 try { 87 resample.setInputFormat(traindata); 88 negatives = Filter.useFilter(negatives, resample); 89 } 90 catch (Exception e) { 91 throw new RuntimeException(e); 92 } 93 traindata.clear(); 94 for (int i = 0; i < negatives.size(); i++) { 95 traindata.add(negatives.get(i)); 96 } 97 for (int i = 0; i < positives.size(); i++) { 98 traindata.add(positives.get(i)); 99 } 100 } 101 } 79 102 80 103 }
Note: See TracChangeset
for help on using the changeset viewer.