source: trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/Oversampling.java @ 40

Last change on this file since 40 was 38, checked in by sherbold, 9 years ago
  • added Oversampling and Resampling processors
  • fixed bug in ZScoreNormalizations
  • added new load for the Audi data set that is based on changes
  • minor changes to remove warnings
  • Property svn:mime-type set to text/plain
File size: 2.7 KB
Line 
1package de.ugoe.cs.cpdp.dataprocessing;
2
3import org.apache.commons.collections4.list.SetUniqueList;
4
5import weka.core.Instances;
6import weka.filters.Filter;
7import weka.filters.supervised.instance.Resample;
8
9/**
10 * Implements oversampling, a strategy for
11 * handling bias in data. In case there are less positive samples (i.e.
12 * defect-prone) samples in the data than negative samples (i.e.
13 * non-defect-prone), the defect-prone entities are over-sampled such that the
14 * number of defect-prone and non-defect-prone instances is the same afterwards.
15 * This means, that some of the defect-prone entities will be more than once
16 * within the data.
17 *
18 * @author Steffen Herbold
19 */
20public class Oversampling implements IProcessesingStrategy,
21                ISetWiseProcessingStrategy {
22
23        /**
24         * Does not have parameters. String is ignored.
25         *
26         * @param parameters
27         *            ignored
28         */
29        @Override
30        public void setParameter(String parameters) {
31                // dummy
32        }
33
34        /*
35         * (non-Javadoc)
36         *
37         * @see
38         * de.ugoe.cs.cpdp.dataprocessing.ISetWiseProcessingStrategy#apply(weka.
39         * core.Instances, org.apache.commons.collections4.list.SetUniqueList)
40         */
41        @Override
42        public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
43                for (Instances traindata : traindataSet) {
44                        apply(testdata, traindata);
45                }
46        }
47
48        /*
49         * (non-Javadoc)
50         *
51         * @see
52         * de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.
53         * Instances, weka.core.Instances)
54         */
55        @Override
56        public void apply(Instances testdata, Instances traindata) {
57
58                final int[] counts = traindata.attributeStats(traindata.classIndex()).nominalCounts;
59                if (counts[1] < counts[0]) {
60                        Instances negatives = new Instances(traindata);
61                        Instances positives = new Instances(traindata);
62
63                        for (int i = traindata.size() - 1; i >= 0; i--) {
64                                if (Double.compare(1.0, negatives.get(i).classValue()) == 0) {
65                                        negatives.remove(i);
66                                }
67                                if (Double.compare(0.0, positives.get(i).classValue()) == 0) {
68                                        positives.remove(i);
69                                }
70                        }
71
72                        Resample resample = new Resample();
73                        // TODO: resample.setSampleSizePercent((100.0*counts[1])/100+0.01);
74                        // Ohne +0.01 wird bei tomcat, xerces-1.2 und jedit-4.0 ein negative
75                        // weniger zurückgegeben
76                        resample.setSampleSizePercent((100.0 * counts[0]) / counts[1]);
77                        try {
78                                resample.setInputFormat(traindata);
79                                positives = Filter.useFilter(positives, resample);
80                        } catch (Exception e) {
81                                throw new RuntimeException(e);
82                        }
83                        traindata.clear();
84                        for (int i = 0; i < negatives.size(); i++) {
85                                traindata.add(negatives.get(i));
86                        }
87                        for (int i = 0; i < positives.size(); i++) {
88                                traindata.add(positives.get(i));
89                        }
90                }
91        }
92
93}
Note: See TracBrowser for help on using the repository browser.