source: trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/Oversampling.java @ 117

Last change on this file since 117 was 86, checked in by sherbold, 9 years ago
  • switched workspace encoding to UTF-8 and fixed broken characters
  • Property svn:mime-type set to text/plain
File size: 3.8 KB
Line 
1// Copyright 2015 Georg-August-Universität Göttingen, Germany
2//
3//   Licensed under the Apache License, Version 2.0 (the "License");
4//   you may not use this file except in compliance with the License.
5//   You may obtain a copy of the License at
6//
7//       http://www.apache.org/licenses/LICENSE-2.0
8//
9//   Unless required by applicable law or agreed to in writing, software
10//   distributed under the License is distributed on an "AS IS" BASIS,
11//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12//   See the License for the specific language governing permissions and
13//   limitations under the License.
14
15package de.ugoe.cs.cpdp.dataprocessing;
16
17import org.apache.commons.collections4.list.SetUniqueList;
18
19import weka.core.Instances;
20import weka.filters.Filter;
21import weka.filters.supervised.instance.Resample;
22
23/**
24 * Implements oversampling, a strategy for handling bias in data. In case there are less positive
25 * samples (i.e. defect-prone) samples in the data than negative samples (i.e. non-defect-prone),
26 * the defect-prone entities are over-sampled such that the number of defect-prone and
27 * non-defect-prone instances is the same afterwards. This means, that some of the defect-prone
28 * entities will be more than once within the data.
29 *
30 * @author Steffen Herbold
31 */
32public class Oversampling implements IProcessesingStrategy, ISetWiseProcessingStrategy {
33
34    /**
35     * Does not have parameters. String is ignored.
36     *
37     * @param parameters
38     *            ignored
39     */
40    @Override
41    public void setParameter(String parameters) {
42        // dummy
43    }
44
45    /*
46     * (non-Javadoc)
47     *
48     * @see de.ugoe.cs.cpdp.dataprocessing.ISetWiseProcessingStrategy#apply(weka. core.Instances,
49     * org.apache.commons.collections4.list.SetUniqueList)
50     */
51    @Override
52    public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
53        for (Instances traindata : traindataSet) {
54            apply(testdata, traindata);
55        }
56    }
57
58    /*
59     * (non-Javadoc)
60     *
61     * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core. Instances,
62     * weka.core.Instances)
63     */
64    @Override
65    public void apply(Instances testdata, Instances traindata) {
66
67        final int[] counts = traindata.attributeStats(traindata.classIndex()).nominalCounts;
68        if (counts[1] < counts[0]) {
69            Instances negatives = new Instances(traindata);
70            Instances positives = new Instances(traindata);
71
72            for (int i = traindata.size() - 1; i >= 0; i--) {
73                if (Double.compare(1.0, negatives.get(i).classValue()) == 0) {
74                    negatives.remove(i);
75                }
76                if (Double.compare(0.0, positives.get(i).classValue()) == 0) {
77                    positives.remove(i);
78                }
79            }
80
81            Resample resample = new Resample();
82            // TODO: resample.setSampleSizePercent((100.0*counts[1])/100+0.01);
83            // Ohne +0.01 wird bei tomcat, xerces-1.2 und jedit-4.0 ein negative
84            // weniger zurückgegeben
85            resample.setSampleSizePercent((100.0 * counts[0]) / counts[1]);
86            try {
87                resample.setInputFormat(traindata);
88                positives = Filter.useFilter(positives, resample);
89            }
90            catch (Exception e) {
91                throw new RuntimeException(e);
92            }
93            traindata.clear();
94            for (int i = 0; i < negatives.size(); i++) {
95                traindata.add(negatives.get(i));
96            }
97            for (int i = 0; i < positives.size(); i++) {
98                traindata.add(positives.get(i));
99            }
100        }
101    }
102
103}
Note: See TracBrowser for help on using the repository browser.