source: trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/Undersampling.java @ 117

Last change on this file since 117 was 86, checked in by sherbold, 9 years ago
  • switched workspace encoding to UTF-8 and fixed broken characters
  • Property svn:mime-type set to text/plain
File size: 3.7 KB
Line 
1// Copyright 2015 Georg-August-Universität Göttingen, Germany
2//
3//   Licensed under the Apache License, Version 2.0 (the "License");
4//   you may not use this file except in compliance with the License.
5//   You may obtain a copy of the License at
6//
7//       http://www.apache.org/licenses/LICENSE-2.0
8//
9//   Unless required by applicable law or agreed to in writing, software
10//   distributed under the License is distributed on an "AS IS" BASIS,
11//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12//   See the License for the specific language governing permissions and
13//   limitations under the License.
14
15package de.ugoe.cs.cpdp.dataprocessing;
16
17import org.apache.commons.collections4.list.SetUniqueList;
18
19import weka.core.Instances;
20import weka.filters.Filter;
21import weka.filters.supervised.instance.Resample;
22
23/**
24 * Implements undersampling, a strategy for handling bias in data. In case there are less positive
25 * samples (i.e. defect-prone) samples in the data than negative samples (i.e. non-defect-prone),
26 * the non-defect-prone entities are sampled such thatthe number of defect-prone and
27 * non-defect-prone instances is the same afterwards.
28 *
29 * @author Steffen Herbold
30 */
31public class Undersampling implements IProcessesingStrategy, ISetWiseProcessingStrategy {
32
33    /**
34     * Does not have parameters. String is ignored.
35     *
36     * @param parameters
37     *            ignored
38     */
39    @Override
40    public void setParameter(String parameters) {
41        // dummy
42    }
43
44    /*
45     * (non-Javadoc)
46     *
47     * @see de.ugoe.cs.cpdp.dataprocessing.ISetWiseProcessingStrategy#apply(weka.core.Instances,
48     * org.apache.commons.collections4.list.SetUniqueList)
49     */
50    @Override
51    public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
52        for (Instances traindata : traindataSet) {
53            apply(testdata, traindata);
54        }
55    }
56
57    /*
58     * (non-Javadoc)
59     *
60     * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances,
61     * weka.core.Instances)
62     */
63    @Override
64    public void apply(Instances testdata, Instances traindata) {
65
66        final int[] counts = traindata.attributeStats(traindata.classIndex()).nominalCounts;
67
68        if (counts[1] < counts[0]) {
69            Instances negatives = new Instances(traindata);
70            Instances positives = new Instances(traindata);
71
72            for (int i = traindata.size() - 1; i >= 0; i--) {
73                if (Double.compare(1.0, negatives.get(i).classValue()) == 0) {
74                    negatives.remove(i);
75                }
76                if (Double.compare(0.0, positives.get(i).classValue()) == 0) {
77                    positives.remove(i);
78                }
79            }
80
81            Resample resample = new Resample();
82            // TODO: resample.setSampleSizePercent((100.0*counts[1])/100+0.01);
83            // Ohne +0.01 wird bei tomcat, xerces-1.2 und jedit-4.0 ein negative weniger
84            // zurückgegeben
85            resample.setSampleSizePercent((100.0 * counts[1]) / counts[0]);
86            try {
87                resample.setInputFormat(traindata);
88                negatives = Filter.useFilter(negatives, resample);
89            }
90            catch (Exception e) {
91                throw new RuntimeException(e);
92            }
93            traindata.clear();
94            for (int i = 0; i < negatives.size(); i++) {
95                traindata.add(negatives.get(i));
96            }
97            for (int i = 0; i < positives.size(); i++) {
98                traindata.add(positives.get(i));
99            }
100        }
101    }
102
103}
Note: See TracBrowser for help on using the repository browser.