source: trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/SeparatabilitySelection.java @ 75

Last change on this file since 75 was 41, checked in by sherbold, 9 years ago
  • formatted code and added copyrights
  • Property svn:mime-type set to text/plain
File size: 4.4 KB
Line 
1// Copyright 2015 Georg-August-Universität Göttingen, Germany
2//
3//   Licensed under the Apache License, Version 2.0 (the "License");
4//   you may not use this file except in compliance with the License.
5//   You may obtain a copy of the License at
6//
7//       http://www.apache.org/licenses/LICENSE-2.0
8//
9//   Unless required by applicable law or agreed to in writing, software
10//   distributed under the License is distributed on an "AS IS" BASIS,
11//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12//   See the License for the specific language governing permissions and
13//   limitations under the License.
14
15package de.ugoe.cs.cpdp.dataselection;
16
17import java.util.Arrays;
18import java.util.Random;
19
20import org.apache.commons.collections4.list.SetUniqueList;
21
22import weka.classifiers.Evaluation;
23import weka.classifiers.functions.Logistic;
24import weka.core.DenseInstance;
25import weka.core.Instance;
26import weka.core.Instances;
27
28/**
29 * A setwise data selection strategy based on the separatability of the training data from the test
30 * data after Z. He, F. Peters, T. Menzies, Y. Yang: Learning from Open-Source Projects: An
31 * Empirical Study on Defect Prediction. <br>
32 * <br>
33 * This is calculated through the error of a logistic regression classifier that tries to separate
34 * the sets.
35 *
36 * @author Steffen Herbold
37 */
38public class SeparatabilitySelection implements ISetWiseDataselectionStrategy {
39
40    /**
41     * size of the random sample that is drawn from both test data and training data
42     */
43    private final int sampleSize = 500;
44
45    /**
46     * number of repetitions of the sample drawing
47     */
48    private final int maxRep = 10;
49
50    /**
51     * number of neighbors that are selected
52     */
53    private int neighbors = 10;
54
55    /**
56     * Sets the number of neighbors that are selected.
57     */
58    @Override
59    public void setParameter(String parameters) {
60        if (!"".equals(parameters)) {
61            neighbors = Integer.parseInt(parameters);
62        }
63    }
64
65    /**
66     * @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances,
67     *      org.apache.commons.collections4.list.SetUniqueList)
68     */
69    @Override
70    public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
71        final Random rand = new Random(1);
72
73        // calculate distances between testdata and traindata
74        final double[] distances = new double[traindataSet.size()];
75
76        int i = 0;
77        for (Instances traindata : traindataSet) {
78            double distance = 0.0;
79            for (int rep = 0; rep < maxRep; rep++) {
80                // sample instances
81                Instances sample = new Instances(testdata);
82                for (int j = 0; j < sampleSize; j++) {
83                    Instance inst =
84                        new DenseInstance(testdata.instance(rand.nextInt(testdata.numInstances())));
85                    inst.setDataset(sample);
86                    inst.setClassValue(1.0);
87                    sample.add(inst);
88                    inst =
89                        new DenseInstance(
90                                          traindata.instance(rand.nextInt(traindata.numInstances())));
91                    inst.setDataset(sample);
92                    inst.setClassValue(0.0);
93                    sample.add(inst);
94                }
95
96                // calculate separation
97                Evaluation eval;
98                try {
99                    eval = new Evaluation(sample);
100                    eval.crossValidateModel(new Logistic(), sample, 5, rand);
101                }
102                catch (Exception e) {
103                    throw new RuntimeException(
104                                               "cross-validation during calculation of separatability failed",
105                                               e);
106                }
107                distance += eval.pctCorrect() / 100.0;
108            }
109            distances[i++] = 2 * ((distance / maxRep) - 0.5);
110        }
111
112        // select closest neighbors
113        final double[] distancesCopy = Arrays.copyOf(distances, distances.length);
114        Arrays.sort(distancesCopy);
115        final double cutoffDistance = distancesCopy[neighbors];
116
117        for (i = traindataSet.size() - 1; i >= 0; i--) {
118            if (distances[i] > cutoffDistance) {
119                traindataSet.remove(i);
120            }
121        }
122    }
123}
Note: See TracBrowser for help on using the repository browser.