1 | // Copyright 2015 Georg-August-Universität Göttingen, Germany
|
---|
2 | //
|
---|
3 | // Licensed under the Apache License, Version 2.0 (the "License");
|
---|
4 | // you may not use this file except in compliance with the License.
|
---|
5 | // You may obtain a copy of the License at
|
---|
6 | //
|
---|
7 | // http://www.apache.org/licenses/LICENSE-2.0
|
---|
8 | //
|
---|
9 | // Unless required by applicable law or agreed to in writing, software
|
---|
10 | // distributed under the License is distributed on an "AS IS" BASIS,
|
---|
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
---|
12 | // See the License for the specific language governing permissions and
|
---|
13 | // limitations under the License.
|
---|
14 |
|
---|
15 | package de.ugoe.cs.cpdp.dataselection;
|
---|
16 |
|
---|
17 | import java.util.Arrays;
|
---|
18 | import java.util.Random;
|
---|
19 |
|
---|
20 | import org.apache.commons.collections4.list.SetUniqueList;
|
---|
21 |
|
---|
22 | import weka.classifiers.Evaluation;
|
---|
23 | import weka.classifiers.functions.Logistic;
|
---|
24 | import weka.core.DenseInstance;
|
---|
25 | import weka.core.Instance;
|
---|
26 | import weka.core.Instances;
|
---|
27 |
|
---|
28 | /**
|
---|
29 | * A setwise data selection strategy based on the separatability of the training data from the test
|
---|
30 | * data after Z. He, F. Peters, T. Menzies, Y. Yang: Learning from Open-Source Projects: An
|
---|
31 | * Empirical Study on Defect Prediction. <br>
|
---|
32 | * <br>
|
---|
33 | * This is calculated through the error of a logistic regression classifier that tries to separate
|
---|
34 | * the sets.
|
---|
35 | *
|
---|
36 | * @author Steffen Herbold
|
---|
37 | */
|
---|
38 | public class SeparatabilitySelection implements ISetWiseDataselectionStrategy {
|
---|
39 |
|
---|
40 | /**
|
---|
41 | * size of the random sample that is drawn from both test data and training data
|
---|
42 | */
|
---|
43 | private final int sampleSize = 500;
|
---|
44 |
|
---|
45 | /**
|
---|
46 | * number of repetitions of the sample drawing
|
---|
47 | */
|
---|
48 | private final int maxRep = 10;
|
---|
49 |
|
---|
50 | /**
|
---|
51 | * number of neighbors that are selected
|
---|
52 | */
|
---|
53 | private int neighbors = 10;
|
---|
54 |
|
---|
55 | /**
|
---|
56 | * Sets the number of neighbors that are selected.
|
---|
57 | */
|
---|
58 | @Override
|
---|
59 | public void setParameter(String parameters) {
|
---|
60 | if (!"".equals(parameters)) {
|
---|
61 | neighbors = Integer.parseInt(parameters);
|
---|
62 | }
|
---|
63 | }
|
---|
64 |
|
---|
65 | /**
|
---|
66 | * @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances,
|
---|
67 | * org.apache.commons.collections4.list.SetUniqueList)
|
---|
68 | */
|
---|
69 | @Override
|
---|
70 | public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
|
---|
71 | final Random rand = new Random(1);
|
---|
72 |
|
---|
73 | // calculate distances between testdata and traindata
|
---|
74 | final double[] distances = new double[traindataSet.size()];
|
---|
75 |
|
---|
76 | int i = 0;
|
---|
77 | for (Instances traindata : traindataSet) {
|
---|
78 | double distance = 0.0;
|
---|
79 | for (int rep = 0; rep < maxRep; rep++) {
|
---|
80 | // sample instances
|
---|
81 | Instances sample = new Instances(testdata);
|
---|
82 | for (int j = 0; j < sampleSize; j++) {
|
---|
83 | Instance inst =
|
---|
84 | new DenseInstance(testdata.instance(rand.nextInt(testdata.numInstances())));
|
---|
85 | inst.setDataset(sample);
|
---|
86 | inst.setClassValue(1.0);
|
---|
87 | sample.add(inst);
|
---|
88 | inst =
|
---|
89 | new DenseInstance(
|
---|
90 | traindata.instance(rand.nextInt(traindata.numInstances())));
|
---|
91 | inst.setDataset(sample);
|
---|
92 | inst.setClassValue(0.0);
|
---|
93 | sample.add(inst);
|
---|
94 | }
|
---|
95 |
|
---|
96 | // calculate separation
|
---|
97 | Evaluation eval;
|
---|
98 | try {
|
---|
99 | eval = new Evaluation(sample);
|
---|
100 | eval.crossValidateModel(new Logistic(), sample, 5, rand);
|
---|
101 | }
|
---|
102 | catch (Exception e) {
|
---|
103 | throw new RuntimeException(
|
---|
104 | "cross-validation during calculation of separatability failed",
|
---|
105 | e);
|
---|
106 | }
|
---|
107 | distance += eval.pctCorrect() / 100.0;
|
---|
108 | }
|
---|
109 | distances[i++] = 2 * ((distance / maxRep) - 0.5);
|
---|
110 | }
|
---|
111 |
|
---|
112 | // select closest neighbors
|
---|
113 | final double[] distancesCopy = Arrays.copyOf(distances, distances.length);
|
---|
114 | Arrays.sort(distancesCopy);
|
---|
115 | final double cutoffDistance = distancesCopy[neighbors];
|
---|
116 |
|
---|
117 | for (i = traindataSet.size() - 1; i >= 0; i--) {
|
---|
118 | if (distances[i] > cutoffDistance) {
|
---|
119 | traindataSet.remove(i);
|
---|
120 | }
|
---|
121 | }
|
---|
122 | }
|
---|
123 | }
|
---|