[86] | 1 | // Copyright 2015 Georg-August-Universität Göttingen, Germany
|
---|
[41] | 2 | //
|
---|
| 3 | // Licensed under the Apache License, Version 2.0 (the "License");
|
---|
| 4 | // you may not use this file except in compliance with the License.
|
---|
| 5 | // You may obtain a copy of the License at
|
---|
| 6 | //
|
---|
| 7 | // http://www.apache.org/licenses/LICENSE-2.0
|
---|
| 8 | //
|
---|
| 9 | // Unless required by applicable law or agreed to in writing, software
|
---|
| 10 | // distributed under the License is distributed on an "AS IS" BASIS,
|
---|
| 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
---|
| 12 | // See the License for the specific language governing permissions and
|
---|
| 13 | // limitations under the License.
|
---|
| 14 |
|
---|
[2] | 15 | package de.ugoe.cs.cpdp.dataselection;
|
---|
| 16 |
|
---|
| 17 | import java.util.Arrays;
|
---|
| 18 | import java.util.Random;
|
---|
| 19 |
|
---|
| 20 | import org.apache.commons.collections4.list.SetUniqueList;
|
---|
| 21 |
|
---|
| 22 | import weka.classifiers.Evaluation;
|
---|
| 23 | import weka.classifiers.functions.Logistic;
|
---|
| 24 | import weka.core.DenseInstance;
|
---|
| 25 | import weka.core.Instance;
|
---|
| 26 | import weka.core.Instances;
|
---|
| 27 |
|
---|
| 28 | /**
|
---|
[41] | 29 | * A setwise data selection strategy based on the separatability of the training data from the test
|
---|
| 30 | * data after Z. He, F. Peters, T. Menzies, Y. Yang: Learning from Open-Source Projects: An
|
---|
| 31 | * Empirical Study on Defect Prediction. <br>
|
---|
| 32 | * <br>
|
---|
| 33 | * This is calculated through the error of a logistic regression classifier that tries to separate
|
---|
| 34 | * the sets.
|
---|
| 35 | *
|
---|
[2] | 36 | * @author Steffen Herbold
|
---|
| 37 | */
|
---|
| 38 | public class SeparatabilitySelection implements ISetWiseDataselectionStrategy {
|
---|
| 39 |
|
---|
[41] | 40 | /**
|
---|
| 41 | * size of the random sample that is drawn from both test data and training data
|
---|
| 42 | */
|
---|
| 43 | private final int sampleSize = 500;
|
---|
[2] | 44 |
|
---|
[41] | 45 | /**
|
---|
| 46 | * number of repetitions of the sample drawing
|
---|
| 47 | */
|
---|
| 48 | private final int maxRep = 10;
|
---|
| 49 |
|
---|
| 50 | /**
|
---|
| 51 | * number of neighbors that are selected
|
---|
| 52 | */
|
---|
| 53 | private int neighbors = 10;
|
---|
| 54 |
|
---|
| 55 | /**
|
---|
| 56 | * Sets the number of neighbors that are selected.
|
---|
| 57 | */
|
---|
| 58 | @Override
|
---|
| 59 | public void setParameter(String parameters) {
|
---|
| 60 | if (!"".equals(parameters)) {
|
---|
| 61 | neighbors = Integer.parseInt(parameters);
|
---|
| 62 | }
|
---|
| 63 | }
|
---|
| 64 |
|
---|
| 65 | /**
|
---|
| 66 | * @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances,
|
---|
| 67 | * org.apache.commons.collections4.list.SetUniqueList)
|
---|
| 68 | */
|
---|
| 69 | @Override
|
---|
| 70 | public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
|
---|
| 71 | final Random rand = new Random(1);
|
---|
| 72 |
|
---|
| 73 | // calculate distances between testdata and traindata
|
---|
| 74 | final double[] distances = new double[traindataSet.size()];
|
---|
| 75 |
|
---|
| 76 | int i = 0;
|
---|
| 77 | for (Instances traindata : traindataSet) {
|
---|
| 78 | double distance = 0.0;
|
---|
| 79 | for (int rep = 0; rep < maxRep; rep++) {
|
---|
| 80 | // sample instances
|
---|
| 81 | Instances sample = new Instances(testdata);
|
---|
| 82 | for (int j = 0; j < sampleSize; j++) {
|
---|
| 83 | Instance inst =
|
---|
| 84 | new DenseInstance(testdata.instance(rand.nextInt(testdata.numInstances())));
|
---|
| 85 | inst.setDataset(sample);
|
---|
| 86 | inst.setClassValue(1.0);
|
---|
| 87 | sample.add(inst);
|
---|
[135] | 88 | inst = new DenseInstance(traindata
|
---|
| 89 | .instance(rand.nextInt(traindata.numInstances())));
|
---|
[41] | 90 | inst.setDataset(sample);
|
---|
| 91 | inst.setClassValue(0.0);
|
---|
| 92 | sample.add(inst);
|
---|
| 93 | }
|
---|
| 94 |
|
---|
| 95 | // calculate separation
|
---|
| 96 | Evaluation eval;
|
---|
| 97 | try {
|
---|
| 98 | eval = new Evaluation(sample);
|
---|
| 99 | eval.crossValidateModel(new Logistic(), sample, 5, rand);
|
---|
| 100 | }
|
---|
| 101 | catch (Exception e) {
|
---|
[135] | 102 | throw new RuntimeException("cross-validation during calculation of separatability failed",
|
---|
[41] | 103 | e);
|
---|
| 104 | }
|
---|
| 105 | distance += eval.pctCorrect() / 100.0;
|
---|
| 106 | }
|
---|
| 107 | distances[i++] = 2 * ((distance / maxRep) - 0.5);
|
---|
| 108 | }
|
---|
| 109 |
|
---|
| 110 | // select closest neighbors
|
---|
| 111 | final double[] distancesCopy = Arrays.copyOf(distances, distances.length);
|
---|
| 112 | Arrays.sort(distancesCopy);
|
---|
| 113 | final double cutoffDistance = distancesCopy[neighbors];
|
---|
| 114 |
|
---|
| 115 | for (i = traindataSet.size() - 1; i >= 0; i--) {
|
---|
| 116 | if (distances[i] > cutoffDistance) {
|
---|
| 117 | traindataSet.remove(i);
|
---|
| 118 | }
|
---|
| 119 | }
|
---|
| 120 | }
|
---|
[2] | 121 | }
|
---|