| 1 | // Copyright 2015 Georg-August-Universität Göttingen, Germany
|
|---|
| 2 | //
|
|---|
| 3 | // Licensed under the Apache License, Version 2.0 (the "License");
|
|---|
| 4 | // you may not use this file except in compliance with the License.
|
|---|
| 5 | // You may obtain a copy of the License at
|
|---|
| 6 | //
|
|---|
| 7 | // http://www.apache.org/licenses/LICENSE-2.0
|
|---|
| 8 | //
|
|---|
| 9 | // Unless required by applicable law or agreed to in writing, software
|
|---|
| 10 | // distributed under the License is distributed on an "AS IS" BASIS,
|
|---|
| 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|---|
| 12 | // See the License for the specific language governing permissions and
|
|---|
| 13 | // limitations under the License.
|
|---|
| 14 |
|
|---|
| 15 | package de.ugoe.cs.cpdp.dataselection;
|
|---|
| 16 |
|
|---|
| 17 | import java.util.Arrays;
|
|---|
| 18 | import java.util.Random;
|
|---|
| 19 |
|
|---|
| 20 | import org.apache.commons.collections4.list.SetUniqueList;
|
|---|
| 21 |
|
|---|
| 22 | import weka.classifiers.Evaluation;
|
|---|
| 23 | import weka.classifiers.functions.Logistic;
|
|---|
| 24 | import weka.core.DenseInstance;
|
|---|
| 25 | import weka.core.Instance;
|
|---|
| 26 | import weka.core.Instances;
|
|---|
| 27 |
|
|---|
| 28 | /**
|
|---|
| 29 | * A setwise data selection strategy based on the separatability of the training data from the test
|
|---|
| 30 | * data after Z. He, F. Peters, T. Menzies, Y. Yang: Learning from Open-Source Projects: An
|
|---|
| 31 | * Empirical Study on Defect Prediction. <br>
|
|---|
| 32 | * <br>
|
|---|
| 33 | * This is calculated through the error of a logistic regression classifier that tries to separate
|
|---|
| 34 | * the sets.
|
|---|
| 35 | *
|
|---|
| 36 | * @author Steffen Herbold
|
|---|
| 37 | */
|
|---|
| 38 | public class SeparatabilitySelection implements ISetWiseDataselectionStrategy {
|
|---|
| 39 |
|
|---|
| 40 | /**
|
|---|
| 41 | * size of the random sample that is drawn from both test data and training data
|
|---|
| 42 | */
|
|---|
| 43 | private final int sampleSize = 500;
|
|---|
| 44 |
|
|---|
| 45 | /**
|
|---|
| 46 | * number of repetitions of the sample drawing
|
|---|
| 47 | */
|
|---|
| 48 | private final int maxRep = 10;
|
|---|
| 49 |
|
|---|
| 50 | /**
|
|---|
| 51 | * number of neighbors that are selected
|
|---|
| 52 | */
|
|---|
| 53 | private int neighbors = 10;
|
|---|
| 54 |
|
|---|
| 55 | /**
|
|---|
| 56 | * Sets the number of neighbors that are selected.
|
|---|
| 57 | */
|
|---|
| 58 | @Override
|
|---|
| 59 | public void setParameter(String parameters) {
|
|---|
| 60 | if (!"".equals(parameters)) {
|
|---|
| 61 | neighbors = Integer.parseInt(parameters);
|
|---|
| 62 | }
|
|---|
| 63 | }
|
|---|
| 64 |
|
|---|
| 65 | /**
|
|---|
| 66 | * @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances,
|
|---|
| 67 | * org.apache.commons.collections4.list.SetUniqueList)
|
|---|
| 68 | */
|
|---|
| 69 | @Override
|
|---|
| 70 | public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
|
|---|
| 71 | final Random rand = new Random(1);
|
|---|
| 72 |
|
|---|
| 73 | // calculate distances between testdata and traindata
|
|---|
| 74 | final double[] distances = new double[traindataSet.size()];
|
|---|
| 75 |
|
|---|
| 76 | int i = 0;
|
|---|
| 77 | for (Instances traindata : traindataSet) {
|
|---|
| 78 | double distance = 0.0;
|
|---|
| 79 | for (int rep = 0; rep < maxRep; rep++) {
|
|---|
| 80 | // sample instances
|
|---|
| 81 | Instances sample = new Instances(testdata);
|
|---|
| 82 | for (int j = 0; j < sampleSize; j++) {
|
|---|
| 83 | Instance inst =
|
|---|
| 84 | new DenseInstance(testdata.instance(rand.nextInt(testdata.numInstances())));
|
|---|
| 85 | inst.setDataset(sample);
|
|---|
| 86 | inst.setClassValue(1.0);
|
|---|
| 87 | sample.add(inst);
|
|---|
| 88 | inst = new DenseInstance(traindata
|
|---|
| 89 | .instance(rand.nextInt(traindata.numInstances())));
|
|---|
| 90 | inst.setDataset(sample);
|
|---|
| 91 | inst.setClassValue(0.0);
|
|---|
| 92 | sample.add(inst);
|
|---|
| 93 | }
|
|---|
| 94 |
|
|---|
| 95 | // calculate separation
|
|---|
| 96 | Evaluation eval;
|
|---|
| 97 | try {
|
|---|
| 98 | eval = new Evaluation(sample);
|
|---|
| 99 | eval.crossValidateModel(new Logistic(), sample, 5, rand);
|
|---|
| 100 | }
|
|---|
| 101 | catch (Exception e) {
|
|---|
| 102 | throw new RuntimeException("cross-validation during calculation of separatability failed",
|
|---|
| 103 | e);
|
|---|
| 104 | }
|
|---|
| 105 | distance += eval.pctCorrect() / 100.0;
|
|---|
| 106 | }
|
|---|
| 107 | distances[i++] = 2 * ((distance / maxRep) - 0.5);
|
|---|
| 108 | }
|
|---|
| 109 |
|
|---|
| 110 | // select closest neighbors
|
|---|
| 111 | final double[] distancesCopy = Arrays.copyOf(distances, distances.length);
|
|---|
| 112 | Arrays.sort(distancesCopy);
|
|---|
| 113 | final double cutoffDistance = distancesCopy[neighbors];
|
|---|
| 114 |
|
|---|
| 115 | for (i = traindataSet.size() - 1; i >= 0; i--) {
|
|---|
| 116 | if (distances[i] > cutoffDistance) {
|
|---|
| 117 | traindataSet.remove(i);
|
|---|
| 118 | }
|
|---|
| 119 | }
|
|---|
| 120 | }
|
|---|
| 121 | }
|
|---|