// Copyright 2015 Georg-August-Universität Göttingen, Germany
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package de.ugoe.cs.cpdp.dataselection;
import java.util.Arrays;
import java.util.Random;
import org.apache.commons.collections4.list.SetUniqueList;
import weka.classifiers.Evaluation;
import weka.classifiers.functions.Logistic;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
/**
* A setwise data selection strategy based on the separatability of the training data from the test
* data after Z. He, F. Peters, T. Menzies, Y. Yang: Learning from Open-Source Projects: An
* Empirical Study on Defect Prediction.
*
* This is calculated through the error of a logistic regression classifier that tries to separate
* the sets.
*
* @author Steffen Herbold
*/
public class SeparatabilitySelection implements ISetWiseDataselectionStrategy {
/**
* size of the random sample that is drawn from both test data and training data
*/
private final int sampleSize = 500;
/**
* number of repetitions of the sample drawing
*/
private final int maxRep = 10;
/**
* number of neighbors that are selected
*/
private int neighbors = 10;
/**
* Sets the number of neighbors that are selected.
*/
@Override
public void setParameter(String parameters) {
if (!"".equals(parameters)) {
neighbors = Integer.parseInt(parameters);
}
}
/**
* @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances,
* org.apache.commons.collections4.list.SetUniqueList)
*/
@Override
public void apply(Instances testdata, SetUniqueList traindataSet) {
final Random rand = new Random(1);
// calculate distances between testdata and traindata
final double[] distances = new double[traindataSet.size()];
int i = 0;
for (Instances traindata : traindataSet) {
double distance = 0.0;
for (int rep = 0; rep < maxRep; rep++) {
// sample instances
Instances sample = new Instances(testdata);
for (int j = 0; j < sampleSize; j++) {
Instance inst =
new DenseInstance(testdata.instance(rand.nextInt(testdata.numInstances())));
inst.setDataset(sample);
inst.setClassValue(1.0);
sample.add(inst);
inst =
new DenseInstance(
traindata.instance(rand.nextInt(traindata.numInstances())));
inst.setDataset(sample);
inst.setClassValue(0.0);
sample.add(inst);
}
// calculate separation
Evaluation eval;
try {
eval = new Evaluation(sample);
eval.crossValidateModel(new Logistic(), sample, 5, rand);
}
catch (Exception e) {
throw new RuntimeException(
"cross-validation during calculation of separatability failed",
e);
}
distance += eval.pctCorrect() / 100.0;
}
distances[i++] = 2 * ((distance / maxRep) - 0.5);
}
// select closest neighbors
final double[] distancesCopy = Arrays.copyOf(distances, distances.length);
Arrays.sort(distancesCopy);
final double cutoffDistance = distancesCopy[neighbors];
for (i = traindataSet.size() - 1; i >= 0; i--) {
if (distances[i] > cutoffDistance) {
traindataSet.remove(i);
}
}
}
}