package de.ugoe.cs.cpdp.dataselection;
import java.util.Arrays;
import java.util.Random;
import org.apache.commons.collections4.list.SetUniqueList;
import weka.classifiers.Evaluation;
import weka.classifiers.functions.Logistic;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
/**
* A setwise data selection strategy based on the separatability of the training data from the test data after Z. He, F. Peters, T. Menzies, Y. Yang: Learning from Open-Source Projects: An Empirical Study on Defect Prediction.
*
* This is calculated through the error of a logistic regression classifier that tries to separate the sets.
* @author Steffen Herbold
*/
public class SeparatabilitySelection implements ISetWiseDataselectionStrategy {
/**
* size of the random sample that is drawn from both test data and training data
*/
private final int sampleSize = 500;
/**
* number of repetitions of the sample drawing
*/
private final int maxRep = 10;
/**
* number of neighbors that are selected
*/
private int neighbors = 10;
/**
* Sets the number of neighbors that are selected.
*/
@Override
public void setParameter(String parameters) {
if( !"".equals(parameters) ) {
neighbors = Integer.parseInt(parameters);
}
}
/**
* @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList)
*/
@Override
public void apply(Instances testdata, SetUniqueList traindataSet) {
final Random rand = new Random(1);
// calculate distances between testdata and traindata
final double[] distances = new double[traindataSet.size()];
int i=0;
for( Instances traindata : traindataSet ) {
double distance = 0.0;
for( int rep=0; rep=0 ; i-- ) {
if( distances[i]>cutoffDistance ) {
traindataSet.remove(i);
}
}
}
}