package de.ugoe.cs.cpdp.dataselection; import java.util.Arrays; import java.util.Random; import org.apache.commons.collections4.list.SetUniqueList; import weka.classifiers.Evaluation; import weka.classifiers.functions.Logistic; import weka.core.DenseInstance; import weka.core.Instance; import weka.core.Instances; /** * A setwise data selection strategy based on the separatability of the training data from the test data after Z. He, F. Peters, T. Menzies, Y. Yang: Learning from Open-Source Projects: An Empirical Study on Defect Prediction. *

* This is calculated through the error of a logistic regression classifier that tries to separate the sets. * @author Steffen Herbold */ public class SeparatabilitySelection implements ISetWiseDataselectionStrategy { /** * size of the random sample that is drawn from both test data and training data */ private final int sampleSize = 500; /** * number of repetitions of the sample drawing */ private final int maxRep = 10; /** * number of neighbors that are selected */ private int neighbors = 10; /** * Sets the number of neighbors that are selected. */ @Override public void setParameter(String parameters) { if( !"".equals(parameters) ) { neighbors = Integer.parseInt(parameters); } } /** * @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) */ @Override public void apply(Instances testdata, SetUniqueList traindataSet) { final Random rand = new Random(1); // calculate distances between testdata and traindata final double[] distances = new double[traindataSet.size()]; int i=0; for( Instances traindata : traindataSet ) { double distance = 0.0; for( int rep=0; rep=0 ; i-- ) { if( distances[i]>cutoffDistance ) { traindataSet.remove(i); } } } }