1 | package de.ugoe.cs.cpdp.dataselection;
|
---|
2 |
|
---|
3 | import java.util.HashSet;
|
---|
4 | import java.util.Set;
|
---|
5 |
|
---|
6 | import org.apache.commons.collections4.list.SetUniqueList;
|
---|
7 | import org.apache.commons.math3.util.MathArrays;
|
---|
8 |
|
---|
9 | import weka.core.Instances;
|
---|
10 |
|
---|
11 | /**
|
---|
12 | * Filter based on the k-nearest neighbor (KNN) algorithm S. Herbold: Training data selection for cross-project defect prediction
|
---|
13 | * @author Steffen Herbold
|
---|
14 | */
|
---|
15 | public class SetWiseKNNSelection extends AbstractCharacteristicSelection {
|
---|
16 |
|
---|
17 | /**
|
---|
18 | * number of neighbors selected
|
---|
19 | */
|
---|
20 | private int k = 1;
|
---|
21 |
|
---|
22 | /**
|
---|
23 | * @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList)
|
---|
24 | */
|
---|
25 | @Override
|
---|
26 | public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
|
---|
27 | final Instances data = normalizedCharacteristicInstances(testdata, traindataSet);
|
---|
28 |
|
---|
29 | final Set<Integer> selected = new HashSet<Integer>();
|
---|
30 | for( int i=0 ; i<k ; i++ ) {
|
---|
31 | int closestIndex = getClosest(data);
|
---|
32 |
|
---|
33 | selected.add(closestIndex);
|
---|
34 | data.delete(closestIndex);
|
---|
35 | }
|
---|
36 |
|
---|
37 | for( int i=traindataSet.size()-1; i>=0 ; i-- ) {
|
---|
38 | if( selected.contains(i) ) {
|
---|
39 | traindataSet.remove(i);
|
---|
40 | }
|
---|
41 | }
|
---|
42 | }
|
---|
43 |
|
---|
44 | /**
|
---|
45 | * Helper method that determines the index of the instance with the smallest distance to the first instance (index 0).
|
---|
46 | * @param data data set
|
---|
47 | * @return index of the closest instance
|
---|
48 | */
|
---|
49 | private int getClosest(Instances data) {
|
---|
50 | double closestDistance = Double.MAX_VALUE;
|
---|
51 | int closestIndex = 1;
|
---|
52 | for( int i=1 ; i<data.numInstances() ; i++ ) {
|
---|
53 | double distance = MathArrays.distance(data.instance(0).toDoubleArray(), data.instance(i).toDoubleArray());
|
---|
54 | if( distance < closestDistance) {
|
---|
55 | closestDistance = distance;
|
---|
56 | closestIndex = i;
|
---|
57 | }
|
---|
58 | }
|
---|
59 | return closestIndex;
|
---|
60 | }
|
---|
61 |
|
---|
62 | /**
|
---|
63 | * Sets the number of neighbors followed by the distributional characteristics, the values are separated by blanks.
|
---|
64 | * @see AbstractCharacteristicSelection#setParameter(String)
|
---|
65 | */
|
---|
66 | @Override
|
---|
67 | public void setParameter(String parameters) {
|
---|
68 | if( !"".equals(parameters) ) {
|
---|
69 | final String[] split = parameters.split(" ");
|
---|
70 | k = Integer.parseInt(split[0]);
|
---|
71 | String str = "";
|
---|
72 | for( int i=1 ; i<split.length; i++ ) {
|
---|
73 | str += split[i];
|
---|
74 | if( i<split.length-1 ) {
|
---|
75 | str += " ";
|
---|
76 | }
|
---|
77 | }
|
---|
78 | super.setParameter(str);
|
---|
79 | }
|
---|
80 | }
|
---|
81 | }
|
---|