1 | // Copyright 2015 Georg-August-Universität Göttingen, Germany
|
---|
2 | //
|
---|
3 | // Licensed under the Apache License, Version 2.0 (the "License");
|
---|
4 | // you may not use this file except in compliance with the License.
|
---|
5 | // You may obtain a copy of the License at
|
---|
6 | //
|
---|
7 | // http://www.apache.org/licenses/LICENSE-2.0
|
---|
8 | //
|
---|
9 | // Unless required by applicable law or agreed to in writing, software
|
---|
10 | // distributed under the License is distributed on an "AS IS" BASIS,
|
---|
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
---|
12 | // See the License for the specific language governing permissions and
|
---|
13 | // limitations under the License.
|
---|
14 |
|
---|
15 | package de.ugoe.cs.cpdp.dataselection;
|
---|
16 |
|
---|
17 | import java.util.HashSet;
|
---|
18 | import java.util.Set;
|
---|
19 |
|
---|
20 | import org.apache.commons.collections4.list.SetUniqueList;
|
---|
21 | import org.apache.commons.math3.util.MathArrays;
|
---|
22 |
|
---|
23 | import weka.core.Instances;
|
---|
24 |
|
---|
25 | /**
|
---|
26 | * Filter based on the k-nearest neighbor (KNN) algorithm S. Herbold: Training data selection for
|
---|
27 | * cross-project defect prediction
|
---|
28 | *
|
---|
29 | * @author Steffen Herbold
|
---|
30 | */
|
---|
31 | public class SetWiseKNNSelection extends AbstractCharacteristicSelection {
|
---|
32 |
|
---|
33 | /**
|
---|
34 | * number of neighbors selected
|
---|
35 | */
|
---|
36 | private int k = 1;
|
---|
37 |
|
---|
38 | /**
|
---|
39 | * @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances,
|
---|
40 | * org.apache.commons.collections4.list.SetUniqueList)
|
---|
41 | */
|
---|
42 | @Override
|
---|
43 | public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
|
---|
44 | final Instances data = normalizedCharacteristicInstances(testdata, traindataSet);
|
---|
45 |
|
---|
46 | final Set<Integer> selected = new HashSet<Integer>();
|
---|
47 | for (int i = 0; i < k; i++) {
|
---|
48 | int closestIndex = getClosest(data);
|
---|
49 |
|
---|
50 | selected.add(closestIndex);
|
---|
51 | data.delete(closestIndex);
|
---|
52 | }
|
---|
53 |
|
---|
54 | for (int i = traindataSet.size() - 1; i >= 0; i--) {
|
---|
55 | if (selected.contains(i)) {
|
---|
56 | traindataSet.remove(i);
|
---|
57 | }
|
---|
58 | }
|
---|
59 | }
|
---|
60 |
|
---|
61 | /**
|
---|
62 | * Helper method that determines the index of the instance with the smallest distance to the
|
---|
63 | * first instance (index 0).
|
---|
64 | *
|
---|
65 | * @param data
|
---|
66 | * data set
|
---|
67 | * @return index of the closest instance
|
---|
68 | */
|
---|
69 | private int getClosest(Instances data) {
|
---|
70 | double closestDistance = Double.MAX_VALUE;
|
---|
71 | int closestIndex = 1;
|
---|
72 | for (int i = 1; i < data.numInstances(); i++) {
|
---|
73 | double distance = MathArrays.distance(data.instance(0).toDoubleArray(),
|
---|
74 | data.instance(i).toDoubleArray());
|
---|
75 | if (distance < closestDistance) {
|
---|
76 | closestDistance = distance;
|
---|
77 | closestIndex = i;
|
---|
78 | }
|
---|
79 | }
|
---|
80 | return closestIndex;
|
---|
81 | }
|
---|
82 |
|
---|
83 | /**
|
---|
84 | * Sets the number of neighbors followed by the distributional characteristics, the values are
|
---|
85 | * separated by blanks.
|
---|
86 | *
|
---|
87 | * @see AbstractCharacteristicSelection#setParameter(String)
|
---|
88 | */
|
---|
89 | @Override
|
---|
90 | public void setParameter(String parameters) {
|
---|
91 | if (!"".equals(parameters)) {
|
---|
92 | final String[] split = parameters.split(" ");
|
---|
93 | k = Integer.parseInt(split[0]);
|
---|
94 | String str = "";
|
---|
95 | for (int i = 1; i < split.length; i++) {
|
---|
96 | str += split[i];
|
---|
97 | if (i < split.length - 1) {
|
---|
98 | str += " ";
|
---|
99 | }
|
---|
100 | }
|
---|
101 | super.setParameter(str);
|
---|
102 | }
|
---|
103 | }
|
---|
104 | }
|
---|