[86] | 1 | // Copyright 2015 Georg-August-Universität Göttingen, Germany
|
---|
[41] | 2 | //
|
---|
| 3 | // Licensed under the Apache License, Version 2.0 (the "License");
|
---|
| 4 | // you may not use this file except in compliance with the License.
|
---|
| 5 | // You may obtain a copy of the License at
|
---|
| 6 | //
|
---|
| 7 | // http://www.apache.org/licenses/LICENSE-2.0
|
---|
| 8 | //
|
---|
| 9 | // Unless required by applicable law or agreed to in writing, software
|
---|
| 10 | // distributed under the License is distributed on an "AS IS" BASIS,
|
---|
| 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
---|
| 12 | // See the License for the specific language governing permissions and
|
---|
| 13 | // limitations under the License.
|
---|
| 14 |
|
---|
[2] | 15 | package de.ugoe.cs.cpdp.dataselection;
|
---|
| 16 |
|
---|
| 17 | import java.util.HashSet;
|
---|
| 18 | import java.util.Set;
|
---|
| 19 |
|
---|
| 20 | import org.apache.commons.collections4.list.SetUniqueList;
|
---|
| 21 | import org.apache.commons.math3.util.MathArrays;
|
---|
| 22 |
|
---|
| 23 | import weka.core.Instances;
|
---|
| 24 |
|
---|
| 25 | /**
|
---|
[41] | 26 | * Filter based on the k-nearest neighbor (KNN) algorithm S. Herbold: Training data selection for
|
---|
| 27 | * cross-project defect prediction
|
---|
| 28 | *
|
---|
[2] | 29 | * @author Steffen Herbold
|
---|
| 30 | */
|
---|
| 31 | public class SetWiseKNNSelection extends AbstractCharacteristicSelection {
|
---|
| 32 |
|
---|
[41] | 33 | /**
|
---|
| 34 | * number of neighbors selected
|
---|
| 35 | */
|
---|
| 36 | private int k = 1;
|
---|
| 37 |
|
---|
| 38 | /**
|
---|
| 39 | * @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances,
|
---|
| 40 | * org.apache.commons.collections4.list.SetUniqueList)
|
---|
| 41 | */
|
---|
| 42 | @Override
|
---|
| 43 | public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
|
---|
| 44 | final Instances data = normalizedCharacteristicInstances(testdata, traindataSet);
|
---|
| 45 |
|
---|
| 46 | final Set<Integer> selected = new HashSet<Integer>();
|
---|
| 47 | for (int i = 0; i < k; i++) {
|
---|
| 48 | int closestIndex = getClosest(data);
|
---|
| 49 |
|
---|
| 50 | selected.add(closestIndex);
|
---|
| 51 | data.delete(closestIndex);
|
---|
| 52 | }
|
---|
| 53 |
|
---|
| 54 | for (int i = traindataSet.size() - 1; i >= 0; i--) {
|
---|
| 55 | if (selected.contains(i)) {
|
---|
| 56 | traindataSet.remove(i);
|
---|
| 57 | }
|
---|
| 58 | }
|
---|
| 59 | }
|
---|
| 60 |
|
---|
| 61 | /**
|
---|
| 62 | * Helper method that determines the index of the instance with the smallest distance to the
|
---|
| 63 | * first instance (index 0).
|
---|
| 64 | *
|
---|
| 65 | * @param data
|
---|
| 66 | * data set
|
---|
| 67 | * @return index of the closest instance
|
---|
| 68 | */
|
---|
| 69 | private int getClosest(Instances data) {
|
---|
| 70 | double closestDistance = Double.MAX_VALUE;
|
---|
| 71 | int closestIndex = 1;
|
---|
| 72 | for (int i = 1; i < data.numInstances(); i++) {
|
---|
[135] | 73 | double distance = MathArrays.distance(data.instance(0).toDoubleArray(),
|
---|
| 74 | data.instance(i).toDoubleArray());
|
---|
[41] | 75 | if (distance < closestDistance) {
|
---|
| 76 | closestDistance = distance;
|
---|
| 77 | closestIndex = i;
|
---|
| 78 | }
|
---|
| 79 | }
|
---|
| 80 | return closestIndex;
|
---|
| 81 | }
|
---|
| 82 |
|
---|
| 83 | /**
|
---|
| 84 | * Sets the number of neighbors followed by the distributional characteristics, the values are
|
---|
| 85 | * separated by blanks.
|
---|
| 86 | *
|
---|
| 87 | * @see AbstractCharacteristicSelection#setParameter(String)
|
---|
| 88 | */
|
---|
| 89 | @Override
|
---|
| 90 | public void setParameter(String parameters) {
|
---|
| 91 | if (!"".equals(parameters)) {
|
---|
| 92 | final String[] split = parameters.split(" ");
|
---|
| 93 | k = Integer.parseInt(split[0]);
|
---|
| 94 | String str = "";
|
---|
| 95 | for (int i = 1; i < split.length; i++) {
|
---|
| 96 | str += split[i];
|
---|
| 97 | if (i < split.length - 1) {
|
---|
| 98 | str += " ";
|
---|
| 99 | }
|
---|
| 100 | }
|
---|
| 101 | super.setParameter(str);
|
---|
| 102 | }
|
---|
| 103 | }
|
---|
[2] | 104 | }
|
---|