source: trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/SetWiseKNNSelection.java @ 146

Last change on this file since 146 was 135, checked in by sherbold, 8 years ago
  • code documentation and formatting
  • Property svn:mime-type set to text/plain
File size: 3.5 KB
Line 
1// Copyright 2015 Georg-August-Universität Göttingen, Germany
2//
3//   Licensed under the Apache License, Version 2.0 (the "License");
4//   you may not use this file except in compliance with the License.
5//   You may obtain a copy of the License at
6//
7//       http://www.apache.org/licenses/LICENSE-2.0
8//
9//   Unless required by applicable law or agreed to in writing, software
10//   distributed under the License is distributed on an "AS IS" BASIS,
11//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12//   See the License for the specific language governing permissions and
13//   limitations under the License.
14
15package de.ugoe.cs.cpdp.dataselection;
16
17import java.util.HashSet;
18import java.util.Set;
19
20import org.apache.commons.collections4.list.SetUniqueList;
21import org.apache.commons.math3.util.MathArrays;
22
23import weka.core.Instances;
24
25/**
26 * Filter based on the k-nearest neighbor (KNN) algorithm S. Herbold: Training data selection for
27 * cross-project defect prediction
28 *
29 * @author Steffen Herbold
30 */
31public class SetWiseKNNSelection extends AbstractCharacteristicSelection {
32
33    /**
34     * number of neighbors selected
35     */
36    private int k = 1;
37
38    /**
39     * @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances,
40     *      org.apache.commons.collections4.list.SetUniqueList)
41     */
42    @Override
43    public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
44        final Instances data = normalizedCharacteristicInstances(testdata, traindataSet);
45
46        final Set<Integer> selected = new HashSet<Integer>();
47        for (int i = 0; i < k; i++) {
48            int closestIndex = getClosest(data);
49
50            selected.add(closestIndex);
51            data.delete(closestIndex);
52        }
53
54        for (int i = traindataSet.size() - 1; i >= 0; i--) {
55            if (selected.contains(i)) {
56                traindataSet.remove(i);
57            }
58        }
59    }
60
61    /**
62     * Helper method that determines the index of the instance with the smallest distance to the
63     * first instance (index 0).
64     *
65     * @param data
66     *            data set
67     * @return index of the closest instance
68     */
69    private int getClosest(Instances data) {
70        double closestDistance = Double.MAX_VALUE;
71        int closestIndex = 1;
72        for (int i = 1; i < data.numInstances(); i++) {
73            double distance = MathArrays.distance(data.instance(0).toDoubleArray(),
74                                                  data.instance(i).toDoubleArray());
75            if (distance < closestDistance) {
76                closestDistance = distance;
77                closestIndex = i;
78            }
79        }
80        return closestIndex;
81    }
82
83    /**
84     * Sets the number of neighbors followed by the distributional characteristics, the values are
85     * separated by blanks.
86     *
87     * @see AbstractCharacteristicSelection#setParameter(String)
88     */
89    @Override
90    public void setParameter(String parameters) {
91        if (!"".equals(parameters)) {
92            final String[] split = parameters.split(" ");
93            k = Integer.parseInt(split[0]);
94            String str = "";
95            for (int i = 1; i < split.length; i++) {
96                str += split[i];
97                if (i < split.length - 1) {
98                    str += " ";
99                }
100            }
101            super.setParameter(str);
102        }
103    }
104}
Note: See TracBrowser for help on using the repository browser.