source: trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/SetWiseKNNSelection.java @ 118

Last change on this file since 118 was 86, checked in by sherbold, 9 years ago
  • switched workspace encoding to UTF-8 and fixed broken characters
  • Property svn:mime-type set to text/plain
File size: 3.5 KB
Line 
1// Copyright 2015 Georg-August-Universität Göttingen, Germany
2//
3//   Licensed under the Apache License, Version 2.0 (the "License");
4//   you may not use this file except in compliance with the License.
5//   You may obtain a copy of the License at
6//
7//       http://www.apache.org/licenses/LICENSE-2.0
8//
9//   Unless required by applicable law or agreed to in writing, software
10//   distributed under the License is distributed on an "AS IS" BASIS,
11//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12//   See the License for the specific language governing permissions and
13//   limitations under the License.
14
15package de.ugoe.cs.cpdp.dataselection;
16
17import java.util.HashSet;
18import java.util.Set;
19
20import org.apache.commons.collections4.list.SetUniqueList;
21import org.apache.commons.math3.util.MathArrays;
22
23import weka.core.Instances;
24
25/**
26 * Filter based on the k-nearest neighbor (KNN) algorithm S. Herbold: Training data selection for
27 * cross-project defect prediction
28 *
29 * @author Steffen Herbold
30 */
31public class SetWiseKNNSelection extends AbstractCharacteristicSelection {
32
33    /**
34     * number of neighbors selected
35     */
36    private int k = 1;
37
38    /**
39     * @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances,
40     *      org.apache.commons.collections4.list.SetUniqueList)
41     */
42    @Override
43    public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
44        final Instances data = normalizedCharacteristicInstances(testdata, traindataSet);
45
46        final Set<Integer> selected = new HashSet<Integer>();
47        for (int i = 0; i < k; i++) {
48            int closestIndex = getClosest(data);
49
50            selected.add(closestIndex);
51            data.delete(closestIndex);
52        }
53
54        for (int i = traindataSet.size() - 1; i >= 0; i--) {
55            if (selected.contains(i)) {
56                traindataSet.remove(i);
57            }
58        }
59    }
60
61    /**
62     * Helper method that determines the index of the instance with the smallest distance to the
63     * first instance (index 0).
64     *
65     * @param data
66     *            data set
67     * @return index of the closest instance
68     */
69    private int getClosest(Instances data) {
70        double closestDistance = Double.MAX_VALUE;
71        int closestIndex = 1;
72        for (int i = 1; i < data.numInstances(); i++) {
73            double distance =
74                MathArrays.distance(data.instance(0).toDoubleArray(), data.instance(i)
75                    .toDoubleArray());
76            if (distance < closestDistance) {
77                closestDistance = distance;
78                closestIndex = i;
79            }
80        }
81        return closestIndex;
82    }
83
84    /**
85     * Sets the number of neighbors followed by the distributional characteristics, the values are
86     * separated by blanks.
87     *
88     * @see AbstractCharacteristicSelection#setParameter(String)
89     */
90    @Override
91    public void setParameter(String parameters) {
92        if (!"".equals(parameters)) {
93            final String[] split = parameters.split(" ");
94            k = Integer.parseInt(split[0]);
95            String str = "";
96            for (int i = 1; i < split.length; i++) {
97                str += split[i];
98                if (i < split.length - 1) {
99                    str += " ";
100                }
101            }
102            super.setParameter(str);
103        }
104    }
105}
Note: See TracBrowser for help on using the repository browser.