source: trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/SetWiseEMClusterSelection.java @ 136

Last change on this file since 136 was 135, checked in by sherbold, 8 years ago
  • code documentation and formatting
  • Property svn:mime-type set to text/plain
File size: 3.1 KB
Line 
1// Copyright 2015 Georg-August-Universität Göttingen, Germany
2//
3//   Licensed under the Apache License, Version 2.0 (the "License");
4//   you may not use this file except in compliance with the License.
5//   You may obtain a copy of the License at
6//
7//       http://www.apache.org/licenses/LICENSE-2.0
8//
9//   Unless required by applicable law or agreed to in writing, software
10//   distributed under the License is distributed on an "AS IS" BASIS,
11//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12//   See the License for the specific language governing permissions and
13//   limitations under the License.
14
15package de.ugoe.cs.cpdp.dataselection;
16
17import java.util.LinkedList;
18import java.util.List;
19
20import org.apache.commons.collections4.list.SetUniqueList;
21
22import weka.clusterers.EM;
23import weka.core.Instance;
24import weka.core.Instances;
25
26/**
27 * Filter based on EM clustering after S. Herbold: Training data selection for cross-project defect
28 * prediction
29 *
30 * @author Steffen Herbold
31 */
32public class SetWiseEMClusterSelection extends AbstractCharacteristicSelection {
33
34    /**
35     * @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances,
36     *      org.apache.commons.collections4.list.SetUniqueList)
37     */
38    @Override
39    public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
40        final Instances data = normalizedCharacteristicInstances(testdata, traindataSet);
41        final Instance targetInstance = data.instance(0);
42        final List<Instance> candidateInstances = new LinkedList<Instance>();
43        for (int i = 1; i < data.numInstances(); i++) {
44            candidateInstances.add(data.instance(i));
45        }
46
47        // cluster and select
48        try {
49            final EM emeans = new EM();
50            boolean onlyTarget = true;
51            int targetCluster;
52            int maxNumClusters = candidateInstances.size();
53            do { // while(onlyTarget)
54                emeans.setMaximumNumberOfClusters(maxNumClusters);
55                emeans.buildClusterer(data);
56
57                targetCluster = emeans.clusterInstance(targetInstance);
58
59                // check if cluster only contains target project
60                for (int i = 0; i < candidateInstances.size() && onlyTarget; i++) {
61                    onlyTarget &=
62                        !(emeans.clusterInstance(candidateInstances.get(i)) == targetCluster);
63                }
64                maxNumClusters = emeans.numberOfClusters() - 1;
65            }
66            while (onlyTarget);
67
68            int numRemoved = 0;
69            for (int i = 0; i < candidateInstances.size(); i++) {
70                if (emeans.clusterInstance(candidateInstances.get(i)) != targetCluster) {
71                    traindataSet.remove(i - numRemoved++);
72                }
73            }
74        }
75        catch (Exception e) {
76            throw new RuntimeException("error applying setwise EM clustering training data selection",
77                                       e);
78        }
79    }
80}
Note: See TracBrowser for help on using the repository browser.