1 | package de.ugoe.cs.cpdp.dataselection;
|
---|
2 |
|
---|
3 | import java.util.LinkedList;
|
---|
4 | import java.util.List;
|
---|
5 |
|
---|
6 | import org.apache.commons.collections4.list.SetUniqueList;
|
---|
7 |
|
---|
8 | import weka.clusterers.EM;
|
---|
9 | import weka.core.Instance;
|
---|
10 | import weka.core.Instances;
|
---|
11 |
|
---|
12 | /**
|
---|
13 | * Filter based on EM clustering after S. Herbold: Training data selection for cross-project defect prediction
|
---|
14 | * @author Steffen Herbold
|
---|
15 | */
|
---|
16 | public class SetWiseEMClusterSelection extends AbstractCharacteristicSelection {
|
---|
17 |
|
---|
18 | /**
|
---|
19 | * @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList)
|
---|
20 | */
|
---|
21 | @Override
|
---|
22 | public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
|
---|
23 | final Instances data = normalizedCharacteristicInstances(testdata, traindataSet);
|
---|
24 | final Instance targetInstance = data.instance(0);
|
---|
25 | final List<Instance> candidateInstances = new LinkedList<Instance>();
|
---|
26 | for( int i=1; i<data.numInstances(); i++ ) {
|
---|
27 | candidateInstances.add(data.instance(i));
|
---|
28 | }
|
---|
29 |
|
---|
30 | // cluster and select
|
---|
31 | try {
|
---|
32 | final EM emeans = new EM();
|
---|
33 | boolean onlyTarget = true;
|
---|
34 | int targetCluster;
|
---|
35 | int maxNumClusters = candidateInstances.size();
|
---|
36 | do { // while(onlyTarget)
|
---|
37 | emeans.setMaximumNumberOfClusters(maxNumClusters);
|
---|
38 | emeans.buildClusterer(data);
|
---|
39 |
|
---|
40 | targetCluster = emeans.clusterInstance(targetInstance);
|
---|
41 |
|
---|
42 | // check if cluster only contains target project
|
---|
43 | for( int i=0 ; i<candidateInstances.size() && onlyTarget; i++ ) {
|
---|
44 | onlyTarget &= !(emeans.clusterInstance(candidateInstances.get(i))==targetCluster);
|
---|
45 | }
|
---|
46 | maxNumClusters = emeans.numberOfClusters()-1;
|
---|
47 | } while(onlyTarget);
|
---|
48 |
|
---|
49 | int numRemoved = 0;
|
---|
50 | for( int i=0 ; i<candidateInstances.size() ; i++ ) {
|
---|
51 | if( emeans.clusterInstance(candidateInstances.get(i))!=targetCluster ) {
|
---|
52 | traindataSet.remove(i-numRemoved++);
|
---|
53 | }
|
---|
54 | }
|
---|
55 | } catch(Exception e) {
|
---|
56 | throw new RuntimeException("error applying setwise EM clustering training data selection", e);
|
---|
57 | }
|
---|
58 | }
|
---|
59 | }
|
---|