source: trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/CLAMIProcessor.java @ 74

Last change on this file since 74 was 43, checked in by sherbold, 9 years ago
  • fix of the CLAMI processor after clarification with the author
  • Property svn:mime-type set to text/plain
File size: 8.7 KB
Line 
1// Copyright 2015 Georg-August-Universität Göttingen, Germany
2//
3//   Licensed under the Apache License, Version 2.0 (the "License");
4//   you may not use this file except in compliance with the License.
5//   You may obtain a copy of the License at
6//
7//       http://www.apache.org/licenses/LICENSE-2.0
8//
9//   Unless required by applicable law or agreed to in writing, software
10//   distributed under the License is distributed on an "AS IS" BASIS,
11//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12//   See the License for the specific language governing permissions and
13//   limitations under the License.
14
15package de.ugoe.cs.cpdp.dataprocessing;
16
17import java.util.Iterator;
18import java.util.SortedSet;
19import java.util.TreeSet;
20import java.util.logging.Level;
21
22import org.apache.commons.math3.stat.descriptive.rank.Median;
23
24import de.ugoe.cs.util.console.Console;
25import weka.core.Instance;
26import weka.core.Instances;
27
28/**
29 * <p>
30 * This processor implements the CLAMI strategy from the CLAMI paper at ASE 2014 be Nam et al. With
31 * CLAMI, the original classification of the data is removed and instead a new classification is
32 * created based on metric values that are higher than the median of the metric. Afterwards, a
33 * subset of the metrics is selected, where the violations of this median threshold is minimal.
34 * Finally, all instances where the metric violations are not correct are dropped, leaving
35 * noise-free data regarding the median threshold classification.
36 * </p>
37 * <p>
38 * This can also be done for the test data (i.e., TestAsTraining data selection), as the original
39 * classification is completely ignored. Hence, CLAMI is an approach for unsupervised learning.
40 * </p>
41 *
42 * @author Steffen Herbold
43 */
44public class CLAMIProcessor implements IProcessesingStrategy {
45
46    /*
47     * (non-Javadoc)
48     *
49     * @see de.ugoe.cs.cpdp.IParameterizable#setParameter(java.lang.String)
50     */
51    @Override
52    public void setParameter(String parameters) {
53        // TODO Auto-generated method stub
54
55    }
56
57    /*
58     * (non-Javadoc)
59     *
60     * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances,
61     * weka.core.Instances)
62     */
63    @Override
64    public void apply(Instances testdata, Instances traindata) {
65        applyCLAMI(testdata, traindata);
66    }
67
68    /**
69     * <p>
70     * Applies the CLAMI processor to the data. The test data is also required, in order to
71     * guarantee a consistent metric set.
72     * </p>
73     *
74     * @param testdata
75     *            test data; the data is not modified, only metrics are dropped
76     * @param data
77     *            data to which the CLAMI processor is applied
78     */
79    public void applyCLAMI(Instances testdata, Instances data) {
80
81        // first determine medians
82        double[] medians = new double[data.numAttributes()];
83        // get medians
84        for (int j = 0; j < data.numAttributes(); j++) {
85            if (j != data.classIndex()) {
86                medians[j] = data.kthSmallestValue(j, (data.numInstances() + 1) >> 1);
87            }
88        }
89        // now determine cluster number for each instance
90        double[] clusterNumber = new double[data.numInstances()];
91        for (int i = 0; i < data.numInstances(); i++) {
92            int countHighValues = 0;
93            Instance currentInstance = data.get(i);
94            for (int j = 0; j < data.numAttributes(); j++) {
95                if (j != data.classIndex()) {
96                    if (currentInstance.value(j) > medians[j]) {
97                        countHighValues++;
98                    }
99                }
100            }
101            clusterNumber[i] = countHighValues;
102        }
103
104        // determine median of cluster number
105        Median m = new Median();
106        double medianClusterNumber = m.evaluate(clusterNumber);
107
108        // now we filter the metrics
109        int[] numMetricViolations = new int[data.numAttributes()];
110        for (int j = 0; j < data.numAttributes(); j++) {
111            int currentViolations = 0;
112            for (int i = 0; i < data.numInstances(); i++) {
113                Instance currentInstance = data.get(i);
114                if (j != data.classIndex()) {
115                    if (clusterNumber[i] > medianClusterNumber) {
116                        // "buggy"
117                        if (currentInstance.value(j) <= medians[j]) {
118                            currentViolations++;
119                        }
120                    }
121                    else {
122                        // "not buggy"
123                        if (currentInstance.value(j) > medians[j]) {
124                            currentViolations++;
125                        }
126                    }
127                }
128            }
129            numMetricViolations[j] = currentViolations;
130        }
131
132        SortedSet<Integer> distinctViolationCounts = new TreeSet<>();
133        for (int currentViolations : numMetricViolations) {
134            distinctViolationCounts.add(currentViolations);
135        }
136        Iterator<Integer> violationCountInterator = distinctViolationCounts.iterator();
137
138        int violationCutoff = violationCountInterator.next();
139        // now we filter the data;
140        // this is first tried with the metrics with fewest violations. if no buggy/bugfree
141        // instances remain, this is repeated with the next metrics with second fewest violations,
142        // and so on.
143        // this part is a bit unclear from the description in the paper, but I confirmed with the
144        // author that this is how they implemented it
145        boolean[] cleanInstances = new boolean[data.numInstances()];
146        int numCleanBuggyInstances = 0;
147        int numCleanBugfreeInstances = 0;
148        do {
149            violationCutoff = violationCountInterator.next();
150            cleanInstances = new boolean[data.numInstances()];
151            numCleanBuggyInstances = 0;
152            numCleanBugfreeInstances = 0;
153            for (int i = 0; i < data.numInstances(); i++) {
154                int currentViolations = 0;
155                Instance currentInstance = data.get(i);
156                for (int j = 0; j < data.numAttributes(); j++) {
157                    if (j != data.classIndex() && numMetricViolations[j] == violationCutoff) {
158                        if (clusterNumber[i] > medianClusterNumber) {
159                            // "buggy"
160                            if (currentInstance.value(j) <= medians[j]) {
161                                currentViolations++;
162                            }
163                        }
164                        else {
165                            // "not buggy"
166                            if (currentInstance.value(j) > medians[j]) {
167                                currentViolations++;
168                            }
169                        }
170                    }
171                }
172                if (currentViolations == 0) {
173                    cleanInstances[i] = true;
174                    if (clusterNumber[i] > medianClusterNumber) {
175                        numCleanBuggyInstances++;
176                    }
177                    else {
178                        numCleanBugfreeInstances++;
179                    }
180                }
181                else {
182                    cleanInstances[i] = false;
183                }
184            }
185        }
186        while (numCleanBuggyInstances == 0 || numCleanBugfreeInstances == 0);
187
188        // output some interesting information to provide insights into the CLAMI model
189        Console.traceln(Level.FINE, "Selected Metrics and Median-threshold: ");
190        for (int j = 0; j < data.numAttributes(); j++) {
191            if (j != data.classIndex() && numMetricViolations[j] == violationCutoff) {
192                Console.traceln(Level.FINE, "\t" + data.attribute(j).name() + ": " + medians[j]);
193            }
194        }
195
196        // finally modify the instances
197        // drop the metrics (also from the testdata)
198        for (int j = data.numAttributes() - 1; j >= 0; j--) {
199            if (j != data.classIndex() && numMetricViolations[j] != violationCutoff) {
200                data.deleteAttributeAt(j);
201                testdata.deleteAttributeAt(j);
202            }
203        }
204        // drop the unclean instances
205        for (int i = data.numInstances() - 1; i >= 0; i--) {
206            if (!cleanInstances[i]) {
207                data.delete(i);
208            }
209            else {
210                // set the classification
211                if (clusterNumber[i] > medianClusterNumber) {
212                    data.get(i).setClassValue(1.0d);
213                }
214                else {
215                    data.get(i).setClassValue(0.0d);
216                }
217            }
218        }
219    }
220
221}
Note: See TracBrowser for help on using the repository browser.