source: trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/CLAMIProcessor.java @ 140

Last change on this file since 140 was 135, checked in by sherbold, 8 years ago
  • code documentation and formatting
  • Property svn:mime-type set to text/plain
File size: 8.7 KB
Line 
1// Copyright 2015 Georg-August-Universität Göttingen, Germany
2//
3//   Licensed under the Apache License, Version 2.0 (the "License");
4//   you may not use this file except in compliance with the License.
5//   You may obtain a copy of the License at
6//
7//       http://www.apache.org/licenses/LICENSE-2.0
8//
9//   Unless required by applicable law or agreed to in writing, software
10//   distributed under the License is distributed on an "AS IS" BASIS,
11//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12//   See the License for the specific language governing permissions and
13//   limitations under the License.
14
15package de.ugoe.cs.cpdp.dataprocessing;
16
17import java.util.Iterator;
18import java.util.SortedSet;
19import java.util.TreeSet;
20import java.util.logging.Level;
21
22import org.apache.commons.math3.stat.descriptive.rank.Median;
23
24import de.ugoe.cs.util.console.Console;
25import weka.core.Instance;
26import weka.core.Instances;
27
28/**
29 * <p>
30 * This processor implements the CLAMI strategy from the CLAMI paper at ASE 2014 be Nam et al. With
31 * CLAMI, the original classification of the data is removed and instead a new classification is
32 * created based on metric values that are higher than the median of the metric. Afterwards, a
33 * subset of the metrics is selected, where the violations of this median threshold is minimal.
34 * Finally, all instances where the metric violations are not correct are dropped, leaving
35 * noise-free data regarding the median threshold classification.
36 * </p>
37 * <p>
38 * This can also be done for the test data (i.e., TestAsTraining data selection), as the original
39 * classification is completely ignored. Hence, CLAMI is an approach for unsupervised learning.
40 * </p>
41 *
42 * @author Steffen Herbold
43 */
44public class CLAMIProcessor implements IProcessesingStrategy {
45
46    /*
47     * (non-Javadoc)
48     *
49     * @see de.ugoe.cs.cpdp.IParameterizable#setParameter(java.lang.String)
50     */
51    @Override
52    public void setParameter(String parameters) {
53        // dummy, parameters not used
54    }
55
56    /*
57     * (non-Javadoc)
58     *
59     * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances,
60     * weka.core.Instances)
61     */
62    @Override
63    public void apply(Instances testdata, Instances traindata) {
64        applyCLAMI(testdata, traindata);
65    }
66
67    /**
68     * <p>
69     * Applies the CLAMI processor to the data. The test data is also required, in order to
70     * guarantee a consistent metric set.
71     * </p>
72     *
73     * @param testdata
74     *            test data; the data is not modified, only metrics are dropped
75     * @param data
76     *            data to which the CLAMI processor is applied
77     */
78    public void applyCLAMI(Instances testdata, Instances data) {
79
80        // first determine medians
81        double[] medians = new double[data.numAttributes()];
82        // get medians
83        for (int j = 0; j < data.numAttributes(); j++) {
84            if (j != data.classIndex()) {
85                medians[j] = data.kthSmallestValue(j, (data.numInstances() + 1) >> 1);
86            }
87        }
88        // now determine cluster number for each instance
89        double[] clusterNumber = new double[data.numInstances()];
90        for (int i = 0; i < data.numInstances(); i++) {
91            int countHighValues = 0;
92            Instance currentInstance = data.get(i);
93            for (int j = 0; j < data.numAttributes(); j++) {
94                if (j != data.classIndex()) {
95                    if (currentInstance.value(j) > medians[j]) {
96                        countHighValues++;
97                    }
98                }
99            }
100            clusterNumber[i] = countHighValues;
101        }
102
103        // determine median of cluster number
104        Median m = new Median();
105        double medianClusterNumber = m.evaluate(clusterNumber);
106
107        // now we filter the metrics
108        int[] numMetricViolations = new int[data.numAttributes()];
109        for (int j = 0; j < data.numAttributes(); j++) {
110            int currentViolations = 0;
111            for (int i = 0; i < data.numInstances(); i++) {
112                Instance currentInstance = data.get(i);
113                if (j != data.classIndex()) {
114                    if (clusterNumber[i] > medianClusterNumber) {
115                        // "buggy"
116                        if (currentInstance.value(j) <= medians[j]) {
117                            currentViolations++;
118                        }
119                    }
120                    else {
121                        // "not buggy"
122                        if (currentInstance.value(j) > medians[j]) {
123                            currentViolations++;
124                        }
125                    }
126                }
127            }
128            numMetricViolations[j] = currentViolations;
129        }
130
131        SortedSet<Integer> distinctViolationCounts = new TreeSet<>();
132        for (int currentViolations : numMetricViolations) {
133            distinctViolationCounts.add(currentViolations);
134        }
135        Iterator<Integer> violationCountInterator = distinctViolationCounts.iterator();
136
137        int violationCutoff = violationCountInterator.next();
138        // now we filter the data;
139        // this is first tried with the metrics with fewest violations. if no buggy/bugfree
140        // instances remain, this is repeated with the next metrics with second fewest violations,
141        // and so on.
142        // this part is a bit unclear from the description in the paper, but I confirmed with the
143        // author that this is how they implemented it
144        boolean[] cleanInstances = new boolean[data.numInstances()];
145        int numCleanBuggyInstances = 0;
146        int numCleanBugfreeInstances = 0;
147        do {
148            violationCutoff = violationCountInterator.next();
149            cleanInstances = new boolean[data.numInstances()];
150            numCleanBuggyInstances = 0;
151            numCleanBugfreeInstances = 0;
152            for (int i = 0; i < data.numInstances(); i++) {
153                int currentViolations = 0;
154                Instance currentInstance = data.get(i);
155                for (int j = 0; j < data.numAttributes(); j++) {
156                    if (j != data.classIndex() && numMetricViolations[j] == violationCutoff) {
157                        if (clusterNumber[i] > medianClusterNumber) {
158                            // "buggy"
159                            if (currentInstance.value(j) <= medians[j]) {
160                                currentViolations++;
161                            }
162                        }
163                        else {
164                            // "not buggy"
165                            if (currentInstance.value(j) > medians[j]) {
166                                currentViolations++;
167                            }
168                        }
169                    }
170                }
171                if (currentViolations == 0) {
172                    cleanInstances[i] = true;
173                    if (clusterNumber[i] > medianClusterNumber) {
174                        numCleanBuggyInstances++;
175                    }
176                    else {
177                        numCleanBugfreeInstances++;
178                    }
179                }
180                else {
181                    cleanInstances[i] = false;
182                }
183            }
184        }
185        while (numCleanBuggyInstances == 0 || numCleanBugfreeInstances == 0);
186
187        // output some interesting information to provide insights into the CLAMI model
188        Console.traceln(Level.FINE, "Selected Metrics and Median-threshold: ");
189        for (int j = 0; j < data.numAttributes(); j++) {
190            if (j != data.classIndex() && numMetricViolations[j] == violationCutoff) {
191                Console.traceln(Level.FINE, "\t" + data.attribute(j).name() + ": " + medians[j]);
192            }
193        }
194
195        // finally modify the instances
196        // drop the metrics (also from the testdata)
197        for (int j = data.numAttributes() - 1; j >= 0; j--) {
198            if (j != data.classIndex() && numMetricViolations[j] != violationCutoff) {
199                data.deleteAttributeAt(j);
200                testdata.deleteAttributeAt(j);
201            }
202        }
203        // drop the unclean instances
204        for (int i = data.numInstances() - 1; i >= 0; i--) {
205            if (!cleanInstances[i]) {
206                data.delete(i);
207            }
208            else {
209                // set the classification
210                if (clusterNumber[i] > medianClusterNumber) {
211                    data.get(i).setClassValue(1.0d);
212                }
213                else {
214                    data.get(i).setClassValue(0.0d);
215                }
216            }
217        }
218    }
219
220}
Note: See TracBrowser for help on using the repository browser.