// Copyright 2015 Georg-August-Universität Göttingen, Germany // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package de.ugoe.cs.cpdp.dataprocessing; import java.util.Iterator; import java.util.SortedSet; import java.util.TreeSet; import java.util.logging.Level; import org.apache.commons.math3.stat.descriptive.rank.Median; import de.ugoe.cs.util.console.Console; import weka.core.Instance; import weka.core.Instances; /** *
* This processor implements the CLAMI strategy from the CLAMI paper at ASE 2014 be Nam et al. With * CLAMI, the original classification of the data is removed and instead a new classification is * created based on metric values that are higher than the median of the metric. Afterwards, a * subset of the metrics is selected, where the violations of this median threshold is minimal. * Finally, all instances where the metric violations are not correct are dropped, leaving * noise-free data regarding the median threshold classification. *
** This can also be done for the test data (i.e., TestAsTraining data selection), as the original * classification is completely ignored. Hence, CLAMI is an approach for unsupervised learning. *
* * @author Steffen Herbold */ public class CLAMIProcessor implements IProcessesingStrategy { /* * (non-Javadoc) * * @see de.ugoe.cs.cpdp.IParameterizable#setParameter(java.lang.String) */ @Override public void setParameter(String parameters) { // TODO Auto-generated method stub } /* * (non-Javadoc) * * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances, * weka.core.Instances) */ @Override public void apply(Instances testdata, Instances traindata) { applyCLAMI(testdata, traindata); } /** ** Applies the CLAMI processor to the data. The test data is also required, in order to * guarantee a consistent metric set. *
* * @param testdata * test data; the data is not modified, only metrics are dropped * @param data * data to which the CLAMI processor is applied */ public void applyCLAMI(Instances testdata, Instances data) { // first determine medians double[] medians = new double[data.numAttributes()]; // get medians for (int j = 0; j < data.numAttributes(); j++) { if (j != data.classIndex()) { medians[j] = data.kthSmallestValue(j, (data.numInstances() + 1) >> 1); } } // now determine cluster number for each instance double[] clusterNumber = new double[data.numInstances()]; for (int i = 0; i < data.numInstances(); i++) { int countHighValues = 0; Instance currentInstance = data.get(i); for (int j = 0; j < data.numAttributes(); j++) { if (j != data.classIndex()) { if (currentInstance.value(j) > medians[j]) { countHighValues++; } } } clusterNumber[i] = countHighValues; } // determine median of cluster number Median m = new Median(); double medianClusterNumber = m.evaluate(clusterNumber); // now we filter the metrics int[] numMetricViolations = new int[data.numAttributes()]; for (int j = 0; j < data.numAttributes(); j++) { int currentViolations = 0; for (int i = 0; i < data.numInstances(); i++) { Instance currentInstance = data.get(i); if (j != data.classIndex()) { if (clusterNumber[i] > medianClusterNumber) { // "buggy" if (currentInstance.value(j) <= medians[j]) { currentViolations++; } } else { // "not buggy" if (currentInstance.value(j) > medians[j]) { currentViolations++; } } } } numMetricViolations[j] = currentViolations; } SortedSet