Context Navigation

← Previous Revision
Next Revision →
Normal
Revision Log

CLAMIProcessor.java

Last change on this file was 135, checked in by sherbold, 8 years ago
code documentation and formatting
Property svn:mime-type set to `text/plain`
File size: 8.7 KB

Rev	Line
[86]	1	// Copyright 2015 Georg-August-Universität Göttingen, Germany
[42]	2	//
	3	// Licensed under the Apache License, Version 2.0 (the "License");
	4	// you may not use this file except in compliance with the License.
	5	// You may obtain a copy of the License at
	6	//
	7	// http://www.apache.org/licenses/LICENSE-2.0
	8	//
	9	// Unless required by applicable law or agreed to in writing, software
	10	// distributed under the License is distributed on an "AS IS" BASIS,
	11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	12	// See the License for the specific language governing permissions and
	13	// limitations under the License.
	14
	15	package de.ugoe.cs.cpdp.dataprocessing;
	16
	17	import java.util.Iterator;
	18	import java.util.SortedSet;
	19	import java.util.TreeSet;
	20	import java.util.logging.Level;
	21
	22	import org.apache.commons.math3.stat.descriptive.rank.Median;
	23
	24	import de.ugoe.cs.util.console.Console;
	25	import weka.core.Instance;
	26	import weka.core.Instances;
	27
	28	/**
	29	* <p>
	30	* This processor implements the CLAMI strategy from the CLAMI paper at ASE 2014 be Nam et al. With
	31	* CLAMI, the original classification of the data is removed and instead a new classification is
	32	* created based on metric values that are higher than the median of the metric. Afterwards, a
	33	* subset of the metrics is selected, where the violations of this median threshold is minimal.
	34	* Finally, all instances where the metric violations are not correct are dropped, leaving
	35	* noise-free data regarding the median threshold classification.
	36	* </p>
	37	* <p>
	38	* This can also be done for the test data (i.e., TestAsTraining data selection), as the original
	39	* classification is completely ignored. Hence, CLAMI is an approach for unsupervised learning.
	40	* </p>
	41	*
	42	* @author Steffen Herbold
	43	*/
	44	public class CLAMIProcessor implements IProcessesingStrategy {
	45
	46	/*
	47	* (non-Javadoc)
	48	*
	49	* @see de.ugoe.cs.cpdp.IParameterizable#setParameter(java.lang.String)
	50	*/
	51	@Override
	52	public void setParameter(String parameters) {
[135]	53	// dummy, parameters not used
[42]	54	}
	55
	56	/*
	57	* (non-Javadoc)
	58	*
	59	* @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances,
	60	* weka.core.Instances)
	61	*/
	62	@Override
	63	public void apply(Instances testdata, Instances traindata) {
	64	applyCLAMI(testdata, traindata);
	65	}
	66
	67	/**
	68	* <p>
	69	* Applies the CLAMI processor to the data. The test data is also required, in order to
	70	* guarantee a consistent metric set.
	71	* </p>
	72	*
	73	* @param testdata
	74	* test data; the data is not modified, only metrics are dropped
	75	* @param data
	76	* data to which the CLAMI processor is applied
	77	*/
	78	public void applyCLAMI(Instances testdata, Instances data) {
	79
	80	// first determine medians
	81	double[] medians = new double[data.numAttributes()];
	82	// get medians
	83	for (int j = 0; j < data.numAttributes(); j++) {
	84	if (j != data.classIndex()) {
	85	medians[j] = data.kthSmallestValue(j, (data.numInstances() + 1) >> 1);
	86	}
	87	}
	88	// now determine cluster number for each instance
	89	double[] clusterNumber = new double[data.numInstances()];
	90	for (int i = 0; i < data.numInstances(); i++) {
	91	int countHighValues = 0;
	92	Instance currentInstance = data.get(i);
	93	for (int j = 0; j < data.numAttributes(); j++) {
	94	if (j != data.classIndex()) {
	95	if (currentInstance.value(j) > medians[j]) {
	96	countHighValues++;
	97	}
	98	}
	99	}
	100	clusterNumber[i] = countHighValues;
	101	}
	102
	103	// determine median of cluster number
	104	Median m = new Median();
	105	double medianClusterNumber = m.evaluate(clusterNumber);
	106
	107	// now we filter the metrics
	108	int[] numMetricViolations = new int[data.numAttributes()];
	109	for (int j = 0; j < data.numAttributes(); j++) {
	110	int currentViolations = 0;
	111	for (int i = 0; i < data.numInstances(); i++) {
	112	Instance currentInstance = data.get(i);
	113	if (j != data.classIndex()) {
	114	if (clusterNumber[i] > medianClusterNumber) {
	115	// "buggy"
	116	if (currentInstance.value(j) <= medians[j]) {
	117	currentViolations++;
	118	}
	119	}
	120	else {
	121	// "not buggy"
	122	if (currentInstance.value(j) > medians[j]) {
	123	currentViolations++;
	124	}
	125	}
	126	}
	127	}
	128	numMetricViolations[j] = currentViolations;
	129	}
	130
	131	SortedSet<Integer> distinctViolationCounts = new TreeSet<>();
	132	for (int currentViolations : numMetricViolations) {
	133	distinctViolationCounts.add(currentViolations);
	134	}
	135	Iterator<Integer> violationCountInterator = distinctViolationCounts.iterator();
	136
	137	int violationCutoff = violationCountInterator.next();
	138	// now we filter the data;
[43]	139	// this is first tried with the metrics with fewest violations. if no buggy/bugfree
	140	// instances remain, this is repeated with the next metrics with second fewest violations,
	141	// and so on.
	142	// this part is a bit unclear from the description in the paper, but I confirmed with the
	143	// author that this is how they implemented it
[42]	144	boolean[] cleanInstances = new boolean[data.numInstances()];
	145	int numCleanBuggyInstances = 0;
	146	int numCleanBugfreeInstances = 0;
	147	do {
	148	violationCutoff = violationCountInterator.next();
	149	cleanInstances = new boolean[data.numInstances()];
	150	numCleanBuggyInstances = 0;
	151	numCleanBugfreeInstances = 0;
	152	for (int i = 0; i < data.numInstances(); i++) {
	153	int currentViolations = 0;
	154	Instance currentInstance = data.get(i);
	155	for (int j = 0; j < data.numAttributes(); j++) {
[43]	156	if (j != data.classIndex() && numMetricViolations[j] == violationCutoff) {
[42]	157	if (clusterNumber[i] > medianClusterNumber) {
	158	// "buggy"
	159	if (currentInstance.value(j) <= medians[j]) {
	160	currentViolations++;
	161	}
	162	}
	163	else {
	164	// "not buggy"
	165	if (currentInstance.value(j) > medians[j]) {
	166	currentViolations++;
	167	}
	168	}
	169	}
	170	}
	171	if (currentViolations == 0) {
	172	cleanInstances[i] = true;
	173	if (clusterNumber[i] > medianClusterNumber) {
	174	numCleanBuggyInstances++;
	175	}
	176	else {
	177	numCleanBugfreeInstances++;
	178	}
	179	}
	180	else {
	181	cleanInstances[i] = false;
	182	}
	183	}
	184	}
	185	while (numCleanBuggyInstances == 0 \|\| numCleanBugfreeInstances == 0);
[43]	186
[42]	187	// output some interesting information to provide insights into the CLAMI model
	188	Console.traceln(Level.FINE, "Selected Metrics and Median-threshold: ");
[43]	189	for (int j = 0; j < data.numAttributes(); j++) {
	190	if (j != data.classIndex() && numMetricViolations[j] == violationCutoff) {
[42]	191	Console.traceln(Level.FINE, "\t" + data.attribute(j).name() + ": " + medians[j]);
	192	}
	193	}
[43]	194
[42]	195	// finally modify the instances
	196	// drop the metrics (also from the testdata)
	197	for (int j = data.numAttributes() - 1; j >= 0; j--) {
[43]	198	if (j != data.classIndex() && numMetricViolations[j] != violationCutoff) {
[42]	199	data.deleteAttributeAt(j);
	200	testdata.deleteAttributeAt(j);
	201	}
	202	}
	203	// drop the unclean instances
	204	for (int i = data.numInstances() - 1; i >= 0; i--) {
	205	if (!cleanInstances[i]) {
	206	data.delete(i);
	207	}
	208	else {
	209	// set the classification
	210	if (clusterNumber[i] > medianClusterNumber) {
	211	data.get(i).setClassValue(1.0d);
	212	}
	213	else {
	214	data.get(i).setClassValue(0.0d);
	215	}
	216	}
	217	}
	218	}
	219
	220	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format