Context Navigation

CLAMIProcessor.java @ 135

Last change on this file since 135 was 135, checked in by sherbold, 9 years ago
code documentation and formatting
Property svn:mime-type set to `text/plain`
File size: 8.7 KB

Line
1	// Copyright 2015 Georg-August-Universität Göttingen, Germany
2	//
3	// Licensed under the Apache License, Version 2.0 (the "License");
4	// you may not use this file except in compliance with the License.
5	// You may obtain a copy of the License at
6	//
7	// http://www.apache.org/licenses/LICENSE-2.0
8	//
9	// Unless required by applicable law or agreed to in writing, software
10	// distributed under the License is distributed on an "AS IS" BASIS,
11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	// See the License for the specific language governing permissions and
13	// limitations under the License.
14
15	package de.ugoe.cs.cpdp.dataprocessing;
16
17	import java.util.Iterator;
18	import java.util.SortedSet;
19	import java.util.TreeSet;
20	import java.util.logging.Level;
21
22	import org.apache.commons.math3.stat.descriptive.rank.Median;
23
24	import de.ugoe.cs.util.console.Console;
25	import weka.core.Instance;
26	import weka.core.Instances;
27
28	/**
29	* <p>
30	* This processor implements the CLAMI strategy from the CLAMI paper at ASE 2014 be Nam et al. With
31	* CLAMI, the original classification of the data is removed and instead a new classification is
32	* created based on metric values that are higher than the median of the metric. Afterwards, a
33	* subset of the metrics is selected, where the violations of this median threshold is minimal.
34	* Finally, all instances where the metric violations are not correct are dropped, leaving
35	* noise-free data regarding the median threshold classification.
36	* </p>
37	* <p>
38	* This can also be done for the test data (i.e., TestAsTraining data selection), as the original
39	* classification is completely ignored. Hence, CLAMI is an approach for unsupervised learning.
40	* </p>
41	*
42	* @author Steffen Herbold
43	*/
44	public class CLAMIProcessor implements IProcessesingStrategy {
45
46	/*
47	* (non-Javadoc)
48	*
49	* @see de.ugoe.cs.cpdp.IParameterizable#setParameter(java.lang.String)
50	*/
51	@Override
52	public void setParameter(String parameters) {
53	// dummy, parameters not used
54	}
55
56	/*
57	* (non-Javadoc)
58	*
59	* @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances,
60	* weka.core.Instances)
61	*/
62	@Override
63	public void apply(Instances testdata, Instances traindata) {
64	applyCLAMI(testdata, traindata);
65	}
66
67	/**
68	* <p>
69	* Applies the CLAMI processor to the data. The test data is also required, in order to
70	* guarantee a consistent metric set.
71	* </p>
72	*
73	* @param testdata
74	* test data; the data is not modified, only metrics are dropped
75	* @param data
76	* data to which the CLAMI processor is applied
77	*/
78	public void applyCLAMI(Instances testdata, Instances data) {
79
80	// first determine medians
81	double[] medians = new double[data.numAttributes()];
82	// get medians
83	for (int j = 0; j < data.numAttributes(); j++) {
84	if (j != data.classIndex()) {
85	medians[j] = data.kthSmallestValue(j, (data.numInstances() + 1) >> 1);
86	}
87	}
88	// now determine cluster number for each instance
89	double[] clusterNumber = new double[data.numInstances()];
90	for (int i = 0; i < data.numInstances(); i++) {
91	int countHighValues = 0;
92	Instance currentInstance = data.get(i);
93	for (int j = 0; j < data.numAttributes(); j++) {
94	if (j != data.classIndex()) {
95	if (currentInstance.value(j) > medians[j]) {
96	countHighValues++;
97	}
98	}
99	}
100	clusterNumber[i] = countHighValues;
101	}
102
103	// determine median of cluster number
104	Median m = new Median();
105	double medianClusterNumber = m.evaluate(clusterNumber);
106
107	// now we filter the metrics
108	int[] numMetricViolations = new int[data.numAttributes()];
109	for (int j = 0; j < data.numAttributes(); j++) {
110	int currentViolations = 0;
111	for (int i = 0; i < data.numInstances(); i++) {
112	Instance currentInstance = data.get(i);
113	if (j != data.classIndex()) {
114	if (clusterNumber[i] > medianClusterNumber) {
115	// "buggy"
116	if (currentInstance.value(j) <= medians[j]) {
117	currentViolations++;
118	}
119	}
120	else {
121	// "not buggy"
122	if (currentInstance.value(j) > medians[j]) {
123	currentViolations++;
124	}
125	}
126	}
127	}
128	numMetricViolations[j] = currentViolations;
129	}
130
131	SortedSet<Integer> distinctViolationCounts = new TreeSet<>();
132	for (int currentViolations : numMetricViolations) {
133	distinctViolationCounts.add(currentViolations);
134	}
135	Iterator<Integer> violationCountInterator = distinctViolationCounts.iterator();
136
137	int violationCutoff = violationCountInterator.next();
138	// now we filter the data;
139	// this is first tried with the metrics with fewest violations. if no buggy/bugfree
140	// instances remain, this is repeated with the next metrics with second fewest violations,
141	// and so on.
142	// this part is a bit unclear from the description in the paper, but I confirmed with the
143	// author that this is how they implemented it
144	boolean[] cleanInstances = new boolean[data.numInstances()];
145	int numCleanBuggyInstances = 0;
146	int numCleanBugfreeInstances = 0;
147	do {
148	violationCutoff = violationCountInterator.next();
149	cleanInstances = new boolean[data.numInstances()];
150	numCleanBuggyInstances = 0;
151	numCleanBugfreeInstances = 0;
152	for (int i = 0; i < data.numInstances(); i++) {
153	int currentViolations = 0;
154	Instance currentInstance = data.get(i);
155	for (int j = 0; j < data.numAttributes(); j++) {
156	if (j != data.classIndex() && numMetricViolations[j] == violationCutoff) {
157	if (clusterNumber[i] > medianClusterNumber) {
158	// "buggy"
159	if (currentInstance.value(j) <= medians[j]) {
160	currentViolations++;
161	}
162	}
163	else {
164	// "not buggy"
165	if (currentInstance.value(j) > medians[j]) {
166	currentViolations++;
167	}
168	}
169	}
170	}
171	if (currentViolations == 0) {
172	cleanInstances[i] = true;
173	if (clusterNumber[i] > medianClusterNumber) {
174	numCleanBuggyInstances++;
175	}
176	else {
177	numCleanBugfreeInstances++;
178	}
179	}
180	else {
181	cleanInstances[i] = false;
182	}
183	}
184	}
185	while (numCleanBuggyInstances == 0 \|\| numCleanBugfreeInstances == 0);
186
187	// output some interesting information to provide insights into the CLAMI model
188	Console.traceln(Level.FINE, "Selected Metrics and Median-threshold: ");
189	for (int j = 0; j < data.numAttributes(); j++) {
190	if (j != data.classIndex() && numMetricViolations[j] == violationCutoff) {
191	Console.traceln(Level.FINE, "\t" + data.attribute(j).name() + ": " + medians[j]);
192	}
193	}
194
195	// finally modify the instances
196	// drop the metrics (also from the testdata)
197	for (int j = data.numAttributes() - 1; j >= 0; j--) {
198	if (j != data.classIndex() && numMetricViolations[j] != violationCutoff) {
199	data.deleteAttributeAt(j);
200	testdata.deleteAttributeAt(j);
201	}
202	}
203	// drop the unclean instances
204	for (int i = data.numInstances() - 1; i >= 0; i--) {
205	if (!cleanInstances[i]) {
206	data.delete(i);
207	}
208	else {
209	// set the classification
210	if (clusterNumber[i] > medianClusterNumber) {
211	data.get(i).setClassValue(1.0d);
212	}
213	else {
214	data.get(i).setClassValue(0.0d);
215	}
216	}
217	}
218	}
219
220	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format