Context Navigation

CLAMIProcessor.java @ 86

Last change on this file since 86 was 86, checked in by sherbold, 8 years ago
switched workspace encoding to UTF-8 and fixed broken characters
Property svn:mime-type set to `text/plain`
File size: 8.7 KB

Line
1	// Copyright 2015 Georg-August-Universität Göttingen, Germany
2	//
3	// Licensed under the Apache License, Version 2.0 (the "License");
4	// you may not use this file except in compliance with the License.
5	// You may obtain a copy of the License at
6	//
7	// http://www.apache.org/licenses/LICENSE-2.0
8	//
9	// Unless required by applicable law or agreed to in writing, software
10	// distributed under the License is distributed on an "AS IS" BASIS,
11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	// See the License for the specific language governing permissions and
13	// limitations under the License.
14
15	package de.ugoe.cs.cpdp.dataprocessing;
16
17	import java.util.Iterator;
18	import java.util.SortedSet;
19	import java.util.TreeSet;
20	import java.util.logging.Level;
21
22	import org.apache.commons.math3.stat.descriptive.rank.Median;
23
24	import de.ugoe.cs.util.console.Console;
25	import weka.core.Instance;
26	import weka.core.Instances;
27
28	/**
29	* <p>
30	* This processor implements the CLAMI strategy from the CLAMI paper at ASE 2014 be Nam et al. With
31	* CLAMI, the original classification of the data is removed and instead a new classification is
32	* created based on metric values that are higher than the median of the metric. Afterwards, a
33	* subset of the metrics is selected, where the violations of this median threshold is minimal.
34	* Finally, all instances where the metric violations are not correct are dropped, leaving
35	* noise-free data regarding the median threshold classification.
36	* </p>
37	* <p>
38	* This can also be done for the test data (i.e., TestAsTraining data selection), as the original
39	* classification is completely ignored. Hence, CLAMI is an approach for unsupervised learning.
40	* </p>
41	*
42	* @author Steffen Herbold
43	*/
44	public class CLAMIProcessor implements IProcessesingStrategy {
45
46	/*
47	* (non-Javadoc)
48	*
49	* @see de.ugoe.cs.cpdp.IParameterizable#setParameter(java.lang.String)
50	*/
51	@Override
52	public void setParameter(String parameters) {
53	// TODO Auto-generated method stub
54
55	}
56
57	/*
58	* (non-Javadoc)
59	*
60	* @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances,
61	* weka.core.Instances)
62	*/
63	@Override
64	public void apply(Instances testdata, Instances traindata) {
65	applyCLAMI(testdata, traindata);
66	}
67
68	/**
69	* <p>
70	* Applies the CLAMI processor to the data. The test data is also required, in order to
71	* guarantee a consistent metric set.
72	* </p>
73	*
74	* @param testdata
75	* test data; the data is not modified, only metrics are dropped
76	* @param data
77	* data to which the CLAMI processor is applied
78	*/
79	public void applyCLAMI(Instances testdata, Instances data) {
80
81	// first determine medians
82	double[] medians = new double[data.numAttributes()];
83	// get medians
84	for (int j = 0; j < data.numAttributes(); j++) {
85	if (j != data.classIndex()) {
86	medians[j] = data.kthSmallestValue(j, (data.numInstances() + 1) >> 1);
87	}
88	}
89	// now determine cluster number for each instance
90	double[] clusterNumber = new double[data.numInstances()];
91	for (int i = 0; i < data.numInstances(); i++) {
92	int countHighValues = 0;
93	Instance currentInstance = data.get(i);
94	for (int j = 0; j < data.numAttributes(); j++) {
95	if (j != data.classIndex()) {
96	if (currentInstance.value(j) > medians[j]) {
97	countHighValues++;
98	}
99	}
100	}
101	clusterNumber[i] = countHighValues;
102	}
103
104	// determine median of cluster number
105	Median m = new Median();
106	double medianClusterNumber = m.evaluate(clusterNumber);
107
108	// now we filter the metrics
109	int[] numMetricViolations = new int[data.numAttributes()];
110	for (int j = 0; j < data.numAttributes(); j++) {
111	int currentViolations = 0;
112	for (int i = 0; i < data.numInstances(); i++) {
113	Instance currentInstance = data.get(i);
114	if (j != data.classIndex()) {
115	if (clusterNumber[i] > medianClusterNumber) {
116	// "buggy"
117	if (currentInstance.value(j) <= medians[j]) {
118	currentViolations++;
119	}
120	}
121	else {
122	// "not buggy"
123	if (currentInstance.value(j) > medians[j]) {
124	currentViolations++;
125	}
126	}
127	}
128	}
129	numMetricViolations[j] = currentViolations;
130	}
131
132	SortedSet<Integer> distinctViolationCounts = new TreeSet<>();
133	for (int currentViolations : numMetricViolations) {
134	distinctViolationCounts.add(currentViolations);
135	}
136	Iterator<Integer> violationCountInterator = distinctViolationCounts.iterator();
137
138	int violationCutoff = violationCountInterator.next();
139	// now we filter the data;
140	// this is first tried with the metrics with fewest violations. if no buggy/bugfree
141	// instances remain, this is repeated with the next metrics with second fewest violations,
142	// and so on.
143	// this part is a bit unclear from the description in the paper, but I confirmed with the
144	// author that this is how they implemented it
145	boolean[] cleanInstances = new boolean[data.numInstances()];
146	int numCleanBuggyInstances = 0;
147	int numCleanBugfreeInstances = 0;
148	do {
149	violationCutoff = violationCountInterator.next();
150	cleanInstances = new boolean[data.numInstances()];
151	numCleanBuggyInstances = 0;
152	numCleanBugfreeInstances = 0;
153	for (int i = 0; i < data.numInstances(); i++) {
154	int currentViolations = 0;
155	Instance currentInstance = data.get(i);
156	for (int j = 0; j < data.numAttributes(); j++) {
157	if (j != data.classIndex() && numMetricViolations[j] == violationCutoff) {
158	if (clusterNumber[i] > medianClusterNumber) {
159	// "buggy"
160	if (currentInstance.value(j) <= medians[j]) {
161	currentViolations++;
162	}
163	}
164	else {
165	// "not buggy"
166	if (currentInstance.value(j) > medians[j]) {
167	currentViolations++;
168	}
169	}
170	}
171	}
172	if (currentViolations == 0) {
173	cleanInstances[i] = true;
174	if (clusterNumber[i] > medianClusterNumber) {
175	numCleanBuggyInstances++;
176	}
177	else {
178	numCleanBugfreeInstances++;
179	}
180	}
181	else {
182	cleanInstances[i] = false;
183	}
184	}
185	}
186	while (numCleanBuggyInstances == 0 \|\| numCleanBugfreeInstances == 0);
187
188	// output some interesting information to provide insights into the CLAMI model
189	Console.traceln(Level.FINE, "Selected Metrics and Median-threshold: ");
190	for (int j = 0; j < data.numAttributes(); j++) {
191	if (j != data.classIndex() && numMetricViolations[j] == violationCutoff) {
192	Console.traceln(Level.FINE, "\t" + data.attribute(j).name() + ": " + medians[j]);
193	}
194	}
195
196	// finally modify the instances
197	// drop the metrics (also from the testdata)
198	for (int j = data.numAttributes() - 1; j >= 0; j--) {
199	if (j != data.classIndex() && numMetricViolations[j] != violationCutoff) {
200	data.deleteAttributeAt(j);
201	testdata.deleteAttributeAt(j);
202	}
203	}
204	// drop the unclean instances
205	for (int i = data.numInstances() - 1; i >= 0; i--) {
206	if (!cleanInstances[i]) {
207	data.delete(i);
208	}
209	else {
210	// set the classification
211	if (clusterNumber[i] > medianClusterNumber) {
212	data.get(i).setClassValue(1.0d);
213	}
214	else {
215	data.get(i).setClassValue(0.0d);
216	}
217	}
218	}
219	}
220
221	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format