source: trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/CLAProcessor.java @ 47

Last change on this file since 47 was 42, checked in by sherbold, 9 years ago
  • added CLA/CLAMI data processors (ASE 2015)
  • modified normal WEKA training to have a fallback in case to few instances of a certain class (defect-prone, non-defect-prone) are available: it now uses ZeroR in that case, i.e., a trivial classifier that always predicts the class that appears more often.
  • Property svn:mime-type set to text/plain
File size: 3.6 KB
Line 
1// Copyright 2015 Georg-August-Universität Göttingen, Germany
2//
3//   Licensed under the Apache License, Version 2.0 (the "License");
4//   you may not use this file except in compliance with the License.
5//   You may obtain a copy of the License at
6//
7//       http://www.apache.org/licenses/LICENSE-2.0
8//
9//   Unless required by applicable law or agreed to in writing, software
10//   distributed under the License is distributed on an "AS IS" BASIS,
11//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12//   See the License for the specific language governing permissions and
13//   limitations under the License.
14
15package de.ugoe.cs.cpdp.dataprocessing;
16
17import org.apache.commons.math3.stat.descriptive.rank.Median;
18
19import weka.core.Instance;
20import weka.core.Instances;
21
22/**
23 * <p>
24 * This processor implements the CLA strategy from the CLAMI paper at ASE 2014 be Nam et al. With
25 * CLA, the original classification of the data is removed and instead a new classification is
26 * created based on metric values that are higher than the median of the metric.
27 * </p>
28 * <p>
29 * This can also be done for the test data (i.e., TestAsTraining data selection), as the original
30 * classification is completely ignored. Hence, CLA is an approach for unsupervised learning.
31 * </p>
32 *
33 * @author Steffen Herbold
34 */
35public class CLAProcessor implements IProcessesingStrategy {
36
37    /*
38     * (non-Javadoc)
39     *
40     * @see de.ugoe.cs.cpdp.IParameterizable#setParameter(java.lang.String)
41     */
42    @Override
43    public void setParameter(String parameters) {
44        // TODO Auto-generated method stub
45
46    }
47
48    /*
49     * (non-Javadoc)
50     *
51     * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core. Instances,
52     * weka.core.Instances)
53     */
54    @Override
55    public void apply(Instances testdata, Instances traindata) {
56        applyCLA(traindata);
57    }
58
59    /**
60     * Applies the CLA processor the the data.
61     *
62     * @param data
63     *            data to which the processor is applied
64     */
65    public void applyCLA(Instances data) {
66        // first determine medians
67        double[] medians = new double[data.numAttributes()];
68        // get medians
69        for (int j = 0; j < data.numAttributes(); j++) {
70            if (j != data.classIndex()) {
71                medians[j] = data.kthSmallestValue(j, (data.numInstances() + 1) >> 1);
72            }
73        }
74        // now determine cluster number for each instance
75        double[] clusterNumber = new double[data.numInstances()];
76        for (int i = 0; i < data.numInstances(); i++) {
77            int countHighValues = 0;
78            Instance currentInstance = data.get(i);
79            for (int j = 0; j < data.numAttributes(); j++) {
80                if (j != data.classIndex()) {
81                    if (currentInstance.value(j) > medians[j]) {
82                        countHighValues++;
83                    }
84                }
85            }
86            clusterNumber[i] = countHighValues;
87        }
88
89        // determine median of cluster number
90        Median m = new Median();
91        double medianClusterNumber = m.evaluate(clusterNumber);
92
93        // finally modify the instances
94        // drop the unclean instances
95        for (int i = data.numInstances() - 1; i >= 0; i--) {
96            // set the classification
97            if (clusterNumber[i] > medianClusterNumber) {
98                data.get(i).setClassValue(1.0d);
99            }
100            else {
101                data.get(i).setClassValue(0.0d);
102            }
103        }
104    }
105
106}
Note: See TracBrowser for help on using the repository browser.