// Copyright 2015 Georg-August-Universität Göttingen, Germany // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package de.ugoe.cs.cpdp.dataprocessing; import org.apache.commons.collections4.list.SetUniqueList; import weka.core.Attribute; import weka.core.Instance; import weka.core.Instances; /** * Median as reference transformation after Carmargo Cruz and Ochimizu: Towards Logistic Regression * Models for Predicting Fault-prone Code across Software Projects
*
* For each attribute value x, the new value is x + (median of the test data - median of the current * project) * * @author Steffen Herbold */ public class MedianAsReference implements ISetWiseProcessingStrategy, IProcessesingStrategy { /** * Does not have parameters. String is ignored. * * @param parameters * ignored */ @Override public void setParameter(String parameters) { // dummy } /** * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, * org.apache.commons.collections4.list.SetUniqueList) */ @Override public void apply(Instances testdata, SetUniqueList traindataSet) { final Attribute classAttribute = testdata.classAttribute(); final double[] median = new double[testdata.numAttributes()]; // test and train have the same number of attributes Attribute traindataClassAttribute; double[] currentmedian = new double[testdata.numAttributes()]; // get medians for (int j = 0; j < testdata.numAttributes(); j++) { if (testdata.attribute(j) != classAttribute) { median[j] = testdata.kthSmallestValue(j, (testdata.numInstances() + 1) >> 1); // (>>2 // -> // /2) } } // preprocess training data for (Instances traindata : traindataSet) { // get median of current training set traindataClassAttribute = traindata.classAttribute(); for (int j = 0; j < traindata.numAttributes(); j++) { if (traindata.attribute(j) != traindataClassAttribute && traindata.attribute(j).isNumeric()) { currentmedian[j] = traindata.kthSmallestValue(j, (traindata.numInstances() + 1) >> 1); // (>>2 // -> // /2) } } for (int i = 0; i < traindata.numInstances(); i++) { Instance instance = traindata.instance(i); for (int j = 0; j < traindata.numAttributes(); j++) { if (traindata.attribute(j) != classAttribute && traindata.attribute(j).isNumeric()) { instance.setValue(j, instance.value(j) + (median[j] - currentmedian[j])); } } } } } /** * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, * weka.core.Instances) */ @Override public void apply(Instances testdata, Instances traindata) { final Attribute classAttribute = testdata.classAttribute(); final Attribute traindataClassAttribute = traindata.classAttribute(); final double[] median = new double[testdata.numAttributes()]; // test and train have the same number of attributes double[] currentmedian = new double[testdata.numAttributes()]; // get medians for (int j = 0; j < testdata.numAttributes(); j++) { if (testdata.attribute(j) != classAttribute) { median[j] = testdata.kthSmallestValue(j, (testdata.numInstances() + 1) >> 1); // (>>2 // -> // /2) } } // get median of current training set for (int j = 0; j < traindata.numAttributes(); j++) { if (traindata.attribute(j) != traindataClassAttribute && traindata.attribute(j).isNumeric()) { currentmedian[j] = traindata.kthSmallestValue(j, (traindata.numInstances() + 1) >> 1); // (>>2 -> // /2) } } // preprocess training data for (int i = 0; i < traindata.numInstances(); i++) { Instance instance = traindata.instance(i); for (int j = 0; j < traindata.numAttributes(); j++) { if (traindata.attribute(j) != classAttribute && traindata.attribute(j).isNumeric()) { instance.setValue(j, instance.value(j) + (median[j] - currentmedian[j])); } } } } }