Changeset 41 for trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing
- Timestamp:
- 09/24/15 10:59:05 (9 years ago)
- Location:
- trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing
- Files:
-
- 18 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/AttributeNonRemoval.java
r32 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 8 22 9 23 /** 10 * Removes attributes from all data sets, except the one defined, using their name. 24 * Removes attributes from all data sets, except the one defined, using their name. 25 * 11 26 * @author Fabian Trautsch 12 27 */ 13 28 public class AttributeNonRemoval implements ISetWiseProcessingStrategy, IProcessesingStrategy { 14 29 15 /** 16 * names of the attributes to be kept (determined by {@link #setParameter(String)}) 17 */ 18 private ArrayList<String> attributeNames = new ArrayList<String>(); 19 20 /** 21 * Sets that attributes that will be kept. The string contains the blank-separated names of the attributes to be kept. 22 * <br><br> 23 * Note, that keeping of attributes with blanks is currently not supported! 24 * @param parameters string with the blank-separated attribute names 25 */ 26 @Override 27 public void setParameter(String parameters) { 28 if( parameters!=null ) { 29 String[] attributeNamesArray = parameters.split(" "); 30 for(String attributeName : attributeNamesArray) { 31 attributeNames.add(attributeName); 32 } 33 } 34 } 30 /** 31 * names of the attributes to be kept (determined by {@link #setParameter(String)}) 32 */ 33 private ArrayList<String> attributeNames = new ArrayList<String>(); 35 34 36 /** 37 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 38 */ 39 @Override 40 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 41 for( String attributeName : attributeNames ) { 42 for( int i=0 ; i<testdata.numAttributes() ; i++ ) { 43 if(!attributeName.equals(testdata.attribute(i).name()) ) { 44 testdata.deleteAttributeAt(i); 45 for( Instances traindata : traindataSet ) { 46 traindata.deleteAttributeAt(i); 47 } 48 } 49 } 50 } 51 } 35 /** 36 * Sets that attributes that will be kept. The string contains the blank-separated names of the 37 * attributes to be kept. <br> 38 * <br> 39 * Note, that keeping of attributes with blanks is currently not supported! 40 * 41 * @param parameters 42 * string with the blank-separated attribute names 43 */ 44 @Override 45 public void setParameter(String parameters) { 46 if (parameters != null) { 47 String[] attributeNamesArray = parameters.split(" "); 48 for (String attributeName : attributeNamesArray) { 49 attributeNames.add(attributeName); 50 } 51 } 52 } 52 53 53 /** 54 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 55 */ 56 @Override 57 public void apply(Instances testdata, Instances traindata) { 58 for(int i=testdata.numAttributes()-1; i>=0; i--) { 59 if(!attributeNames.contains(testdata.attribute(i).name())) { 60 testdata.deleteAttributeAt(i); 61 traindata.deleteAttributeAt(i); 62 } 63 } 64 } 54 /** 55 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, 56 * org.apache.commons.collections4.list.SetUniqueList) 57 */ 58 @Override 59 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 60 for (String attributeName : attributeNames) { 61 for (int i = 0; i < testdata.numAttributes(); i++) { 62 if (!attributeName.equals(testdata.attribute(i).name())) { 63 testdata.deleteAttributeAt(i); 64 for (Instances traindata : traindataSet) { 65 traindata.deleteAttributeAt(i); 66 } 67 } 68 } 69 } 70 } 71 72 /** 73 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, 74 * weka.core.Instances) 75 */ 76 @Override 77 public void apply(Instances testdata, Instances traindata) { 78 for (int i = testdata.numAttributes() - 1; i >= 0; i--) { 79 if (!attributeNames.contains(testdata.attribute(i).name())) { 80 testdata.deleteAttributeAt(i); 81 traindata.deleteAttributeAt(i); 82 } 83 } 84 } 65 85 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/AttributeRemoval.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 6 20 7 21 /** 8 * Removes an attributes from all data sets using their name. 22 * Removes an attributes from all data sets using their name. 23 * 9 24 * @author Steffen Herbold 10 25 */ 11 26 public class AttributeRemoval implements ISetWiseProcessingStrategy, IProcessesingStrategy { 12 27 13 /** 14 * names of the attributes to be removed (determined by {@link #setParameter(String)}) 15 */ 16 private String[] attributeNames = new String[]{}; 17 18 /** 19 * Sets that attributes that will be removed. The string contains the blank-separated names of the attributes to be removed. 20 * <br><br> 21 * Note, that removal of attributes with blanks is currently not supported! 22 * @param parameters string with the blank-separated attribute names 23 */ 24 @Override 25 public void setParameter(String parameters) { 26 if( parameters!=null ) { 27 attributeNames = parameters.split(" "); 28 } 29 } 28 /** 29 * names of the attributes to be removed (determined by {@link #setParameter(String)}) 30 */ 31 private String[] attributeNames = new String[] { }; 30 32 31 /** 32 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 33 */ 34 @Override 35 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 36 for( String attributeName : attributeNames ) { 37 for( int i=0 ; i<testdata.numAttributes() ; i++ ) { 38 if( attributeName.equals(testdata.attribute(i).name()) ) { 39 testdata.deleteAttributeAt(i); 40 for( Instances traindata : traindataSet ) { 41 traindata.deleteAttributeAt(i); 42 } 43 } 44 } 45 } 46 } 33 /** 34 * Sets that attributes that will be removed. The string contains the blank-separated names of 35 * the attributes to be removed. <br> 36 * <br> 37 * Note, that removal of attributes with blanks is currently not supported! 38 * 39 * @param parameters 40 * string with the blank-separated attribute names 41 */ 42 @Override 43 public void setParameter(String parameters) { 44 if (parameters != null) { 45 attributeNames = parameters.split(" "); 46 } 47 } 47 48 48 /** 49 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 50 */ 51 @Override 52 public void apply(Instances testdata, Instances traindata) { 53 for( String attributeName : attributeNames ) { 54 for( int i=0 ; i<testdata.numAttributes() ; i++ ) { 55 if( attributeName.equals(testdata.attribute(i).name()) ) { 56 testdata.deleteAttributeAt(i); 57 traindata.deleteAttributeAt(i); 58 } 59 } 60 } 61 } 49 /** 50 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, 51 * org.apache.commons.collections4.list.SetUniqueList) 52 */ 53 @Override 54 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 55 for (String attributeName : attributeNames) { 56 for (int i = 0; i < testdata.numAttributes(); i++) { 57 if (attributeName.equals(testdata.attribute(i).name())) { 58 testdata.deleteAttributeAt(i); 59 for (Instances traindata : traindataSet) { 60 traindata.deleteAttributeAt(i); 61 } 62 } 63 } 64 } 65 } 66 67 /** 68 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, 69 * weka.core.Instances) 70 */ 71 @Override 72 public void apply(Instances testdata, Instances traindata) { 73 for (String attributeName : attributeNames) { 74 for (int i = 0; i < testdata.numAttributes(); i++) { 75 if (attributeName.equals(testdata.attribute(i).name())) { 76 testdata.deleteAttributeAt(i); 77 traindata.deleteAttributeAt(i); 78 } 79 } 80 } 81 } 62 82 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/AverageStandardization.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 8 22 9 23 /** 10 * Standardization procedure after Watanabe et al.: Adapting a Fault Prediction Model to Allow Inter Language Reuse. 11 * <br><br> 12 * In comparison to Watanabe et al., we transform training data instead of the test data. Otherwise, this approach would not be feasible with multiple projects. 24 * Standardization procedure after Watanabe et al.: Adapting a Fault Prediction Model to Allow Inter 25 * Language Reuse. <br> 26 * <br> 27 * In comparison to Watanabe et al., we transform training data instead of the test data. Otherwise, 28 * this approach would not be feasible with multiple projects. 29 * 13 30 * @author Steffen Herbold 14 31 */ 15 32 public class AverageStandardization implements ISetWiseProcessingStrategy, IProcessesingStrategy { 16 33 17 /** 18 * Does not have parameters. String is ignored. 19 * @param parameters ignored 20 */ 21 @Override 22 public void setParameter(String parameters) { 23 // dummy 24 } 34 /** 35 * Does not have parameters. String is ignored. 36 * 37 * @param parameters 38 * ignored 39 */ 40 @Override 41 public void setParameter(String parameters) { 42 // dummy 43 } 25 44 26 /** 27 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 28 */ 29 @Override 30 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 31 final Attribute classAttribute = testdata.classAttribute(); 32 33 final double[] meanTest = new double[testdata.numAttributes()]; 34 35 // get means of testdata 36 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 37 if( testdata.attribute(j)!=classAttribute ) { 38 meanTest[j] = testdata.meanOrMode(j); 39 } 40 } 41 42 // preprocess training data 43 for( Instances traindata : traindataSet ) { 44 double[] meanTrain = new double[testdata.numAttributes()]; 45 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 46 if( testdata.attribute(j)!=classAttribute ) { 47 meanTrain[j] = traindata.meanOrMode(j); 48 } 49 } 50 51 for( int i=0 ; i<traindata.numInstances() ; i++ ) { 52 Instance instance = traindata.instance(i); 53 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 54 if( testdata.attribute(j)!=classAttribute ) { 55 instance.setValue(j, instance.value(j)*meanTest[j]/meanTrain[j]); 56 } 57 } 58 } 59 } 60 } 45 /** 46 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, 47 * org.apache.commons.collections4.list.SetUniqueList) 48 */ 49 @Override 50 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 51 final Attribute classAttribute = testdata.classAttribute(); 61 52 62 /** 63 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 64 */ 65 @Override 66 public void apply(Instances testdata, Instances traindata) { 67 final Attribute classAttribute = testdata.classAttribute(); 68 69 final double[] meanTest = new double[testdata.numAttributes()]; 70 71 // get means of testdata 72 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 73 if( testdata.attribute(j)!=classAttribute ) { 74 meanTest[j] = testdata.meanOrMode(j); 75 } 76 } 77 78 // preprocess training data 79 final double[] meanTrain = new double[testdata.numAttributes()]; 80 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 81 if( testdata.attribute(j)!=classAttribute ) { 82 meanTrain[j] = traindata.meanOrMode(j); 83 } 84 } 85 86 for( int i=0 ; i<traindata.numInstances() ; i++ ) { 87 Instance instance = traindata.instance(i); 88 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 89 if( testdata.attribute(j)!=classAttribute ) { 90 instance.setValue(j, instance.value(j)*meanTest[j]/meanTrain[j]); 91 } 92 } 93 } 94 } 53 final double[] meanTest = new double[testdata.numAttributes()]; 54 55 // get means of testdata 56 for (int j = 0; j < testdata.numAttributes(); j++) { 57 if (testdata.attribute(j) != classAttribute) { 58 meanTest[j] = testdata.meanOrMode(j); 59 } 60 } 61 62 // preprocess training data 63 for (Instances traindata : traindataSet) { 64 double[] meanTrain = new double[testdata.numAttributes()]; 65 for (int j = 0; j < testdata.numAttributes(); j++) { 66 if (testdata.attribute(j) != classAttribute) { 67 meanTrain[j] = traindata.meanOrMode(j); 68 } 69 } 70 71 for (int i = 0; i < traindata.numInstances(); i++) { 72 Instance instance = traindata.instance(i); 73 for (int j = 0; j < testdata.numAttributes(); j++) { 74 if (testdata.attribute(j) != classAttribute) { 75 instance.setValue(j, instance.value(j) * meanTest[j] / meanTrain[j]); 76 } 77 } 78 } 79 } 80 } 81 82 /** 83 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, 84 * weka.core.Instances) 85 */ 86 @Override 87 public void apply(Instances testdata, Instances traindata) { 88 final Attribute classAttribute = testdata.classAttribute(); 89 90 final double[] meanTest = new double[testdata.numAttributes()]; 91 92 // get means of testdata 93 for (int j = 0; j < testdata.numAttributes(); j++) { 94 if (testdata.attribute(j) != classAttribute) { 95 meanTest[j] = testdata.meanOrMode(j); 96 } 97 } 98 99 // preprocess training data 100 final double[] meanTrain = new double[testdata.numAttributes()]; 101 for (int j = 0; j < testdata.numAttributes(); j++) { 102 if (testdata.attribute(j) != classAttribute) { 103 meanTrain[j] = traindata.meanOrMode(j); 104 } 105 } 106 107 for (int i = 0; i < traindata.numInstances(); i++) { 108 Instance instance = traindata.instance(i); 109 for (int j = 0; j < testdata.numAttributes(); j++) { 110 if (testdata.attribute(j) != classAttribute) { 111 instance.setValue(j, instance.value(j) * meanTest[j] / meanTrain[j]); 112 } 113 } 114 } 115 } 95 116 96 117 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/BiasedWeights.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 7 21 8 22 /** 9 * Sets the bias of the weights of the training data. By using a bias of 0.5 (default value) the total weight of the positive instances (i.e. 10 * fault-prone) is equal to the total weight of the negative instances (i.e. non-fault-prone). Otherwise the weights between the two will be 11 * distributed according to the bias, where <0.5 means in favor of the negative instances and >0.5 in favor of the positive instances. 12 * equal to the total weight of the test 23 * Sets the bias of the weights of the training data. By using a bias of 0.5 (default value) the 24 * total weight of the positive instances (i.e. fault-prone) is equal to the total weight of the 25 * negative instances (i.e. non-fault-prone). Otherwise the weights between the two will be 26 * distributed according to the bias, where <0.5 means in favor of the negative instances and 27 * >0.5 in favor of the positive instances. equal to the total weight of the test 28 * 13 29 * @author Steffen Herbold 14 30 */ 15 31 public class BiasedWeights implements IProcessesingStrategy, ISetWiseProcessingStrategy { 16 32 17 /** 18 * bias used for the weighting 19 */ 20 private double bias = 0.5; 21 22 23 /** 24 * Sets the bias to be used for weighting. 25 * @param parameters string with the bias 26 */ 27 @Override 28 public void setParameter(String parameters) { 29 bias = Double.parseDouble(parameters); 30 } 33 /** 34 * bias used for the weighting 35 */ 36 private double bias = 0.5; 31 37 32 /** 33 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 34 */ 35 @Override 36 public void apply(Instances testdata, Instances traindata) { 37 //setBiasedWeights(testdata); 38 setBiasedWeights(traindata); 39 } 40 41 /** 42 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 43 */ 44 @Override 45 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 46 for( Instances traindata : traindataSet ) { 47 setBiasedWeights(traindata); 48 } 49 } 50 51 /** 52 * Helper method that sets the weights for a given data set. 53 * @param data data set whose weights are set 54 */ 55 private void setBiasedWeights(Instances data) { 56 final int classIndex = data.classIndex(); 57 58 final int[] counts = data.attributeStats(classIndex).nominalCounts; 59 60 final double weightNegatives = ((1-bias)*data.numInstances()) / counts[0]; 61 final double weightPositives = (bias*data.numInstances()) / counts[1]; 62 63 64 for( int i=0 ; i<data.numInstances() ; i++ ) { 65 Instance instance = data.instance(i); 66 if( instance.value(classIndex)==0 ) { 67 instance.setWeight(weightNegatives); 68 } 69 if( instance.value(classIndex)==1 ) { 70 instance.setWeight(weightPositives); 71 } 72 } 73 } 38 /** 39 * Sets the bias to be used for weighting. 40 * 41 * @param parameters 42 * string with the bias 43 */ 44 @Override 45 public void setParameter(String parameters) { 46 bias = Double.parseDouble(parameters); 47 } 74 48 75 49 /** 50 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, 51 * weka.core.Instances) 52 */ 53 @Override 54 public void apply(Instances testdata, Instances traindata) { 55 // setBiasedWeights(testdata); 56 setBiasedWeights(traindata); 57 } 58 59 /** 60 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, 61 * org.apache.commons.collections4.list.SetUniqueList) 62 */ 63 @Override 64 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 65 for (Instances traindata : traindataSet) { 66 setBiasedWeights(traindata); 67 } 68 } 69 70 /** 71 * Helper method that sets the weights for a given data set. 72 * 73 * @param data 74 * data set whose weights are set 75 */ 76 private void setBiasedWeights(Instances data) { 77 final int classIndex = data.classIndex(); 78 79 final int[] counts = data.attributeStats(classIndex).nominalCounts; 80 81 final double weightNegatives = ((1 - bias) * data.numInstances()) / counts[0]; 82 final double weightPositives = (bias * data.numInstances()) / counts[1]; 83 84 for (int i = 0; i < data.numInstances(); i++) { 85 Instance instance = data.instance(i); 86 if (instance.value(classIndex) == 0) { 87 instance.setWeight(weightNegatives); 88 } 89 if (instance.value(classIndex) == 1) { 90 instance.setWeight(weightPositives); 91 } 92 } 93 } 76 94 77 95 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/DataGravitation.java
r10 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 8 22 9 23 /** 10 * Implements an approach for data weighting suggested after Y. Ma, G. Luo, X. Zeng, and A. Chen: Transfer learning for 11 * cross-company software defect prediction. The instances are weighted higher, the more attributes are within the range they are in the training data. 24 * Implements an approach for data weighting suggested after Y. Ma, G. Luo, X. Zeng, and A. Chen: 25 * Transfer learning for cross-company software defect prediction. The instances are weighted 26 * higher, the more attributes are within the range they are in the training data. 27 * 12 28 * @author Steffen Herbold 13 29 */ 14 30 public class DataGravitation implements IProcessesingStrategy, ISetWiseProcessingStrategy { 15 31 16 /** 17 * Does not have parameters. String is ignored. 18 * @param parameters ignored 19 */ 20 @Override 21 public void setParameter(String parameters) { 22 // dummy 23 } 24 25 /* (non-Javadoc) 26 * @see de.ugoe.cs.cpdp.dataprocessing.ISetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 27 */ 28 @Override 29 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 30 for( Instances traindata : traindataSet ) { 31 apply(testdata, traindata); 32 } 33 } 32 /** 33 * Does not have parameters. String is ignored. 34 * 35 * @param parameters 36 * ignored 37 */ 38 @Override 39 public void setParameter(String parameters) { 40 // dummy 41 } 34 42 35 /* (non-Javadoc) 36 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 37 */ 38 @Override 39 public void apply(Instances testdata, Instances traindata) { 40 Attribute classAtt = testdata.classAttribute(); 41 42 double[] minAttValues = new double[testdata.numAttributes()]; 43 double[] maxAttValues = new double[testdata.numAttributes()]; 44 double[] weights = new double[traindata.numInstances()]; 45 double weightsum = 0.0; 46 47 for( int j=0; j<testdata.numAttributes(); j++) { 48 if( testdata.attribute(j)!=classAtt ) { 49 minAttValues[j] = testdata.attributeStats(j).numericStats.min; 50 maxAttValues[j] = testdata.attributeStats(j).numericStats.max; 51 } 52 } 53 54 for( int i=0; i<traindata.numInstances(); i++ ) { 55 Instance inst = traindata.instance(i); 56 int similar = 0; 57 for( int j=0; j<testdata.numAttributes(); j++ ) { 58 if( testdata.attribute(j)!=classAtt ) { 59 if( inst.value(j)>=minAttValues[j] && inst.value(j)<=maxAttValues[j] ) { 60 similar++; 61 } 62 } 63 } 64 weights[i] = similar/Math.sqrt(testdata.numAttributes()-similar); 65 weightsum += weights[i]; 66 } 67 for( int i=0; i<traindata.numInstances(); i++ ) { 68 traindata.instance(i).setWeight(weights[i]*traindata.numInstances()/weightsum); 69 } 70 } 43 /* 44 * (non-Javadoc) 45 * 46 * @see de.ugoe.cs.cpdp.dataprocessing.ISetWiseProcessingStrategy#apply(weka.core.Instances, 47 * org.apache.commons.collections4.list.SetUniqueList) 48 */ 49 @Override 50 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 51 for (Instances traindata : traindataSet) { 52 apply(testdata, traindata); 53 } 54 } 55 56 /* 57 * (non-Javadoc) 58 * 59 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances, 60 * weka.core.Instances) 61 */ 62 @Override 63 public void apply(Instances testdata, Instances traindata) { 64 Attribute classAtt = testdata.classAttribute(); 65 66 double[] minAttValues = new double[testdata.numAttributes()]; 67 double[] maxAttValues = new double[testdata.numAttributes()]; 68 double[] weights = new double[traindata.numInstances()]; 69 double weightsum = 0.0; 70 71 for (int j = 0; j < testdata.numAttributes(); j++) { 72 if (testdata.attribute(j) != classAtt) { 73 minAttValues[j] = testdata.attributeStats(j).numericStats.min; 74 maxAttValues[j] = testdata.attributeStats(j).numericStats.max; 75 } 76 } 77 78 for (int i = 0; i < traindata.numInstances(); i++) { 79 Instance inst = traindata.instance(i); 80 int similar = 0; 81 for (int j = 0; j < testdata.numAttributes(); j++) { 82 if (testdata.attribute(j) != classAtt) { 83 if (inst.value(j) >= minAttValues[j] && inst.value(j) <= maxAttValues[j]) { 84 similar++; 85 } 86 } 87 } 88 weights[i] = similar / Math.sqrt(testdata.numAttributes() - similar); 89 weightsum += weights[i]; 90 } 91 for (int i = 0; i < traindata.numInstances(); i++) { 92 traindata.instance(i).setWeight(weights[i] * traindata.numInstances() / weightsum); 93 } 94 } 71 95 72 96 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/IProcessesingStrategy.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 5 19 6 20 /** 7 * A data processing strategy that is applied to the test data and a single set of training data. 21 * A data processing strategy that is applied to the test data and a single set of training data. 22 * 8 23 * @author Steffen Herbold 9 24 */ 10 25 public interface IProcessesingStrategy extends IParameterizable { 11 12 /** 13 * Applies the processing strategy. 14 * @param testdata test data 15 * @param traindata training data 16 */ 17 void apply(Instances testdata, Instances traindata); 26 27 /** 28 * Applies the processing strategy. 29 * 30 * @param testdata 31 * test data 32 * @param traindata 33 * training data 34 */ 35 void apply(Instances testdata, Instances traindata); 18 36 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/ISetWiseProcessingStrategy.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 8 22 9 23 /** 10 * A data processing strategy that is applied to the test data and a multiple sets of training data. 24 * A data processing strategy that is applied to the test data and a multiple sets of training data. 25 * 11 26 * @author Steffen Herbold 12 27 */ 13 28 public interface ISetWiseProcessingStrategy extends IParameterizable { 14 29 15 /** 16 * Applies the processing strategy. 17 * @param testdata test data 18 * @param traindataSet training data sets 19 */ 20 void apply(Instances testdata, SetUniqueList<Instances> traindataSet); 21 30 /** 31 * Applies the processing strategy. 32 * 33 * @param testdata 34 * test data 35 * @param traindataSet 36 * training data sets 37 */ 38 void apply(Instances testdata, SetUniqueList<Instances> traindataSet); 39 22 40 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/InformationGainFilter.java
r10 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 12 26 13 27 /** 14 * Implements an attribute filter that is based on the information gain of each attribute after Z. He, F. Peters, T. Menzies, Y. Yang: Learning from Open-Source Projects: An Empirical Study on Defect Prediction. 15 * A logistic classifier is trained to separate a random sample of the training data from a random sample of the test data. As standard, the best 50% of attributes are retained. 16 * This ratio can be adjusted using the parameter of the filter (0.5 = 50%). 17 * <br><br> 18 * Best means the least information gain, because this means that the attribute is similar in both test and training data. 28 * Implements an attribute filter that is based on the information gain of each attribute after Z. 29 * He, F. Peters, T. Menzies, Y. Yang: Learning from Open-Source Projects: An Empirical Study on 30 * Defect Prediction. A logistic classifier is trained to separate a random sample of the training 31 * data from a random sample of the test data. As standard, the best 50% of attributes are retained. 32 * This ratio can be adjusted using the parameter of the filter (0.5 = 50%). <br> 33 * <br> 34 * Best means the least information gain, because this means that the attribute is similar in both 35 * test and training data. 36 * 19 37 * @author Steffen Herbold 20 38 */ 21 39 public class InformationGainFilter implements ISetWiseProcessingStrategy, IProcessesingStrategy { 22 40 23 /** 24 * size of the random sample that is drawn from both test data and training data 25 */ 26 private final int sampleSize = 500; 27 28 /** 29 * ratio of features that is kept 30 */ 31 private double featureRatio = 0.5; 32 33 /** 34 * Sets the feature ratio. 35 * @param parameters feature ratio 36 */ 37 @Override 38 public void setParameter(String parameters) { 39 if( !"".equals(parameters) ) { 40 featureRatio = Double.parseDouble(parameters); 41 } 42 } 41 /** 42 * size of the random sample that is drawn from both test data and training data 43 */ 44 private final int sampleSize = 500; 43 45 44 /** 45 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 46 */ 47 @Override 48 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 49 for( Instances traindata : traindataSet ) { 50 apply(testdata, traindata, false); 51 } 52 53 } 54 55 /** 56 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 57 */ 58 @Override 59 public void apply(Instances testdata, Instances traindata) { 60 apply(testdata, traindata, true); 61 } 62 63 /** 64 * Internal helper function for the application of the filter to both all data set as well as a single data set. 65 * @param testdata data of the target product 66 * @param traindata data of the training product 67 * @param removeFromTest defines whether the attributes shall be removed from the test data as well or not 68 */ 69 private void apply(Instances testdata, Instances traindata, boolean removeFromTest) { 70 final Random rand = new Random(1); 71 final int removalNumber = (int) (featureRatio*(testdata.numAttributes()-1)); 72 73 final int classIndex = testdata.classIndex(); 74 75 // sample instances 76 final Instances sample = new Instances(testdata); 77 for( int j=0; j<sampleSize; j++ ) { 78 Instance inst = new DenseInstance(testdata.instance(rand.nextInt(testdata.numInstances()))); 79 inst.setDataset(sample); 80 inst.setClassValue(1.0); 81 sample.add(inst); 82 inst = new DenseInstance(traindata.instance(rand.nextInt(traindata.numInstances()))); 83 inst.setDataset(sample); 84 inst.setClassValue(0.0); 85 sample.add(inst); 86 } 87 88 final double[] gain = new double[sample.numAttributes()]; 89 90 final InfoGainAttributeEval gainEval = new InfoGainAttributeEval(); 91 try { 92 gainEval.buildEvaluator(sample); 93 for( int i=0 ; i<testdata.numAttributes() ; i++ ) { 94 //if( sample.classAttribute().equals(sample.attribute(i)) ) { 95 // gain[i] = 0.0; 96 //} else { 97 if( !sample.classAttribute().equals(sample.attribute(i)) ) { 98 gain[i] = gainEval.evaluateAttribute(i); 99 } 100 } 101 } catch (Exception e) { 102 //throw new RuntimeException("could not determine information gain for all attributes", e); 103 // ignore exception; it is caused by attributes that are extremely 104 } 105 106 // select best attributes 107 final double[] gainCopy = Arrays.copyOf(gain, gain.length); 108 Arrays.sort(gainCopy); 109 final double cutoffGain = gainCopy[testdata.numAttributes()-removalNumber]; 110 111 for( int i=testdata.numAttributes()-1; i>=0 ; i-- ) { 112 if( gain[i]>=cutoffGain && i!=classIndex) { 113 traindata.deleteAttributeAt(i); 114 if( removeFromTest ) { 115 testdata.deleteAttributeAt(i); 116 } 117 } 118 } 119 } 46 /** 47 * ratio of features that is kept 48 */ 49 private double featureRatio = 0.5; 50 51 /** 52 * Sets the feature ratio. 53 * 54 * @param parameters 55 * feature ratio 56 */ 57 @Override 58 public void setParameter(String parameters) { 59 if (!"".equals(parameters)) { 60 featureRatio = Double.parseDouble(parameters); 61 } 62 } 63 64 /** 65 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, 66 * org.apache.commons.collections4.list.SetUniqueList) 67 */ 68 @Override 69 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 70 for (Instances traindata : traindataSet) { 71 apply(testdata, traindata, false); 72 } 73 74 } 75 76 /** 77 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, 78 * weka.core.Instances) 79 */ 80 @Override 81 public void apply(Instances testdata, Instances traindata) { 82 apply(testdata, traindata, true); 83 } 84 85 /** 86 * Internal helper function for the application of the filter to both all data set as well as a 87 * single data set. 88 * 89 * @param testdata 90 * data of the target product 91 * @param traindata 92 * data of the training product 93 * @param removeFromTest 94 * defines whether the attributes shall be removed from the test data as well or not 95 */ 96 private void apply(Instances testdata, Instances traindata, boolean removeFromTest) { 97 final Random rand = new Random(1); 98 final int removalNumber = (int) (featureRatio * (testdata.numAttributes() - 1)); 99 100 final int classIndex = testdata.classIndex(); 101 102 // sample instances 103 final Instances sample = new Instances(testdata); 104 for (int j = 0; j < sampleSize; j++) { 105 Instance inst = 106 new DenseInstance(testdata.instance(rand.nextInt(testdata.numInstances()))); 107 inst.setDataset(sample); 108 inst.setClassValue(1.0); 109 sample.add(inst); 110 inst = new DenseInstance(traindata.instance(rand.nextInt(traindata.numInstances()))); 111 inst.setDataset(sample); 112 inst.setClassValue(0.0); 113 sample.add(inst); 114 } 115 116 final double[] gain = new double[sample.numAttributes()]; 117 118 final InfoGainAttributeEval gainEval = new InfoGainAttributeEval(); 119 try { 120 gainEval.buildEvaluator(sample); 121 for (int i = 0; i < testdata.numAttributes(); i++) { 122 // if( sample.classAttribute().equals(sample.attribute(i)) ) { 123 // gain[i] = 0.0; 124 // } else { 125 if (!sample.classAttribute().equals(sample.attribute(i))) { 126 gain[i] = gainEval.evaluateAttribute(i); 127 } 128 } 129 } 130 catch (Exception e) { 131 // throw new RuntimeException("could not determine information gain for all attributes", 132 // e); 133 // ignore exception; it is caused by attributes that are extremely 134 } 135 136 // select best attributes 137 final double[] gainCopy = Arrays.copyOf(gain, gain.length); 138 Arrays.sort(gainCopy); 139 final double cutoffGain = gainCopy[testdata.numAttributes() - removalNumber]; 140 141 for (int i = testdata.numAttributes() - 1; i >= 0; i--) { 142 if (gain[i] >= cutoffGain && i != classIndex) { 143 traindata.deleteAttributeAt(i); 144 if (removeFromTest) { 145 testdata.deleteAttributeAt(i); 146 } 147 } 148 } 149 } 120 150 121 151 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/LogarithmTransform.java
r40 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 8 22 9 23 /** 10 * Logarithm transformation after Carmargo Cruz and Ochimizu: Towards Logistic Regression Models for Predicting Fault-prone Code across Software Projects. 11 * <br><br> 12 * Transform each attribute value x into log(x+1). 24 * Logarithm transformation after Carmargo Cruz and Ochimizu: Towards Logistic Regression Models for 25 * Predicting Fault-prone Code across Software Projects. <br> 26 * <br> 27 * Transform each attribute value x into log(x+1). 28 * 13 29 * @author Steffen Herbold 14 30 */ 15 31 public class LogarithmTransform implements ISetWiseProcessingStrategy, IProcessesingStrategy { 16 32 17 /** 18 * Does not have parameters. String is ignored. 19 * @param parameters ignored 20 */ 21 @Override 22 public void setParameter(String parameters) { 23 // dummy 24 } 33 /** 34 * Does not have parameters. String is ignored. 35 * 36 * @param parameters 37 * ignored 38 */ 39 @Override 40 public void setParameter(String parameters) { 41 // dummy 42 } 25 43 26 /** 27 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 28 */ 29 @Override 30 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 31 final Attribute classAttribute = testdata.classAttribute(); 32 33 // preprocess testdata 34 for( int i=0 ; i<testdata.numInstances() ; i++ ) { 35 Instance instance = testdata.instance(i); 36 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 37 if( testdata.attribute(j)!=classAttribute && testdata.attribute(j).isNumeric() ) { 38 if( instance.value(j) < 0 ) { 39 instance.setValue(j, (-1*(Math.log(-1*instance.value(j))))); 40 }else { 41 instance.setValue(j, Math.log(1+instance.value(j))); 42 } 43 } 44 } 45 } 46 47 // preprocess training data 48 for( Instances traindata : traindataSet ) { 49 for( int i=0 ; i<traindata.numInstances() ; i++ ) { 50 Instance instance = traindata.instance(i); 51 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 52 if( traindata.attribute(j)!=classAttribute && traindata.attribute(j).isNumeric() ) { 53 if( instance.value(j) < 0 ) { 54 instance.setValue(j, (-1*(Math.log(-1*instance.value(j))))); 55 }else { 56 instance.setValue(j, Math.log(1+instance.value(j))); 57 } 58 } 59 } 60 } 61 } 62 } 44 /** 45 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, 46 * org.apache.commons.collections4.list.SetUniqueList) 47 */ 48 @Override 49 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 50 final Attribute classAttribute = testdata.classAttribute(); 63 51 64 /** 65 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 66 */ 67 @Override 68 public void apply(Instances testdata, Instances traindata) { 69 final Attribute classAttribute = testdata.classAttribute(); 70 71 // preprocess testdata 72 for( int i=0 ; i<testdata.numInstances() ; i++ ) { 73 Instance instance = testdata.instance(i); 74 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 75 if( testdata.attribute(j)!=classAttribute && testdata.attribute(j).isNumeric() ) { 76 if( instance.value(j) < 0 ) { 77 instance.setValue(j, (-1*(Math.log(-1*instance.value(j))))); 78 }else { 79 instance.setValue(j, Math.log(1+instance.value(j))); 80 } 81 } 82 } 83 } 84 85 // preprocess training data 86 for( int i=0 ; i<traindata.numInstances() ; i++ ) { 87 Instance instance = traindata.instance(i); 88 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 89 if( traindata.attribute(j)!=classAttribute && traindata.attribute(j).isNumeric() ) { 90 if( instance.value(j) < 0 ) { 91 instance.setValue(j, (-1*(Math.log(-1*instance.value(j))))); 92 }else { 93 instance.setValue(j, Math.log(1+instance.value(j))); 94 } 95 } 96 } 97 } 98 } 52 // preprocess testdata 53 for (int i = 0; i < testdata.numInstances(); i++) { 54 Instance instance = testdata.instance(i); 55 for (int j = 0; j < testdata.numAttributes(); j++) { 56 if (testdata.attribute(j) != classAttribute && testdata.attribute(j).isNumeric()) { 57 if (instance.value(j) < 0) { 58 instance.setValue(j, (-1 * (Math.log(-1 * instance.value(j))))); 59 } 60 else { 61 instance.setValue(j, Math.log(1 + instance.value(j))); 62 } 63 } 64 } 65 } 66 67 // preprocess training data 68 for (Instances traindata : traindataSet) { 69 for (int i = 0; i < traindata.numInstances(); i++) { 70 Instance instance = traindata.instance(i); 71 for (int j = 0; j < testdata.numAttributes(); j++) { 72 if (traindata.attribute(j) != classAttribute && 73 traindata.attribute(j).isNumeric()) 74 { 75 if (instance.value(j) < 0) { 76 instance.setValue(j, (-1 * (Math.log(-1 * instance.value(j))))); 77 } 78 else { 79 instance.setValue(j, Math.log(1 + instance.value(j))); 80 } 81 } 82 } 83 } 84 } 85 } 86 87 /** 88 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, 89 * weka.core.Instances) 90 */ 91 @Override 92 public void apply(Instances testdata, Instances traindata) { 93 final Attribute classAttribute = testdata.classAttribute(); 94 95 // preprocess testdata 96 for (int i = 0; i < testdata.numInstances(); i++) { 97 Instance instance = testdata.instance(i); 98 for (int j = 0; j < testdata.numAttributes(); j++) { 99 if (testdata.attribute(j) != classAttribute && testdata.attribute(j).isNumeric()) { 100 if (instance.value(j) < 0) { 101 instance.setValue(j, (-1 * (Math.log(-1 * instance.value(j))))); 102 } 103 else { 104 instance.setValue(j, Math.log(1 + instance.value(j))); 105 } 106 } 107 } 108 } 109 110 // preprocess training data 111 for (int i = 0; i < traindata.numInstances(); i++) { 112 Instance instance = traindata.instance(i); 113 for (int j = 0; j < testdata.numAttributes(); j++) { 114 if (traindata.attribute(j) != classAttribute && traindata.attribute(j).isNumeric()) 115 { 116 if (instance.value(j) < 0) { 117 instance.setValue(j, (-1 * (Math.log(-1 * instance.value(j))))); 118 } 119 else { 120 instance.setValue(j, Math.log(1 + instance.value(j))); 121 } 122 } 123 } 124 } 125 } 99 126 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/MedianAsReference.java
r40 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 8 22 9 23 /** 10 * Median as reference transformation after Carmargo Cruz and Ochimizu: Towards Logistic Regression Models for Predicting Fault-prone Code across Software Projects 11 * <br><br> 12 * For each attribute value x, the new value is x + (median of the test data - median of the current project) 24 * Median as reference transformation after Carmargo Cruz and Ochimizu: Towards Logistic Regression 25 * Models for Predicting Fault-prone Code across Software Projects <br> 26 * <br> 27 * For each attribute value x, the new value is x + (median of the test data - median of the current 28 * project) 29 * 13 30 * @author Steffen Herbold 14 31 */ 15 32 public class MedianAsReference implements ISetWiseProcessingStrategy, IProcessesingStrategy { 16 33 17 /** 18 * Does not have parameters. String is ignored. 19 * @param parameters ignored 20 */ 21 @Override 22 public void setParameter(String parameters) { 23 // dummy 24 } 34 /** 35 * Does not have parameters. String is ignored. 36 * 37 * @param parameters 38 * ignored 39 */ 40 @Override 41 public void setParameter(String parameters) { 42 // dummy 43 } 25 44 26 /** 27 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 28 */ 29 @Override 30 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 31 final Attribute classAttribute = testdata.classAttribute(); 32 final double[] median = new double[testdata.numAttributes()]; 33 34 // test and train have the same number of attributes 35 Attribute traindataClassAttribute; 36 double[] currentmedian = new double[testdata.numAttributes()]; 37 38 // get medians 39 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 40 if( testdata.attribute(j)!=classAttribute ) { 41 median[j] = testdata.kthSmallestValue(j, (testdata.numInstances()+1)>>1); // (>>2 -> /2) 42 } 43 } 44 45 // preprocess training data 46 for( Instances traindata : traindataSet ) { 47 // get median of current training set 48 traindataClassAttribute = traindata.classAttribute(); 49 for( int j=0 ; j<traindata.numAttributes() ; j++ ) { 50 if( traindata.attribute(j)!=traindataClassAttribute && traindata.attribute(j).isNumeric()) { 51 currentmedian[j] = traindata.kthSmallestValue(j, (traindata.numInstances()+1)>>1); // (>>2 -> /2) 52 } 53 } 54 for( int i=0 ; i<traindata.numInstances() ; i++ ) { 55 Instance instance = traindata.instance(i); 56 for( int j=0 ; j<traindata.numAttributes() ; j++ ) { 57 if( traindata.attribute(j)!=classAttribute && traindata.attribute(j).isNumeric() ) { 58 instance.setValue(j, instance.value(j) + (median[j] - currentmedian[j])); 59 } 60 } 61 } 62 } 63 } 45 /** 46 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, 47 * org.apache.commons.collections4.list.SetUniqueList) 48 */ 49 @Override 50 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 51 final Attribute classAttribute = testdata.classAttribute(); 52 final double[] median = new double[testdata.numAttributes()]; 64 53 65 /** 66 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 67 */ 68 @Override 69 public void apply(Instances testdata, Instances traindata) { 70 final Attribute classAttribute = testdata.classAttribute(); 71 final Attribute traindataClassAttribute = traindata.classAttribute(); 72 final double[] median = new double[testdata.numAttributes()]; 54 // test and train have the same number of attributes 55 Attribute traindataClassAttribute; 56 double[] currentmedian = new double[testdata.numAttributes()]; 73 57 74 // test and train have the same number of attributes 75 double[] currentmedian = new double[testdata.numAttributes()]; 76 77 // get medians 78 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 79 if( testdata.attribute(j)!=classAttribute ) { 80 median[j] = testdata.kthSmallestValue(j, (testdata.numInstances()+1)>>1); // (>>2 -> /2) 81 } 82 } 58 // get medians 59 for (int j = 0; j < testdata.numAttributes(); j++) { 60 if (testdata.attribute(j) != classAttribute) { 61 median[j] = testdata.kthSmallestValue(j, (testdata.numInstances() + 1) >> 1); // (>>2 62 // -> 63 // /2) 64 } 65 } 83 66 84 // get median of current training set 85 for( int j=0 ; j<traindata.numAttributes() ; j++ ) { 86 if( traindata.attribute(j)!=traindataClassAttribute && traindata.attribute(j).isNumeric() ) { 87 currentmedian[j] = traindata.kthSmallestValue(j, (traindata.numInstances()+1)>>1); // (>>2 -> /2) 88 } 89 } 90 91 // preprocess training data 92 for( int i=0 ; i<traindata.numInstances() ; i++ ) { 93 Instance instance = traindata.instance(i); 94 for( int j=0 ; j<traindata.numAttributes() ; j++ ) { 95 if( traindata.attribute(j)!=classAttribute && traindata.attribute(j).isNumeric() ) { 96 instance.setValue(j, instance.value(j) + (median[j] - currentmedian[j])); 97 } 98 } 99 } 100 } 67 // preprocess training data 68 for (Instances traindata : traindataSet) { 69 // get median of current training set 70 traindataClassAttribute = traindata.classAttribute(); 71 for (int j = 0; j < traindata.numAttributes(); j++) { 72 if (traindata.attribute(j) != traindataClassAttribute && 73 traindata.attribute(j).isNumeric()) 74 { 75 currentmedian[j] = 76 traindata.kthSmallestValue(j, (traindata.numInstances() + 1) >> 1); // (>>2 77 // -> 78 // /2) 79 } 80 } 81 for (int i = 0; i < traindata.numInstances(); i++) { 82 Instance instance = traindata.instance(i); 83 for (int j = 0; j < traindata.numAttributes(); j++) { 84 if (traindata.attribute(j) != classAttribute && 85 traindata.attribute(j).isNumeric()) 86 { 87 instance.setValue(j, instance.value(j) + (median[j] - currentmedian[j])); 88 } 89 } 90 } 91 } 92 } 93 94 /** 95 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, 96 * weka.core.Instances) 97 */ 98 @Override 99 public void apply(Instances testdata, Instances traindata) { 100 final Attribute classAttribute = testdata.classAttribute(); 101 final Attribute traindataClassAttribute = traindata.classAttribute(); 102 final double[] median = new double[testdata.numAttributes()]; 103 104 // test and train have the same number of attributes 105 double[] currentmedian = new double[testdata.numAttributes()]; 106 107 // get medians 108 for (int j = 0; j < testdata.numAttributes(); j++) { 109 if (testdata.attribute(j) != classAttribute) { 110 median[j] = testdata.kthSmallestValue(j, (testdata.numInstances() + 1) >> 1); // (>>2 111 // -> 112 // /2) 113 } 114 } 115 116 // get median of current training set 117 for (int j = 0; j < traindata.numAttributes(); j++) { 118 if (traindata.attribute(j) != traindataClassAttribute && 119 traindata.attribute(j).isNumeric()) 120 { 121 currentmedian[j] = 122 traindata.kthSmallestValue(j, (traindata.numInstances() + 1) >> 1); // (>>2 -> 123 // /2) 124 } 125 } 126 127 // preprocess training data 128 for (int i = 0; i < traindata.numInstances(); i++) { 129 Instance instance = traindata.instance(i); 130 for (int j = 0; j < traindata.numAttributes(); j++) { 131 if (traindata.attribute(j) != classAttribute && traindata.attribute(j).isNumeric()) 132 { 133 instance.setValue(j, instance.value(j) + (median[j] - currentmedian[j])); 134 } 135 } 136 } 137 } 101 138 102 139 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/NominalAttributeFilter.java
r32 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 10 24 11 25 /** 12 * Filters the given dataset for an nominal attribute. 13 * Every instance that has a value of thedefined values of the given nominal attribute is removed.26 * Filters the given dataset for an nominal attribute. Every instance that has a value of the 27 * defined values of the given nominal attribute is removed. 14 28 * 15 * 16 * (e.g. param="CONFIDECNE low middle"; all instances where the "CONFIDENCE" attribute 17 * value is"low" or "middle" are removed from the dataset)29 * 30 * (e.g. param="CONFIDECNE low middle"; all instances where the "CONFIDENCE" attribute value is 31 * "low" or "middle" are removed from the dataset) 18 32 */ 19 33 20 public class NominalAttributeFilter implements IProcessesingStrategy {34 public class NominalAttributeFilter implements IProcessesingStrategy { 21 35 22 private String nominalAttributeName = ""; 23 private String[] nominalAttributeValues = new String[]{}; 24 25 /** 26 * Sets the nominal attribute name (first parameter) and the nominal attribute values (other 27 * parameters), which should be removed from the dataset. 28 * 29 * @param parameters string with the blank-separated parameters (first parameter 30 * is the name of the nominal attribute, everything else are the values) 31 */ 32 @Override 33 public void setParameter(String parameters) { 34 if( parameters!=null ) { 35 String[] parameter = parameters.split(" "); 36 nominalAttributeName = parameter[0]; 37 nominalAttributeValues = Arrays.copyOfRange(parameter, 1, parameter.length); 38 } 39 } 40 41 /* (non-Javadoc) 42 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 43 */ 44 @Override 45 public void apply(Instances testdata, Instances traindata) { 46 int indexOfConfidenceAttribute = -1; 47 48 // Find index of the named confidence attribute to filter for 49 for(int i=0; i<traindata.numAttributes(); i++) { 50 if(traindata.attribute(i).name().equals(nominalAttributeName)) { 51 indexOfConfidenceAttribute = i; 52 } 53 } 54 55 // if it was not found return 56 if(indexOfConfidenceAttribute == -1) { 57 return; 58 } 59 60 // Find index of nominal values 61 Attribute confidenceAttribute = traindata.attribute(indexOfConfidenceAttribute); 62 ArrayList<Object> nominalValuesOfConfidenceAttribute = Collections.list(confidenceAttribute.enumerateValues()); 63 ArrayList<Double> indexOfnominalAttributeValues = new ArrayList<Double>(); 64 65 66 for(int k=0; k<nominalValuesOfConfidenceAttribute.size(); k++) { 67 for(String attributeValue : nominalAttributeValues) { 68 if(((String)nominalValuesOfConfidenceAttribute.get(k)).equals(attributeValue)) { 69 indexOfnominalAttributeValues.add((double) k); 70 } 71 } 72 } 36 private String nominalAttributeName = ""; 37 private String[] nominalAttributeValues = new String[] { }; 73 38 74 75 76 77 // Go through all instances and check if nominal attribute equals 78 for(int j=traindata.numInstances()-1; j>=0; j--) { 79 Instance wekaInstance = traindata.get(j); 80 81 // delete all instances where nominal attribute has the value of one of the parameter 82 if(indexOfnominalAttributeValues.contains(wekaInstance.value(indexOfConfidenceAttribute))) { 83 traindata.delete(j); 84 } 85 } 86 } 39 /** 40 * Sets the nominal attribute name (first parameter) and the nominal attribute values (other 41 * parameters), which should be removed from the dataset. 42 * 43 * @param parameters 44 * string with the blank-separated parameters (first parameter is the name of the 45 * nominal attribute, everything else are the values) 46 */ 47 @Override 48 public void setParameter(String parameters) { 49 if (parameters != null) { 50 String[] parameter = parameters.split(" "); 51 nominalAttributeName = parameter[0]; 52 nominalAttributeValues = Arrays.copyOfRange(parameter, 1, parameter.length); 53 } 54 } 55 56 /* 57 * (non-Javadoc) 58 * 59 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances, 60 * weka.core.Instances) 61 */ 62 @Override 63 public void apply(Instances testdata, Instances traindata) { 64 int indexOfConfidenceAttribute = -1; 65 66 // Find index of the named confidence attribute to filter for 67 for (int i = 0; i < traindata.numAttributes(); i++) { 68 if (traindata.attribute(i).name().equals(nominalAttributeName)) { 69 indexOfConfidenceAttribute = i; 70 } 71 } 72 73 // if it was not found return 74 if (indexOfConfidenceAttribute == -1) { 75 return; 76 } 77 78 // Find index of nominal values 79 Attribute confidenceAttribute = traindata.attribute(indexOfConfidenceAttribute); 80 ArrayList<Object> nominalValuesOfConfidenceAttribute = 81 Collections.list(confidenceAttribute.enumerateValues()); 82 ArrayList<Double> indexOfnominalAttributeValues = new ArrayList<Double>(); 83 84 for (int k = 0; k < nominalValuesOfConfidenceAttribute.size(); k++) { 85 for (String attributeValue : nominalAttributeValues) { 86 if (((String) nominalValuesOfConfidenceAttribute.get(k)).equals(attributeValue)) { 87 indexOfnominalAttributeValues.add((double) k); 88 } 89 } 90 } 91 92 // Go through all instances and check if nominal attribute equals 93 for (int j = traindata.numInstances() - 1; j >= 0; j--) { 94 Instance wekaInstance = traindata.get(j); 95 96 // delete all instances where nominal attribute has the value of one of the parameter 97 if (indexOfnominalAttributeValues.contains(wekaInstance 98 .value(indexOfConfidenceAttribute))) 99 { 100 traindata.delete(j); 101 } 102 } 103 } 87 104 88 105 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/Normalization.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 9 23 10 24 /** 11 * Normalizes each attribute of each data set separately. 25 * Normalizes each attribute of each data set separately. 26 * 12 27 * @author Steffen Herbold 13 28 */ 14 29 public class Normalization implements ISetWiseProcessingStrategy, IProcessesingStrategy { 15 30 16 /** 17 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 18 */ 19 @Override 20 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 21 final Attribute classAtt = testdata.classAttribute(); 22 23 for( int i=0 ; i<testdata.numAttributes(); i++) { 24 if( !testdata.attribute(i).equals(classAtt) ) { 25 Stats teststats = testdata.attributeStats(i).numericStats; 26 27 double minVal = teststats.min; 28 double maxVal = teststats.max; 29 30 for( Instances traindata : traindataSet ) { 31 Stats trainstats = traindata.attributeStats(i).numericStats; 32 if( minVal>trainstats.min ) { 33 minVal = trainstats.min; 34 } 35 if( maxVal<trainstats.max ) { 36 maxVal = trainstats.max; 37 } 38 } 39 40 for( int j=0 ; j<testdata.numInstances() ; j++ ) { 41 Instance inst = testdata.instance(j); 42 double newValue = (inst.value(i)-minVal)/(maxVal-minVal); 43 inst.setValue(i, newValue); 44 } 45 46 for( Instances traindata : traindataSet ) { 47 for( int j=0 ; j<traindata.numInstances() ; j++ ) { 48 Instance inst = traindata.instance(j); 49 double newValue = (inst.value(i)-minVal)/(maxVal-minVal); 50 inst.setValue(i, newValue); 51 } 52 } 53 } 54 } 55 56 } 57 58 /** 59 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 60 */ 61 @Override 62 public void apply(Instances testdata, Instances traindata) { 63 final Attribute classAtt = testdata.classAttribute(); 64 65 for( int i=0 ; i<testdata.numAttributes(); i++) { 66 if( !testdata.attribute(i).equals(classAtt) ) { 67 Stats teststats = testdata.attributeStats(i).numericStats; 68 69 double minVal = teststats.min; 70 double maxVal = teststats.max; 71 72 Stats trainstats = traindata.attributeStats(i).numericStats; 73 if( minVal>trainstats.min ) { 74 minVal = trainstats.min; 75 } 76 if( maxVal<trainstats.max ) { 77 maxVal = trainstats.max; 78 } 79 80 for( int j=0 ; j<testdata.numInstances() ; j++ ) { 81 Instance inst = testdata.instance(j); 82 double newValue = (inst.value(i)-minVal)/(maxVal-minVal); 83 inst.setValue(i, newValue); 84 } 85 86 for( int j=0 ; j<traindata.numInstances() ; j++ ) { 87 Instance inst = traindata.instance(j); 88 double newValue = (inst.value(i)-minVal)/(maxVal-minVal); 89 inst.setValue(i, newValue); 90 } 91 } 92 } 93 } 31 /** 32 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, 33 * org.apache.commons.collections4.list.SetUniqueList) 34 */ 35 @Override 36 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 37 final Attribute classAtt = testdata.classAttribute(); 94 38 95 /** 96 * Does not have parameters. String is ignored. 97 * @param parameters ignored 98 */ 99 @Override 100 public void setParameter(String parameters) { 101 // no parameters 102 } 39 for (int i = 0; i < testdata.numAttributes(); i++) { 40 if (!testdata.attribute(i).equals(classAtt)) { 41 Stats teststats = testdata.attributeStats(i).numericStats; 42 43 double minVal = teststats.min; 44 double maxVal = teststats.max; 45 46 for (Instances traindata : traindataSet) { 47 Stats trainstats = traindata.attributeStats(i).numericStats; 48 if (minVal > trainstats.min) { 49 minVal = trainstats.min; 50 } 51 if (maxVal < trainstats.max) { 52 maxVal = trainstats.max; 53 } 54 } 55 56 for (int j = 0; j < testdata.numInstances(); j++) { 57 Instance inst = testdata.instance(j); 58 double newValue = (inst.value(i) - minVal) / (maxVal - minVal); 59 inst.setValue(i, newValue); 60 } 61 62 for (Instances traindata : traindataSet) { 63 for (int j = 0; j < traindata.numInstances(); j++) { 64 Instance inst = traindata.instance(j); 65 double newValue = (inst.value(i) - minVal) / (maxVal - minVal); 66 inst.setValue(i, newValue); 67 } 68 } 69 } 70 } 71 72 } 73 74 /** 75 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, 76 * weka.core.Instances) 77 */ 78 @Override 79 public void apply(Instances testdata, Instances traindata) { 80 final Attribute classAtt = testdata.classAttribute(); 81 82 for (int i = 0; i < testdata.numAttributes(); i++) { 83 if (!testdata.attribute(i).equals(classAtt)) { 84 Stats teststats = testdata.attributeStats(i).numericStats; 85 86 double minVal = teststats.min; 87 double maxVal = teststats.max; 88 89 Stats trainstats = traindata.attributeStats(i).numericStats; 90 if (minVal > trainstats.min) { 91 minVal = trainstats.min; 92 } 93 if (maxVal < trainstats.max) { 94 maxVal = trainstats.max; 95 } 96 97 for (int j = 0; j < testdata.numInstances(); j++) { 98 Instance inst = testdata.instance(j); 99 double newValue = (inst.value(i) - minVal) / (maxVal - minVal); 100 inst.setValue(i, newValue); 101 } 102 103 for (int j = 0; j < traindata.numInstances(); j++) { 104 Instance inst = traindata.instance(j); 105 double newValue = (inst.value(i) - minVal) / (maxVal - minVal); 106 inst.setValue(i, newValue); 107 } 108 } 109 } 110 } 111 112 /** 113 * Does not have parameters. String is ignored. 114 * 115 * @param parameters 116 * ignored 117 */ 118 @Override 119 public void setParameter(String parameters) { 120 // no parameters 121 } 103 122 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/Oversampling.java
r38 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 8 22 9 23 /** 10 * Implements oversampling, a strategy for 11 * handling bias in data. In case there are less positive samples (i.e. 12 * defect-prone) samples in the data than negative samples (i.e. 13 * non-defect-prone), the defect-prone entities are over-sampled such that the 14 * number of defect-prone and non-defect-prone instances is the same afterwards. 15 * This means, that some of the defect-prone entities will be more than once 16 * within the data. 24 * Implements oversampling, a strategy for handling bias in data. In case there are less positive 25 * samples (i.e. defect-prone) samples in the data than negative samples (i.e. non-defect-prone), 26 * the defect-prone entities are over-sampled such that the number of defect-prone and 27 * non-defect-prone instances is the same afterwards. This means, that some of the defect-prone 28 * entities will be more than once within the data. 17 29 * 18 30 * @author Steffen Herbold 19 31 */ 20 public class Oversampling implements IProcessesingStrategy, 21 ISetWiseProcessingStrategy { 32 public class Oversampling implements IProcessesingStrategy, ISetWiseProcessingStrategy { 22 33 23 24 25 26 27 28 29 30 31 32 34 /** 35 * Does not have parameters. String is ignored. 36 * 37 * @param parameters 38 * ignored 39 */ 40 @Override 41 public void setParameter(String parameters) { 42 // dummy 43 } 33 44 34 /* 35 * (non-Javadoc) 36 * 37 * @see 38 * de.ugoe.cs.cpdp.dataprocessing.ISetWiseProcessingStrategy#apply(weka. 39 * core.Instances, org.apache.commons.collections4.list.SetUniqueList) 40 */ 41 @Override 42 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 43 for (Instances traindata : traindataSet) { 44 apply(testdata, traindata); 45 } 46 } 45 /* 46 * (non-Javadoc) 47 * 48 * @see de.ugoe.cs.cpdp.dataprocessing.ISetWiseProcessingStrategy#apply(weka. core.Instances, 49 * org.apache.commons.collections4.list.SetUniqueList) 50 */ 51 @Override 52 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 53 for (Instances traindata : traindataSet) { 54 apply(testdata, traindata); 55 } 56 } 47 57 48 /* 49 * (non-Javadoc) 50 * 51 * @see 52 * de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core. 53 * Instances, weka.core.Instances) 54 */ 55 @Override 56 public void apply(Instances testdata, Instances traindata) { 58 /* 59 * (non-Javadoc) 60 * 61 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core. Instances, 62 * weka.core.Instances) 63 */ 64 @Override 65 public void apply(Instances testdata, Instances traindata) { 57 66 58 59 60 61 67 final int[] counts = traindata.attributeStats(traindata.classIndex()).nominalCounts; 68 if (counts[1] < counts[0]) { 69 Instances negatives = new Instances(traindata); 70 Instances positives = new Instances(traindata); 62 71 63 64 65 66 67 68 69 70 72 for (int i = traindata.size() - 1; i >= 0; i--) { 73 if (Double.compare(1.0, negatives.get(i).classValue()) == 0) { 74 negatives.remove(i); 75 } 76 if (Double.compare(0.0, positives.get(i).classValue()) == 0) { 77 positives.remove(i); 78 } 79 } 71 80 72 Resample resample = new Resample(); 73 // TODO: resample.setSampleSizePercent((100.0*counts[1])/100+0.01); 74 // Ohne +0.01 wird bei tomcat, xerces-1.2 und jedit-4.0 ein negative 75 // weniger zurückgegeben 76 resample.setSampleSizePercent((100.0 * counts[0]) / counts[1]); 77 try { 78 resample.setInputFormat(traindata); 79 positives = Filter.useFilter(positives, resample); 80 } catch (Exception e) { 81 throw new RuntimeException(e); 82 } 83 traindata.clear(); 84 for (int i = 0; i < negatives.size(); i++) { 85 traindata.add(negatives.get(i)); 86 } 87 for (int i = 0; i < positives.size(); i++) { 88 traindata.add(positives.get(i)); 89 } 90 } 91 } 81 Resample resample = new Resample(); 82 // TODO: resample.setSampleSizePercent((100.0*counts[1])/100+0.01); 83 // Ohne +0.01 wird bei tomcat, xerces-1.2 und jedit-4.0 ein negative 84 // weniger zurückgegeben 85 resample.setSampleSizePercent((100.0 * counts[0]) / counts[1]); 86 try { 87 resample.setInputFormat(traindata); 88 positives = Filter.useFilter(positives, resample); 89 } 90 catch (Exception e) { 91 throw new RuntimeException(e); 92 } 93 traindata.clear(); 94 for (int i = 0; i < negatives.size(); i++) { 95 traindata.add(negatives.get(i)); 96 } 97 for (int i = 0; i < positives.size(); i++) { 98 traindata.add(positives.get(i)); 99 } 100 } 101 } 92 102 93 103 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/Resampling.java
r38 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 8 22 9 23 /** 10 * Resamples the data with WEKA {@link Resample} to have a uniform distribution among all classes. 24 * Resamples the data with WEKA {@link Resample} to have a uniform distribution among all classes. 25 * 11 26 * @author Steffen Herbold 12 27 */ 13 public class Resampling implements IProcessesingStrategy, 14 ISetWiseProcessingStrategy { 28 public class Resampling implements IProcessesingStrategy, ISetWiseProcessingStrategy { 15 29 16 17 /** 18 * Does not have parameters. String is ignored. 19 * @param parameters ignored 20 */ 21 @Override 22 public void setParameter(String parameters) { 23 // dummy 24 } 30 /** 31 * Does not have parameters. String is ignored. 32 * 33 * @param parameters 34 * ignored 35 */ 36 @Override 37 public void setParameter(String parameters) { 38 // dummy 39 } 25 40 26 /* 27 * (non-Javadoc) 28 * @see de.ugoe.cs.cpdp.dataprocessing.ISetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 29 */ 30 @Override 31 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 32 for( Instances traindata : traindataSet ) { 33 apply(testdata, traindata); 34 } 35 } 41 /* 42 * (non-Javadoc) 43 * 44 * @see de.ugoe.cs.cpdp.dataprocessing.ISetWiseProcessingStrategy#apply(weka.core.Instances, 45 * org.apache.commons.collections4.list.SetUniqueList) 46 */ 47 @Override 48 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 49 for (Instances traindata : traindataSet) { 50 apply(testdata, traindata); 51 } 52 } 36 53 37 /* 38 * (non-Javadoc) 39 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 40 */ 41 @Override 42 public void apply(Instances testdata, Instances traindata) { 43 Resample resample = new Resample(); 44 resample.setSampleSizePercent(100); 45 resample.setBiasToUniformClass(1.0); 46 47 Instances traindataSample; 48 try { 49 resample.setInputFormat(traindata); 50 traindataSample = Filter.useFilter(traindata, resample); 51 } catch (Exception e) { 52 throw new RuntimeException(e); 53 } 54 traindata.clear(); 55 for( int i=0 ; i<traindataSample.size() ; i++ ) { 56 traindata.add(traindataSample.get(i)); 57 } 58 } 54 /* 55 * (non-Javadoc) 56 * 57 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances, 58 * weka.core.Instances) 59 */ 60 @Override 61 public void apply(Instances testdata, Instances traindata) { 62 Resample resample = new Resample(); 63 resample.setSampleSizePercent(100); 64 resample.setBiasToUniformClass(1.0); 65 66 Instances traindataSample; 67 try { 68 resample.setInputFormat(traindata); 69 traindataSample = Filter.useFilter(traindata, resample); 70 } 71 catch (Exception e) { 72 throw new RuntimeException(e); 73 } 74 traindata.clear(); 75 for (int i = 0; i < traindataSample.size(); i++) { 76 traindata.add(traindataSample.get(i)); 77 } 78 } 59 79 60 80 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/SimulationFilter.java
r32 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 12 26 * Filter for the Repast Simulation of Software Projects. 13 27 * 14 * Filters the training dataset in the following way: If 0 is no bug 15 * and 1 means there is a bug in this artifact, then this filter 16 * filters the dataset in this way: 28 * Filters the training dataset in the following way: If 0 is no bug and 1 means there is a bug in 29 * this artifact, then this filter filters the dataset in this way: 17 30 * 18 * 10010111000101110101111011101 19 * x--x-x-----x-x---x-x----x---x 31 * 10010111000101110101111011101 x--x-x-----x-x---x-x----x---x 20 32 * 21 * The instances, which are marked with x in this graphic are included 22 * in the newly created datasetand form the trainingsdataset.33 * The instances, which are marked with x in this graphic are included in the newly created dataset 34 * and form the trainingsdataset. 23 35 * 24 36 * @author Fabian Trautsch 25 * 37 * 26 38 */ 27 39 28 public class SimulationFilter implements IProcessesingStrategy {40 public class SimulationFilter implements IProcessesingStrategy { 29 41 30 31 32 * @param parameters ignored 33 */ 34 @Override 35 public void setParameter(String parameters) { 36 // dummy 37 38 } 42 /** 43 * Does not have parameters. String is ignored. 44 * 45 * @param parameters 46 * ignored 47 */ 48 @Override 49 public void setParameter(String parameters) { 50 // dummy 39 51 40 41 /* 42 * (non-Javadoc) 43 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 44 */ 45 @Override 46 public void apply(Instances testdata, Instances traindata) { 47 Instances newDataSet = new Instances(traindata); 48 traindata.delete(); 49 50 HashMap<Double, Instance> artifactNames = new HashMap<Double, Instance>(); 51 52 // This is to add all data, where the first occurence of the file has a bug 53 ArrayList<Double> firstOccurenceArtifactNames = new ArrayList<Double>(); 54 55 // Sort dataset (StateID is connected to the date of commit: Lower StateID 56 // means earlier commit than a higher stateID) 57 Attribute wekaAttribute = newDataSet.attribute("Artifact.Target.StateID"); 58 newDataSet.sort(wekaAttribute); 59 60 61 /* 62 * Logical summary: 63 * If there is an instance that dont have a bug, put it into the hashmap (only unique values in there) 64 * 65 * If there is an instance, that hava a bug look up if it is in the hashmap already (this means: 66 * it does not had a bug before!): If this is true add it to a new dataset and remove it from 67 * the hashmap, so that new changes from "nonBug" -> "bug" for this file can be found. 68 * 69 * If the instance has a bug and is not in the hashmap (this means: The file has a bug with its 70 * first occurence or this file only has bugs and not an instance with no bug), then (if it is 71 * not in the arrayList above) add it to the new dataset. This way it is possible to get 72 * the first occurence of a file, which has a bug 73 * 74 */ 75 for(int i=0; i<newDataSet.numInstances(); i++) { 76 Instance wekaInstance = newDataSet.instance(i); 52 } 77 53 78 double newBugLabel = wekaInstance.classValue(); 79 Attribute wekaArtifactName = newDataSet.attribute("Artifact.Name"); 80 Double artifactName = wekaInstance.value(wekaArtifactName); 81 82 if(newBugLabel == 0.0 && artifactNames.keySet().contains(artifactName)) { 83 artifactNames.put(artifactName, wekaInstance); 84 } else if(newBugLabel == 0.0 && !artifactNames.keySet().contains(artifactName)) { 85 artifactNames.put(artifactName, wekaInstance); 86 } else if(newBugLabel == 1.0 && artifactNames.keySet().contains(artifactName)) { 87 traindata.add(wekaInstance); 88 artifactNames.remove(artifactName); 89 } else if(newBugLabel == 1.0 && !artifactNames.keySet().contains(artifactName)) { 90 if(!firstOccurenceArtifactNames.contains(artifactName)) { 91 traindata.add(wekaInstance); 92 firstOccurenceArtifactNames.add(artifactName); 93 } 94 } 95 } 96 97 98 // If we have a file, that never had a bug (this is, when it is NOT in the 99 // new created dataset, but it is in the HashMap from above) add it to 100 // the new dataset 101 102 double[] artifactNamesinNewDataSet = traindata.attributeToDoubleArray(0); 103 HashMap<Double, Instance> artifactNamesCopy = new HashMap<Double, Instance>(artifactNames); 104 105 106 for(Double artifactName : artifactNames.keySet()) { 107 108 for(int i=0; i<artifactNamesinNewDataSet.length; i++) { 109 if(artifactNamesinNewDataSet[i] == artifactName) { 110 artifactNamesCopy.remove(artifactName); 111 } 112 } 113 } 114 115 for(Double artifact: artifactNamesCopy.keySet()) { 116 traindata.add(artifactNamesCopy.get(artifact)); 117 } 118 119 } 54 /* 55 * (non-Javadoc) 56 * 57 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances, 58 * weka.core.Instances) 59 */ 60 @Override 61 public void apply(Instances testdata, Instances traindata) { 62 Instances newDataSet = new Instances(traindata); 63 traindata.delete(); 64 65 HashMap<Double, Instance> artifactNames = new HashMap<Double, Instance>(); 66 67 // This is to add all data, where the first occurence of the file has a bug 68 ArrayList<Double> firstOccurenceArtifactNames = new ArrayList<Double>(); 69 70 // Sort dataset (StateID is connected to the date of commit: Lower StateID 71 // means earlier commit than a higher stateID) 72 Attribute wekaAttribute = newDataSet.attribute("Artifact.Target.StateID"); 73 newDataSet.sort(wekaAttribute); 74 75 /* 76 * Logical summary: If there is an instance that dont have a bug, put it into the hashmap 77 * (only unique values in there) 78 * 79 * If there is an instance, that hava a bug look up if it is in the hashmap already (this 80 * means: it does not had a bug before!): If this is true add it to a new dataset and remove 81 * it from the hashmap, so that new changes from "nonBug" -> "bug" for this file can be 82 * found. 83 * 84 * If the instance has a bug and is not in the hashmap (this means: The file has a bug with 85 * its first occurence or this file only has bugs and not an instance with no bug), then (if 86 * it is not in the arrayList above) add it to the new dataset. This way it is possible to 87 * get the first occurence of a file, which has a bug 88 */ 89 for (int i = 0; i < newDataSet.numInstances(); i++) { 90 Instance wekaInstance = newDataSet.instance(i); 91 92 double newBugLabel = wekaInstance.classValue(); 93 Attribute wekaArtifactName = newDataSet.attribute("Artifact.Name"); 94 Double artifactName = wekaInstance.value(wekaArtifactName); 95 96 if (newBugLabel == 0.0 && artifactNames.keySet().contains(artifactName)) { 97 artifactNames.put(artifactName, wekaInstance); 98 } 99 else if (newBugLabel == 0.0 && !artifactNames.keySet().contains(artifactName)) { 100 artifactNames.put(artifactName, wekaInstance); 101 } 102 else if (newBugLabel == 1.0 && artifactNames.keySet().contains(artifactName)) { 103 traindata.add(wekaInstance); 104 artifactNames.remove(artifactName); 105 } 106 else if (newBugLabel == 1.0 && !artifactNames.keySet().contains(artifactName)) { 107 if (!firstOccurenceArtifactNames.contains(artifactName)) { 108 traindata.add(wekaInstance); 109 firstOccurenceArtifactNames.add(artifactName); 110 } 111 } 112 } 113 114 // If we have a file, that never had a bug (this is, when it is NOT in the 115 // new created dataset, but it is in the HashMap from above) add it to 116 // the new dataset 117 118 double[] artifactNamesinNewDataSet = traindata.attributeToDoubleArray(0); 119 HashMap<Double, Instance> artifactNamesCopy = new HashMap<Double, Instance>(artifactNames); 120 121 for (Double artifactName : artifactNames.keySet()) { 122 123 for (int i = 0; i < artifactNamesinNewDataSet.length; i++) { 124 if (artifactNamesinNewDataSet[i] == artifactName) { 125 artifactNamesCopy.remove(artifactName); 126 } 127 } 128 } 129 130 for (Double artifact : artifactNamesCopy.keySet()) { 131 traindata.add(artifactNamesCopy.get(artifact)); 132 } 133 134 } 120 135 121 136 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/Undersampling.java
r18 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 8 22 9 23 /** 10 * Implements undersampling, a strategy for handling bias in data. In case there are less positive samples (i.e. defect-prone) samples in the 11 * data than negative samples (i.e. non-defect-prone), the non-defect-prone entities are sampled such thatthe number of defect-prone and non-defect-prone instances is the same afterwards. 24 * Implements undersampling, a strategy for handling bias in data. In case there are less positive 25 * samples (i.e. defect-prone) samples in the data than negative samples (i.e. non-defect-prone), 26 * the non-defect-prone entities are sampled such thatthe number of defect-prone and 27 * non-defect-prone instances is the same afterwards. 28 * 12 29 * @author Steffen Herbold 13 30 */ 14 public class Undersampling implements IProcessesingStrategy, 15 ISetWiseProcessingStrategy { 31 public class Undersampling implements IProcessesingStrategy, ISetWiseProcessingStrategy { 16 32 17 18 /** 19 * Does not have parameters. String is ignored. 20 * @param parameters ignored 21 */ 22 @Override 23 public void setParameter(String parameters) { 24 // dummy 25 } 33 /** 34 * Does not have parameters. String is ignored. 35 * 36 * @param parameters 37 * ignored 38 */ 39 @Override 40 public void setParameter(String parameters) { 41 // dummy 42 } 26 43 27 /* 28 * (non-Javadoc) 29 * @see de.ugoe.cs.cpdp.dataprocessing.ISetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 30 */ 31 @Override 32 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 33 for( Instances traindata : traindataSet ) { 34 apply(testdata, traindata); 35 } 36 } 44 /* 45 * (non-Javadoc) 46 * 47 * @see de.ugoe.cs.cpdp.dataprocessing.ISetWiseProcessingStrategy#apply(weka.core.Instances, 48 * org.apache.commons.collections4.list.SetUniqueList) 49 */ 50 @Override 51 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 52 for (Instances traindata : traindataSet) { 53 apply(testdata, traindata); 54 } 55 } 37 56 38 /* 39 * (non-Javadoc) 40 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 41 */ 42 @Override 43 public void apply(Instances testdata, Instances traindata) { 44 45 final int[] counts = traindata.attributeStats(traindata.classIndex()).nominalCounts; 46 47 if( counts[1]<counts[0] ) { 48 Instances negatives = new Instances(traindata); 49 Instances positives = new Instances(traindata); 50 51 for( int i=traindata.size()-1 ; i>=0 ; i-- ) { 52 if( Double.compare(1.0, negatives.get(i).classValue())==0 ) { 53 negatives.remove(i); 54 } 55 if( Double.compare(0.0, positives.get(i).classValue())==0 ) { 56 positives.remove(i); 57 } 58 } 59 60 Resample resample = new Resample(); 61 // TODO: resample.setSampleSizePercent((100.0*counts[1])/100+0.01); 62 // Ohne +0.01 wird bei tomcat, xerces-1.2 und jedit-4.0 ein negative weniger zurückgegeben 63 resample.setSampleSizePercent((100.0* counts[1])/counts[0]); 64 try { 65 resample.setInputFormat(traindata); 66 negatives = Filter.useFilter(negatives, resample); 67 } catch (Exception e) { 68 throw new RuntimeException(e); 69 } 70 traindata.clear(); 71 for( int i=0 ; i<negatives.size() ; i++ ) { 72 traindata.add(negatives.get(i)); 73 } 74 for( int i=0 ; i<positives.size() ; i++ ) { 75 traindata.add(positives.get(i)); 76 } 77 } 78 } 57 /* 58 * (non-Javadoc) 59 * 60 * @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances, 61 * weka.core.Instances) 62 */ 63 @Override 64 public void apply(Instances testdata, Instances traindata) { 65 66 final int[] counts = traindata.attributeStats(traindata.classIndex()).nominalCounts; 67 68 if (counts[1] < counts[0]) { 69 Instances negatives = new Instances(traindata); 70 Instances positives = new Instances(traindata); 71 72 for (int i = traindata.size() - 1; i >= 0; i--) { 73 if (Double.compare(1.0, negatives.get(i).classValue()) == 0) { 74 negatives.remove(i); 75 } 76 if (Double.compare(0.0, positives.get(i).classValue()) == 0) { 77 positives.remove(i); 78 } 79 } 80 81 Resample resample = new Resample(); 82 // TODO: resample.setSampleSizePercent((100.0*counts[1])/100+0.01); 83 // Ohne +0.01 wird bei tomcat, xerces-1.2 und jedit-4.0 ein negative weniger 84 // zurückgegeben 85 resample.setSampleSizePercent((100.0 * counts[1]) / counts[0]); 86 try { 87 resample.setInputFormat(traindata); 88 negatives = Filter.useFilter(negatives, resample); 89 } 90 catch (Exception e) { 91 throw new RuntimeException(e); 92 } 93 traindata.clear(); 94 for (int i = 0; i < negatives.size(); i++) { 95 traindata.add(negatives.get(i)); 96 } 97 for (int i = 0; i < positives.size(); i++) { 98 traindata.add(positives.get(i)); 99 } 100 } 101 } 79 102 80 103 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/ZScoreNormalization.java
r38 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 14 28 public class ZScoreNormalization implements ISetWiseProcessingStrategy, IProcessesingStrategy { 15 29 16 /** 17 * Does not have parameters. String is ignored. 18 * @param parameters ignored 19 */ 20 @Override 21 public void setParameter(String parameters) { 22 // dummy 23 } 30 /** 31 * Does not have parameters. String is ignored. 32 * 33 * @param parameters 34 * ignored 35 */ 36 @Override 37 public void setParameter(String parameters) { 38 // dummy 39 } 24 40 25 /** 26 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 27 */ 28 @Override 29 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 30 normalize(testdata); 31 for( Instances instances : traindataSet ) { 32 normalize(instances); 33 } 34 } 41 /** 42 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, 43 * org.apache.commons.collections4.list.SetUniqueList) 44 */ 45 @Override 46 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 47 normalize(testdata); 48 for (Instances instances : traindataSet) { 49 normalize(instances); 50 } 51 } 35 52 36 /** 37 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 38 */ 39 @Override 40 public void apply(Instances testdata, Instances traindata) { 41 normalize(testdata); 42 normalize(traindata); 43 } 44 45 private void normalize(Instances instances) { 46 instances.toString(); 47 final Attribute classAttribute = instances.classAttribute(); 48 49 final double[] means = new double[instances.numAttributes()]; 50 final double[] stddevs = new double[instances.numAttributes()]; 51 52 // get means and stddevs of data 53 for( int j=0 ; j<instances.numAttributes() ; j++ ) { 54 if( instances.attribute(j)!=classAttribute ) { 55 means[j] = instances.meanOrMode(j); 56 stddevs[j] = Math.sqrt(instances.variance(j)); 57 } 58 } 59 for( int i=0 ; i<instances.numAttributes(); i++) { 60 if( !instances.attribute(i).equals(classAttribute) ) { 61 for( int j=0 ; j<instances.numInstances() ; j++ ) { 62 Instance inst = instances.get(i); 63 double newValue = (inst.value(i)-means[i])/stddevs[i]; 64 if( newValue==Double.NaN ) { 65 System.out.println("foooooo"); 66 } 67 inst.setValue(i, newValue); 68 } 69 } 70 } 71 } 53 /** 54 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, 55 * weka.core.Instances) 56 */ 57 @Override 58 public void apply(Instances testdata, Instances traindata) { 59 normalize(testdata); 60 normalize(traindata); 61 } 62 63 private void normalize(Instances instances) { 64 instances.toString(); 65 final Attribute classAttribute = instances.classAttribute(); 66 67 final double[] means = new double[instances.numAttributes()]; 68 final double[] stddevs = new double[instances.numAttributes()]; 69 70 // get means and stddevs of data 71 for (int j = 0; j < instances.numAttributes(); j++) { 72 if (instances.attribute(j) != classAttribute) { 73 means[j] = instances.meanOrMode(j); 74 stddevs[j] = Math.sqrt(instances.variance(j)); 75 } 76 } 77 for (int i = 0; i < instances.numAttributes(); i++) { 78 if (!instances.attribute(i).equals(classAttribute)) { 79 for (int j = 0; j < instances.numInstances(); j++) { 80 Instance inst = instances.get(i); 81 double newValue = (inst.value(i) - means[i]) / stddevs[i]; 82 if (newValue == Double.NaN) { 83 System.out.println("foooooo"); 84 } 85 inst.setValue(i, newValue); 86 } 87 } 88 } 89 } 72 90 73 91 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/ZScoreTargetNormalization.java
r38 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataprocessing; 2 16 … … 12 26 * @author Steffen Herbold 13 27 */ 14 public class ZScoreTargetNormalization implements ISetWiseProcessingStrategy, IProcessesingStrategy { 28 public class ZScoreTargetNormalization implements ISetWiseProcessingStrategy, IProcessesingStrategy 29 { 15 30 16 /** 17 * Does not have parameters. String is ignored. 18 * @param parameters ignored 19 */ 20 @Override 21 public void setParameter(String parameters) { 22 // dummy 23 } 31 /** 32 * Does not have parameters. String is ignored. 33 * 34 * @param parameters 35 * ignored 36 */ 37 @Override 38 public void setParameter(String parameters) { 39 // dummy 40 } 24 41 25 /** 26 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 27 */ 28 @Override 29 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 30 final Attribute classAttribute = testdata.classAttribute(); 31 32 final double[] meanTest = new double[testdata.numAttributes()]; 33 final double[] stddevTest = new double[testdata.numAttributes()]; 34 35 // get means of testdata 36 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 37 if( testdata.attribute(j)!=classAttribute ) { 38 meanTest[j] = testdata.meanOrMode(j); 39 stddevTest[j] = Math.sqrt(testdata.variance(j)); 40 } 41 } 42 43 // preprocess test data 44 for( int i=0 ; i<testdata.numInstances() ; i++ ) { 45 Instance instance = testdata.instance(i); 46 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 47 if( testdata.attribute(j)!=classAttribute ) { 48 instance.setValue(j, instance.value(j)-meanTest[j]/stddevTest[j]); 49 } 50 } 51 } 52 53 // preprocess training data 54 for( Instances traindata : traindataSet ) { 55 for( int i=0 ; i<traindata.numInstances() ; i++ ) { 56 Instance instance = traindata.instance(i); 57 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 58 if( testdata.attribute(j)!=classAttribute ) { 59 instance.setValue(j, instance.value(j)-meanTest[j]/stddevTest[j]); 60 } 61 } 62 } 63 } 64 } 42 /** 43 * @see de.ugoe.cs.cpdp.dataprocessing.SetWiseProcessingStrategy#apply(weka.core.Instances, 44 * org.apache.commons.collections4.list.SetUniqueList) 45 */ 46 @Override 47 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 48 final Attribute classAttribute = testdata.classAttribute(); 65 49 66 /** 67 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances) 68 */ 69 @Override 70 public void apply(Instances testdata, Instances traindata) { 71 final Attribute classAttribute = testdata.classAttribute(); 72 73 final double[] meanTest = new double[testdata.numAttributes()]; 74 final double[] stddevTest = new double[testdata.numAttributes()]; 75 76 // get means of testdata 77 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 78 if( testdata.attribute(j)!=classAttribute ) { 79 meanTest[j] = testdata.meanOrMode(j); 80 stddevTest[j] = Math.sqrt(testdata.variance(j)); 81 } 82 } 83 84 // preprocess test data 85 for( int i=0 ; i<testdata.numInstances() ; i++ ) { 86 Instance instance = testdata.instance(i); 87 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 88 if( testdata.attribute(j)!=classAttribute ) { 89 instance.setValue(j, instance.value(j)-meanTest[j]/stddevTest[j]); 90 } 91 } 92 } 93 94 // preprocess training data 95 for( int i=0 ; i<traindata.numInstances() ; i++ ) { 96 Instance instance = traindata.instance(i); 97 for( int j=0 ; j<testdata.numAttributes() ; j++ ) { 98 if( testdata.attribute(j)!=classAttribute ) { 99 instance.setValue(j, instance.value(j)-meanTest[j]/stddevTest[j]); 100 } 101 } 102 } 103 } 50 final double[] meanTest = new double[testdata.numAttributes()]; 51 final double[] stddevTest = new double[testdata.numAttributes()]; 52 53 // get means of testdata 54 for (int j = 0; j < testdata.numAttributes(); j++) { 55 if (testdata.attribute(j) != classAttribute) { 56 meanTest[j] = testdata.meanOrMode(j); 57 stddevTest[j] = Math.sqrt(testdata.variance(j)); 58 } 59 } 60 61 // preprocess test data 62 for (int i = 0; i < testdata.numInstances(); i++) { 63 Instance instance = testdata.instance(i); 64 for (int j = 0; j < testdata.numAttributes(); j++) { 65 if (testdata.attribute(j) != classAttribute) { 66 instance.setValue(j, instance.value(j) - meanTest[j] / stddevTest[j]); 67 } 68 } 69 } 70 71 // preprocess training data 72 for (Instances traindata : traindataSet) { 73 for (int i = 0; i < traindata.numInstances(); i++) { 74 Instance instance = traindata.instance(i); 75 for (int j = 0; j < testdata.numAttributes(); j++) { 76 if (testdata.attribute(j) != classAttribute) { 77 instance.setValue(j, instance.value(j) - meanTest[j] / stddevTest[j]); 78 } 79 } 80 } 81 } 82 } 83 84 /** 85 * @see de.ugoe.cs.cpdp.dataprocessing.ProcessesingStrategy#apply(weka.core.Instances, 86 * weka.core.Instances) 87 */ 88 @Override 89 public void apply(Instances testdata, Instances traindata) { 90 final Attribute classAttribute = testdata.classAttribute(); 91 92 final double[] meanTest = new double[testdata.numAttributes()]; 93 final double[] stddevTest = new double[testdata.numAttributes()]; 94 95 // get means of testdata 96 for (int j = 0; j < testdata.numAttributes(); j++) { 97 if (testdata.attribute(j) != classAttribute) { 98 meanTest[j] = testdata.meanOrMode(j); 99 stddevTest[j] = Math.sqrt(testdata.variance(j)); 100 } 101 } 102 103 // preprocess test data 104 for (int i = 0; i < testdata.numInstances(); i++) { 105 Instance instance = testdata.instance(i); 106 for (int j = 0; j < testdata.numAttributes(); j++) { 107 if (testdata.attribute(j) != classAttribute) { 108 instance.setValue(j, instance.value(j) - meanTest[j] / stddevTest[j]); 109 } 110 } 111 } 112 113 // preprocess training data 114 for (int i = 0; i < traindata.numInstances(); i++) { 115 Instance instance = traindata.instance(i); 116 for (int j = 0; j < testdata.numAttributes(); j++) { 117 if (testdata.attribute(j) != classAttribute) { 118 instance.setValue(j, instance.value(j) - meanTest[j] / stddevTest[j]); 119 } 120 } 121 } 122 } 104 123 }
Note: See TracChangeset
for help on using the changeset viewer.