Changeset 41 for trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection
- Timestamp:
- 09/24/15 10:59:05 (9 years ago)
- Location:
- trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection
- Files:
-
- 11 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/AbstractCharacteristicSelection.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataselection; 2 16 … … 14 28 15 29 /** 16 * Abstract class that implements the foundation of setwise data selection strategies using distributional characteristics. 17 * This class provides the means to transform the data sets into their characteristic vectors. 30 * Abstract class that implements the foundation of setwise data selection strategies using 31 * distributional characteristics. This class provides the means to transform the data sets into 32 * their characteristic vectors. 33 * 18 34 * @author Steffen Herbold 19 35 */ 20 public abstract class AbstractCharacteristicSelection implements 21 ISetWiseDataselectionStrategy { 36 public abstract class AbstractCharacteristicSelection implements ISetWiseDataselectionStrategy { 22 37 23 /** 24 * vector with the distributional characteristics 25 */ 26 private String[] characteristics = new String[]{"mean","stddev"}; 27 28 /** 29 * Sets the distributional characteristics. The names of the characteristics are separated by blanks. 30 */ 31 @Override 32 public void setParameter(String parameters) { 33 if( !"".equals(parameters) ) { 34 characteristics = parameters.split(" "); 35 } 36 } 37 38 /** 39 * Transforms the data into the distributional characteristics. The first instance is the test data, followed by the training data. 40 * @param testdata test data 41 * @param traindataSet training data sets 42 * @return distributional characteristics of the data 43 */ 44 protected Instances characteristicInstances(Instances testdata, SetUniqueList<Instances> traindataSet) { 45 // setup weka Instances for clustering 46 final ArrayList<Attribute> atts = new ArrayList<Attribute>(); 47 48 final Attribute classAtt = testdata.classAttribute(); 49 for( int i=0 ; i<testdata.numAttributes() ; i++ ) { 50 Attribute dataAtt = testdata.attribute(i); 51 if( !dataAtt.equals(classAtt) ) { 52 for( String characteristic : characteristics ) { 53 atts.add(new Attribute(dataAtt.name() + "_" + characteristic)); 54 } 55 } 56 } 57 final Instances data = new Instances("distributional_characteristics", atts, 0); 58 59 // setup data for clustering 60 double[] instanceValues = new double[atts.size()]; 61 for( int i=0 ; i<testdata.numAttributes() ; i++ ) { 62 Attribute dataAtt = testdata.attribute(i); 63 if( !dataAtt.equals(classAtt) ) { 64 Stats stats = testdata.attributeStats(i).numericStats; 65 for( int j=0; j<characteristics.length; j++ ) { 66 if( "mean".equals(characteristics[j]) ) { 67 instanceValues[i*characteristics.length+j] = stats.mean; 68 } else if( "stddev".equals(characteristics[j])) { 69 instanceValues[i*characteristics.length+j] = stats.stdDev; 70 } else if( "var".equals(characteristics[j])) { 71 instanceValues[i*characteristics.length+j] = testdata.variance(j); 72 } else { 73 throw new RuntimeException("Unkown distributional characteristic: " + characteristics[j]); 74 } 75 } 76 } 77 } 78 data.add(new DenseInstance(1.0, instanceValues)); 79 80 for( Instances traindata : traindataSet ) { 81 instanceValues = new double[atts.size()]; 82 for( int i=0 ; i<traindata.numAttributes() ; i++ ) { 83 Attribute dataAtt = traindata.attribute(i); 84 if( !dataAtt.equals(classAtt) ) { 85 Stats stats = traindata.attributeStats(i).numericStats; 86 for( int j=0; j<characteristics.length; j++ ) { 87 if( "mean".equals(characteristics[j]) ) { 88 instanceValues[i*characteristics.length+j] = stats.mean; 89 } else if( "stddev".equals(characteristics[j])) { 90 instanceValues[i*characteristics.length+j] = stats.stdDev; 91 } else if( "var".equals(characteristics[j])) { 92 instanceValues[i*characteristics.length+j] = testdata.variance(j); 93 } else { 94 throw new RuntimeException("Unkown distributional characteristic: " + characteristics[j]); 95 } 96 } 97 } 98 } 99 Instance instance = new DenseInstance(1.0, instanceValues); 100 101 data.add(instance); 102 } 103 return data; 104 } 105 106 /** 107 * Returns the normalized distributional characteristics of the training data. 108 * @param testdata test data 109 * @param traindataSet training data sets 110 * @return normalized distributional characteristics of the data 111 */ 112 protected Instances normalizedCharacteristicInstances(Instances testdata, SetUniqueList<Instances> traindataSet) { 113 Instances data = characteristicInstances(testdata, traindataSet); 114 try { 115 final Normalize normalizer = new Normalize(); 116 normalizer.setInputFormat(data); 117 data = Filter.useFilter(data, normalizer); 118 } catch (Exception e) { 119 throw new RuntimeException("Unexpected exception during normalization of distributional characteristics.", e); 120 } 121 return data; 122 } 38 /** 39 * vector with the distributional characteristics 40 */ 41 private String[] characteristics = new String[] 42 { "mean", "stddev" }; 43 44 /** 45 * Sets the distributional characteristics. The names of the characteristics are separated by 46 * blanks. 47 */ 48 @Override 49 public void setParameter(String parameters) { 50 if (!"".equals(parameters)) { 51 characteristics = parameters.split(" "); 52 } 53 } 54 55 /** 56 * Transforms the data into the distributional characteristics. The first instance is the test 57 * data, followed by the training data. 58 * 59 * @param testdata 60 * test data 61 * @param traindataSet 62 * training data sets 63 * @return distributional characteristics of the data 64 */ 65 protected Instances characteristicInstances(Instances testdata, 66 SetUniqueList<Instances> traindataSet) 67 { 68 // setup weka Instances for clustering 69 final ArrayList<Attribute> atts = new ArrayList<Attribute>(); 70 71 final Attribute classAtt = testdata.classAttribute(); 72 for (int i = 0; i < testdata.numAttributes(); i++) { 73 Attribute dataAtt = testdata.attribute(i); 74 if (!dataAtt.equals(classAtt)) { 75 for (String characteristic : characteristics) { 76 atts.add(new Attribute(dataAtt.name() + "_" + characteristic)); 77 } 78 } 79 } 80 final Instances data = new Instances("distributional_characteristics", atts, 0); 81 82 // setup data for clustering 83 double[] instanceValues = new double[atts.size()]; 84 for (int i = 0; i < testdata.numAttributes(); i++) { 85 Attribute dataAtt = testdata.attribute(i); 86 if (!dataAtt.equals(classAtt)) { 87 Stats stats = testdata.attributeStats(i).numericStats; 88 for (int j = 0; j < characteristics.length; j++) { 89 if ("mean".equals(characteristics[j])) { 90 instanceValues[i * characteristics.length + j] = stats.mean; 91 } 92 else if ("stddev".equals(characteristics[j])) { 93 instanceValues[i * characteristics.length + j] = stats.stdDev; 94 } 95 else if ("var".equals(characteristics[j])) { 96 instanceValues[i * characteristics.length + j] = testdata.variance(j); 97 } 98 else { 99 throw new RuntimeException("Unkown distributional characteristic: " + 100 characteristics[j]); 101 } 102 } 103 } 104 } 105 data.add(new DenseInstance(1.0, instanceValues)); 106 107 for (Instances traindata : traindataSet) { 108 instanceValues = new double[atts.size()]; 109 for (int i = 0; i < traindata.numAttributes(); i++) { 110 Attribute dataAtt = traindata.attribute(i); 111 if (!dataAtt.equals(classAtt)) { 112 Stats stats = traindata.attributeStats(i).numericStats; 113 for (int j = 0; j < characteristics.length; j++) { 114 if ("mean".equals(characteristics[j])) { 115 instanceValues[i * characteristics.length + j] = stats.mean; 116 } 117 else if ("stddev".equals(characteristics[j])) { 118 instanceValues[i * characteristics.length + j] = stats.stdDev; 119 } 120 else if ("var".equals(characteristics[j])) { 121 instanceValues[i * characteristics.length + j] = testdata.variance(j); 122 } 123 else { 124 throw new RuntimeException("Unkown distributional characteristic: " + 125 characteristics[j]); 126 } 127 } 128 } 129 } 130 Instance instance = new DenseInstance(1.0, instanceValues); 131 132 data.add(instance); 133 } 134 return data; 135 } 136 137 /** 138 * Returns the normalized distributional characteristics of the training data. 139 * 140 * @param testdata 141 * test data 142 * @param traindataSet 143 * training data sets 144 * @return normalized distributional characteristics of the data 145 */ 146 protected Instances normalizedCharacteristicInstances(Instances testdata, 147 SetUniqueList<Instances> traindataSet) 148 { 149 Instances data = characteristicInstances(testdata, traindataSet); 150 try { 151 final Normalize normalizer = new Normalize(); 152 normalizer.setInputFormat(data); 153 data = Filter.useFilter(data, normalizer); 154 } 155 catch (Exception e) { 156 throw new RuntimeException( 157 "Unexpected exception during normalization of distributional characteristics.", 158 e); 159 } 160 return data; 161 } 123 162 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/IPointWiseDataselectionStrategy.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataselection; 2 16 … … 6 20 7 21 /** 8 * Interface for pointwise data selection strategies. 22 * Interface for pointwise data selection strategies. 23 * 9 24 * @author Steffen Herbold 10 25 */ 11 26 public interface IPointWiseDataselectionStrategy extends IParameterizable { 12 27 13 /** 14 * Applies the data selection strategy. 15 * @param testdata test data 16 * @param traindata candidate training data 17 * @return the selected training data 18 */ 19 Instances apply(Instances testdata, Instances traindata); 28 /** 29 * Applies the data selection strategy. 30 * 31 * @param testdata 32 * test data 33 * @param traindata 34 * candidate training data 35 * @return the selected training data 36 */ 37 Instances apply(Instances testdata, Instances traindata); 20 38 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/ISetWiseDataselectionStrategy.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataselection; 2 16 … … 9 23 /** 10 24 * Interface for setwise data selection strategies. 25 * 11 26 * @author Steffen Herbold 12 27 */ 13 28 public interface ISetWiseDataselectionStrategy extends IParameterizable { 14 29 15 /** 16 * Applies a setwise data selection strategy. 17 * @param testdata test data for which the training data is selected 18 * @param traindataSet candidate training data 19 */ 20 void apply(Instances testdata, SetUniqueList<Instances> traindataSet); 30 /** 31 * Applies a setwise data selection strategy. 32 * 33 * @param testdata 34 * test data for which the training data is selected 35 * @param traindataSet 36 * candidate training data 37 */ 38 void apply(Instances testdata, SetUniqueList<Instances> traindataSet); 21 39 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/PetersFilter.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataselection; 2 16 … … 13 27 14 28 /** 15 * Filter according to F. Peters, T. Menzies, and A. Marcus: Better Cross Company Defect Prediction 16 * <br><br> 17 * This filter does not work, the paper has been withdrawn. 29 * Filter according to F. Peters, T. Menzies, and A. Marcus: Better Cross Company Defect Prediction <br> 30 * <br> 31 * This filter does not work, the paper has been withdrawn. 32 * 18 33 * @author Steffen Herbold 19 34 */ … … 21 36 public class PetersFilter implements IPointWiseDataselectionStrategy { 22 37 23 24 /** 25 * @see de.ugoe.cs.cpdp.IParameterizable#setParameter(java.lang.String) 26 */ 27 @Override 28 public void setParameter(String parameters) { 29 // dummy 30 } 38 /** 39 * @see de.ugoe.cs.cpdp.IParameterizable#setParameter(java.lang.String) 40 */ 41 @Override 42 public void setParameter(String parameters) { 43 // dummy 44 } 31 45 32 /** 33 * @see de.ugoe.cs.cpdp.dataselection.IPointWiseDataselectionStrategy#apply(weka.core.Instances, weka.core.Instances) 34 */ 35 @Override 36 public Instances apply(Instances testdata, Instances traindata) { 37 final Attribute classAttribute = testdata.classAttribute(); 38 39 final double[][] testDoubles = new double[testdata.numInstances()][testdata.numAttributes()]; 40 for( int i=0; i<testdata.numInstances() ; i++ ) { 41 Instance instance = testdata.instance(i); 42 int tmp = 0; 43 for( int j=0 ; j<testdata.numAttributes(); j++ ) { 44 if( testdata.attribute(j)!=classAttribute ) { 45 testDoubles[i][tmp++] = instance.value(j); 46 } 47 } 48 } 49 50 final double[][] trainDoubles = new double[traindata.numInstances()][testdata.numAttributes()]; 51 for( int i=0; i<traindata.numInstances() ; i++ ) { 52 Instance instance = traindata.instance(i); 53 int tmp = 0; 54 for( int j=0 ; j<testdata.numAttributes(); j++ ) { 55 if( testdata.attribute(j)!=classAttribute ) { 56 trainDoubles[i][tmp++] = instance.value(j); 57 } 58 } 59 } 60 61 final List<List<Integer>> fanList = new ArrayList<List<Integer>>(testdata.numInstances()); 62 for( int i=0; i<testdata.numInstances(); i++ ) { 63 fanList.add(new LinkedList<Integer>()); 64 } 65 66 for( int i=0; i<traindata.numInstances(); i++ ) { 67 double minDistance = Double.MAX_VALUE; 68 int minIndex = 0; 69 for( int j=0; j<testdata.numInstances(); j++ ) { 70 double distance = MathArrays.distance(trainDoubles[i], testDoubles[j]); 71 if( distance<minDistance ) { 72 minDistance = distance; 73 minIndex = j; 74 } 75 } 76 fanList.get(minIndex).add(i); 77 } 78 79 final SetUniqueList<Integer> selectedIndex = SetUniqueList.setUniqueList(new LinkedList<Integer>()); 80 for( int i=0; i<testdata.numInstances(); i++ ) { 81 double minDistance = Double.MAX_VALUE; 82 int minIndex = -1; 83 for( Integer j : fanList.get(i) ) { 84 double distance = MathArrays.distance(testDoubles[i], trainDoubles[j]); 85 if( distance<minDistance && distance>0.0d ) { 86 minDistance = distance; 87 minIndex = j; 88 } 89 } 90 if( minIndex!=-1 ) { 91 selectedIndex.add(minIndex); 92 } 93 } 94 95 final Instances selected = new Instances(testdata); 96 selected.delete(); 97 for( Integer i : selectedIndex) { 98 selected.add(traindata.instance(i)); 99 } 100 return selected; 101 } 46 /** 47 * @see de.ugoe.cs.cpdp.dataselection.IPointWiseDataselectionStrategy#apply(weka.core.Instances, 48 * weka.core.Instances) 49 */ 50 @Override 51 public Instances apply(Instances testdata, Instances traindata) { 52 final Attribute classAttribute = testdata.classAttribute(); 53 54 final double[][] testDoubles = 55 new double[testdata.numInstances()][testdata.numAttributes()]; 56 for (int i = 0; i < testdata.numInstances(); i++) { 57 Instance instance = testdata.instance(i); 58 int tmp = 0; 59 for (int j = 0; j < testdata.numAttributes(); j++) { 60 if (testdata.attribute(j) != classAttribute) { 61 testDoubles[i][tmp++] = instance.value(j); 62 } 63 } 64 } 65 66 final double[][] trainDoubles = 67 new double[traindata.numInstances()][testdata.numAttributes()]; 68 for (int i = 0; i < traindata.numInstances(); i++) { 69 Instance instance = traindata.instance(i); 70 int tmp = 0; 71 for (int j = 0; j < testdata.numAttributes(); j++) { 72 if (testdata.attribute(j) != classAttribute) { 73 trainDoubles[i][tmp++] = instance.value(j); 74 } 75 } 76 } 77 78 final List<List<Integer>> fanList = new ArrayList<List<Integer>>(testdata.numInstances()); 79 for (int i = 0; i < testdata.numInstances(); i++) { 80 fanList.add(new LinkedList<Integer>()); 81 } 82 83 for (int i = 0; i < traindata.numInstances(); i++) { 84 double minDistance = Double.MAX_VALUE; 85 int minIndex = 0; 86 for (int j = 0; j < testdata.numInstances(); j++) { 87 double distance = MathArrays.distance(trainDoubles[i], testDoubles[j]); 88 if (distance < minDistance) { 89 minDistance = distance; 90 minIndex = j; 91 } 92 } 93 fanList.get(minIndex).add(i); 94 } 95 96 final SetUniqueList<Integer> selectedIndex = 97 SetUniqueList.setUniqueList(new LinkedList<Integer>()); 98 for (int i = 0; i < testdata.numInstances(); i++) { 99 double minDistance = Double.MAX_VALUE; 100 int minIndex = -1; 101 for (Integer j : fanList.get(i)) { 102 double distance = MathArrays.distance(testDoubles[i], trainDoubles[j]); 103 if (distance < minDistance && distance > 0.0d) { 104 minDistance = distance; 105 minIndex = j; 106 } 107 } 108 if (minIndex != -1) { 109 selectedIndex.add(minIndex); 110 } 111 } 112 113 final Instances selected = new Instances(testdata); 114 selected.delete(); 115 for (Integer i : selectedIndex) { 116 selected.add(traindata.instance(i)); 117 } 118 return selected; 119 } 102 120 103 121 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/PointWiseEMClusterSelection.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataselection; 2 16 … … 14 28 import de.ugoe.cs.util.console.Console; 15 29 16 17 30 /** 18 31 * Use in Config: 19 32 * 20 * Specify number of clusters 21 * -N = Num Clusters 22 * <pointwiseselector name="PointWiseEMClusterSelection" param="-N 10"/> 23 * 24 * Try to determine the number of clusters: 25 * -I 10 = max iterations 26 * -X 5 = 5 folds for cross evaluation 27 * -max = max number of clusters 28 * <pointwiseselector name="PointWiseEMClusterSelection" param="-I 10 -X 5 -max 300"/> 33 * Specify number of clusters -N = Num Clusters <pointwiseselector 34 * name="PointWiseEMClusterSelection" param="-N 10"/> 29 35 * 30 * Don't forget to add: 31 * <preprocessor name="Normalization" param=""/> 36 * Try to determine the number of clusters: -I 10 = max iterations -X 5 = 5 folds for cross 37 * evaluation -max = max number of clusters <pointwiseselector name="PointWiseEMClusterSelection" 38 * param="-I 10 -X 5 -max 300"/> 39 * 40 * Don't forget to add: <preprocessor name="Normalization" param=""/> 32 41 */ 33 42 public class PointWiseEMClusterSelection implements IPointWiseDataselectionStrategy { 34 35 private String[] params;36 37 @Override38 public void setParameter(String parameters) {39 params = parameters.split(" ");40 }41 43 42 43 /** 44 * 1. Cluster the traindata 45 * 2. for each instance in the testdata find the assigned cluster 46 * 3. select only traindata from the clusters we found in our testdata 47 * 48 * @returns the selected training data 49 */ 50 @Override 51 public Instances apply(Instances testdata, Instances traindata) { 52 //final Attribute classAttribute = testdata.classAttribute(); 53 54 final List<Integer> selectedCluster = SetUniqueList.setUniqueList(new LinkedList<Integer>()); 44 private String[] params; 55 45 56 // 1. copy train- and testdata 57 Instances train = new Instances(traindata); 58 Instances test = new Instances(testdata); 59 60 Instances selected = null; 61 62 try { 63 // remove class attribute from traindata 64 Remove filter = new Remove(); 65 filter.setAttributeIndices("" + (train.classIndex() + 1)); 66 filter.setInputFormat(train); 67 train = Filter.useFilter(train, filter); 68 69 Console.traceln(Level.INFO, String.format("starting clustering")); 70 71 // 3. cluster data 72 EM clusterer = new EM(); 73 clusterer.setOptions(params); 74 clusterer.buildClusterer(train); 75 int numClusters = clusterer.getNumClusters(); 76 if ( numClusters == -1) { 77 Console.traceln(Level.INFO, String.format("we have unlimited clusters")); 78 }else { 79 Console.traceln(Level.INFO, String.format("we have: "+numClusters+" clusters")); 80 } 81 82 83 // 4. classify testdata, save cluster int 84 85 // remove class attribute from testdata? 86 Remove filter2 = new Remove(); 87 filter2.setAttributeIndices("" + (test.classIndex() + 1)); 88 filter2.setInputFormat(test); 89 test = Filter.useFilter(test, filter2); 90 91 int cnum; 92 for( int i=0; i < test.numInstances(); i++ ) { 93 cnum = ((EM)clusterer).clusterInstance(test.get(i)); 46 @Override 47 public void setParameter(String parameters) { 48 params = parameters.split(" "); 49 } 94 50 95 // we dont want doubles (maybe use a hashset instead of list?) 96 if ( !selectedCluster.contains(cnum) ) { 97 selectedCluster.add(cnum); 98 //Console.traceln(Level.INFO, String.format("assigned to cluster: "+cnum)); 99 } 100 } 101 102 Console.traceln(Level.INFO, String.format("our testdata is in: "+selectedCluster.size()+" different clusters")); 103 104 // 5. get cluster membership of our traindata 105 AddCluster cfilter = new AddCluster(); 106 cfilter.setClusterer(clusterer); 107 cfilter.setInputFormat(train); 108 Instances ctrain = Filter.useFilter(train, cfilter); 109 110 111 // 6. for all traindata get the cluster int, if it is in our list of testdata cluster int add the traindata 112 // of this cluster to our returned traindata 113 int cnumber; 114 selected = new Instances(traindata); 115 selected.delete(); 116 117 for ( int j=0; j < ctrain.numInstances(); j++ ) { 118 // get the cluster number from the attributes 119 cnumber = Integer.parseInt(ctrain.get(j).stringValue(ctrain.get(j).numAttributes()-1).replace("cluster", "")); 120 121 //Console.traceln(Level.INFO, String.format("instance "+j+" is in cluster: "+cnumber)); 122 if ( selectedCluster.contains(cnumber) ) { 123 // this only works if the index does not change 124 selected.add(traindata.get(j)); 125 // check for differences, just one attribute, we are pretty sure the index does not change 126 if ( traindata.get(j).value(3) != ctrain.get(j).value(3) ) { 127 Console.traceln(Level.WARNING, String.format("we have a difference between train an ctrain!")); 128 } 129 } 130 } 131 132 Console.traceln(Level.INFO, String.format("that leaves us with: "+selected.numInstances()+" traindata instances from "+traindata.numInstances())); 133 }catch( Exception e ) { 134 Console.traceln(Level.WARNING, String.format("ERROR")); 135 throw new RuntimeException("error in pointwise em", e); 136 } 137 138 return selected; 139 } 51 /** 52 * 1. Cluster the traindata 2. for each instance in the testdata find the assigned cluster 3. 53 * select only traindata from the clusters we found in our testdata 54 * 55 * @returns the selected training data 56 */ 57 @Override 58 public Instances apply(Instances testdata, Instances traindata) { 59 // final Attribute classAttribute = testdata.classAttribute(); 60 61 final List<Integer> selectedCluster = 62 SetUniqueList.setUniqueList(new LinkedList<Integer>()); 63 64 // 1. copy train- and testdata 65 Instances train = new Instances(traindata); 66 Instances test = new Instances(testdata); 67 68 Instances selected = null; 69 70 try { 71 // remove class attribute from traindata 72 Remove filter = new Remove(); 73 filter.setAttributeIndices("" + (train.classIndex() + 1)); 74 filter.setInputFormat(train); 75 train = Filter.useFilter(train, filter); 76 77 Console.traceln(Level.INFO, String.format("starting clustering")); 78 79 // 3. cluster data 80 EM clusterer = new EM(); 81 clusterer.setOptions(params); 82 clusterer.buildClusterer(train); 83 int numClusters = clusterer.getNumClusters(); 84 if (numClusters == -1) { 85 Console.traceln(Level.INFO, String.format("we have unlimited clusters")); 86 } 87 else { 88 Console.traceln(Level.INFO, String.format("we have: " + numClusters + " clusters")); 89 } 90 91 // 4. classify testdata, save cluster int 92 93 // remove class attribute from testdata? 94 Remove filter2 = new Remove(); 95 filter2.setAttributeIndices("" + (test.classIndex() + 1)); 96 filter2.setInputFormat(test); 97 test = Filter.useFilter(test, filter2); 98 99 int cnum; 100 for (int i = 0; i < test.numInstances(); i++) { 101 cnum = ((EM) clusterer).clusterInstance(test.get(i)); 102 103 // we dont want doubles (maybe use a hashset instead of list?) 104 if (!selectedCluster.contains(cnum)) { 105 selectedCluster.add(cnum); 106 // Console.traceln(Level.INFO, String.format("assigned to cluster: "+cnum)); 107 } 108 } 109 110 Console.traceln(Level.INFO, 111 String.format("our testdata is in: " + selectedCluster.size() + 112 " different clusters")); 113 114 // 5. get cluster membership of our traindata 115 AddCluster cfilter = new AddCluster(); 116 cfilter.setClusterer(clusterer); 117 cfilter.setInputFormat(train); 118 Instances ctrain = Filter.useFilter(train, cfilter); 119 120 // 6. for all traindata get the cluster int, if it is in our list of testdata cluster 121 // int add the traindata 122 // of this cluster to our returned traindata 123 int cnumber; 124 selected = new Instances(traindata); 125 selected.delete(); 126 127 for (int j = 0; j < ctrain.numInstances(); j++) { 128 // get the cluster number from the attributes 129 cnumber = 130 Integer.parseInt(ctrain.get(j).stringValue(ctrain.get(j).numAttributes() - 1) 131 .replace("cluster", "")); 132 133 // Console.traceln(Level.INFO, 134 // String.format("instance "+j+" is in cluster: "+cnumber)); 135 if (selectedCluster.contains(cnumber)) { 136 // this only works if the index does not change 137 selected.add(traindata.get(j)); 138 // check for differences, just one attribute, we are pretty sure the index does 139 // not change 140 if (traindata.get(j).value(3) != ctrain.get(j).value(3)) { 141 Console.traceln(Level.WARNING, String 142 .format("we have a difference between train an ctrain!")); 143 } 144 } 145 } 146 147 Console.traceln(Level.INFO, 148 String.format("that leaves us with: " + selected.numInstances() + 149 " traindata instances from " + traindata.numInstances())); 150 } 151 catch (Exception e) { 152 Console.traceln(Level.WARNING, String.format("ERROR")); 153 throw new RuntimeException("error in pointwise em", e); 154 } 155 156 return selected; 157 } 140 158 141 159 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/SeparatabilitySelection.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataselection; 2 16 … … 13 27 14 28 /** 15 * A setwise data selection strategy based on the separatability of the training data from the test data after Z. He, F. Peters, T. Menzies, Y. Yang: Learning from Open-Source Projects: An Empirical Study on Defect Prediction. 16 * <br><br> 17 * This is calculated through the error of a logistic regression classifier that tries to separate the sets. 29 * A setwise data selection strategy based on the separatability of the training data from the test 30 * data after Z. He, F. Peters, T. Menzies, Y. Yang: Learning from Open-Source Projects: An 31 * Empirical Study on Defect Prediction. <br> 32 * <br> 33 * This is calculated through the error of a logistic regression classifier that tries to separate 34 * the sets. 35 * 18 36 * @author Steffen Herbold 19 37 */ 20 38 public class SeparatabilitySelection implements ISetWiseDataselectionStrategy { 21 39 22 /** 23 * size of the random sample that is drawn from both test data and training data 24 */ 25 private final int sampleSize = 500; 26 27 /** 28 * number of repetitions of the sample drawing 29 */ 30 private final int maxRep = 10; 31 32 /** 33 * number of neighbors that are selected 34 */ 35 private int neighbors = 10; 36 37 /** 38 * Sets the number of neighbors that are selected. 39 */ 40 @Override 41 public void setParameter(String parameters) { 42 if( !"".equals(parameters) ) { 43 neighbors = Integer.parseInt(parameters); 44 } 45 } 40 /** 41 * size of the random sample that is drawn from both test data and training data 42 */ 43 private final int sampleSize = 500; 46 44 47 /** 48 * @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 49 */ 50 @Override 51 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 52 final Random rand = new Random(1); 53 54 // calculate distances between testdata and traindata 55 final double[] distances = new double[traindataSet.size()]; 56 57 int i=0; 58 for( Instances traindata : traindataSet ) { 59 double distance = 0.0; 60 for( int rep=0; rep<maxRep ; rep++ ) { 61 // sample instances 62 Instances sample = new Instances(testdata); 63 for( int j=0; j<sampleSize; j++ ) { 64 Instance inst = new DenseInstance(testdata.instance(rand.nextInt(testdata.numInstances()))); 65 inst.setDataset(sample); 66 inst.setClassValue(1.0); 67 sample.add(inst); 68 inst = new DenseInstance(traindata.instance(rand.nextInt(traindata.numInstances()))); 69 inst.setDataset(sample); 70 inst.setClassValue(0.0); 71 sample.add(inst); 72 } 73 74 // calculate separation 75 Evaluation eval; 76 try { 77 eval = new Evaluation(sample); 78 eval.crossValidateModel(new Logistic(), sample, 5, rand); 79 } catch (Exception e) { 80 throw new RuntimeException("cross-validation during calculation of separatability failed", e); 81 } 82 distance += eval.pctCorrect()/100.0; 83 } 84 distances[i++] = 2*((distance/maxRep)-0.5); 85 } 86 87 // select closest neighbors 88 final double[] distancesCopy = Arrays.copyOf(distances, distances.length); 89 Arrays.sort(distancesCopy); 90 final double cutoffDistance = distancesCopy[neighbors]; 91 92 for( i=traindataSet.size()-1; i>=0 ; i-- ) { 93 if( distances[i]>cutoffDistance ) { 94 traindataSet.remove(i); 95 } 96 } 97 } 45 /** 46 * number of repetitions of the sample drawing 47 */ 48 private final int maxRep = 10; 49 50 /** 51 * number of neighbors that are selected 52 */ 53 private int neighbors = 10; 54 55 /** 56 * Sets the number of neighbors that are selected. 57 */ 58 @Override 59 public void setParameter(String parameters) { 60 if (!"".equals(parameters)) { 61 neighbors = Integer.parseInt(parameters); 62 } 63 } 64 65 /** 66 * @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances, 67 * org.apache.commons.collections4.list.SetUniqueList) 68 */ 69 @Override 70 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 71 final Random rand = new Random(1); 72 73 // calculate distances between testdata and traindata 74 final double[] distances = new double[traindataSet.size()]; 75 76 int i = 0; 77 for (Instances traindata : traindataSet) { 78 double distance = 0.0; 79 for (int rep = 0; rep < maxRep; rep++) { 80 // sample instances 81 Instances sample = new Instances(testdata); 82 for (int j = 0; j < sampleSize; j++) { 83 Instance inst = 84 new DenseInstance(testdata.instance(rand.nextInt(testdata.numInstances()))); 85 inst.setDataset(sample); 86 inst.setClassValue(1.0); 87 sample.add(inst); 88 inst = 89 new DenseInstance( 90 traindata.instance(rand.nextInt(traindata.numInstances()))); 91 inst.setDataset(sample); 92 inst.setClassValue(0.0); 93 sample.add(inst); 94 } 95 96 // calculate separation 97 Evaluation eval; 98 try { 99 eval = new Evaluation(sample); 100 eval.crossValidateModel(new Logistic(), sample, 5, rand); 101 } 102 catch (Exception e) { 103 throw new RuntimeException( 104 "cross-validation during calculation of separatability failed", 105 e); 106 } 107 distance += eval.pctCorrect() / 100.0; 108 } 109 distances[i++] = 2 * ((distance / maxRep) - 0.5); 110 } 111 112 // select closest neighbors 113 final double[] distancesCopy = Arrays.copyOf(distances, distances.length); 114 Arrays.sort(distancesCopy); 115 final double cutoffDistance = distancesCopy[neighbors]; 116 117 for (i = traindataSet.size() - 1; i >= 0; i--) { 118 if (distances[i] > cutoffDistance) { 119 traindataSet.remove(i); 120 } 121 } 122 } 98 123 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/SetWiseEMClusterSelection.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataselection; 2 16 … … 11 25 12 26 /** 13 * Filter based on EM clustering after S. Herbold: Training data selection for cross-project defect prediction 27 * Filter based on EM clustering after S. Herbold: Training data selection for cross-project defect 28 * prediction 29 * 14 30 * @author Steffen Herbold 15 31 */ 16 32 public class SetWiseEMClusterSelection extends AbstractCharacteristicSelection { 17 18 /** 19 * @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 20 */ 21 @Override 22 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 23 final Instances data = normalizedCharacteristicInstances(testdata, traindataSet); 24 final Instance targetInstance = data.instance(0); 25 final List<Instance> candidateInstances = new LinkedList<Instance>(); 26 for( int i=1; i<data.numInstances(); i++ ) { 27 candidateInstances.add(data.instance(i)); 28 } 29 30 // cluster and select 31 try { 32 final EM emeans = new EM(); 33 boolean onlyTarget = true; 34 int targetCluster; 35 int maxNumClusters = candidateInstances.size(); 36 do { // while(onlyTarget) 37 emeans.setMaximumNumberOfClusters(maxNumClusters); 38 emeans.buildClusterer(data); 39 40 targetCluster = emeans.clusterInstance(targetInstance); 41 42 // check if cluster only contains target project 43 for( int i=0 ; i<candidateInstances.size() && onlyTarget; i++ ) { 44 onlyTarget &= !(emeans.clusterInstance(candidateInstances.get(i))==targetCluster); 45 } 46 maxNumClusters = emeans.numberOfClusters()-1; 47 } while(onlyTarget); 48 49 int numRemoved = 0; 50 for( int i=0 ; i<candidateInstances.size() ; i++ ) { 51 if( emeans.clusterInstance(candidateInstances.get(i))!=targetCluster ) { 52 traindataSet.remove(i-numRemoved++); 53 } 54 } 55 } catch(Exception e) { 56 throw new RuntimeException("error applying setwise EM clustering training data selection", e); 57 } 58 } 33 34 /** 35 * @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances, 36 * org.apache.commons.collections4.list.SetUniqueList) 37 */ 38 @Override 39 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 40 final Instances data = normalizedCharacteristicInstances(testdata, traindataSet); 41 final Instance targetInstance = data.instance(0); 42 final List<Instance> candidateInstances = new LinkedList<Instance>(); 43 for (int i = 1; i < data.numInstances(); i++) { 44 candidateInstances.add(data.instance(i)); 45 } 46 47 // cluster and select 48 try { 49 final EM emeans = new EM(); 50 boolean onlyTarget = true; 51 int targetCluster; 52 int maxNumClusters = candidateInstances.size(); 53 do { // while(onlyTarget) 54 emeans.setMaximumNumberOfClusters(maxNumClusters); 55 emeans.buildClusterer(data); 56 57 targetCluster = emeans.clusterInstance(targetInstance); 58 59 // check if cluster only contains target project 60 for (int i = 0; i < candidateInstances.size() && onlyTarget; i++) { 61 onlyTarget &= 62 !(emeans.clusterInstance(candidateInstances.get(i)) == targetCluster); 63 } 64 maxNumClusters = emeans.numberOfClusters() - 1; 65 } 66 while (onlyTarget); 67 68 int numRemoved = 0; 69 for (int i = 0; i < candidateInstances.size(); i++) { 70 if (emeans.clusterInstance(candidateInstances.get(i)) != targetCluster) { 71 traindataSet.remove(i - numRemoved++); 72 } 73 } 74 } 75 catch (Exception e) { 76 throw new RuntimeException( 77 "error applying setwise EM clustering training data selection", 78 e); 79 } 80 } 59 81 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/SetWiseEMContextSelection.java
r38 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataselection; 2 16 … … 22 36 * Selects training data by clustering project context factors. 23 37 * 24 * The project context factors used for the clustering are configured in 25 * the XML param attribute, Example: 26 * <setwiseselector name="SetWiseEMContextSelection" param="AFS TND TNC" /> 38 * The project context factors used for the clustering are configured in the XML param attribute, 39 * Example: <setwiseselector name="SetWiseEMContextSelection" param="AFS TND TNC" /> 27 40 */ 28 41 public class SetWiseEMContextSelection implements ISetWiseDataselectionStrategy { 29 30 private String[] project_context_factors; // = new String[]{"TND", "TNC", "TNF", "TLOC"}; 31 32 @Override 33 public void setParameter(String parameters) { 34 if( parameters!=null ) { 35 project_context_factors = parameters.split(" "); 36 } 37 } 38 39 /** 40 * Uses the Weka EM-Clustering algorithm to cluster the projects 41 * by their project context factors. 42 * The project context factors are first normalized and then used for clustering. 43 * They can be configured in the configuration param. 44 * 45 * @param testdata 46 * @param traindataSet 47 */ 48 protected void cluster(Instances testdata, SetUniqueList<Instances> traindataSet) { 49 // now do the clustering, normalizedCharacteristicInstances ruft getContextFactors auf 50 final Instances data = this.normalizedCharacteristicInstances(testdata, traindataSet); 51 52 final Instance targetInstance = data.instance(0); 53 final List<Instance> candidateInstances = new LinkedList<Instance>(); 54 for( int i=1; i<data.numInstances(); i++ ) { 55 candidateInstances.add(data.instance(i)); 56 } 57 58 // cluster and select 59 try { 60 final EM emeans = new EM(); 61 boolean onlyTarget = true; 62 int targetCluster; 63 int maxNumClusters = candidateInstances.size(); 64 65 do { // while(onlyTarget) 66 emeans.setMaximumNumberOfClusters(maxNumClusters); 67 emeans.buildClusterer(data); 68 69 targetCluster = emeans.clusterInstance(targetInstance); 70 71 // check if cluster only contains target project 72 for( int i=0 ; i<candidateInstances.size() && onlyTarget; i++ ) { 73 onlyTarget &= !(emeans.clusterInstance(candidateInstances.get(i))==targetCluster); 74 } 75 maxNumClusters = emeans.numberOfClusters()-1; 76 77 //Console.traceln(Level.INFO, "number of clusters: " + emeans.numberOfClusters()); 78 } while(onlyTarget); 79 80 Console.traceln(Level.INFO, "clusters: " + maxNumClusters); 81 Console.traceln(Level.INFO, "instances vor dem clustern: " + traindataSet.size()); 82 int numRemoved = 0; 83 for( int i=0 ; i<candidateInstances.size() ; i++ ) { 84 if( emeans.clusterInstance(candidateInstances.get(i))!=targetCluster ) { 85 traindataSet.remove(i-numRemoved++); 86 } 87 } 88 Console.traceln(Level.INFO, "instances nach dem clustern: " + traindataSet.size()); 89 } catch(Exception e) { 90 throw new RuntimeException("error applying setwise EM clustering training data selection", e); 91 } 92 } 93 94 @Override 95 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 96 // issuetracking und pl muss passen 97 /* 98 int s = traindataSet.size(); 99 Console.traceln(Level.INFO, "remove non matching PL and IssueTracking projects, size now: " + s); 100 this.removeWrongContext(testdata, traindataSet, "PL"); 101 this.removeWrongContext(testdata, traindataSet, "IssueTracking"); 102 s = traindataSet.size(); 103 Console.traceln(Level.INFO, "size after removal: " + s); 104 */ 105 // now cluster 106 this.cluster(testdata, traindataSet); 107 } 108 109 /** 110 * Returns test- and training data with only the project context factors 111 * which were chosen in the configuration. 112 * This is later used for clustering. 113 * 114 * @param testdata 115 * @param traindataSet 116 * @return 117 */ 118 protected Instances getContextFactors(Instances testdata, SetUniqueList<Instances> traindataSet) { 119 // setup weka Instances for clustering 120 final ArrayList<Attribute> atts = new ArrayList<Attribute>(); 121 122 // we only want the project context factors 123 for( String pcf : this.project_context_factors ) { 124 atts.add(new Attribute(pcf)); 125 } 126 127 // set up the data 128 final Instances data = new Instances("project_context_factors", atts, 0); 129 double[] instanceValues = new double[atts.size()]; 130 131 // only project context factors + only one instance per project needed 132 int i = 0; 133 for( String pcf : this.project_context_factors ) { 134 instanceValues[i] = testdata.instance(0).value(testdata.attribute(pcf)); 135 //Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " + instanceValues[i]); 136 i++; 137 } 138 data.add(new DenseInstance(1.0, instanceValues)); 139 140 // now for the projects of the training stet 141 for( Instances traindata : traindataSet ) { 142 instanceValues = new double[atts.size()]; // ohne das hier immer dieselben werte?! 143 i = 0; 144 for( String pcf : this.project_context_factors ) { 145 instanceValues[i] = traindata.instance(0).value(traindata.attribute(pcf)); 146 //Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " + instanceValues[i]); 147 i++; 148 } 149 150 data.add(new DenseInstance(1.0, instanceValues)); 151 } 152 153 return data; 154 } 155 156 /** 157 * Delete projects where the project context does not match the training project 158 * 159 * @param testdata 160 * @param traindataSet 161 * @param attribute 162 */ 163 protected void removeWrongContext(Instances testdata, SetUniqueList<Instances> traindataSet, String attribute) { 164 Set<Instances> remove = new HashSet<Instances>(); 165 for( Instances traindata : traindataSet ) { 166 if( traindata.firstInstance().value(traindata.attribute(attribute)) != testdata.firstInstance().value(testdata.attribute(attribute)) ) { 167 remove.add(traindata); 168 //Console.traceln(Level.WARNING, "rmove attribute "+attribute+" test: "+testdata.firstInstance().value(testdata.attribute(attribute))+" train: "+traindata.firstInstance().value(traindata.attribute(attribute))); 169 } 170 } 171 172 // now delete the projects from set 173 for( Instances i : remove ) { 174 traindataSet.remove(i); 175 //Console.traceln(Level.INFO, "removing training project from set"); 176 } 177 } 178 179 /** 180 * Normalizes the data before it gets used for clustering 181 * 182 * @param testdata 183 * @param traindataSet 184 * @return 185 */ 186 protected Instances normalizedCharacteristicInstances(Instances testdata, SetUniqueList<Instances> traindataSet) { 187 Instances data = this.getContextFactors(testdata, traindataSet); 188 try { 189 final Normalize normalizer = new Normalize(); 190 normalizer.setInputFormat(data); 191 data = Filter.useFilter(data, normalizer); 192 } catch (Exception e) { 193 throw new RuntimeException("Unexpected exception during normalization of distributional characteristics.", e); 194 } 195 return data; 196 } 42 43 private String[] project_context_factors; // = new String[]{"TND", "TNC", "TNF", "TLOC"}; 44 45 @Override 46 public void setParameter(String parameters) { 47 if (parameters != null) { 48 project_context_factors = parameters.split(" "); 49 } 50 } 51 52 /** 53 * Uses the Weka EM-Clustering algorithm to cluster the projects by their project context 54 * factors. The project context factors are first normalized and then used for clustering. They 55 * can be configured in the configuration param. 56 * 57 * @param testdata 58 * @param traindataSet 59 */ 60 protected void cluster(Instances testdata, SetUniqueList<Instances> traindataSet) { 61 // now do the clustering, normalizedCharacteristicInstances ruft getContextFactors auf 62 final Instances data = this.normalizedCharacteristicInstances(testdata, traindataSet); 63 64 final Instance targetInstance = data.instance(0); 65 final List<Instance> candidateInstances = new LinkedList<Instance>(); 66 for (int i = 1; i < data.numInstances(); i++) { 67 candidateInstances.add(data.instance(i)); 68 } 69 70 // cluster and select 71 try { 72 final EM emeans = new EM(); 73 boolean onlyTarget = true; 74 int targetCluster; 75 int maxNumClusters = candidateInstances.size(); 76 77 do { // while(onlyTarget) 78 emeans.setMaximumNumberOfClusters(maxNumClusters); 79 emeans.buildClusterer(data); 80 81 targetCluster = emeans.clusterInstance(targetInstance); 82 83 // check if cluster only contains target project 84 for (int i = 0; i < candidateInstances.size() && onlyTarget; i++) { 85 onlyTarget &= 86 !(emeans.clusterInstance(candidateInstances.get(i)) == targetCluster); 87 } 88 maxNumClusters = emeans.numberOfClusters() - 1; 89 90 // Console.traceln(Level.INFO, "number of clusters: " + emeans.numberOfClusters()); 91 } 92 while (onlyTarget); 93 94 Console.traceln(Level.INFO, "clusters: " + maxNumClusters); 95 Console.traceln(Level.INFO, "instances vor dem clustern: " + traindataSet.size()); 96 int numRemoved = 0; 97 for (int i = 0; i < candidateInstances.size(); i++) { 98 if (emeans.clusterInstance(candidateInstances.get(i)) != targetCluster) { 99 traindataSet.remove(i - numRemoved++); 100 } 101 } 102 Console.traceln(Level.INFO, "instances nach dem clustern: " + traindataSet.size()); 103 } 104 catch (Exception e) { 105 throw new RuntimeException( 106 "error applying setwise EM clustering training data selection", 107 e); 108 } 109 } 110 111 @Override 112 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 113 // issuetracking und pl muss passen 114 /* 115 * int s = traindataSet.size(); Console.traceln(Level.INFO, 116 * "remove non matching PL and IssueTracking projects, size now: " + s); 117 * this.removeWrongContext(testdata, traindataSet, "PL"); this.removeWrongContext(testdata, 118 * traindataSet, "IssueTracking"); s = traindataSet.size(); Console.traceln(Level.INFO, 119 * "size after removal: " + s); 120 */ 121 // now cluster 122 this.cluster(testdata, traindataSet); 123 } 124 125 /** 126 * Returns test- and training data with only the project context factors which were chosen in 127 * the configuration. This is later used for clustering. 128 * 129 * @param testdata 130 * @param traindataSet 131 * @return 132 */ 133 protected Instances getContextFactors(Instances testdata, SetUniqueList<Instances> traindataSet) 134 { 135 // setup weka Instances for clustering 136 final ArrayList<Attribute> atts = new ArrayList<Attribute>(); 137 138 // we only want the project context factors 139 for (String pcf : this.project_context_factors) { 140 atts.add(new Attribute(pcf)); 141 } 142 143 // set up the data 144 final Instances data = new Instances("project_context_factors", atts, 0); 145 double[] instanceValues = new double[atts.size()]; 146 147 // only project context factors + only one instance per project needed 148 int i = 0; 149 for (String pcf : this.project_context_factors) { 150 instanceValues[i] = testdata.instance(0).value(testdata.attribute(pcf)); 151 // Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " + 152 // instanceValues[i]); 153 i++; 154 } 155 data.add(new DenseInstance(1.0, instanceValues)); 156 157 // now for the projects of the training stet 158 for (Instances traindata : traindataSet) { 159 instanceValues = new double[atts.size()]; // ohne das hier immer dieselben werte?! 160 i = 0; 161 for (String pcf : this.project_context_factors) { 162 instanceValues[i] = traindata.instance(0).value(traindata.attribute(pcf)); 163 // Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " + 164 // instanceValues[i]); 165 i++; 166 } 167 168 data.add(new DenseInstance(1.0, instanceValues)); 169 } 170 171 return data; 172 } 173 174 /** 175 * Delete projects where the project context does not match the training project 176 * 177 * @param testdata 178 * @param traindataSet 179 * @param attribute 180 */ 181 protected void removeWrongContext(Instances testdata, 182 SetUniqueList<Instances> traindataSet, 183 String attribute) 184 { 185 Set<Instances> remove = new HashSet<Instances>(); 186 for (Instances traindata : traindataSet) { 187 if (traindata.firstInstance().value(traindata.attribute(attribute)) != testdata 188 .firstInstance().value(testdata.attribute(attribute))) 189 { 190 remove.add(traindata); 191 // Console.traceln(Level.WARNING, 192 // "rmove attribute "+attribute+" test: "+testdata.firstInstance().value(testdata.attribute(attribute))+" train: "+traindata.firstInstance().value(traindata.attribute(attribute))); 193 } 194 } 195 196 // now delete the projects from set 197 for (Instances i : remove) { 198 traindataSet.remove(i); 199 // Console.traceln(Level.INFO, "removing training project from set"); 200 } 201 } 202 203 /** 204 * Normalizes the data before it gets used for clustering 205 * 206 * @param testdata 207 * @param traindataSet 208 * @return 209 */ 210 protected Instances normalizedCharacteristicInstances(Instances testdata, 211 SetUniqueList<Instances> traindataSet) 212 { 213 Instances data = this.getContextFactors(testdata, traindataSet); 214 try { 215 final Normalize normalizer = new Normalize(); 216 normalizer.setInputFormat(data); 217 data = Filter.useFilter(data, normalizer); 218 } 219 catch (Exception e) { 220 throw new RuntimeException( 221 "Unexpected exception during normalization of distributional characteristics.", 222 e); 223 } 224 return data; 225 } 197 226 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/SetWiseKNNSelection.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataselection; 2 16 … … 10 24 11 25 /** 12 * Filter based on the k-nearest neighbor (KNN) algorithm S. Herbold: Training data selection for cross-project defect prediction 26 * Filter based on the k-nearest neighbor (KNN) algorithm S. Herbold: Training data selection for 27 * cross-project defect prediction 28 * 13 29 * @author Steffen Herbold 14 30 */ 15 31 public class SetWiseKNNSelection extends AbstractCharacteristicSelection { 16 17 /**18 * number of neighbors selected19 */20 private int k = 1;21 22 /**23 * @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList)24 */25 @Override26 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {27 final Instances data = normalizedCharacteristicInstances(testdata, traindataSet);28 29 final Set<Integer> selected = new HashSet<Integer>();30 for( int i=0 ; i<k ; i++ ) {31 int closestIndex = getClosest(data);32 33 selected.add(closestIndex);34 data.delete(closestIndex);35 }36 37 for( int i=traindataSet.size()-1; i>=0 ; i-- ) {38 if( selected.contains(i) ) {39 traindataSet.remove(i);40 }41 }42 }43 44 /**45 * Helper method that determines the index of the instance with the smallest distance to the first instance (index 0).46 * @param data data set47 * @return index of the closest instance48 */49 private int getClosest(Instances data) {50 double closestDistance = Double.MAX_VALUE;51 int closestIndex = 1;52 for( int i=1 ; i<data.numInstances() ; i++ ) {53 double distance = MathArrays.distance(data.instance(0).toDoubleArray(), data.instance(i).toDoubleArray());54 if( distance < closestDistance) {55 closestDistance = distance;56 closestIndex = i;57 }58 }59 return closestIndex;60 }61 32 62 /** 63 * Sets the number of neighbors followed by the distributional characteristics, the values are separated by blanks. 64 * @see AbstractCharacteristicSelection#setParameter(String) 65 */ 66 @Override 67 public void setParameter(String parameters) { 68 if( !"".equals(parameters) ) { 69 final String[] split = parameters.split(" "); 70 k = Integer.parseInt(split[0]); 71 String str = ""; 72 for( int i=1 ; i<split.length; i++ ) { 73 str += split[i]; 74 if( i<split.length-1 ) { 75 str += " "; 76 } 77 } 78 super.setParameter(str); 79 } 80 } 33 /** 34 * number of neighbors selected 35 */ 36 private int k = 1; 37 38 /** 39 * @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances, 40 * org.apache.commons.collections4.list.SetUniqueList) 41 */ 42 @Override 43 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 44 final Instances data = normalizedCharacteristicInstances(testdata, traindataSet); 45 46 final Set<Integer> selected = new HashSet<Integer>(); 47 for (int i = 0; i < k; i++) { 48 int closestIndex = getClosest(data); 49 50 selected.add(closestIndex); 51 data.delete(closestIndex); 52 } 53 54 for (int i = traindataSet.size() - 1; i >= 0; i--) { 55 if (selected.contains(i)) { 56 traindataSet.remove(i); 57 } 58 } 59 } 60 61 /** 62 * Helper method that determines the index of the instance with the smallest distance to the 63 * first instance (index 0). 64 * 65 * @param data 66 * data set 67 * @return index of the closest instance 68 */ 69 private int getClosest(Instances data) { 70 double closestDistance = Double.MAX_VALUE; 71 int closestIndex = 1; 72 for (int i = 1; i < data.numInstances(); i++) { 73 double distance = 74 MathArrays.distance(data.instance(0).toDoubleArray(), data.instance(i) 75 .toDoubleArray()); 76 if (distance < closestDistance) { 77 closestDistance = distance; 78 closestIndex = i; 79 } 80 } 81 return closestIndex; 82 } 83 84 /** 85 * Sets the number of neighbors followed by the distributional characteristics, the values are 86 * separated by blanks. 87 * 88 * @see AbstractCharacteristicSelection#setParameter(String) 89 */ 90 @Override 91 public void setParameter(String parameters) { 92 if (!"".equals(parameters)) { 93 final String[] split = parameters.split(" "); 94 k = Integer.parseInt(split[0]); 95 String str = ""; 96 for (int i = 1; i < split.length; i++) { 97 str += split[i]; 98 if (i < split.length - 1) { 99 str += " "; 100 } 101 } 102 super.setParameter(str); 103 } 104 } 81 105 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/TestAsTraining.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataselection; 2 16 … … 7 21 /** 8 22 * Uses the test data as training data. 23 * 9 24 * @author Steffen Herbold 10 * 25 * 11 26 */ 12 27 public class TestAsTraining implements ISetWiseDataselectionStrategy { 13 28 14 15 16 17 18 19 20 29 /** 30 * no parameters 31 */ 32 @Override 33 public void setParameter(String parameters) { 34 // dummy 35 } 21 36 22 /**(non-Javadoc) 23 * @see de.ugoe.cs.cpdp.dataselection.ISetWiseDataselectionStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList) 24 */ 25 @Override 26 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 27 traindataSet.clear(); 28 traindataSet.add(new Instances(testdata)); 29 } 37 /** 38 * (non-Javadoc) 39 * 40 * @see de.ugoe.cs.cpdp.dataselection.ISetWiseDataselectionStrategy#apply(weka.core.Instances, 41 * org.apache.commons.collections4.list.SetUniqueList) 42 */ 43 @Override 44 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 45 traindataSet.clear(); 46 traindataSet.add(new Instances(testdata)); 47 } 30 48 31 49 } -
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/TurhanFilter.java
r2 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataselection; 2 16 … … 13 27 14 28 /** 15 * Filter according to B. Turhan, T. Menzies, A. Bener, and J. Die Stefano: On the relative value of cross-company and within company defect prediction 29 * Filter according to B. Turhan, T. Menzies, A. Bener, and J. Die Stefano: On the relative value of 30 * cross-company and within company defect prediction 31 * 16 32 * @author Steffen Herbold 17 33 */ 18 34 public class TurhanFilter implements IPointWiseDataselectionStrategy { 19 35 20 /** 21 * number of neighbors that are selected 22 */ 23 private int k = 10; 24 25 /** 26 * Sets the number of neighbors. 27 * @param parameters number of neighbors 28 */ 29 @Override 30 public void setParameter(String parameters) { 31 k = Integer.parseInt(parameters); 32 } 36 /** 37 * number of neighbors that are selected 38 */ 39 private int k = 10; 33 40 34 /** 35 * @see de.ugoe.cs.cpdp.dataselection.PointWiseDataselectionStrategy#apply(weka.core.Instances, weka.core.Instances) 36 */ 37 @Override 38 public Instances apply(Instances testdata, Instances traindata) { 39 final Attribute classAttribute = testdata.classAttribute(); 40 41 final List<Integer> selectedIndex = SetUniqueList.setUniqueList(new LinkedList<Integer>()); 42 43 final double[][] trainDoubles = new double[traindata.numInstances()][testdata.numAttributes()]; 44 45 for( int i=0; i<traindata.numInstances() ; i++ ) { 46 Instance instance = traindata.instance(i); 47 int tmp = 0; 48 for( int j=0 ; j<testdata.numAttributes(); j++ ) { 49 if( testdata.attribute(j)!=classAttribute ) { 50 trainDoubles[i][tmp++] = instance.value(j); 51 } 52 } 53 } 54 55 for( int i=0; i<testdata.numInstances() ; i++ ) { 56 Instance testIntance = testdata.instance(i); 57 double[] targetVector = new double[testdata.numAttributes()-1]; 58 int tmp = 0; 59 for( int j=0 ; j<testdata.numAttributes(); j++ ) { 60 if( testdata.attribute(j)!=classAttribute ) { 61 targetVector[tmp++] = testIntance.value(j); 62 } 63 } 64 65 double farthestClosestDistance = Double.MAX_VALUE; 66 int farthestClosestIndex = 0; 67 double[] closestDistances = new double[k]; 68 for( int m=0 ; m<closestDistances.length ; m++ ) { 69 closestDistances[m] = Double.MAX_VALUE; 70 } 71 int[] closestIndex = new int[k]; 72 73 for( int n=0; n<traindata.numInstances() ; n++ ) { 74 double distance = MathArrays.distance(targetVector, trainDoubles[n]); 75 76 if( distance<farthestClosestDistance ) { 77 closestIndex[farthestClosestIndex] = n; 78 closestDistances[farthestClosestIndex] = distance; 79 80 farthestClosestIndex = ArrayTools.findMax(closestDistances); 81 farthestClosestDistance = closestDistances[farthestClosestIndex]; 82 } 83 } 84 for( int index : closestIndex ) { 85 selectedIndex.add(index); 86 } 87 } 88 89 final Instances selected = new Instances(testdata); 90 selected.delete(); 91 for( Integer i : selectedIndex) { 92 selected.add(traindata.instance(i)); 93 } 94 return selected; 95 } 41 /** 42 * Sets the number of neighbors. 43 * 44 * @param parameters 45 * number of neighbors 46 */ 47 @Override 48 public void setParameter(String parameters) { 49 k = Integer.parseInt(parameters); 50 } 51 52 /** 53 * @see de.ugoe.cs.cpdp.dataselection.PointWiseDataselectionStrategy#apply(weka.core.Instances, 54 * weka.core.Instances) 55 */ 56 @Override 57 public Instances apply(Instances testdata, Instances traindata) { 58 final Attribute classAttribute = testdata.classAttribute(); 59 60 final List<Integer> selectedIndex = SetUniqueList.setUniqueList(new LinkedList<Integer>()); 61 62 final double[][] trainDoubles = 63 new double[traindata.numInstances()][testdata.numAttributes()]; 64 65 for (int i = 0; i < traindata.numInstances(); i++) { 66 Instance instance = traindata.instance(i); 67 int tmp = 0; 68 for (int j = 0; j < testdata.numAttributes(); j++) { 69 if (testdata.attribute(j) != classAttribute) { 70 trainDoubles[i][tmp++] = instance.value(j); 71 } 72 } 73 } 74 75 for (int i = 0; i < testdata.numInstances(); i++) { 76 Instance testIntance = testdata.instance(i); 77 double[] targetVector = new double[testdata.numAttributes() - 1]; 78 int tmp = 0; 79 for (int j = 0; j < testdata.numAttributes(); j++) { 80 if (testdata.attribute(j) != classAttribute) { 81 targetVector[tmp++] = testIntance.value(j); 82 } 83 } 84 85 double farthestClosestDistance = Double.MAX_VALUE; 86 int farthestClosestIndex = 0; 87 double[] closestDistances = new double[k]; 88 for (int m = 0; m < closestDistances.length; m++) { 89 closestDistances[m] = Double.MAX_VALUE; 90 } 91 int[] closestIndex = new int[k]; 92 93 for (int n = 0; n < traindata.numInstances(); n++) { 94 double distance = MathArrays.distance(targetVector, trainDoubles[n]); 95 96 if (distance < farthestClosestDistance) { 97 closestIndex[farthestClosestIndex] = n; 98 closestDistances[farthestClosestIndex] = distance; 99 100 farthestClosestIndex = ArrayTools.findMax(closestDistances); 101 farthestClosestDistance = closestDistances[farthestClosestIndex]; 102 } 103 } 104 for (int index : closestIndex) { 105 selectedIndex.add(index); 106 } 107 } 108 109 final Instances selected = new Instances(testdata); 110 selected.delete(); 111 for (Integer i : selectedIndex) { 112 selected.add(traindata.instance(i)); 113 } 114 return selected; 115 } 96 116 97 117 }
Note: See TracChangeset
for help on using the changeset viewer.