- Timestamp:
- 09/24/15 10:59:05 (9 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/SetWiseEMContextSelection.java
r38 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.dataselection; 2 16 … … 22 36 * Selects training data by clustering project context factors. 23 37 * 24 * The project context factors used for the clustering are configured in 25 * the XML param attribute, Example: 26 * <setwiseselector name="SetWiseEMContextSelection" param="AFS TND TNC" /> 38 * The project context factors used for the clustering are configured in the XML param attribute, 39 * Example: <setwiseselector name="SetWiseEMContextSelection" param="AFS TND TNC" /> 27 40 */ 28 41 public class SetWiseEMContextSelection implements ISetWiseDataselectionStrategy { 29 30 private String[] project_context_factors; // = new String[]{"TND", "TNC", "TNF", "TLOC"}; 31 32 @Override 33 public void setParameter(String parameters) { 34 if( parameters!=null ) { 35 project_context_factors = parameters.split(" "); 36 } 37 } 38 39 /** 40 * Uses the Weka EM-Clustering algorithm to cluster the projects 41 * by their project context factors. 42 * The project context factors are first normalized and then used for clustering. 43 * They can be configured in the configuration param. 44 * 45 * @param testdata 46 * @param traindataSet 47 */ 48 protected void cluster(Instances testdata, SetUniqueList<Instances> traindataSet) { 49 // now do the clustering, normalizedCharacteristicInstances ruft getContextFactors auf 50 final Instances data = this.normalizedCharacteristicInstances(testdata, traindataSet); 51 52 final Instance targetInstance = data.instance(0); 53 final List<Instance> candidateInstances = new LinkedList<Instance>(); 54 for( int i=1; i<data.numInstances(); i++ ) { 55 candidateInstances.add(data.instance(i)); 56 } 57 58 // cluster and select 59 try { 60 final EM emeans = new EM(); 61 boolean onlyTarget = true; 62 int targetCluster; 63 int maxNumClusters = candidateInstances.size(); 64 65 do { // while(onlyTarget) 66 emeans.setMaximumNumberOfClusters(maxNumClusters); 67 emeans.buildClusterer(data); 68 69 targetCluster = emeans.clusterInstance(targetInstance); 70 71 // check if cluster only contains target project 72 for( int i=0 ; i<candidateInstances.size() && onlyTarget; i++ ) { 73 onlyTarget &= !(emeans.clusterInstance(candidateInstances.get(i))==targetCluster); 74 } 75 maxNumClusters = emeans.numberOfClusters()-1; 76 77 //Console.traceln(Level.INFO, "number of clusters: " + emeans.numberOfClusters()); 78 } while(onlyTarget); 79 80 Console.traceln(Level.INFO, "clusters: " + maxNumClusters); 81 Console.traceln(Level.INFO, "instances vor dem clustern: " + traindataSet.size()); 82 int numRemoved = 0; 83 for( int i=0 ; i<candidateInstances.size() ; i++ ) { 84 if( emeans.clusterInstance(candidateInstances.get(i))!=targetCluster ) { 85 traindataSet.remove(i-numRemoved++); 86 } 87 } 88 Console.traceln(Level.INFO, "instances nach dem clustern: " + traindataSet.size()); 89 } catch(Exception e) { 90 throw new RuntimeException("error applying setwise EM clustering training data selection", e); 91 } 92 } 93 94 @Override 95 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 96 // issuetracking und pl muss passen 97 /* 98 int s = traindataSet.size(); 99 Console.traceln(Level.INFO, "remove non matching PL and IssueTracking projects, size now: " + s); 100 this.removeWrongContext(testdata, traindataSet, "PL"); 101 this.removeWrongContext(testdata, traindataSet, "IssueTracking"); 102 s = traindataSet.size(); 103 Console.traceln(Level.INFO, "size after removal: " + s); 104 */ 105 // now cluster 106 this.cluster(testdata, traindataSet); 107 } 108 109 /** 110 * Returns test- and training data with only the project context factors 111 * which were chosen in the configuration. 112 * This is later used for clustering. 113 * 114 * @param testdata 115 * @param traindataSet 116 * @return 117 */ 118 protected Instances getContextFactors(Instances testdata, SetUniqueList<Instances> traindataSet) { 119 // setup weka Instances for clustering 120 final ArrayList<Attribute> atts = new ArrayList<Attribute>(); 121 122 // we only want the project context factors 123 for( String pcf : this.project_context_factors ) { 124 atts.add(new Attribute(pcf)); 125 } 126 127 // set up the data 128 final Instances data = new Instances("project_context_factors", atts, 0); 129 double[] instanceValues = new double[atts.size()]; 130 131 // only project context factors + only one instance per project needed 132 int i = 0; 133 for( String pcf : this.project_context_factors ) { 134 instanceValues[i] = testdata.instance(0).value(testdata.attribute(pcf)); 135 //Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " + instanceValues[i]); 136 i++; 137 } 138 data.add(new DenseInstance(1.0, instanceValues)); 139 140 // now for the projects of the training stet 141 for( Instances traindata : traindataSet ) { 142 instanceValues = new double[atts.size()]; // ohne das hier immer dieselben werte?! 143 i = 0; 144 for( String pcf : this.project_context_factors ) { 145 instanceValues[i] = traindata.instance(0).value(traindata.attribute(pcf)); 146 //Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " + instanceValues[i]); 147 i++; 148 } 149 150 data.add(new DenseInstance(1.0, instanceValues)); 151 } 152 153 return data; 154 } 155 156 /** 157 * Delete projects where the project context does not match the training project 158 * 159 * @param testdata 160 * @param traindataSet 161 * @param attribute 162 */ 163 protected void removeWrongContext(Instances testdata, SetUniqueList<Instances> traindataSet, String attribute) { 164 Set<Instances> remove = new HashSet<Instances>(); 165 for( Instances traindata : traindataSet ) { 166 if( traindata.firstInstance().value(traindata.attribute(attribute)) != testdata.firstInstance().value(testdata.attribute(attribute)) ) { 167 remove.add(traindata); 168 //Console.traceln(Level.WARNING, "rmove attribute "+attribute+" test: "+testdata.firstInstance().value(testdata.attribute(attribute))+" train: "+traindata.firstInstance().value(traindata.attribute(attribute))); 169 } 170 } 171 172 // now delete the projects from set 173 for( Instances i : remove ) { 174 traindataSet.remove(i); 175 //Console.traceln(Level.INFO, "removing training project from set"); 176 } 177 } 178 179 /** 180 * Normalizes the data before it gets used for clustering 181 * 182 * @param testdata 183 * @param traindataSet 184 * @return 185 */ 186 protected Instances normalizedCharacteristicInstances(Instances testdata, SetUniqueList<Instances> traindataSet) { 187 Instances data = this.getContextFactors(testdata, traindataSet); 188 try { 189 final Normalize normalizer = new Normalize(); 190 normalizer.setInputFormat(data); 191 data = Filter.useFilter(data, normalizer); 192 } catch (Exception e) { 193 throw new RuntimeException("Unexpected exception during normalization of distributional characteristics.", e); 194 } 195 return data; 196 } 42 43 private String[] project_context_factors; // = new String[]{"TND", "TNC", "TNF", "TLOC"}; 44 45 @Override 46 public void setParameter(String parameters) { 47 if (parameters != null) { 48 project_context_factors = parameters.split(" "); 49 } 50 } 51 52 /** 53 * Uses the Weka EM-Clustering algorithm to cluster the projects by their project context 54 * factors. The project context factors are first normalized and then used for clustering. They 55 * can be configured in the configuration param. 56 * 57 * @param testdata 58 * @param traindataSet 59 */ 60 protected void cluster(Instances testdata, SetUniqueList<Instances> traindataSet) { 61 // now do the clustering, normalizedCharacteristicInstances ruft getContextFactors auf 62 final Instances data = this.normalizedCharacteristicInstances(testdata, traindataSet); 63 64 final Instance targetInstance = data.instance(0); 65 final List<Instance> candidateInstances = new LinkedList<Instance>(); 66 for (int i = 1; i < data.numInstances(); i++) { 67 candidateInstances.add(data.instance(i)); 68 } 69 70 // cluster and select 71 try { 72 final EM emeans = new EM(); 73 boolean onlyTarget = true; 74 int targetCluster; 75 int maxNumClusters = candidateInstances.size(); 76 77 do { // while(onlyTarget) 78 emeans.setMaximumNumberOfClusters(maxNumClusters); 79 emeans.buildClusterer(data); 80 81 targetCluster = emeans.clusterInstance(targetInstance); 82 83 // check if cluster only contains target project 84 for (int i = 0; i < candidateInstances.size() && onlyTarget; i++) { 85 onlyTarget &= 86 !(emeans.clusterInstance(candidateInstances.get(i)) == targetCluster); 87 } 88 maxNumClusters = emeans.numberOfClusters() - 1; 89 90 // Console.traceln(Level.INFO, "number of clusters: " + emeans.numberOfClusters()); 91 } 92 while (onlyTarget); 93 94 Console.traceln(Level.INFO, "clusters: " + maxNumClusters); 95 Console.traceln(Level.INFO, "instances vor dem clustern: " + traindataSet.size()); 96 int numRemoved = 0; 97 for (int i = 0; i < candidateInstances.size(); i++) { 98 if (emeans.clusterInstance(candidateInstances.get(i)) != targetCluster) { 99 traindataSet.remove(i - numRemoved++); 100 } 101 } 102 Console.traceln(Level.INFO, "instances nach dem clustern: " + traindataSet.size()); 103 } 104 catch (Exception e) { 105 throw new RuntimeException( 106 "error applying setwise EM clustering training data selection", 107 e); 108 } 109 } 110 111 @Override 112 public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) { 113 // issuetracking und pl muss passen 114 /* 115 * int s = traindataSet.size(); Console.traceln(Level.INFO, 116 * "remove non matching PL and IssueTracking projects, size now: " + s); 117 * this.removeWrongContext(testdata, traindataSet, "PL"); this.removeWrongContext(testdata, 118 * traindataSet, "IssueTracking"); s = traindataSet.size(); Console.traceln(Level.INFO, 119 * "size after removal: " + s); 120 */ 121 // now cluster 122 this.cluster(testdata, traindataSet); 123 } 124 125 /** 126 * Returns test- and training data with only the project context factors which were chosen in 127 * the configuration. This is later used for clustering. 128 * 129 * @param testdata 130 * @param traindataSet 131 * @return 132 */ 133 protected Instances getContextFactors(Instances testdata, SetUniqueList<Instances> traindataSet) 134 { 135 // setup weka Instances for clustering 136 final ArrayList<Attribute> atts = new ArrayList<Attribute>(); 137 138 // we only want the project context factors 139 for (String pcf : this.project_context_factors) { 140 atts.add(new Attribute(pcf)); 141 } 142 143 // set up the data 144 final Instances data = new Instances("project_context_factors", atts, 0); 145 double[] instanceValues = new double[atts.size()]; 146 147 // only project context factors + only one instance per project needed 148 int i = 0; 149 for (String pcf : this.project_context_factors) { 150 instanceValues[i] = testdata.instance(0).value(testdata.attribute(pcf)); 151 // Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " + 152 // instanceValues[i]); 153 i++; 154 } 155 data.add(new DenseInstance(1.0, instanceValues)); 156 157 // now for the projects of the training stet 158 for (Instances traindata : traindataSet) { 159 instanceValues = new double[atts.size()]; // ohne das hier immer dieselben werte?! 160 i = 0; 161 for (String pcf : this.project_context_factors) { 162 instanceValues[i] = traindata.instance(0).value(traindata.attribute(pcf)); 163 // Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " + 164 // instanceValues[i]); 165 i++; 166 } 167 168 data.add(new DenseInstance(1.0, instanceValues)); 169 } 170 171 return data; 172 } 173 174 /** 175 * Delete projects where the project context does not match the training project 176 * 177 * @param testdata 178 * @param traindataSet 179 * @param attribute 180 */ 181 protected void removeWrongContext(Instances testdata, 182 SetUniqueList<Instances> traindataSet, 183 String attribute) 184 { 185 Set<Instances> remove = new HashSet<Instances>(); 186 for (Instances traindata : traindataSet) { 187 if (traindata.firstInstance().value(traindata.attribute(attribute)) != testdata 188 .firstInstance().value(testdata.attribute(attribute))) 189 { 190 remove.add(traindata); 191 // Console.traceln(Level.WARNING, 192 // "rmove attribute "+attribute+" test: "+testdata.firstInstance().value(testdata.attribute(attribute))+" train: "+traindata.firstInstance().value(traindata.attribute(attribute))); 193 } 194 } 195 196 // now delete the projects from set 197 for (Instances i : remove) { 198 traindataSet.remove(i); 199 // Console.traceln(Level.INFO, "removing training project from set"); 200 } 201 } 202 203 /** 204 * Normalizes the data before it gets used for clustering 205 * 206 * @param testdata 207 * @param traindataSet 208 * @return 209 */ 210 protected Instances normalizedCharacteristicInstances(Instances testdata, 211 SetUniqueList<Instances> traindataSet) 212 { 213 Instances data = this.getContextFactors(testdata, traindataSet); 214 try { 215 final Normalize normalizer = new Normalize(); 216 normalizer.setInputFormat(data); 217 data = Filter.useFilter(data, normalizer); 218 } 219 catch (Exception e) { 220 throw new RuntimeException( 221 "Unexpected exception during normalization of distributional characteristics.", 222 e); 223 } 224 return data; 225 } 197 226 }
Note: See TracChangeset
for help on using the changeset viewer.