- Timestamp:
- 09/24/15 10:59:05 (10 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/NasaARFFLoader.java
r4 r41 1 // Copyright 2015 Georg-August-Universität Göttingen, Germany 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 1 15 package de.ugoe.cs.cpdp.loader; 2 16 … … 16 30 17 31 /** 18 * Loads the instances for a software version from an ARFF file of the 19 * NASA/SOFTLAB/MDP data. 32 * Loads the instances for a software version from an ARFF file of the NASA/SOFTLAB/MDP data. 20 33 * 21 34 * @author Steffen Herbold … … 23 36 public class NasaARFFLoader implements SingleVersionLoader { 24 37 25 /**26 * used to map attributes the same attribute with different names to each 27 * other 28 */ 29 Map<String, String> attributeNameMap; 30 31 /** 32 * used to ensure that the attribute order is the same after loading 33 */ 34 List<String> attributeOrder; 35 36 /** 37 * Constructor. Creates a new NasaARFFLoader. 38 */ 39 public NasaARFFLoader() { 40 attributeNameMap = new HashMap<>(); 41 42 // Map entries for ar project 43 attributeNameMap.put("total_loc", "LOC_TOTAL");44 attributeNameMap.put("comment_loc", "LOC_COMMENTS");45 attributeNameMap.put("code_and_comment_loc", "LOC_CODE_AND_COMMENT");46 attributeNameMap.put("executable_loc", "LOC_EXECUTABLE");47 attributeNameMap.put("unique_operands", "NUM_UNIQUE_OPERANDS");48 attributeNameMap.put("unique_operators", "NUM_UNIQUE_OPERATORS");49 attributeNameMap.put("total_operands", "NUM_OPERANDS");50 attributeNameMap.put("total_operators", "NUM_OPERATORS");51 attributeNameMap.put("halstead_length", "HALSTEAD_LENGTH");52 attributeNameMap.put("halstead_volume", "HALSTEAD_VOLUME");53 attributeNameMap.put("halstead_difficulty", "HALSTEAD_DIFFICULTY");54 attributeNameMap.put("halstead_effort", "HALSTEAD_EFFORT");55 attributeNameMap.put("halstead_error", "HALSTEAD_ERROR_EST");56 attributeNameMap.put("halstead_time", "HALSTEAD_PROG_TIME");57 attributeNameMap.put("branch_count", "BRANCH_COUNT");58 attributeNameMap.put("cyclomatic_complexity", "CYCLOMATIC_COMPLEXITY");59 attributeNameMap.put("design_complexity", "DESIGN_COMPLEXITY"); 60 61 // Map entries for KC2 62 attributeNameMap.put("loc", "LOC_TOTAL");63 attributeNameMap.put("lOCode", "LOC_EXECUTABLE");64 attributeNameMap.put("lOComment", "LOC_COMMENTS");65 attributeNameMap.put("lOCodeAndComment", "LOC_CODE_AND_COMMENT");66 attributeNameMap.put("uniq_Op", "NUM_UNIQUE_OPERATORS");67 attributeNameMap.put("uniq_Opnd", "NUM_UNIQUE_OPERANDS");68 attributeNameMap.put("total_Op", "NUM_OPERATORS");69 attributeNameMap.put("total_Opnd", "NUM_OPERANDS");70 attributeNameMap.put("v", "HALSTEAD_VOLUME");71 attributeNameMap.put("l", "HALSTEAD_LENGTH");72 attributeNameMap.put("d", "HALSTEAD_DIFFICULTY");73 attributeNameMap.put("e", "HALSTEAD_EFFORT");74 attributeNameMap.put("b", "HALSTEAD_ERROR_EST");75 attributeNameMap.put("t", "HALSTEAD_PROG_TIME");76 attributeNameMap.put("branchCount", "BRANCH_COUNT");77 attributeNameMap.put("v(g)", "CYCLOMATIC_COMPLEXITY");78 attributeNameMap.put("iv(g)", "DESIGN_COMPLEXITY"); 79 80 attributeNameMap.put("defects", "bug");81 attributeNameMap.put("Defective", "bug");82 attributeNameMap.put("problems", "bug");83 attributeNameMap.put("label", "bug"); 84 85 // build list with normalized attribute order 86 attributeOrder = new LinkedList<>(); 87 88 attributeOrder.add("LOC_TOTAL");89 attributeOrder.add("LOC_EXECUTABLE");90 attributeOrder.add("LOC_COMMENTS");91 attributeOrder.add("LOC_CODE_AND_COMMENT");92 attributeOrder.add("NUM_UNIQUE_OPERATORS");93 attributeOrder.add("NUM_UNIQUE_OPERANDS");94 attributeOrder.add("NUM_OPERATORS");95 attributeOrder.add("NUM_OPERANDS");96 attributeOrder.add("HALSTEAD_VOLUME");97 attributeOrder.add("HALSTEAD_LENGTH");98 attributeOrder.add("HALSTEAD_DIFFICULTY");99 attributeOrder.add("HALSTEAD_EFFORT");100 attributeOrder.add("HALSTEAD_ERROR_EST");101 attributeOrder.add("HALSTEAD_PROG_TIME");102 attributeOrder.add("BRANCH_COUNT");103 attributeOrder.add("CYCLOMATIC_COMPLEXITY");104 attributeOrder.add("DESIGN_COMPLEXITY");105 attributeOrder.add("bug"); 106 } 107 108 /* 109 * (non-Javadoc) 110 * 111 * @see de.ugoe.cs.cpdp.loader.SingleVersionLoader#load(java.io.File) 112 */ 113 @Override 114 public Instances load(File file) { 115 BufferedReader reader;116 Instances data; 117 try { 118 reader = new BufferedReader(new FileReader(file));119 data = new Instances(reader);120 reader.close(); 121 }catch (IOException e) {122 throw new RuntimeException("Error reading data", e);123 }124 125 // setting class attribute126 data.setClassIndex(data.numAttributes() - 1);127 128 // normalize attribute names129 for (int i = 0; i < data.numAttributes(); i++) {130 String mapValue = attributeNameMap.get(data.attribute(i).name());131 if (mapValue != null) {132 data.renameAttribute(i, mapValue);133 }134 }135 136 // determine new attribute order (unwanted attributes are implicitly137 // removed138 String orderString = "";139 for (String attName : attributeOrder) {140 for (int i = 0; i < data.numAttributes(); i++) {141 if (attName.equals(data.attribute(i).name())) {142 orderString += (i + 1) + ",";143 }144 }145 }146 orderString = orderString.substring(0, orderString.length() - 1);147 148 String relationName = data.relationName();149 String[] options = new String[2];150 options[0] = "-R";151 options[1] = orderString;152 Reorder reorder = new Reorder();153 try {154 reorder.setOptions(options);155 reorder.setInputFormat(data);156 data = Filter.useFilter(data, reorder);157 } catch (Exception e) { 158 throw new RuntimeException("Error while reordering the data", e); 159 } 160 if (data.numAttributes() != attributeOrder.size()) { 161 throw new RuntimeException( 162 "Invalid number of attributes; filename: " + file.getName());163 }164 165 // normalize bug nominal values166 Add add = new Add();167 add.setAttributeIndex("last");168 add.setNominalLabels("0,1");169 add.setAttributeName("bug-new");170 try {171 add.setInputFormat(data);172 data = Filter.useFilter(data, add);173 } catch (Exception e) { 174 throw new RuntimeException( 175 "Error while normalizing the bug nonminal values", e);176 }177 data.setRelationName(relationName);178 179 double classValue;180 181 String firstValue = data.classAttribute().enumerateValues() 182 .nextElement().toString(); 183 if (firstValue.equals("Y") || firstValue.equals("yes") 184 || firstValue.equals("true")) { 185 classValue = 0.0; 186 } else { 187 classValue = 1.0; 188 } 189 190 for (int i = 0; i < data.numInstances(); i++) {191 if (data.instance(i).classValue() == classValue) { 192 data.instance(i).setValue(data.classIndex() + 1, 1.0); 193 }else {194 data.instance(i).setValue(data.classIndex() + 1, 0.0);195 }196 }197 198 int oldClassIndex = data.classIndex();199 data.setClassIndex(oldClassIndex + 1);200 data.deleteAttributeAt(oldClassIndex);201 202 return data;203 }204 205 /*206 * (non-Javadoc)207 *208 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader.SingleVersionLoader#209 * filenameFilter(java.lang.String)210 */211 @Override212 public boolean filenameFilter(String filename) {213 return filename.endsWith(".arff");214 }38 /** 39 * used to map attributes the same attribute with different names to each other 40 */ 41 Map<String, String> attributeNameMap; 42 43 /** 44 * used to ensure that the attribute order is the same after loading 45 */ 46 List<String> attributeOrder; 47 48 /** 49 * Constructor. Creates a new NasaARFFLoader. 50 */ 51 public NasaARFFLoader() { 52 attributeNameMap = new HashMap<>(); 53 54 // Map entries for ar project 55 attributeNameMap.put("total_loc", "LOC_TOTAL"); 56 attributeNameMap.put("comment_loc", "LOC_COMMENTS"); 57 attributeNameMap.put("code_and_comment_loc", "LOC_CODE_AND_COMMENT"); 58 attributeNameMap.put("executable_loc", "LOC_EXECUTABLE"); 59 attributeNameMap.put("unique_operands", "NUM_UNIQUE_OPERANDS"); 60 attributeNameMap.put("unique_operators", "NUM_UNIQUE_OPERATORS"); 61 attributeNameMap.put("total_operands", "NUM_OPERANDS"); 62 attributeNameMap.put("total_operators", "NUM_OPERATORS"); 63 attributeNameMap.put("halstead_length", "HALSTEAD_LENGTH"); 64 attributeNameMap.put("halstead_volume", "HALSTEAD_VOLUME"); 65 attributeNameMap.put("halstead_difficulty", "HALSTEAD_DIFFICULTY"); 66 attributeNameMap.put("halstead_effort", "HALSTEAD_EFFORT"); 67 attributeNameMap.put("halstead_error", "HALSTEAD_ERROR_EST"); 68 attributeNameMap.put("halstead_time", "HALSTEAD_PROG_TIME"); 69 attributeNameMap.put("branch_count", "BRANCH_COUNT"); 70 attributeNameMap.put("cyclomatic_complexity", "CYCLOMATIC_COMPLEXITY"); 71 attributeNameMap.put("design_complexity", "DESIGN_COMPLEXITY"); 72 73 // Map entries for KC2 74 attributeNameMap.put("loc", "LOC_TOTAL"); 75 attributeNameMap.put("lOCode", "LOC_EXECUTABLE"); 76 attributeNameMap.put("lOComment", "LOC_COMMENTS"); 77 attributeNameMap.put("lOCodeAndComment", "LOC_CODE_AND_COMMENT"); 78 attributeNameMap.put("uniq_Op", "NUM_UNIQUE_OPERATORS"); 79 attributeNameMap.put("uniq_Opnd", "NUM_UNIQUE_OPERANDS"); 80 attributeNameMap.put("total_Op", "NUM_OPERATORS"); 81 attributeNameMap.put("total_Opnd", "NUM_OPERANDS"); 82 attributeNameMap.put("v", "HALSTEAD_VOLUME"); 83 attributeNameMap.put("l", "HALSTEAD_LENGTH"); 84 attributeNameMap.put("d", "HALSTEAD_DIFFICULTY"); 85 attributeNameMap.put("e", "HALSTEAD_EFFORT"); 86 attributeNameMap.put("b", "HALSTEAD_ERROR_EST"); 87 attributeNameMap.put("t", "HALSTEAD_PROG_TIME"); 88 attributeNameMap.put("branchCount", "BRANCH_COUNT"); 89 attributeNameMap.put("v(g)", "CYCLOMATIC_COMPLEXITY"); 90 attributeNameMap.put("iv(g)", "DESIGN_COMPLEXITY"); 91 92 attributeNameMap.put("defects", "bug"); 93 attributeNameMap.put("Defective", "bug"); 94 attributeNameMap.put("problems", "bug"); 95 attributeNameMap.put("label", "bug"); 96 97 // build list with normalized attribute order 98 attributeOrder = new LinkedList<>(); 99 100 attributeOrder.add("LOC_TOTAL"); 101 attributeOrder.add("LOC_EXECUTABLE"); 102 attributeOrder.add("LOC_COMMENTS"); 103 attributeOrder.add("LOC_CODE_AND_COMMENT"); 104 attributeOrder.add("NUM_UNIQUE_OPERATORS"); 105 attributeOrder.add("NUM_UNIQUE_OPERANDS"); 106 attributeOrder.add("NUM_OPERATORS"); 107 attributeOrder.add("NUM_OPERANDS"); 108 attributeOrder.add("HALSTEAD_VOLUME"); 109 attributeOrder.add("HALSTEAD_LENGTH"); 110 attributeOrder.add("HALSTEAD_DIFFICULTY"); 111 attributeOrder.add("HALSTEAD_EFFORT"); 112 attributeOrder.add("HALSTEAD_ERROR_EST"); 113 attributeOrder.add("HALSTEAD_PROG_TIME"); 114 attributeOrder.add("BRANCH_COUNT"); 115 attributeOrder.add("CYCLOMATIC_COMPLEXITY"); 116 attributeOrder.add("DESIGN_COMPLEXITY"); 117 attributeOrder.add("bug"); 118 } 119 120 /* 121 * (non-Javadoc) 122 * 123 * @see de.ugoe.cs.cpdp.loader.SingleVersionLoader#load(java.io.File) 124 */ 125 @Override 126 public Instances load(File file) { 127 BufferedReader reader; 128 Instances data; 129 try { 130 reader = new BufferedReader(new FileReader(file)); 131 data = new Instances(reader); 132 reader.close(); 133 } 134 catch (IOException e) { 135 throw new RuntimeException("Error reading data", e); 136 } 137 138 // setting class attribute 139 data.setClassIndex(data.numAttributes() - 1); 140 141 // normalize attribute names 142 for (int i = 0; i < data.numAttributes(); i++) { 143 String mapValue = attributeNameMap.get(data.attribute(i).name()); 144 if (mapValue != null) { 145 data.renameAttribute(i, mapValue); 146 } 147 } 148 149 // determine new attribute order (unwanted attributes are implicitly 150 // removed 151 String orderString = ""; 152 for (String attName : attributeOrder) { 153 for (int i = 0; i < data.numAttributes(); i++) { 154 if (attName.equals(data.attribute(i).name())) { 155 orderString += (i + 1) + ","; 156 } 157 } 158 } 159 orderString = orderString.substring(0, orderString.length() - 1); 160 161 String relationName = data.relationName(); 162 String[] options = new String[2]; 163 options[0] = "-R"; 164 options[1] = orderString; 165 Reorder reorder = new Reorder(); 166 try { 167 reorder.setOptions(options); 168 reorder.setInputFormat(data); 169 data = Filter.useFilter(data, reorder); 170 } 171 catch (Exception e) { 172 throw new RuntimeException("Error while reordering the data", e); 173 } 174 if (data.numAttributes() != attributeOrder.size()) { 175 throw new RuntimeException("Invalid number of attributes; filename: " + file.getName()); 176 } 177 178 // normalize bug nominal values 179 Add add = new Add(); 180 add.setAttributeIndex("last"); 181 add.setNominalLabels("0,1"); 182 add.setAttributeName("bug-new"); 183 try { 184 add.setInputFormat(data); 185 data = Filter.useFilter(data, add); 186 } 187 catch (Exception e) { 188 throw new RuntimeException("Error while normalizing the bug nonminal values", e); 189 } 190 data.setRelationName(relationName); 191 192 double classValue; 193 194 String firstValue = data.classAttribute().enumerateValues().nextElement().toString(); 195 if (firstValue.equals("Y") || firstValue.equals("yes") || firstValue.equals("true")) { 196 classValue = 0.0; 197 } 198 else { 199 classValue = 1.0; 200 } 201 202 for (int i = 0; i < data.numInstances(); i++) { 203 if (data.instance(i).classValue() == classValue) { 204 data.instance(i).setValue(data.classIndex() + 1, 1.0); 205 } 206 else { 207 data.instance(i).setValue(data.classIndex() + 1, 0.0); 208 } 209 } 210 211 int oldClassIndex = data.classIndex(); 212 data.setClassIndex(oldClassIndex + 1); 213 data.deleteAttributeAt(oldClassIndex); 214 215 return data; 216 } 217 218 /* 219 * (non-Javadoc) 220 * 221 * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader.SingleVersionLoader# 222 * filenameFilter(java.lang.String) 223 */ 224 @Override 225 public boolean filenameFilter(String filename) { 226 return filename.endsWith(".arff"); 227 } 215 228 216 229 }
Note: See TracChangeset
for help on using the changeset viewer.