[86] | 1 | // Copyright 2015 Georg-August-Universität Göttingen, Germany
|
---|
[41] | 2 | //
|
---|
| 3 | // Licensed under the Apache License, Version 2.0 (the "License");
|
---|
| 4 | // you may not use this file except in compliance with the License.
|
---|
| 5 | // You may obtain a copy of the License at
|
---|
| 6 | //
|
---|
| 7 | // http://www.apache.org/licenses/LICENSE-2.0
|
---|
| 8 | //
|
---|
| 9 | // Unless required by applicable law or agreed to in writing, software
|
---|
| 10 | // distributed under the License is distributed on an "AS IS" BASIS,
|
---|
| 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
---|
| 12 | // See the License for the specific language governing permissions and
|
---|
| 13 | // limitations under the License.
|
---|
| 14 |
|
---|
[38] | 15 | package de.ugoe.cs.cpdp.loader;
|
---|
| 16 |
|
---|
| 17 | import java.io.File;
|
---|
| 18 | import java.io.IOException;
|
---|
| 19 | import java.util.ArrayList;
|
---|
| 20 | import java.util.Map.Entry;
|
---|
| 21 | import java.util.SortedMap;
|
---|
| 22 | import java.util.TreeMap;
|
---|
| 23 |
|
---|
| 24 | import weka.core.Attribute;
|
---|
| 25 | import weka.core.DenseInstance;
|
---|
| 26 | import weka.core.Instances;
|
---|
| 27 | import de.ugoe.cs.util.FileTools;
|
---|
| 28 |
|
---|
| 29 | /**
|
---|
[135] | 30 | * <p>
|
---|
| 31 | * Loads data from the automative defect data set from Audi Electronic Ventures donated by Altinger
|
---|
| 32 | * et al. at the MSR 2015. This loader contains the changes per commit, i.e., it is for JIT defect
|
---|
| 33 | * prediction.
|
---|
| 34 | * </p>
|
---|
[41] | 35 | *
|
---|
[135] | 36 | * @author Steffen Herbold
|
---|
[38] | 37 | */
|
---|
| 38 | class AUDIChangeLoader implements SingleVersionLoader {
|
---|
| 39 |
|
---|
[135] | 40 | /**
|
---|
| 41 | * <p>
|
---|
| 42 | * Internal helper class.
|
---|
| 43 | * </p>
|
---|
| 44 | *
|
---|
| 45 | * @author Steffen Herbold
|
---|
| 46 | */
|
---|
[41] | 47 | private class EntityRevisionPair implements Comparable<EntityRevisionPair> {
|
---|
[135] | 48 |
|
---|
| 49 | /**
|
---|
| 50 | * string that defines an entity
|
---|
| 51 | */
|
---|
[41] | 52 | private final String entity;
|
---|
[135] | 53 |
|
---|
| 54 | /**
|
---|
| 55 | * revision number of the entity
|
---|
| 56 | */
|
---|
[41] | 57 | private final int revision;
|
---|
[38] | 58 |
|
---|
[135] | 59 | /**
|
---|
| 60 | * <p>
|
---|
| 61 | * Constructor. Creates a new EntityRevisionPair.
|
---|
| 62 | * </p>
|
---|
| 63 | *
|
---|
| 64 | * @param entity
|
---|
| 65 | * the entity
|
---|
| 66 | * @param revision
|
---|
| 67 | * the revision
|
---|
| 68 | */
|
---|
[41] | 69 | public EntityRevisionPair(String entity, int revision) {
|
---|
| 70 | this.entity = entity;
|
---|
| 71 | this.revision = revision;
|
---|
| 72 | }
|
---|
[38] | 73 |
|
---|
[135] | 74 | /*
|
---|
| 75 | * (non-Javadoc)
|
---|
| 76 | *
|
---|
| 77 | * @see java.lang.Object#equals(java.lang.Object)
|
---|
| 78 | */
|
---|
[41] | 79 | @Override
|
---|
| 80 | public boolean equals(Object other) {
|
---|
| 81 | if (!(other instanceof EntityRevisionPair)) {
|
---|
| 82 | return false;
|
---|
| 83 | }
|
---|
| 84 | else {
|
---|
| 85 | return compareTo((EntityRevisionPair) other) == 0;
|
---|
| 86 | }
|
---|
| 87 | }
|
---|
[38] | 88 |
|
---|
[135] | 89 | /*
|
---|
| 90 | * (non-Javadoc)
|
---|
| 91 | *
|
---|
| 92 | * @see java.lang.Object#hashCode()
|
---|
| 93 | */
|
---|
[41] | 94 | @Override
|
---|
| 95 | public int hashCode() {
|
---|
| 96 | return entity.hashCode() + revision;
|
---|
| 97 | }
|
---|
[38] | 98 |
|
---|
[135] | 99 | /*
|
---|
| 100 | * (non-Javadoc)
|
---|
| 101 | *
|
---|
| 102 | * @see java.lang.Comparable#compareTo(java.lang.Object)
|
---|
| 103 | */
|
---|
[41] | 104 | @Override
|
---|
| 105 | public int compareTo(EntityRevisionPair other) {
|
---|
| 106 | int strCmp = this.entity.compareTo(other.entity);
|
---|
| 107 | if (strCmp != 0) {
|
---|
| 108 | return strCmp;
|
---|
| 109 | }
|
---|
| 110 | return Integer.compare(revision, other.revision);
|
---|
| 111 | }
|
---|
[38] | 112 |
|
---|
[135] | 113 | /*
|
---|
| 114 | * (non-Javadoc)
|
---|
| 115 | *
|
---|
| 116 | * @see java.lang.Object#toString()
|
---|
| 117 | */
|
---|
[41] | 118 | @Override
|
---|
| 119 | public String toString() {
|
---|
| 120 | return entity + "@" + revision;
|
---|
| 121 | }
|
---|
| 122 | }
|
---|
[38] | 123 |
|
---|
[135] | 124 | /*
|
---|
| 125 | * (non-Javadoc)
|
---|
| 126 | *
|
---|
| 127 | * @see de.ugoe.cs.cpdp.loader.SingleVersionLoader#load(java.io.File)
|
---|
| 128 | */
|
---|
[41] | 129 | @Override
|
---|
| 130 | public Instances load(File file) {
|
---|
| 131 | final String[] lines;
|
---|
| 132 | String[] lineSplit;
|
---|
| 133 | String[] lineSplitBug;
|
---|
[38] | 134 |
|
---|
[41] | 135 | try {
|
---|
| 136 | lines = FileTools.getLinesFromFile(file.getAbsolutePath());
|
---|
| 137 | }
|
---|
| 138 | catch (IOException e) {
|
---|
| 139 | throw new RuntimeException(e);
|
---|
| 140 | }
|
---|
| 141 |
|
---|
| 142 | // information about bugs are in another file
|
---|
| 143 | String path = file.getAbsolutePath();
|
---|
| 144 | path = path.substring(0, path.length() - 14) + "repro.csv";
|
---|
| 145 | final String[] linesBug;
|
---|
| 146 | try {
|
---|
| 147 | linesBug = FileTools.getLinesFromFile(path);
|
---|
| 148 | }
|
---|
| 149 | catch (IOException e) {
|
---|
| 150 | throw new RuntimeException(e);
|
---|
| 151 | }
|
---|
| 152 |
|
---|
| 153 | int revisionIndex = -1;
|
---|
| 154 | int bugIndex = -1;
|
---|
| 155 | lineSplitBug = linesBug[0].split(";");
|
---|
| 156 | for (int j = 0; j < lineSplitBug.length; j++) {
|
---|
| 157 | if (lineSplitBug[j].equals("svnrev")) {
|
---|
| 158 | revisionIndex = j;
|
---|
| 159 | }
|
---|
| 160 | if (lineSplitBug[j].equals("num_bugs_trace")) {
|
---|
| 161 | bugIndex = j;
|
---|
| 162 | }
|
---|
| 163 | }
|
---|
| 164 | if (revisionIndex < 0) {
|
---|
| 165 | throw new RuntimeException("could not find SVN revisions");
|
---|
| 166 | }
|
---|
| 167 | if (bugIndex < 0) {
|
---|
| 168 | throw new RuntimeException("could not find bug information");
|
---|
| 169 | }
|
---|
| 170 |
|
---|
| 171 | int metricsStartIndex = -1;
|
---|
| 172 | int metricsEndIndex = -1;
|
---|
| 173 | lineSplit = lines[0].split(";");
|
---|
| 174 | for (int j = 0; j < lineSplit.length; j++) {
|
---|
| 175 | if (lineSplit[j].equals("lm_LOC")) {
|
---|
| 176 | metricsStartIndex = j;
|
---|
| 177 | }
|
---|
| 178 | if (lineSplit[j].equals("h_E")) {
|
---|
| 179 | metricsEndIndex = j;
|
---|
| 180 | }
|
---|
| 181 | }
|
---|
| 182 | if (metricsStartIndex < 0) {
|
---|
| 183 | throw new RuntimeException("could not find first metric, i.e., lm_LOC");
|
---|
| 184 | }
|
---|
| 185 | if (metricsEndIndex < 0) {
|
---|
| 186 | throw new RuntimeException("could not find last metric, i.e., h_E");
|
---|
| 187 | }
|
---|
| 188 | int numMetrics = metricsEndIndex - metricsStartIndex + 1;
|
---|
| 189 |
|
---|
| 190 | // create sets of all filenames and revisions
|
---|
| 191 | SortedMap<EntityRevisionPair, Integer> entityRevisionPairs = new TreeMap<>();
|
---|
| 192 | for (int i = 1; i < linesBug.length; i++) {
|
---|
| 193 | lineSplitBug = linesBug[i].split(";");
|
---|
[135] | 194 | entityRevisionPairs.put(
|
---|
| 195 | new EntityRevisionPair(lineSplitBug[0],
|
---|
| 196 | Integer
|
---|
| 197 | .parseInt(lineSplitBug[revisionIndex])),
|
---|
| 198 | i);
|
---|
[41] | 199 | }
|
---|
| 200 |
|
---|
| 201 | // prepare weka instances
|
---|
| 202 | final ArrayList<Attribute> atts = new ArrayList<Attribute>();
|
---|
| 203 | lineSplit = lines[0].split(";");
|
---|
| 204 | for (int j = metricsStartIndex; j <= metricsEndIndex; j++) {
|
---|
| 205 | atts.add(new Attribute(lineSplit[j] + "_delta"));
|
---|
| 206 | }
|
---|
| 207 | for (int j = metricsStartIndex; j <= metricsEndIndex; j++) {
|
---|
| 208 | atts.add(new Attribute(lineSplit[j] + "_abs"));
|
---|
| 209 | }
|
---|
| 210 | final ArrayList<String> classAttVals = new ArrayList<String>();
|
---|
| 211 | classAttVals.add("0");
|
---|
| 212 | classAttVals.add("1");
|
---|
| 213 | final Attribute classAtt = new Attribute("bug", classAttVals);
|
---|
| 214 | atts.add(classAtt);
|
---|
| 215 |
|
---|
| 216 | final Instances data = new Instances(file.getName(), atts, 0);
|
---|
| 217 | data.setClass(classAtt);
|
---|
| 218 |
|
---|
| 219 | // create data
|
---|
| 220 | String lastFile = null;
|
---|
| 221 | double[] lastValues = null;
|
---|
| 222 | int lastNumBugs = 0;
|
---|
| 223 | for (Entry<EntityRevisionPair, Integer> entry : entityRevisionPairs.entrySet()) {
|
---|
| 224 | try {
|
---|
| 225 | // first get values
|
---|
| 226 | lineSplit = lines[entry.getValue()].split(";");
|
---|
| 227 | lineSplitBug = linesBug[entry.getValue()].split(";");
|
---|
| 228 | int i = 0;
|
---|
| 229 | double[] values = new double[numMetrics];
|
---|
| 230 | for (int j = metricsStartIndex; j <= metricsEndIndex; j++) {
|
---|
| 231 | values[i] = Double.parseDouble(lineSplit[j]);
|
---|
| 232 | i++;
|
---|
| 233 | }
|
---|
| 234 | int numBugs = Integer.parseInt(lineSplitBug[bugIndex]);
|
---|
| 235 |
|
---|
| 236 | // then check if an entity must be created
|
---|
| 237 | if (entry.getKey().entity.equals(lastFile)) {
|
---|
| 238 | // create new instance
|
---|
| 239 | double[] instanceValues = new double[2 * numMetrics + 1];
|
---|
| 240 | for (int j = 0; j < numMetrics; j++) {
|
---|
| 241 | instanceValues[j] = values[j] - lastValues[j];
|
---|
| 242 | instanceValues[j + numMetrics] = values[j];
|
---|
| 243 | }
|
---|
| 244 | // check if any value>0
|
---|
| 245 | boolean changeOccured = false;
|
---|
| 246 | for (int j = 0; j < numMetrics; j++) {
|
---|
| 247 | if (instanceValues[j] > 0) {
|
---|
| 248 | changeOccured = true;
|
---|
| 249 | }
|
---|
| 250 | }
|
---|
| 251 | if (changeOccured) {
|
---|
| 252 | instanceValues[instanceValues.length - 1] = numBugs <= lastNumBugs ? 0 : 1;
|
---|
| 253 | data.add(new DenseInstance(1.0, instanceValues));
|
---|
| 254 | }
|
---|
| 255 | }
|
---|
| 256 | lastFile = entry.getKey().entity;
|
---|
| 257 | lastValues = values;
|
---|
| 258 | lastNumBugs = numBugs;
|
---|
| 259 | }
|
---|
| 260 | catch (IllegalArgumentException e) {
|
---|
| 261 | System.err.println("error in line " + entry.getValue() + ": " + e.getMessage());
|
---|
| 262 | System.err.println("metrics line: " + lines[entry.getValue()]);
|
---|
| 263 | System.err.println("bugs line: " + linesBug[entry.getValue()]);
|
---|
| 264 | System.err.println("line is ignored");
|
---|
| 265 | }
|
---|
| 266 | }
|
---|
| 267 |
|
---|
| 268 | return data;
|
---|
| 269 | }
|
---|
| 270 |
|
---|
| 271 | /*
|
---|
| 272 | * (non-Javadoc)
|
---|
| 273 | *
|
---|
| 274 | * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader.SingleVersionLoader#load( java.io.File)
|
---|
| 275 | */
|
---|
| 276 |
|
---|
| 277 | public Instances load(File file, String dummy) {
|
---|
| 278 | final String[] lines;
|
---|
| 279 | try {
|
---|
| 280 | lines = FileTools.getLinesFromFile(file.getAbsolutePath());
|
---|
| 281 | }
|
---|
| 282 | catch (IOException e) {
|
---|
| 283 | throw new RuntimeException(e);
|
---|
| 284 | }
|
---|
| 285 |
|
---|
| 286 | // information about bugs are in another file
|
---|
| 287 | String path = file.getAbsolutePath();
|
---|
| 288 | path = path.substring(0, path.length() - 14) + "repro.csv";
|
---|
| 289 | final String[] linesBug;
|
---|
| 290 | try {
|
---|
| 291 | linesBug = FileTools.getLinesFromFile(path);
|
---|
| 292 | }
|
---|
| 293 | catch (IOException e) {
|
---|
| 294 | throw new RuntimeException(e);
|
---|
| 295 | }
|
---|
| 296 |
|
---|
| 297 | // configure Instances
|
---|
| 298 | final ArrayList<Attribute> atts = new ArrayList<Attribute>();
|
---|
| 299 |
|
---|
| 300 | String[] lineSplit = lines[0].split(";");
|
---|
| 301 | // ignore first three/four and last two columns
|
---|
| 302 | int offset;
|
---|
| 303 | if (lineSplit[3].equals("project_rev")) {
|
---|
| 304 | offset = 4;
|
---|
| 305 | }
|
---|
| 306 | else {
|
---|
| 307 | offset = 3;
|
---|
| 308 | }
|
---|
| 309 | for (int j = 0; j < lineSplit.length - (offset + 2); j++) {
|
---|
| 310 | atts.add(new Attribute(lineSplit[j + offset]));
|
---|
| 311 | }
|
---|
| 312 | final ArrayList<String> classAttVals = new ArrayList<String>();
|
---|
| 313 | classAttVals.add("0");
|
---|
| 314 | classAttVals.add("1");
|
---|
| 315 | final Attribute classAtt = new Attribute("bug", classAttVals);
|
---|
| 316 | atts.add(classAtt);
|
---|
| 317 |
|
---|
| 318 | final Instances data = new Instances(file.getName(), atts, 0);
|
---|
| 319 | data.setClass(classAtt);
|
---|
| 320 |
|
---|
| 321 | // fetch data
|
---|
| 322 | for (int i = 1; i < lines.length; i++) {
|
---|
| 323 | boolean validInstance = true;
|
---|
| 324 | lineSplit = lines[i].split(";");
|
---|
| 325 | String[] lineSplitBug = linesBug[i].split(";");
|
---|
| 326 | double[] values = new double[data.numAttributes()];
|
---|
| 327 | for (int j = 0; validInstance && j < values.length - 1; j++) {
|
---|
| 328 | if (lineSplit[j + offset].trim().isEmpty()) {
|
---|
| 329 | validInstance = false;
|
---|
| 330 | }
|
---|
| 331 | else {
|
---|
| 332 | values[j] = Double.parseDouble(lineSplit[j + offset].trim());
|
---|
| 333 | }
|
---|
| 334 | }
|
---|
| 335 | if (offset == 3) {
|
---|
| 336 | values[values.length - 1] = lineSplitBug[7].equals("0") ? 0 : 1;
|
---|
| 337 | }
|
---|
| 338 | else {
|
---|
| 339 | values[values.length - 1] = lineSplitBug[8].equals("0") ? 0 : 1;
|
---|
| 340 | }
|
---|
| 341 |
|
---|
| 342 | if (validInstance) {
|
---|
| 343 | data.add(new DenseInstance(1.0, values));
|
---|
| 344 | }
|
---|
| 345 | else {
|
---|
| 346 | System.out.println("instance " + i + " is invalid");
|
---|
| 347 | }
|
---|
| 348 | }
|
---|
| 349 | return data;
|
---|
| 350 | }
|
---|
| 351 |
|
---|
| 352 | /*
|
---|
| 353 | * (non-Javadoc)
|
---|
| 354 | *
|
---|
| 355 | * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader.SingleVersionLoader#
|
---|
| 356 | * filenameFilter(java.lang.String)
|
---|
| 357 | */
|
---|
| 358 | @Override
|
---|
| 359 | public boolean filenameFilter(String filename) {
|
---|
| 360 | return filename.endsWith("src.csv");
|
---|
| 361 | }
|
---|
| 362 |
|
---|
[38] | 363 | }
|
---|