[124] | 1 | // Copyright 2016 Georg-August-Universität Göttingen, Germany
|
---|
| 2 | //
|
---|
| 3 | // Licensed under the Apache License, Version 2.0 (the "License");
|
---|
| 4 | // you may not use this file except in compliance with the License.
|
---|
| 5 | // You may obtain a copy of the License at
|
---|
| 6 | //
|
---|
| 7 | // http://www.apache.org/licenses/LICENSE-2.0
|
---|
| 8 | //
|
---|
| 9 | // Unless required by applicable law or agreed to in writing, software
|
---|
| 10 | // distributed under the License is distributed on an "AS IS" BASIS,
|
---|
| 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
---|
| 12 | // See the License for the specific language governing permissions and
|
---|
| 13 | // limitations under the License.
|
---|
| 14 |
|
---|
| 15 | package de.ugoe.cs.cpdp.loader;
|
---|
| 16 |
|
---|
| 17 | import java.io.File;
|
---|
| 18 | import java.io.IOException;
|
---|
| 19 | import java.util.ArrayList;
|
---|
| 20 | import java.util.HashMap;
|
---|
| 21 | import java.util.Map;
|
---|
| 22 | import java.util.logging.Level;
|
---|
| 23 |
|
---|
| 24 | import de.ugoe.cs.util.console.Console;
|
---|
| 25 | import weka.core.Attribute;
|
---|
| 26 | import weka.core.Instance;
|
---|
| 27 | import weka.core.Instances;
|
---|
| 28 | import weka.core.converters.CSVLoader;
|
---|
| 29 |
|
---|
| 30 | /**
|
---|
| 31 | * <p>
|
---|
| 32 | * Loads the genealogy data published by Herzig et al.
|
---|
| 33 | * </p>
|
---|
| 34 | *
|
---|
| 35 | * @author Steffen Herbold
|
---|
| 36 | */
|
---|
| 37 | public class NetgeneLoader implements SingleVersionLoader {
|
---|
| 38 |
|
---|
| 39 | /*
|
---|
| 40 | * (non-Javadoc)
|
---|
| 41 | *
|
---|
| 42 | * @see de.ugoe.cs.cpdp.loader.SingleVersionLoader#load(java.io.File)
|
---|
| 43 | */
|
---|
| 44 | @Override
|
---|
| 45 | public Instances load(File fileMetricsFile) {
|
---|
| 46 | // first determine all files
|
---|
| 47 | String path = fileMetricsFile.getParentFile().getAbsolutePath();
|
---|
| 48 | String project = fileMetricsFile.getName().split("_")[0];
|
---|
| 49 | File bugsFile = new File(path + "/" + project + "_bugs_per_file.csv");
|
---|
| 50 | File networkMetrics = new File(path + "/" + project + "_network_metrics.csv");
|
---|
| 51 | Instances metricsData = null;
|
---|
| 52 |
|
---|
| 53 | try {
|
---|
| 54 | CSVLoader wekaCsvLoader = new CSVLoader();
|
---|
| 55 | wekaCsvLoader.setSource(fileMetricsFile);
|
---|
| 56 | metricsData = wekaCsvLoader.getDataSet();
|
---|
| 57 | wekaCsvLoader.setSource(bugsFile);
|
---|
| 58 | Instances bugsData = wekaCsvLoader.getDataSet();
|
---|
| 59 | wekaCsvLoader.setSource(networkMetrics);
|
---|
| 60 | Instances networkData = wekaCsvLoader.getDataSet();
|
---|
| 61 |
|
---|
| 62 | metricsData.setRelationName(project);
|
---|
| 63 |
|
---|
| 64 | // fix nominal attributes (i.e., NA values)
|
---|
| 65 | for (int j = 2; j < networkData.numAttributes(); j++) {
|
---|
| 66 | if (networkData.attribute(j).isNominal()) {
|
---|
| 67 | String attributeName = networkData.attribute(j).name();
|
---|
| 68 | double[] tmpVals = new double[networkData.size()];
|
---|
| 69 | // get temporary values
|
---|
| 70 | for (int i = 0; i < networkData.size(); i++) {
|
---|
| 71 | Instance inst = networkData.instance(i);
|
---|
| 72 | if (!inst.isMissing(j)) {
|
---|
| 73 | String val = networkData.instance(i).stringValue(j);
|
---|
| 74 | try {
|
---|
| 75 | tmpVals[i] = Double.parseDouble(val);
|
---|
| 76 | }
|
---|
| 77 | catch (NumberFormatException e) {
|
---|
| 78 | // not a number, using 0.0;
|
---|
| 79 | tmpVals[i] = 0.0;
|
---|
| 80 | }
|
---|
| 81 | }
|
---|
| 82 | else {
|
---|
| 83 | tmpVals[i] = 0.0;
|
---|
| 84 | }
|
---|
| 85 | }
|
---|
| 86 | // replace attribute
|
---|
| 87 | networkData.deleteAttributeAt(j);
|
---|
| 88 | networkData.insertAttributeAt(new Attribute(attributeName), j);
|
---|
| 89 | for (int i = 0; i < networkData.size(); i++) {
|
---|
| 90 | networkData.instance(i).setValue(j, tmpVals[i]);
|
---|
| 91 | }
|
---|
| 92 | }
|
---|
| 93 | }
|
---|
| 94 | // fix string attributes
|
---|
| 95 | for (int j = 2; j < networkData.numAttributes(); j++) {
|
---|
| 96 | if (networkData.attribute(j).isString()) {
|
---|
| 97 | String attributeName = networkData.attribute(j).name();
|
---|
| 98 | double[] tmpVals = new double[networkData.size()];
|
---|
| 99 | // get temporary values
|
---|
| 100 | for (int i = 0; i < networkData.size(); i++) {
|
---|
| 101 | Instance inst = networkData.instance(i);
|
---|
| 102 | if (!inst.isMissing(j)) {
|
---|
| 103 | String val = networkData.instance(i).stringValue(j);
|
---|
| 104 | try {
|
---|
| 105 | tmpVals[i] = Double.parseDouble(val);
|
---|
| 106 | }
|
---|
| 107 | catch (NumberFormatException e) {
|
---|
| 108 | // not a number, using 0.0;
|
---|
| 109 | tmpVals[i] = 0.0;
|
---|
| 110 | }
|
---|
| 111 | }
|
---|
| 112 | else {
|
---|
| 113 | tmpVals[i] = 0.0;
|
---|
| 114 | }
|
---|
| 115 | }
|
---|
| 116 | // replace attribute
|
---|
| 117 | networkData.deleteAttributeAt(j);
|
---|
| 118 | networkData.insertAttributeAt(new Attribute(attributeName), j);
|
---|
| 119 | for (int i = 0; i < networkData.size(); i++) {
|
---|
| 120 | networkData.instance(i).setValue(j, tmpVals[i]);
|
---|
| 121 | }
|
---|
| 122 | }
|
---|
| 123 | }
|
---|
| 124 |
|
---|
| 125 | Map<String, Integer> filenames = new HashMap<>();
|
---|
| 126 | for (int j = 0; j < metricsData.size(); j++) {
|
---|
| 127 | filenames.put(metricsData.instance(j).stringValue(0), j);
|
---|
| 128 | }
|
---|
| 129 | // merge with network data
|
---|
| 130 | int attributeIndex;
|
---|
| 131 | for (int j = 2; j < networkData.numAttributes(); j++) {
|
---|
| 132 | attributeIndex = metricsData.numAttributes();
|
---|
| 133 | metricsData.insertAttributeAt(networkData.attribute(j), attributeIndex);
|
---|
| 134 | for (int i = 0; i < networkData.size(); i++) {
|
---|
| 135 | Integer instanceIndex = filenames.get(networkData.instance(i).stringValue(1));
|
---|
| 136 | if (instanceIndex != null) {
|
---|
| 137 | metricsData.instance(instanceIndex)
|
---|
| 138 | .setValue(attributeIndex, networkData.instance(i).value(j));
|
---|
| 139 | }
|
---|
| 140 | }
|
---|
| 141 | }
|
---|
| 142 |
|
---|
| 143 | // add bug information
|
---|
| 144 | attributeIndex = metricsData.numAttributes();
|
---|
| 145 | final ArrayList<String> classAttVals = new ArrayList<String>();
|
---|
| 146 | classAttVals.add("0");
|
---|
| 147 | classAttVals.add("1");
|
---|
| 148 | final Attribute classAtt = new Attribute("bug", classAttVals);
|
---|
| 149 | metricsData.insertAttributeAt(classAtt, attributeIndex);
|
---|
| 150 | for (int i = 0; i < bugsData.size(); i++) {
|
---|
| 151 | if (bugsData.instance(i).value(2) > 0.0d) {
|
---|
| 152 | Integer instanceIndex = filenames.get(bugsData.instance(i).stringValue(1));
|
---|
| 153 | if (instanceIndex != null) {
|
---|
| 154 | metricsData.instance(instanceIndex).setValue(attributeIndex, 1.0);
|
---|
| 155 | }
|
---|
| 156 | }
|
---|
| 157 | }
|
---|
| 158 |
|
---|
| 159 | // remove filenames
|
---|
| 160 | metricsData.deleteAttributeAt(0);
|
---|
| 161 | Attribute eigenvector = metricsData.attribute("eigenvector");
|
---|
| 162 | if (eigenvector != null) {
|
---|
| 163 | for (int j = 0; j < metricsData.numAttributes(); j++) {
|
---|
| 164 | if (metricsData.attribute(j) == eigenvector) {
|
---|
| 165 | metricsData.deleteAttributeAt(j);
|
---|
| 166 | }
|
---|
| 167 | }
|
---|
| 168 | }
|
---|
| 169 |
|
---|
| 170 | metricsData.setClassIndex(metricsData.numAttributes() - 1);
|
---|
| 171 |
|
---|
| 172 | // set all missing values to 0
|
---|
| 173 | for (int i = 0; i < metricsData.size(); i++) {
|
---|
| 174 | for (int j = 0; j < metricsData.numAttributes(); j++) {
|
---|
| 175 | if (metricsData.instance(i).isMissing(j)) {
|
---|
| 176 | metricsData.instance(i).setValue(j, 0.0d);
|
---|
| 177 | }
|
---|
| 178 | }
|
---|
| 179 | }
|
---|
| 180 | }
|
---|
| 181 | catch (IOException e) {
|
---|
| 182 | Console.traceln(Level.SEVERE, "failure reading file: " + e.getMessage());
|
---|
| 183 | metricsData = null;
|
---|
| 184 | }
|
---|
| 185 | return metricsData;
|
---|
| 186 | }
|
---|
| 187 |
|
---|
| 188 | /*
|
---|
| 189 | * (non-Javadoc)
|
---|
| 190 | *
|
---|
| 191 | * @see de.ugoe.cs.cpdp.loader.SingleVersionLoader#filenameFilter(java.lang.String)
|
---|
| 192 | */
|
---|
| 193 | @Override
|
---|
| 194 | public boolean filenameFilter(String filename) {
|
---|
| 195 | return filename.endsWith("fileMetrics.csv");
|
---|
| 196 | }
|
---|
| 197 |
|
---|
| 198 | }
|
---|