1 | // Copyright 2016 Georg-August-Universität Göttingen, Germany
|
---|
2 | //
|
---|
3 | // Licensed under the Apache License, Version 2.0 (the "License");
|
---|
4 | // you may not use this file except in compliance with the License.
|
---|
5 | // You may obtain a copy of the License at
|
---|
6 | //
|
---|
7 | // http://www.apache.org/licenses/LICENSE-2.0
|
---|
8 | //
|
---|
9 | // Unless required by applicable law or agreed to in writing, software
|
---|
10 | // distributed under the License is distributed on an "AS IS" BASIS,
|
---|
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
---|
12 | // See the License for the specific language governing permissions and
|
---|
13 | // limitations under the License.
|
---|
14 |
|
---|
15 | package de.ugoe.cs.cpdp.loader;
|
---|
16 |
|
---|
17 | import java.io.File;
|
---|
18 | import java.io.IOException;
|
---|
19 | import java.util.ArrayList;
|
---|
20 | import java.util.HashMap;
|
---|
21 | import java.util.Map;
|
---|
22 | import java.util.logging.Level;
|
---|
23 |
|
---|
24 | import de.ugoe.cs.util.console.Console;
|
---|
25 | import weka.core.Attribute;
|
---|
26 | import weka.core.Instance;
|
---|
27 | import weka.core.Instances;
|
---|
28 | import weka.core.converters.CSVLoader;
|
---|
29 |
|
---|
30 | /**
|
---|
31 | * <p>
|
---|
32 | * Loads the genealogy data published by Herzig et al.
|
---|
33 | * </p>
|
---|
34 | *
|
---|
35 | * @author Steffen Herbold
|
---|
36 | */
|
---|
37 | public class NetgeneLoader implements SingleVersionLoader {
|
---|
38 |
|
---|
39 | /*
|
---|
40 | * (non-Javadoc)
|
---|
41 | *
|
---|
42 | * @see de.ugoe.cs.cpdp.loader.SingleVersionLoader#load(java.io.File)
|
---|
43 | */
|
---|
44 | @Override
|
---|
45 | public Instances load(File fileMetricsFile) {
|
---|
46 | // first determine all files
|
---|
47 | String path = fileMetricsFile.getParentFile().getAbsolutePath();
|
---|
48 | String project = fileMetricsFile.getName().split("_")[0];
|
---|
49 | File bugsFile = new File(path + "/" + project + "_bugs_per_file.csv");
|
---|
50 | File networkMetrics = new File(path + "/" + project + "_network_metrics.csv");
|
---|
51 | Instances metricsData = null;
|
---|
52 |
|
---|
53 | try {
|
---|
54 | CSVLoader wekaCsvLoader = new CSVLoader();
|
---|
55 | wekaCsvLoader.setSource(fileMetricsFile);
|
---|
56 | metricsData = wekaCsvLoader.getDataSet();
|
---|
57 | wekaCsvLoader.setSource(bugsFile);
|
---|
58 | Instances bugsData = wekaCsvLoader.getDataSet();
|
---|
59 | wekaCsvLoader.setSource(networkMetrics);
|
---|
60 | Instances networkData = wekaCsvLoader.getDataSet();
|
---|
61 |
|
---|
62 | metricsData.setRelationName(project);
|
---|
63 |
|
---|
64 | // fix nominal attributes (i.e., NA values)
|
---|
65 | for (int j = 2; j < networkData.numAttributes(); j++) {
|
---|
66 | if (networkData.attribute(j).isNominal()) {
|
---|
67 | String attributeName = networkData.attribute(j).name();
|
---|
68 | double[] tmpVals = new double[networkData.size()];
|
---|
69 | // get temporary values
|
---|
70 | for (int i = 0; i < networkData.size(); i++) {
|
---|
71 | Instance inst = networkData.instance(i);
|
---|
72 | if (!inst.isMissing(j)) {
|
---|
73 | String val = networkData.instance(i).stringValue(j);
|
---|
74 | try {
|
---|
75 | tmpVals[i] = Double.parseDouble(val);
|
---|
76 | }
|
---|
77 | catch (NumberFormatException e) {
|
---|
78 | // not a number, using 0.0;
|
---|
79 | tmpVals[i] = 0.0;
|
---|
80 | }
|
---|
81 | }
|
---|
82 | else {
|
---|
83 | tmpVals[i] = 0.0;
|
---|
84 | }
|
---|
85 | }
|
---|
86 | // replace attribute
|
---|
87 | networkData.deleteAttributeAt(j);
|
---|
88 | networkData.insertAttributeAt(new Attribute(attributeName), j);
|
---|
89 | for (int i = 0; i < networkData.size(); i++) {
|
---|
90 | networkData.instance(i).setValue(j, tmpVals[i]);
|
---|
91 | }
|
---|
92 | }
|
---|
93 | }
|
---|
94 | // fix string attributes
|
---|
95 | for (int j = 2; j < networkData.numAttributes(); j++) {
|
---|
96 | if (networkData.attribute(j).isString()) {
|
---|
97 | String attributeName = networkData.attribute(j).name();
|
---|
98 | double[] tmpVals = new double[networkData.size()];
|
---|
99 | // get temporary values
|
---|
100 | for (int i = 0; i < networkData.size(); i++) {
|
---|
101 | Instance inst = networkData.instance(i);
|
---|
102 | if (!inst.isMissing(j)) {
|
---|
103 | String val = networkData.instance(i).stringValue(j);
|
---|
104 | try {
|
---|
105 | tmpVals[i] = Double.parseDouble(val);
|
---|
106 | }
|
---|
107 | catch (NumberFormatException e) {
|
---|
108 | // not a number, using 0.0;
|
---|
109 | tmpVals[i] = 0.0;
|
---|
110 | }
|
---|
111 | }
|
---|
112 | else {
|
---|
113 | tmpVals[i] = 0.0;
|
---|
114 | }
|
---|
115 | }
|
---|
116 | // replace attribute
|
---|
117 | networkData.deleteAttributeAt(j);
|
---|
118 | networkData.insertAttributeAt(new Attribute(attributeName), j);
|
---|
119 | for (int i = 0; i < networkData.size(); i++) {
|
---|
120 | networkData.instance(i).setValue(j, tmpVals[i]);
|
---|
121 | }
|
---|
122 | }
|
---|
123 | }
|
---|
124 |
|
---|
125 | Map<String, Integer> filenames = new HashMap<>();
|
---|
126 | for (int j = 0; j < metricsData.size(); j++) {
|
---|
127 | filenames.put(metricsData.instance(j).stringValue(0), j);
|
---|
128 | }
|
---|
129 | // merge with network data
|
---|
130 | int attributeIndex;
|
---|
131 | for (int j = 2; j < networkData.numAttributes(); j++) {
|
---|
132 | attributeIndex = metricsData.numAttributes();
|
---|
133 | metricsData.insertAttributeAt(networkData.attribute(j), attributeIndex);
|
---|
134 | for (int i = 0; i < networkData.size(); i++) {
|
---|
135 | Integer instanceIndex = filenames.get(networkData.instance(i).stringValue(1));
|
---|
136 | if (instanceIndex != null) {
|
---|
137 | metricsData.instance(instanceIndex)
|
---|
138 | .setValue(attributeIndex, networkData.instance(i).value(j));
|
---|
139 | }
|
---|
140 | }
|
---|
141 | }
|
---|
142 |
|
---|
143 | // add bug information
|
---|
144 | attributeIndex = metricsData.numAttributes();
|
---|
145 | final ArrayList<String> classAttVals = new ArrayList<String>();
|
---|
146 | classAttVals.add("0");
|
---|
147 | classAttVals.add("1");
|
---|
148 | final Attribute classAtt = new Attribute("bug", classAttVals);
|
---|
149 | metricsData.insertAttributeAt(classAtt, attributeIndex);
|
---|
150 | for (int i = 0; i < bugsData.size(); i++) {
|
---|
151 | if (bugsData.instance(i).value(2) > 0.0d) {
|
---|
152 | Integer instanceIndex = filenames.get(bugsData.instance(i).stringValue(1));
|
---|
153 | if (instanceIndex != null) {
|
---|
154 | metricsData.instance(instanceIndex).setValue(attributeIndex, 1.0);
|
---|
155 | }
|
---|
156 | }
|
---|
157 | }
|
---|
158 |
|
---|
159 | // remove filenames
|
---|
160 | metricsData.deleteAttributeAt(0);
|
---|
161 | Attribute eigenvector = metricsData.attribute("eigenvector");
|
---|
162 | if (eigenvector != null) {
|
---|
163 | for (int j = 0; j < metricsData.numAttributes(); j++) {
|
---|
164 | if (metricsData.attribute(j) == eigenvector) {
|
---|
165 | metricsData.deleteAttributeAt(j);
|
---|
166 | }
|
---|
167 | }
|
---|
168 | }
|
---|
169 |
|
---|
170 | metricsData.setClassIndex(metricsData.numAttributes() - 1);
|
---|
171 |
|
---|
172 | // set all missing values to 0
|
---|
173 | for (int i = 0; i < metricsData.size(); i++) {
|
---|
174 | for (int j = 0; j < metricsData.numAttributes(); j++) {
|
---|
175 | if (metricsData.instance(i).isMissing(j)) {
|
---|
176 | metricsData.instance(i).setValue(j, 0.0d);
|
---|
177 | }
|
---|
178 | }
|
---|
179 | }
|
---|
180 | }
|
---|
181 | catch (IOException e) {
|
---|
182 | Console.traceln(Level.SEVERE, "failure reading file: " + e.getMessage());
|
---|
183 | metricsData = null;
|
---|
184 | }
|
---|
185 | return metricsData;
|
---|
186 | }
|
---|
187 |
|
---|
188 | /*
|
---|
189 | * (non-Javadoc)
|
---|
190 | *
|
---|
191 | * @see de.ugoe.cs.cpdp.loader.SingleVersionLoader#filenameFilter(java.lang.String)
|
---|
192 | */
|
---|
193 | @Override
|
---|
194 | public boolean filenameFilter(String filename) {
|
---|
195 | return filename.endsWith("fileMetrics.csv");
|
---|
196 | }
|
---|
197 |
|
---|
198 | }
|
---|