source: trunk/CrossPare/src/de/ugoe/cs/cpdp/loader/NasaARFFLoader.java @ 86

Last change on this file since 86 was 86, checked in by sherbold, 8 years ago
  • switched workspace encoding to UTF-8 and fixed broken characters
  • Property svn:mime-type set to text/plain
File size: 8.6 KB
Line 
1// Copyright 2015 Georg-August-Universität Göttingen, Germany
2//
3//   Licensed under the Apache License, Version 2.0 (the "License");
4//   you may not use this file except in compliance with the License.
5//   You may obtain a copy of the License at
6//
7//       http://www.apache.org/licenses/LICENSE-2.0
8//
9//   Unless required by applicable law or agreed to in writing, software
10//   distributed under the License is distributed on an "AS IS" BASIS,
11//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12//   See the License for the specific language governing permissions and
13//   limitations under the License.
14
15package de.ugoe.cs.cpdp.loader;
16
17import java.io.BufferedReader;
18import java.io.File;
19import java.io.FileReader;
20import java.io.IOException;
21import java.util.HashMap;
22import java.util.LinkedList;
23import java.util.List;
24import java.util.Map;
25
26import weka.core.Instances;
27import weka.filters.Filter;
28import weka.filters.unsupervised.attribute.Add;
29import weka.filters.unsupervised.attribute.Reorder;
30
31/**
32 * Loads the instances for a software version from an ARFF file of the NASA/SOFTLAB/MDP data.
33 *
34 * @author Steffen Herbold
35 */
36public class NasaARFFLoader implements SingleVersionLoader {
37
38    /**
39     * used to map attributes the same attribute with different names to each other
40     */
41    Map<String, String> attributeNameMap;
42
43    /**
44     * used to ensure that the attribute order is the same after loading
45     */
46    List<String> attributeOrder;
47
48    /**
49     * Constructor. Creates a new NasaARFFLoader.
50     */
51    public NasaARFFLoader() {
52        attributeNameMap = new HashMap<>();
53
54        // Map entries for ar project
55        attributeNameMap.put("total_loc", "LOC_TOTAL");
56        attributeNameMap.put("comment_loc", "LOC_COMMENTS");
57        attributeNameMap.put("code_and_comment_loc", "LOC_CODE_AND_COMMENT");
58        attributeNameMap.put("executable_loc", "LOC_EXECUTABLE");
59        attributeNameMap.put("unique_operands", "NUM_UNIQUE_OPERANDS");
60        attributeNameMap.put("unique_operators", "NUM_UNIQUE_OPERATORS");
61        attributeNameMap.put("total_operands", "NUM_OPERANDS");
62        attributeNameMap.put("total_operators", "NUM_OPERATORS");
63        attributeNameMap.put("halstead_length", "HALSTEAD_LENGTH");
64        attributeNameMap.put("halstead_volume", "HALSTEAD_VOLUME");
65        attributeNameMap.put("halstead_difficulty", "HALSTEAD_DIFFICULTY");
66        attributeNameMap.put("halstead_effort", "HALSTEAD_EFFORT");
67        attributeNameMap.put("halstead_error", "HALSTEAD_ERROR_EST");
68        attributeNameMap.put("halstead_time", "HALSTEAD_PROG_TIME");
69        attributeNameMap.put("branch_count", "BRANCH_COUNT");
70        attributeNameMap.put("cyclomatic_complexity", "CYCLOMATIC_COMPLEXITY");
71        attributeNameMap.put("design_complexity", "DESIGN_COMPLEXITY");
72
73        // Map entries for KC2
74        attributeNameMap.put("loc", "LOC_TOTAL");
75        attributeNameMap.put("lOCode", "LOC_EXECUTABLE");
76        attributeNameMap.put("lOComment", "LOC_COMMENTS");
77        attributeNameMap.put("lOCodeAndComment", "LOC_CODE_AND_COMMENT");
78        attributeNameMap.put("uniq_Op", "NUM_UNIQUE_OPERATORS");
79        attributeNameMap.put("uniq_Opnd", "NUM_UNIQUE_OPERANDS");
80        attributeNameMap.put("total_Op", "NUM_OPERATORS");
81        attributeNameMap.put("total_Opnd", "NUM_OPERANDS");
82        attributeNameMap.put("v", "HALSTEAD_VOLUME");
83        attributeNameMap.put("l", "HALSTEAD_LENGTH");
84        attributeNameMap.put("d", "HALSTEAD_DIFFICULTY");
85        attributeNameMap.put("e", "HALSTEAD_EFFORT");
86        attributeNameMap.put("b", "HALSTEAD_ERROR_EST");
87        attributeNameMap.put("t", "HALSTEAD_PROG_TIME");
88        attributeNameMap.put("branchCount", "BRANCH_COUNT");
89        attributeNameMap.put("v(g)", "CYCLOMATIC_COMPLEXITY");
90        attributeNameMap.put("iv(g)", "DESIGN_COMPLEXITY");
91
92        attributeNameMap.put("defects", "bug");
93        attributeNameMap.put("Defective", "bug");
94        attributeNameMap.put("problems", "bug");
95        attributeNameMap.put("label", "bug");
96
97        // build list with normalized attribute order
98        attributeOrder = new LinkedList<>();
99
100        attributeOrder.add("LOC_TOTAL");
101        attributeOrder.add("LOC_EXECUTABLE");
102        attributeOrder.add("LOC_COMMENTS");
103        attributeOrder.add("LOC_CODE_AND_COMMENT");
104        attributeOrder.add("NUM_UNIQUE_OPERATORS");
105        attributeOrder.add("NUM_UNIQUE_OPERANDS");
106        attributeOrder.add("NUM_OPERATORS");
107        attributeOrder.add("NUM_OPERANDS");
108        attributeOrder.add("HALSTEAD_VOLUME");
109        attributeOrder.add("HALSTEAD_LENGTH");
110        attributeOrder.add("HALSTEAD_DIFFICULTY");
111        attributeOrder.add("HALSTEAD_EFFORT");
112        attributeOrder.add("HALSTEAD_ERROR_EST");
113        attributeOrder.add("HALSTEAD_PROG_TIME");
114        attributeOrder.add("BRANCH_COUNT");
115        attributeOrder.add("CYCLOMATIC_COMPLEXITY");
116        attributeOrder.add("DESIGN_COMPLEXITY");
117        attributeOrder.add("bug");
118    }
119
120    /*
121     * (non-Javadoc)
122     *
123     * @see de.ugoe.cs.cpdp.loader.SingleVersionLoader#load(java.io.File)
124     */
125    @Override
126    public Instances load(File file) {
127        BufferedReader reader;
128        Instances data;
129        try {
130            reader = new BufferedReader(new FileReader(file));
131            data = new Instances(reader);
132            reader.close();
133        }
134        catch (IOException e) {
135            throw new RuntimeException("Error reading data", e);
136        }
137
138        // setting class attribute
139        data.setClassIndex(data.numAttributes() - 1);
140
141        // normalize attribute names
142        for (int i = 0; i < data.numAttributes(); i++) {
143            String mapValue = attributeNameMap.get(data.attribute(i).name());
144            if (mapValue != null) {
145                data.renameAttribute(i, mapValue);
146            }
147        }
148
149        // determine new attribute order (unwanted attributes are implicitly
150        // removed
151        String orderString = "";
152        for (String attName : attributeOrder) {
153            for (int i = 0; i < data.numAttributes(); i++) {
154                if (attName.equals(data.attribute(i).name())) {
155                    orderString += (i + 1) + ",";
156                }
157            }
158        }
159        orderString = orderString.substring(0, orderString.length() - 1);
160
161        String relationName = data.relationName();
162        String[] options = new String[2];
163        options[0] = "-R";
164        options[1] = orderString;
165        Reorder reorder = new Reorder();
166        try {
167            reorder.setOptions(options);
168            reorder.setInputFormat(data);
169            data = Filter.useFilter(data, reorder);
170        }
171        catch (Exception e) {
172            throw new RuntimeException("Error while reordering the data", e);
173        }
174        if (data.numAttributes() != attributeOrder.size()) {
175            throw new RuntimeException("Invalid number of attributes; filename: " + file.getName());
176        }
177
178        // normalize bug nominal values
179        Add add = new Add();
180        add.setAttributeIndex("last");
181        add.setNominalLabels("0,1");
182        add.setAttributeName("bug-new");
183        try {
184            add.setInputFormat(data);
185            data = Filter.useFilter(data, add);
186        }
187        catch (Exception e) {
188            throw new RuntimeException("Error while normalizing the bug nonminal values", e);
189        }
190        data.setRelationName(relationName);
191
192        double classValue;
193
194        String firstValue = data.classAttribute().enumerateValues().nextElement().toString();
195        if (firstValue.equals("Y") || firstValue.equals("yes") || firstValue.equals("true")) {
196            classValue = 0.0;
197        }
198        else {
199            classValue = 1.0;
200        }
201
202        for (int i = 0; i < data.numInstances(); i++) {
203            if (data.instance(i).classValue() == classValue) {
204                data.instance(i).setValue(data.classIndex() + 1, 1.0);
205            }
206            else {
207                data.instance(i).setValue(data.classIndex() + 1, 0.0);
208            }
209        }
210
211        int oldClassIndex = data.classIndex();
212        data.setClassIndex(oldClassIndex + 1);
213        data.deleteAttributeAt(oldClassIndex);
214
215        return data;
216    }
217
218    /*
219     * (non-Javadoc)
220     *
221     * @see de.ugoe.cs.cpdp.loader.AbstractFolderLoader.SingleVersionLoader#
222     * filenameFilter(java.lang.String)
223     */
224    @Override
225    public boolean filenameFilter(String filename) {
226        return filename.endsWith(".arff");
227    }
228
229}
Note: See TracBrowser for help on using the repository browser.