1 | // Copyright 2015 Georg-August-Universität Göttingen, Germany
|
---|
2 | //
|
---|
3 | // Licensed under the Apache License, Version 2.0 (the "License");
|
---|
4 | // you may not use this file except in compliance with the License.
|
---|
5 | // You may obtain a copy of the License at
|
---|
6 | //
|
---|
7 | // http://www.apache.org/licenses/LICENSE-2.0
|
---|
8 | //
|
---|
9 | // Unless required by applicable law or agreed to in writing, software
|
---|
10 | // distributed under the License is distributed on an "AS IS" BASIS,
|
---|
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
---|
12 | // See the License for the specific language governing permissions and
|
---|
13 | // limitations under the License.
|
---|
14 |
|
---|
15 | package de.ugoe.cs.cpdp.util;
|
---|
16 |
|
---|
17 | import org.apache.commons.math3.ml.distance.EuclideanDistance;
|
---|
18 |
|
---|
19 | import weka.core.Instance;
|
---|
20 | import weka.core.Instances;
|
---|
21 |
|
---|
22 | /**
|
---|
23 | * <p>
|
---|
24 | * Collections of helper functions to work with Weka.
|
---|
25 | * </p>
|
---|
26 | *
|
---|
27 | * @author Steffen Herbold
|
---|
28 | */
|
---|
29 | public class WekaUtils {
|
---|
30 |
|
---|
31 | /**
|
---|
32 | * <p>
|
---|
33 | * Data class for distance between instances within a data set based on their distributional
|
---|
34 | * characteristics.
|
---|
35 | * </p>
|
---|
36 | *
|
---|
37 | * @author Steffen Herbold
|
---|
38 | */
|
---|
39 | public static class DistChar {
|
---|
40 |
|
---|
41 | /**
|
---|
42 | * mean distance
|
---|
43 | */
|
---|
44 | public final double mean;
|
---|
45 |
|
---|
46 | /**
|
---|
47 | * standard deviation
|
---|
48 | */
|
---|
49 | public final double std;
|
---|
50 |
|
---|
51 | /**
|
---|
52 | * minimal value
|
---|
53 | */
|
---|
54 | public final double min;
|
---|
55 |
|
---|
56 | /**
|
---|
57 | * maximal value
|
---|
58 | */
|
---|
59 | public final double max;
|
---|
60 |
|
---|
61 | /**
|
---|
62 | * number of instances
|
---|
63 | */
|
---|
64 | public final int num;
|
---|
65 |
|
---|
66 | /**
|
---|
67 | * <p>
|
---|
68 | * Constructor. Creates a new DistChar object.
|
---|
69 | * </p>
|
---|
70 | *
|
---|
71 | * @param mean mean distance between instances
|
---|
72 | * @param std standard deviation of distances between instances
|
---|
73 | * @param min minimal distance between instances
|
---|
74 | * @param max maximal distance between instances
|
---|
75 | * @param num number of instance
|
---|
76 | */
|
---|
77 | private DistChar(double mean, double std, double min, double max, int num) {
|
---|
78 | this.mean = mean;
|
---|
79 | this.std = std;
|
---|
80 | this.min = min;
|
---|
81 | this.max = max;
|
---|
82 | this.num = num;
|
---|
83 | }
|
---|
84 | }
|
---|
85 |
|
---|
86 | /**
|
---|
87 | * Scaling value that moves the decimal point by 5 digets.
|
---|
88 | */
|
---|
89 | public final static double SCALER = 10000.0d;
|
---|
90 |
|
---|
91 | /**
|
---|
92 | * <p>
|
---|
93 | * Adoption of the Hamming difference to numerical values, i.e., basically a count of different
|
---|
94 | * metric values.
|
---|
95 | * </p>
|
---|
96 | *
|
---|
97 | * @param inst1
|
---|
98 | * first instance to be compared
|
---|
99 | * @param inst2
|
---|
100 | * second instance to be compared
|
---|
101 | * @return the distance
|
---|
102 | */
|
---|
103 | public static double hammingDistance(Instance inst1, Instance inst2) {
|
---|
104 | double distance = 0.0;
|
---|
105 | for (int j = 0; j < inst1.numAttributes(); j++) {
|
---|
106 | if (j != inst1.classIndex()) {
|
---|
107 | if (inst1.value(j) != inst2.value(j)) {
|
---|
108 | distance += 1.0;
|
---|
109 | }
|
---|
110 | }
|
---|
111 | }
|
---|
112 | return distance;
|
---|
113 | }
|
---|
114 |
|
---|
115 | /**
|
---|
116 | * <p>
|
---|
117 | * Returns a double array of the values without the classification.
|
---|
118 | * </p>
|
---|
119 | *
|
---|
120 | * @param instance
|
---|
121 | * the instance
|
---|
122 | * @return double array
|
---|
123 | */
|
---|
124 | public static double[] instanceValues(Instance instance) {
|
---|
125 | double[] values = new double[instance.numAttributes() - 1];
|
---|
126 | int k = 0;
|
---|
127 | for (int j = 0; j < instance.numAttributes(); j++) {
|
---|
128 | if (j != instance.classIndex()) {
|
---|
129 | values[k] = instance.value(j);
|
---|
130 | k++;
|
---|
131 | }
|
---|
132 | }
|
---|
133 | return values;
|
---|
134 | }
|
---|
135 |
|
---|
136 | /**
|
---|
137 | * <p>
|
---|
138 | * Calculates the distributional characteristics of the distances the instances within a data
|
---|
139 | * set have to each other.
|
---|
140 | * </p>
|
---|
141 | *
|
---|
142 | * @param data
|
---|
143 | * data for which the instances are characterized
|
---|
144 | * @return characteristics
|
---|
145 | */
|
---|
146 | public static DistChar datasetDistance(Instances data) {
|
---|
147 | double distance;
|
---|
148 | double sumAll = 0.0;
|
---|
149 | double sumAllQ = 0.0;
|
---|
150 | double min = Double.MAX_VALUE;
|
---|
151 | double max = Double.MIN_VALUE;
|
---|
152 | int numCmp = 0;
|
---|
153 | int l = 0;
|
---|
154 | double[] inst1 = new double[data.numAttributes() - 1];
|
---|
155 | double[] inst2 = new double[data.numAttributes() - 1];
|
---|
156 | EuclideanDistance euclideanDistance = new EuclideanDistance();
|
---|
157 | for (int i = 0; i < data.numInstances(); i++) {
|
---|
158 | l = 0;
|
---|
159 | for (int k = 0; k < data.numAttributes(); k++) {
|
---|
160 | if (k != data.classIndex()) {
|
---|
161 | inst1[l] = data.instance(i).value(k);
|
---|
162 | }
|
---|
163 | }
|
---|
164 | for (int j = 0; j < data.numInstances(); j++) {
|
---|
165 | if (j != i) {
|
---|
166 | l = 0;
|
---|
167 | for (int k = 0; k < data.numAttributes(); k++) {
|
---|
168 | if (k != data.classIndex()) {
|
---|
169 | inst2[l] = data.instance(j).value(k);
|
---|
170 | }
|
---|
171 | }
|
---|
172 | distance = euclideanDistance.compute(inst1, inst2);
|
---|
173 | sumAll += distance;
|
---|
174 | sumAllQ += distance * distance;
|
---|
175 | numCmp++;
|
---|
176 | if (distance < min) {
|
---|
177 | min = distance;
|
---|
178 | }
|
---|
179 | if (distance > max) {
|
---|
180 | max = distance;
|
---|
181 | }
|
---|
182 | }
|
---|
183 | }
|
---|
184 | }
|
---|
185 | double mean = sumAll / numCmp;
|
---|
186 | double std = Math.sqrt((sumAllQ - (sumAll * sumAll) / numCmp) * (1.0d / (numCmp - 1)));
|
---|
187 | return new DistChar(mean, std, min, max, data.numInstances());
|
---|
188 | }
|
---|
189 |
|
---|
190 | /**
|
---|
191 | * <p>
|
---|
192 | * Calculates the distributional characteristics of the distances of a single attribute the
|
---|
193 | * instances within a data set have to each other.
|
---|
194 | * </p>
|
---|
195 | *
|
---|
196 | * @param data
|
---|
197 | * data for which the instances are characterized
|
---|
198 | * @param index
|
---|
199 | * attribute for which the distances are characterized
|
---|
200 | * @return characteristics
|
---|
201 | */
|
---|
202 | public static DistChar attributeDistance(Instances data, int index) {
|
---|
203 | double distance;
|
---|
204 | double sumAll = 0.0;
|
---|
205 | double sumAllQ = 0.0;
|
---|
206 | double min = Double.MAX_VALUE;
|
---|
207 | double max = Double.MIN_VALUE;
|
---|
208 | int numCmp = 0;
|
---|
209 | double value1, value2;
|
---|
210 | for (int i = 0; i < data.numInstances(); i++) {
|
---|
211 | value1 = data.instance(i).value(index);
|
---|
212 | for (int j = 0; j < data.numInstances(); j++) {
|
---|
213 | if (j != i) {
|
---|
214 | value2 = data.instance(j).value(index);
|
---|
215 | distance = Math.abs(value1 - value2);
|
---|
216 | sumAll += distance;
|
---|
217 | sumAllQ += distance * distance;
|
---|
218 | numCmp++;
|
---|
219 | if (distance < min) {
|
---|
220 | min = distance;
|
---|
221 | }
|
---|
222 | if (distance > max) {
|
---|
223 | max = distance;
|
---|
224 | }
|
---|
225 | }
|
---|
226 | }
|
---|
227 | }
|
---|
228 | double mean = sumAll / numCmp;
|
---|
229 | double std = Math.sqrt((sumAllQ - (sumAll * sumAll) / numCmp) * (1.0d / (numCmp - 1)));
|
---|
230 | return new DistChar(mean, std, min, max, data.numInstances());
|
---|
231 | }
|
---|
232 |
|
---|
233 | /**
|
---|
234 | * <p>
|
---|
235 | * Upscales the value of a single attribute. This is a workaround to get BayesNet running for
|
---|
236 | * all data. Works on a copy of the training data, i.e., leaves the original data untouched.
|
---|
237 | * </p>
|
---|
238 | *
|
---|
239 | * @param traindata
|
---|
240 | * data from which the attribute is upscaled.
|
---|
241 | * @param attributeIndex
|
---|
242 | * index of the attribute
|
---|
243 | * @return data with upscaled attribute
|
---|
244 | */
|
---|
245 | public static Instances upscaleAttribute(Instances traindata, int attributeIndex) {
|
---|
246 | Instances traindataCopy = new Instances(traindata);
|
---|
247 | for (int i = 0; i < traindata.size(); i++) {
|
---|
248 | traindataCopy.get(i).setValue(attributeIndex,
|
---|
249 | traindata.get(i).value(attributeIndex) * SCALER);
|
---|
250 | }
|
---|
251 | return traindataCopy;
|
---|
252 | }
|
---|
253 | }
|
---|