1 | // Copyright 2015 Georg-August-Universität Göttingen, Germany
|
---|
2 | //
|
---|
3 | // Licensed under the Apache License, Version 2.0 (the "License");
|
---|
4 | // you may not use this file except in compliance with the License.
|
---|
5 | // You may obtain a copy of the License at
|
---|
6 | //
|
---|
7 | // http://www.apache.org/licenses/LICENSE-2.0
|
---|
8 | //
|
---|
9 | // Unless required by applicable law or agreed to in writing, software
|
---|
10 | // distributed under the License is distributed on an "AS IS" BASIS,
|
---|
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
---|
12 | // See the License for the specific language governing permissions and
|
---|
13 | // limitations under the License.
|
---|
14 |
|
---|
15 | package de.ugoe.cs.cpdp.util;
|
---|
16 |
|
---|
17 | // TODO comment
|
---|
18 | import org.apache.commons.math3.ml.distance.EuclideanDistance;
|
---|
19 |
|
---|
20 | import weka.core.Instance;
|
---|
21 | import weka.core.Instances;
|
---|
22 |
|
---|
23 | public class WekaUtils {
|
---|
24 |
|
---|
25 | public static class DistChar {
|
---|
26 | public final double mean;
|
---|
27 | public final double std;
|
---|
28 | public final double min;
|
---|
29 | public final double max;
|
---|
30 | public final int num;
|
---|
31 | private DistChar(double mean, double std, double min, double max, int num) {
|
---|
32 | this.mean = mean;
|
---|
33 | this.std = std;
|
---|
34 | this.min = min;
|
---|
35 | this.max = max;
|
---|
36 | this.num = num;
|
---|
37 | }
|
---|
38 | }
|
---|
39 |
|
---|
40 | /**
|
---|
41 | * Scaling value that moves the decimal point by 5 digets.
|
---|
42 | */
|
---|
43 | public final static double SCALER = 10000.0d;
|
---|
44 |
|
---|
45 | /**
|
---|
46 | * <p>
|
---|
47 | * Adoption of the Hamming difference to numerical values, i.e., basically a count of different
|
---|
48 | * metric values.
|
---|
49 | * </p>
|
---|
50 | *
|
---|
51 | * @param inst1
|
---|
52 | * first instance to be compared
|
---|
53 | * @param inst2
|
---|
54 | * second instance to be compared
|
---|
55 | * @return the distance
|
---|
56 | */
|
---|
57 | public static double hammingDistance(Instance inst1, Instance inst2) {
|
---|
58 | double distance = 0.0;
|
---|
59 | for (int j = 0; j < inst1.numAttributes(); j++) {
|
---|
60 | if (j != inst1.classIndex()) {
|
---|
61 | if (inst1.value(j) != inst2.value(j)) {
|
---|
62 | distance += 1.0;
|
---|
63 | }
|
---|
64 | }
|
---|
65 | }
|
---|
66 | return distance;
|
---|
67 | }
|
---|
68 |
|
---|
69 | public static double[] instanceValues(Instance instance) {
|
---|
70 | double[] values = new double[instance.numAttributes()-1];
|
---|
71 | int k=0;
|
---|
72 | for( int j=0; j<instance.numAttributes() ; j++ ) {
|
---|
73 | if( j!= instance.classIndex() ) {
|
---|
74 | values[k] = instance.value(j);
|
---|
75 | k++;
|
---|
76 | }
|
---|
77 | }
|
---|
78 | return values;
|
---|
79 | }
|
---|
80 |
|
---|
81 | public static DistChar datasetDistance(Instances data) {
|
---|
82 | double distance;
|
---|
83 | double sumAll = 0.0;
|
---|
84 | double sumAllQ = 0.0;
|
---|
85 | double min = Double.MAX_VALUE;
|
---|
86 | double max = Double.MIN_VALUE;
|
---|
87 | int numCmp = 0;
|
---|
88 | int l = 0;
|
---|
89 | double[] inst1 = new double[data.numAttributes()-1];
|
---|
90 | double[] inst2 = new double[data.numAttributes()-1];
|
---|
91 | EuclideanDistance euclideanDistance = new EuclideanDistance();
|
---|
92 | for( int i=0; i<data.numInstances(); i++ ) {
|
---|
93 | l=0;
|
---|
94 | for( int k=0; k<data.numAttributes(); k++ ) {
|
---|
95 | if( k!=data.classIndex() ) {
|
---|
96 | inst1[l] = data.instance(i).value(k);
|
---|
97 | }
|
---|
98 | }
|
---|
99 | for( int j=0; j<data.numInstances(); j++ ) {
|
---|
100 | if( j!=i ) {
|
---|
101 | l=0;
|
---|
102 | for( int k=0; k<data.numAttributes(); k++ ) {
|
---|
103 | if( k!=data.classIndex() ) {
|
---|
104 | inst2[l] = data.instance(j).value(k);
|
---|
105 | }
|
---|
106 | }
|
---|
107 | distance = euclideanDistance.compute(inst1, inst2);
|
---|
108 | sumAll += distance;
|
---|
109 | sumAllQ += distance*distance;
|
---|
110 | numCmp++;
|
---|
111 | if( distance < min ) {
|
---|
112 | min = distance;
|
---|
113 | }
|
---|
114 | if( distance > max ) {
|
---|
115 | max = distance;
|
---|
116 | }
|
---|
117 | }
|
---|
118 | }
|
---|
119 | }
|
---|
120 | double mean = sumAll / numCmp;
|
---|
121 | double std = Math.sqrt((sumAllQ-(sumAll*sumAll)/numCmp) *
|
---|
122 | (1.0d / (numCmp - 1)));
|
---|
123 | return new DistChar(mean, std, min, max, data.numInstances());
|
---|
124 | }
|
---|
125 |
|
---|
126 | // like above, but for single attribute
|
---|
127 | public static DistChar attributeDistance(Instances data, int index) {
|
---|
128 | double distance;
|
---|
129 | double sumAll = 0.0;
|
---|
130 | double sumAllQ = 0.0;
|
---|
131 | double min = Double.MAX_VALUE;
|
---|
132 | double max = Double.MIN_VALUE;
|
---|
133 | int numCmp = 0;
|
---|
134 | double value1, value2;
|
---|
135 | for( int i=0; i<data.numInstances(); i++ ) {
|
---|
136 | value1 = data.instance(i).value(index);
|
---|
137 | for( int j=0; j<data.numInstances(); j++ ) {
|
---|
138 | if( j!=i ) {
|
---|
139 | value2 = data.instance(j).value(index);
|
---|
140 | distance = Math.abs(value1-value2);
|
---|
141 | sumAll += distance;
|
---|
142 | sumAllQ += distance*distance;
|
---|
143 | numCmp++;
|
---|
144 | if( distance < min ) {
|
---|
145 | min = distance;
|
---|
146 | }
|
---|
147 | if( distance > max ) {
|
---|
148 | max = distance;
|
---|
149 | }
|
---|
150 | }
|
---|
151 | }
|
---|
152 | }
|
---|
153 | double mean = sumAll / numCmp;
|
---|
154 | double std = Math.sqrt((sumAllQ-(sumAll*sumAll)/numCmp) *
|
---|
155 | (1.0d / (numCmp - 1)));
|
---|
156 | return new DistChar(mean, std, min, max, data.numInstances());
|
---|
157 | }
|
---|
158 |
|
---|
159 | /**
|
---|
160 | * <p>
|
---|
161 | * Upscales the value of a single attribute. This is a workaround to get BayesNet running for
|
---|
162 | * all data. Works on a copy of the training data, i.e., leaves the original data untouched.
|
---|
163 | * </p>
|
---|
164 | *
|
---|
165 | * @param traindata
|
---|
166 | * data from which the attribute is upscaled.
|
---|
167 | * @param attributeIndex
|
---|
168 | * index of the attribute
|
---|
169 | * @return data with upscaled attribute
|
---|
170 | */
|
---|
171 | public static Instances upscaleAttribute(Instances traindata, int attributeIndex) {
|
---|
172 | Instances traindataCopy = new Instances(traindata);
|
---|
173 | for (int i = 0; i < traindata.size(); i++) {
|
---|
174 | traindataCopy.get(i).setValue(attributeIndex,
|
---|
175 | traindata.get(i).value(attributeIndex) * SCALER);
|
---|
176 | }
|
---|
177 | return traindataCopy;
|
---|
178 | }
|
---|
179 | }
|
---|