1 | // Copyright 2015 Georg-August-Universität Göttingen, Germany
|
---|
2 | //
|
---|
3 | // Licensed under the Apache License, Version 2.0 (the "License");
|
---|
4 | // you may not use this file except in compliance with the License.
|
---|
5 | // You may obtain a copy of the License at
|
---|
6 | //
|
---|
7 | // http://www.apache.org/licenses/LICENSE-2.0
|
---|
8 | //
|
---|
9 | // Unless required by applicable law or agreed to in writing, software
|
---|
10 | // distributed under the License is distributed on an "AS IS" BASIS,
|
---|
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
---|
12 | // See the License for the specific language governing permissions and
|
---|
13 | // limitations under the License.
|
---|
14 |
|
---|
15 | package de.ugoe.cs.cpdp.util;
|
---|
16 |
|
---|
17 | // TODO comment
|
---|
18 | import org.apache.commons.math3.ml.distance.EuclideanDistance;
|
---|
19 |
|
---|
20 | import weka.core.Instance;
|
---|
21 | import weka.core.Instances;
|
---|
22 |
|
---|
23 | public class WekaUtils {
|
---|
24 |
|
---|
25 | public static class DistChar {
|
---|
26 | public final double mean;
|
---|
27 | public final double std;
|
---|
28 | public final double min;
|
---|
29 | public final double max;
|
---|
30 | public final int num;
|
---|
31 | private DistChar(double mean, double std, double min, double max, int num) {
|
---|
32 | this.mean = mean;
|
---|
33 | this.std = std;
|
---|
34 | this.min = min;
|
---|
35 | this.max = max;
|
---|
36 | this.num = num;
|
---|
37 | }
|
---|
38 | }
|
---|
39 |
|
---|
40 | /**
|
---|
41 | * <p>
|
---|
42 | * Adoption of the Hamming difference to numerical values, i.e., basically a count of different
|
---|
43 | * metric values.
|
---|
44 | * </p>
|
---|
45 | *
|
---|
46 | * @param inst1
|
---|
47 | * first instance to be compared
|
---|
48 | * @param inst2
|
---|
49 | * second instance to be compared
|
---|
50 | * @return the distance
|
---|
51 | */
|
---|
52 | public static double hammingDistance(Instance inst1, Instance inst2) {
|
---|
53 | double distance = 0.0;
|
---|
54 | for (int j = 0; j < inst1.numAttributes(); j++) {
|
---|
55 | if (j != inst1.classIndex()) {
|
---|
56 | if (inst1.value(j) != inst2.value(j)) {
|
---|
57 | distance += 1.0;
|
---|
58 | }
|
---|
59 | }
|
---|
60 | }
|
---|
61 | return distance;
|
---|
62 | }
|
---|
63 |
|
---|
64 | public static double[] instanceValues(Instance instance) {
|
---|
65 | double[] values = new double[instance.numAttributes()-1];
|
---|
66 | int k=0;
|
---|
67 | for( int j=0; j<instance.numAttributes() ; j++ ) {
|
---|
68 | if( j!= instance.classIndex() ) {
|
---|
69 | values[k] = instance.value(j);
|
---|
70 | k++;
|
---|
71 | }
|
---|
72 | }
|
---|
73 | return values;
|
---|
74 | }
|
---|
75 |
|
---|
76 | public static DistChar datasetDistance(Instances data) {
|
---|
77 | double distance;
|
---|
78 | double sumAll = 0.0;
|
---|
79 | double sumAllQ = 0.0;
|
---|
80 | double min = Double.MAX_VALUE;
|
---|
81 | double max = Double.MIN_VALUE;
|
---|
82 | int numCmp = 0;
|
---|
83 | int l = 0;
|
---|
84 | double[] inst1 = new double[data.numAttributes()-1];
|
---|
85 | double[] inst2 = new double[data.numAttributes()-1];
|
---|
86 | EuclideanDistance euclideanDistance = new EuclideanDistance();
|
---|
87 | for( int i=0; i<data.numInstances(); i++ ) {
|
---|
88 | l=0;
|
---|
89 | for( int k=0; k<data.numAttributes(); k++ ) {
|
---|
90 | if( k!=data.classIndex() ) {
|
---|
91 | inst1[l] = data.instance(i).value(k);
|
---|
92 | }
|
---|
93 | }
|
---|
94 | for( int j=0; j<data.numInstances(); j++ ) {
|
---|
95 | if( j!=i ) {
|
---|
96 | l=0;
|
---|
97 | for( int k=0; k<data.numAttributes(); k++ ) {
|
---|
98 | if( k!=data.classIndex() ) {
|
---|
99 | inst2[l] = data.instance(j).value(k);
|
---|
100 | }
|
---|
101 | }
|
---|
102 | distance = euclideanDistance.compute(inst1, inst2);
|
---|
103 | sumAll += distance;
|
---|
104 | sumAllQ += distance*distance;
|
---|
105 | numCmp++;
|
---|
106 | if( distance < min ) {
|
---|
107 | min = distance;
|
---|
108 | }
|
---|
109 | if( distance > max ) {
|
---|
110 | max = distance;
|
---|
111 | }
|
---|
112 | }
|
---|
113 | }
|
---|
114 | }
|
---|
115 | double mean = sumAll / numCmp;
|
---|
116 | double std = Math.sqrt((sumAllQ-(sumAll*sumAll)/numCmp) *
|
---|
117 | (1.0d / (numCmp - 1)));
|
---|
118 | return new DistChar(mean, std, min, max, data.numInstances());
|
---|
119 | }
|
---|
120 |
|
---|
121 | // like above, but for single attribute
|
---|
122 | public static DistChar attributeDistance(Instances data, int index) {
|
---|
123 | double distance;
|
---|
124 | double sumAll = 0.0;
|
---|
125 | double sumAllQ = 0.0;
|
---|
126 | double min = Double.MAX_VALUE;
|
---|
127 | double max = Double.MIN_VALUE;
|
---|
128 | int numCmp = 0;
|
---|
129 | double value1, value2;
|
---|
130 | for( int i=0; i<data.numInstances(); i++ ) {
|
---|
131 | value1 = data.instance(i).value(index);
|
---|
132 | for( int j=0; j<data.numInstances(); j++ ) {
|
---|
133 | if( j!=i ) {
|
---|
134 | value2 = data.instance(j).value(index);
|
---|
135 | distance = Math.abs(value1-value2);
|
---|
136 | sumAll += distance;
|
---|
137 | sumAllQ += distance*distance;
|
---|
138 | numCmp++;
|
---|
139 | if( distance < min ) {
|
---|
140 | min = distance;
|
---|
141 | }
|
---|
142 | if( distance > max ) {
|
---|
143 | max = distance;
|
---|
144 | }
|
---|
145 | }
|
---|
146 | }
|
---|
147 | }
|
---|
148 | double mean = sumAll / numCmp;
|
---|
149 | double std = Math.sqrt((sumAllQ-(sumAll*sumAll)/numCmp) *
|
---|
150 | (1.0d / (numCmp - 1)));
|
---|
151 | return new DistChar(mean, std, min, max, data.numInstances());
|
---|
152 | }
|
---|
153 | }
|
---|