| 1 | // Copyright 2015 Georg-August-Universität Göttingen, Germany
|
|---|
| 2 | //
|
|---|
| 3 | // Licensed under the Apache License, Version 2.0 (the "License");
|
|---|
| 4 | // you may not use this file except in compliance with the License.
|
|---|
| 5 | // You may obtain a copy of the License at
|
|---|
| 6 | //
|
|---|
| 7 | // http://www.apache.org/licenses/LICENSE-2.0
|
|---|
| 8 | //
|
|---|
| 9 | // Unless required by applicable law or agreed to in writing, software
|
|---|
| 10 | // distributed under the License is distributed on an "AS IS" BASIS,
|
|---|
| 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|---|
| 12 | // See the License for the specific language governing permissions and
|
|---|
| 13 | // limitations under the License.
|
|---|
| 14 |
|
|---|
| 15 | package de.ugoe.cs.cpdp.util;
|
|---|
| 16 |
|
|---|
| 17 | // TODO comment
|
|---|
| 18 | import org.apache.commons.math3.ml.distance.EuclideanDistance;
|
|---|
| 19 |
|
|---|
| 20 | import weka.core.Instance;
|
|---|
| 21 | import weka.core.Instances;
|
|---|
| 22 |
|
|---|
| 23 | public class WekaUtils {
|
|---|
| 24 |
|
|---|
| 25 | public static class DistChar {
|
|---|
| 26 | public final double mean;
|
|---|
| 27 | public final double std;
|
|---|
| 28 | public final double min;
|
|---|
| 29 | public final double max;
|
|---|
| 30 | public final int num;
|
|---|
| 31 | private DistChar(double mean, double std, double min, double max, int num) {
|
|---|
| 32 | this.mean = mean;
|
|---|
| 33 | this.std = std;
|
|---|
| 34 | this.min = min;
|
|---|
| 35 | this.max = max;
|
|---|
| 36 | this.num = num;
|
|---|
| 37 | }
|
|---|
| 38 | }
|
|---|
| 39 |
|
|---|
| 40 | /**
|
|---|
| 41 | * Scaling value that moves the decimal point by 5 digets.
|
|---|
| 42 | */
|
|---|
| 43 | public final static double SCALER = 10000.0d;
|
|---|
| 44 |
|
|---|
| 45 | /**
|
|---|
| 46 | * <p>
|
|---|
| 47 | * Adoption of the Hamming difference to numerical values, i.e., basically a count of different
|
|---|
| 48 | * metric values.
|
|---|
| 49 | * </p>
|
|---|
| 50 | *
|
|---|
| 51 | * @param inst1
|
|---|
| 52 | * first instance to be compared
|
|---|
| 53 | * @param inst2
|
|---|
| 54 | * second instance to be compared
|
|---|
| 55 | * @return the distance
|
|---|
| 56 | */
|
|---|
| 57 | public static double hammingDistance(Instance inst1, Instance inst2) {
|
|---|
| 58 | double distance = 0.0;
|
|---|
| 59 | for (int j = 0; j < inst1.numAttributes(); j++) {
|
|---|
| 60 | if (j != inst1.classIndex()) {
|
|---|
| 61 | if (inst1.value(j) != inst2.value(j)) {
|
|---|
| 62 | distance += 1.0;
|
|---|
| 63 | }
|
|---|
| 64 | }
|
|---|
| 65 | }
|
|---|
| 66 | return distance;
|
|---|
| 67 | }
|
|---|
| 68 |
|
|---|
| 69 | public static double[] instanceValues(Instance instance) {
|
|---|
| 70 | double[] values = new double[instance.numAttributes()-1];
|
|---|
| 71 | int k=0;
|
|---|
| 72 | for( int j=0; j<instance.numAttributes() ; j++ ) {
|
|---|
| 73 | if( j!= instance.classIndex() ) {
|
|---|
| 74 | values[k] = instance.value(j);
|
|---|
| 75 | k++;
|
|---|
| 76 | }
|
|---|
| 77 | }
|
|---|
| 78 | return values;
|
|---|
| 79 | }
|
|---|
| 80 |
|
|---|
| 81 | public static DistChar datasetDistance(Instances data) {
|
|---|
| 82 | double distance;
|
|---|
| 83 | double sumAll = 0.0;
|
|---|
| 84 | double sumAllQ = 0.0;
|
|---|
| 85 | double min = Double.MAX_VALUE;
|
|---|
| 86 | double max = Double.MIN_VALUE;
|
|---|
| 87 | int numCmp = 0;
|
|---|
| 88 | int l = 0;
|
|---|
| 89 | double[] inst1 = new double[data.numAttributes()-1];
|
|---|
| 90 | double[] inst2 = new double[data.numAttributes()-1];
|
|---|
| 91 | EuclideanDistance euclideanDistance = new EuclideanDistance();
|
|---|
| 92 | for( int i=0; i<data.numInstances(); i++ ) {
|
|---|
| 93 | l=0;
|
|---|
| 94 | for( int k=0; k<data.numAttributes(); k++ ) {
|
|---|
| 95 | if( k!=data.classIndex() ) {
|
|---|
| 96 | inst1[l] = data.instance(i).value(k);
|
|---|
| 97 | }
|
|---|
| 98 | }
|
|---|
| 99 | for( int j=0; j<data.numInstances(); j++ ) {
|
|---|
| 100 | if( j!=i ) {
|
|---|
| 101 | l=0;
|
|---|
| 102 | for( int k=0; k<data.numAttributes(); k++ ) {
|
|---|
| 103 | if( k!=data.classIndex() ) {
|
|---|
| 104 | inst2[l] = data.instance(j).value(k);
|
|---|
| 105 | }
|
|---|
| 106 | }
|
|---|
| 107 | distance = euclideanDistance.compute(inst1, inst2);
|
|---|
| 108 | sumAll += distance;
|
|---|
| 109 | sumAllQ += distance*distance;
|
|---|
| 110 | numCmp++;
|
|---|
| 111 | if( distance < min ) {
|
|---|
| 112 | min = distance;
|
|---|
| 113 | }
|
|---|
| 114 | if( distance > max ) {
|
|---|
| 115 | max = distance;
|
|---|
| 116 | }
|
|---|
| 117 | }
|
|---|
| 118 | }
|
|---|
| 119 | }
|
|---|
| 120 | double mean = sumAll / numCmp;
|
|---|
| 121 | double std = Math.sqrt((sumAllQ-(sumAll*sumAll)/numCmp) *
|
|---|
| 122 | (1.0d / (numCmp - 1)));
|
|---|
| 123 | return new DistChar(mean, std, min, max, data.numInstances());
|
|---|
| 124 | }
|
|---|
| 125 |
|
|---|
| 126 | // like above, but for single attribute
|
|---|
| 127 | public static DistChar attributeDistance(Instances data, int index) {
|
|---|
| 128 | double distance;
|
|---|
| 129 | double sumAll = 0.0;
|
|---|
| 130 | double sumAllQ = 0.0;
|
|---|
| 131 | double min = Double.MAX_VALUE;
|
|---|
| 132 | double max = Double.MIN_VALUE;
|
|---|
| 133 | int numCmp = 0;
|
|---|
| 134 | double value1, value2;
|
|---|
| 135 | for( int i=0; i<data.numInstances(); i++ ) {
|
|---|
| 136 | value1 = data.instance(i).value(index);
|
|---|
| 137 | for( int j=0; j<data.numInstances(); j++ ) {
|
|---|
| 138 | if( j!=i ) {
|
|---|
| 139 | value2 = data.instance(j).value(index);
|
|---|
| 140 | distance = Math.abs(value1-value2);
|
|---|
| 141 | sumAll += distance;
|
|---|
| 142 | sumAllQ += distance*distance;
|
|---|
| 143 | numCmp++;
|
|---|
| 144 | if( distance < min ) {
|
|---|
| 145 | min = distance;
|
|---|
| 146 | }
|
|---|
| 147 | if( distance > max ) {
|
|---|
| 148 | max = distance;
|
|---|
| 149 | }
|
|---|
| 150 | }
|
|---|
| 151 | }
|
|---|
| 152 | }
|
|---|
| 153 | double mean = sumAll / numCmp;
|
|---|
| 154 | double std = Math.sqrt((sumAllQ-(sumAll*sumAll)/numCmp) *
|
|---|
| 155 | (1.0d / (numCmp - 1)));
|
|---|
| 156 | return new DistChar(mean, std, min, max, data.numInstances());
|
|---|
| 157 | }
|
|---|
| 158 |
|
|---|
| 159 | /**
|
|---|
| 160 | * <p>
|
|---|
| 161 | * Upscales the value of a single attribute. This is a workaround to get BayesNet running for
|
|---|
| 162 | * all data. Works on a copy of the training data, i.e., leaves the original data untouched.
|
|---|
| 163 | * </p>
|
|---|
| 164 | *
|
|---|
| 165 | * @param traindata
|
|---|
| 166 | * data from which the attribute is upscaled.
|
|---|
| 167 | * @param attributeIndex
|
|---|
| 168 | * index of the attribute
|
|---|
| 169 | * @return data with upscaled attribute
|
|---|
| 170 | */
|
|---|
| 171 | public static Instances upscaleAttribute(Instances traindata, int attributeIndex) {
|
|---|
| 172 | Instances traindataCopy = new Instances(traindata);
|
|---|
| 173 | for (int i = 0; i < traindata.size(); i++) {
|
|---|
| 174 | traindataCopy.get(i).setValue(attributeIndex,
|
|---|
| 175 | traindata.get(i).value(attributeIndex) * SCALER);
|
|---|
| 176 | }
|
|---|
| 177 | return traindataCopy;
|
|---|
| 178 | }
|
|---|
| 179 | }
|
|---|