| 1 | // Copyright 2015 Georg-August-Universität Göttingen, Germany
|
|---|
| 2 | //
|
|---|
| 3 | // Licensed under the Apache License, Version 2.0 (the "License");
|
|---|
| 4 | // you may not use this file except in compliance with the License.
|
|---|
| 5 | // You may obtain a copy of the License at
|
|---|
| 6 | //
|
|---|
| 7 | // http://www.apache.org/licenses/LICENSE-2.0
|
|---|
| 8 | //
|
|---|
| 9 | // Unless required by applicable law or agreed to in writing, software
|
|---|
| 10 | // distributed under the License is distributed on an "AS IS" BASIS,
|
|---|
| 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|---|
| 12 | // See the License for the specific language governing permissions and
|
|---|
| 13 | // limitations under the License.
|
|---|
| 14 |
|
|---|
| 15 | package de.ugoe.cs.cpdp.util;
|
|---|
| 16 |
|
|---|
| 17 | import org.apache.commons.math3.ml.distance.EuclideanDistance;
|
|---|
| 18 |
|
|---|
| 19 | import weka.core.Instance;
|
|---|
| 20 | import weka.core.Instances;
|
|---|
| 21 |
|
|---|
| 22 | /**
|
|---|
| 23 | * <p>
|
|---|
| 24 | * Collections of helper functions to work with Weka.
|
|---|
| 25 | * </p>
|
|---|
| 26 | *
|
|---|
| 27 | * @author Steffen Herbold
|
|---|
| 28 | */
|
|---|
| 29 | public class WekaUtils {
|
|---|
| 30 |
|
|---|
| 31 | /**
|
|---|
| 32 | * <p>
|
|---|
| 33 | * Data class for distance between instances within a data set based on their distributional
|
|---|
| 34 | * characteristics.
|
|---|
| 35 | * </p>
|
|---|
| 36 | *
|
|---|
| 37 | * @author Steffen Herbold
|
|---|
| 38 | */
|
|---|
| 39 | public static class DistChar {
|
|---|
| 40 |
|
|---|
| 41 | /**
|
|---|
| 42 | * mean distance
|
|---|
| 43 | */
|
|---|
| 44 | public final double mean;
|
|---|
| 45 |
|
|---|
| 46 | /**
|
|---|
| 47 | * standard deviation
|
|---|
| 48 | */
|
|---|
| 49 | public final double std;
|
|---|
| 50 |
|
|---|
| 51 | /**
|
|---|
| 52 | * minimal value
|
|---|
| 53 | */
|
|---|
| 54 | public final double min;
|
|---|
| 55 |
|
|---|
| 56 | /**
|
|---|
| 57 | * maximal value
|
|---|
| 58 | */
|
|---|
| 59 | public final double max;
|
|---|
| 60 |
|
|---|
| 61 | /**
|
|---|
| 62 | * number of instances
|
|---|
| 63 | */
|
|---|
| 64 | public final int num;
|
|---|
| 65 |
|
|---|
| 66 | /**
|
|---|
| 67 | * <p>
|
|---|
| 68 | * Constructor. Creates a new DistChar object.
|
|---|
| 69 | * </p>
|
|---|
| 70 | *
|
|---|
| 71 | * @param mean mean distance between instances
|
|---|
| 72 | * @param std standard deviation of distances between instances
|
|---|
| 73 | * @param min minimal distance between instances
|
|---|
| 74 | * @param max maximal distance between instances
|
|---|
| 75 | * @param num number of instance
|
|---|
| 76 | */
|
|---|
| 77 | private DistChar(double mean, double std, double min, double max, int num) {
|
|---|
| 78 | this.mean = mean;
|
|---|
| 79 | this.std = std;
|
|---|
| 80 | this.min = min;
|
|---|
| 81 | this.max = max;
|
|---|
| 82 | this.num = num;
|
|---|
| 83 | }
|
|---|
| 84 | }
|
|---|
| 85 |
|
|---|
| 86 | /**
|
|---|
| 87 | * Scaling value that moves the decimal point by 5 digets.
|
|---|
| 88 | */
|
|---|
| 89 | public final static double SCALER = 10000.0d;
|
|---|
| 90 |
|
|---|
| 91 | /**
|
|---|
| 92 | * <p>
|
|---|
| 93 | * Adoption of the Hamming difference to numerical values, i.e., basically a count of different
|
|---|
| 94 | * metric values.
|
|---|
| 95 | * </p>
|
|---|
| 96 | *
|
|---|
| 97 | * @param inst1
|
|---|
| 98 | * first instance to be compared
|
|---|
| 99 | * @param inst2
|
|---|
| 100 | * second instance to be compared
|
|---|
| 101 | * @return the distance
|
|---|
| 102 | */
|
|---|
| 103 | public static double hammingDistance(Instance inst1, Instance inst2) {
|
|---|
| 104 | double distance = 0.0;
|
|---|
| 105 | for (int j = 0; j < inst1.numAttributes(); j++) {
|
|---|
| 106 | if (j != inst1.classIndex()) {
|
|---|
| 107 | if (inst1.value(j) != inst2.value(j)) {
|
|---|
| 108 | distance += 1.0;
|
|---|
| 109 | }
|
|---|
| 110 | }
|
|---|
| 111 | }
|
|---|
| 112 | return distance;
|
|---|
| 113 | }
|
|---|
| 114 |
|
|---|
| 115 | /**
|
|---|
| 116 | * <p>
|
|---|
| 117 | * Returns a double array of the values without the classification.
|
|---|
| 118 | * </p>
|
|---|
| 119 | *
|
|---|
| 120 | * @param instance
|
|---|
| 121 | * the instance
|
|---|
| 122 | * @return double array
|
|---|
| 123 | */
|
|---|
| 124 | public static double[] instanceValues(Instance instance) {
|
|---|
| 125 | double[] values = new double[instance.numAttributes() - 1];
|
|---|
| 126 | int k = 0;
|
|---|
| 127 | for (int j = 0; j < instance.numAttributes(); j++) {
|
|---|
| 128 | if (j != instance.classIndex()) {
|
|---|
| 129 | values[k] = instance.value(j);
|
|---|
| 130 | k++;
|
|---|
| 131 | }
|
|---|
| 132 | }
|
|---|
| 133 | return values;
|
|---|
| 134 | }
|
|---|
| 135 |
|
|---|
| 136 | /**
|
|---|
| 137 | * <p>
|
|---|
| 138 | * Calculates the distributional characteristics of the distances the instances within a data
|
|---|
| 139 | * set have to each other.
|
|---|
| 140 | * </p>
|
|---|
| 141 | *
|
|---|
| 142 | * @param data
|
|---|
| 143 | * data for which the instances are characterized
|
|---|
| 144 | * @return characteristics
|
|---|
| 145 | */
|
|---|
| 146 | public static DistChar datasetDistance(Instances data) {
|
|---|
| 147 | double distance;
|
|---|
| 148 | double sumAll = 0.0;
|
|---|
| 149 | double sumAllQ = 0.0;
|
|---|
| 150 | double min = Double.MAX_VALUE;
|
|---|
| 151 | double max = Double.MIN_VALUE;
|
|---|
| 152 | int numCmp = 0;
|
|---|
| 153 | int l = 0;
|
|---|
| 154 | double[] inst1 = new double[data.numAttributes() - 1];
|
|---|
| 155 | double[] inst2 = new double[data.numAttributes() - 1];
|
|---|
| 156 | EuclideanDistance euclideanDistance = new EuclideanDistance();
|
|---|
| 157 | for (int i = 0; i < data.numInstances(); i++) {
|
|---|
| 158 | l = 0;
|
|---|
| 159 | for (int k = 0; k < data.numAttributes(); k++) {
|
|---|
| 160 | if (k != data.classIndex()) {
|
|---|
| 161 | inst1[l] = data.instance(i).value(k);
|
|---|
| 162 | }
|
|---|
| 163 | }
|
|---|
| 164 | for (int j = 0; j < data.numInstances(); j++) {
|
|---|
| 165 | if (j != i) {
|
|---|
| 166 | l = 0;
|
|---|
| 167 | for (int k = 0; k < data.numAttributes(); k++) {
|
|---|
| 168 | if (k != data.classIndex()) {
|
|---|
| 169 | inst2[l] = data.instance(j).value(k);
|
|---|
| 170 | }
|
|---|
| 171 | }
|
|---|
| 172 | distance = euclideanDistance.compute(inst1, inst2);
|
|---|
| 173 | sumAll += distance;
|
|---|
| 174 | sumAllQ += distance * distance;
|
|---|
| 175 | numCmp++;
|
|---|
| 176 | if (distance < min) {
|
|---|
| 177 | min = distance;
|
|---|
| 178 | }
|
|---|
| 179 | if (distance > max) {
|
|---|
| 180 | max = distance;
|
|---|
| 181 | }
|
|---|
| 182 | }
|
|---|
| 183 | }
|
|---|
| 184 | }
|
|---|
| 185 | double mean = sumAll / numCmp;
|
|---|
| 186 | double std = Math.sqrt((sumAllQ - (sumAll * sumAll) / numCmp) * (1.0d / (numCmp - 1)));
|
|---|
| 187 | return new DistChar(mean, std, min, max, data.numInstances());
|
|---|
| 188 | }
|
|---|
| 189 |
|
|---|
| 190 | /**
|
|---|
| 191 | * <p>
|
|---|
| 192 | * Calculates the distributional characteristics of the distances of a single attribute the
|
|---|
| 193 | * instances within a data set have to each other.
|
|---|
| 194 | * </p>
|
|---|
| 195 | *
|
|---|
| 196 | * @param data
|
|---|
| 197 | * data for which the instances are characterized
|
|---|
| 198 | * @param index
|
|---|
| 199 | * attribute for which the distances are characterized
|
|---|
| 200 | * @return characteristics
|
|---|
| 201 | */
|
|---|
| 202 | public static DistChar attributeDistance(Instances data, int index) {
|
|---|
| 203 | double distance;
|
|---|
| 204 | double sumAll = 0.0;
|
|---|
| 205 | double sumAllQ = 0.0;
|
|---|
| 206 | double min = Double.MAX_VALUE;
|
|---|
| 207 | double max = Double.MIN_VALUE;
|
|---|
| 208 | int numCmp = 0;
|
|---|
| 209 | double value1, value2;
|
|---|
| 210 | for (int i = 0; i < data.numInstances(); i++) {
|
|---|
| 211 | value1 = data.instance(i).value(index);
|
|---|
| 212 | for (int j = 0; j < data.numInstances(); j++) {
|
|---|
| 213 | if (j != i) {
|
|---|
| 214 | value2 = data.instance(j).value(index);
|
|---|
| 215 | distance = Math.abs(value1 - value2);
|
|---|
| 216 | sumAll += distance;
|
|---|
| 217 | sumAllQ += distance * distance;
|
|---|
| 218 | numCmp++;
|
|---|
| 219 | if (distance < min) {
|
|---|
| 220 | min = distance;
|
|---|
| 221 | }
|
|---|
| 222 | if (distance > max) {
|
|---|
| 223 | max = distance;
|
|---|
| 224 | }
|
|---|
| 225 | }
|
|---|
| 226 | }
|
|---|
| 227 | }
|
|---|
| 228 | double mean = sumAll / numCmp;
|
|---|
| 229 | double std = Math.sqrt((sumAllQ - (sumAll * sumAll) / numCmp) * (1.0d / (numCmp - 1)));
|
|---|
| 230 | return new DistChar(mean, std, min, max, data.numInstances());
|
|---|
| 231 | }
|
|---|
| 232 |
|
|---|
| 233 | /**
|
|---|
| 234 | * <p>
|
|---|
| 235 | * Upscales the value of a single attribute. This is a workaround to get BayesNet running for
|
|---|
| 236 | * all data. Works on a copy of the training data, i.e., leaves the original data untouched.
|
|---|
| 237 | * </p>
|
|---|
| 238 | *
|
|---|
| 239 | * @param traindata
|
|---|
| 240 | * data from which the attribute is upscaled.
|
|---|
| 241 | * @param attributeIndex
|
|---|
| 242 | * index of the attribute
|
|---|
| 243 | * @return data with upscaled attribute
|
|---|
| 244 | */
|
|---|
| 245 | public static Instances upscaleAttribute(Instances traindata, int attributeIndex) {
|
|---|
| 246 | Instances traindataCopy = new Instances(traindata);
|
|---|
| 247 | for (int i = 0; i < traindata.size(); i++) {
|
|---|
| 248 | traindataCopy.get(i).setValue(attributeIndex,
|
|---|
| 249 | traindata.get(i).value(attributeIndex) * SCALER);
|
|---|
| 250 | }
|
|---|
| 251 | return traindataCopy;
|
|---|
| 252 | }
|
|---|
| 253 | }
|
|---|