source: trunk/CrossPare/src/de/ugoe/cs/cpdp/util/WekaUtils.java

Last change on this file was 136, checked in by sherbold, 8 years ago
  • more code documentation
  • Property svn:mime-type set to text/plain
File size: 8.2 KB
Line 
1// Copyright 2015 Georg-August-Universität Göttingen, Germany
2//
3//   Licensed under the Apache License, Version 2.0 (the "License");
4//   you may not use this file except in compliance with the License.
5//   You may obtain a copy of the License at
6//
7//       http://www.apache.org/licenses/LICENSE-2.0
8//
9//   Unless required by applicable law or agreed to in writing, software
10//   distributed under the License is distributed on an "AS IS" BASIS,
11//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12//   See the License for the specific language governing permissions and
13//   limitations under the License.
14
15package de.ugoe.cs.cpdp.util;
16
17import org.apache.commons.math3.ml.distance.EuclideanDistance;
18
19import weka.core.Instance;
20import weka.core.Instances;
21
22/**
23 * <p>
24 * Collections of helper functions to work with Weka.
25 * </p>
26 *
27 * @author Steffen Herbold
28 */
29public class WekaUtils {
30
31    /**
32     * <p>
33     * Data class for distance between instances within a data set based on their distributional
34     * characteristics.
35     * </p>
36     *
37     * @author Steffen Herbold
38     */
39    public static class DistChar {
40       
41        /**
42         * mean distance
43         */
44        public final double mean;
45       
46        /**
47         * standard deviation
48         */
49        public final double std;
50       
51        /**
52         * minimal value
53         */
54        public final double min;
55       
56        /**
57         * maximal value
58         */
59        public final double max;
60       
61        /**
62         * number of instances
63         */
64        public final int num;
65
66        /**
67         * <p>
68         * Constructor. Creates a new DistChar object.
69         * </p>
70         *
71         * @param mean mean distance between instances
72         * @param std standard deviation of distances between instances
73         * @param min minimal distance between instances
74         * @param max maximal distance between instances
75         * @param num number of instance
76         */
77        private DistChar(double mean, double std, double min, double max, int num) {
78            this.mean = mean;
79            this.std = std;
80            this.min = min;
81            this.max = max;
82            this.num = num;
83        }
84    }
85
86    /**
87     * Scaling value that moves the decimal point by 5 digets.
88     */
89    public final static double SCALER = 10000.0d;
90
91    /**
92     * <p>
93     * Adoption of the Hamming difference to numerical values, i.e., basically a count of different
94     * metric values.
95     * </p>
96     *
97     * @param inst1
98     *            first instance to be compared
99     * @param inst2
100     *            second instance to be compared
101     * @return the distance
102     */
103    public static double hammingDistance(Instance inst1, Instance inst2) {
104        double distance = 0.0;
105        for (int j = 0; j < inst1.numAttributes(); j++) {
106            if (j != inst1.classIndex()) {
107                if (inst1.value(j) != inst2.value(j)) {
108                    distance += 1.0;
109                }
110            }
111        }
112        return distance;
113    }
114
115    /**
116     * <p>
117     * Returns a double array of the values without the classification.
118     * </p>
119     *
120     * @param instance
121     *            the instance
122     * @return double array
123     */
124    public static double[] instanceValues(Instance instance) {
125        double[] values = new double[instance.numAttributes() - 1];
126        int k = 0;
127        for (int j = 0; j < instance.numAttributes(); j++) {
128            if (j != instance.classIndex()) {
129                values[k] = instance.value(j);
130                k++;
131            }
132        }
133        return values;
134    }
135
136    /**
137     * <p>
138     * Calculates the distributional characteristics of the distances the instances within a data
139     * set have to each other.
140     * </p>
141     *
142     * @param data
143     *            data for which the instances are characterized
144     * @return characteristics
145     */
146    public static DistChar datasetDistance(Instances data) {
147        double distance;
148        double sumAll = 0.0;
149        double sumAllQ = 0.0;
150        double min = Double.MAX_VALUE;
151        double max = Double.MIN_VALUE;
152        int numCmp = 0;
153        int l = 0;
154        double[] inst1 = new double[data.numAttributes() - 1];
155        double[] inst2 = new double[data.numAttributes() - 1];
156        EuclideanDistance euclideanDistance = new EuclideanDistance();
157        for (int i = 0; i < data.numInstances(); i++) {
158            l = 0;
159            for (int k = 0; k < data.numAttributes(); k++) {
160                if (k != data.classIndex()) {
161                    inst1[l] = data.instance(i).value(k);
162                }
163            }
164            for (int j = 0; j < data.numInstances(); j++) {
165                if (j != i) {
166                    l = 0;
167                    for (int k = 0; k < data.numAttributes(); k++) {
168                        if (k != data.classIndex()) {
169                            inst2[l] = data.instance(j).value(k);
170                        }
171                    }
172                    distance = euclideanDistance.compute(inst1, inst2);
173                    sumAll += distance;
174                    sumAllQ += distance * distance;
175                    numCmp++;
176                    if (distance < min) {
177                        min = distance;
178                    }
179                    if (distance > max) {
180                        max = distance;
181                    }
182                }
183            }
184        }
185        double mean = sumAll / numCmp;
186        double std = Math.sqrt((sumAllQ - (sumAll * sumAll) / numCmp) * (1.0d / (numCmp - 1)));
187        return new DistChar(mean, std, min, max, data.numInstances());
188    }
189
190    /**
191     * <p>
192     * Calculates the distributional characteristics of the distances of a single attribute the
193     * instances within a data set have to each other.
194     * </p>
195     *
196     * @param data
197     *            data for which the instances are characterized
198     * @param index
199     *            attribute for which the distances are characterized
200     * @return characteristics
201     */
202    public static DistChar attributeDistance(Instances data, int index) {
203        double distance;
204        double sumAll = 0.0;
205        double sumAllQ = 0.0;
206        double min = Double.MAX_VALUE;
207        double max = Double.MIN_VALUE;
208        int numCmp = 0;
209        double value1, value2;
210        for (int i = 0; i < data.numInstances(); i++) {
211            value1 = data.instance(i).value(index);
212            for (int j = 0; j < data.numInstances(); j++) {
213                if (j != i) {
214                    value2 = data.instance(j).value(index);
215                    distance = Math.abs(value1 - value2);
216                    sumAll += distance;
217                    sumAllQ += distance * distance;
218                    numCmp++;
219                    if (distance < min) {
220                        min = distance;
221                    }
222                    if (distance > max) {
223                        max = distance;
224                    }
225                }
226            }
227        }
228        double mean = sumAll / numCmp;
229        double std = Math.sqrt((sumAllQ - (sumAll * sumAll) / numCmp) * (1.0d / (numCmp - 1)));
230        return new DistChar(mean, std, min, max, data.numInstances());
231    }
232
233    /**
234     * <p>
235     * Upscales the value of a single attribute. This is a workaround to get BayesNet running for
236     * all data. Works on a copy of the training data, i.e., leaves the original data untouched.
237     * </p>
238     *
239     * @param traindata
240     *            data from which the attribute is upscaled.
241     * @param attributeIndex
242     *            index of the attribute
243     * @return data with upscaled attribute
244     */
245    public static Instances upscaleAttribute(Instances traindata, int attributeIndex) {
246        Instances traindataCopy = new Instances(traindata);
247        for (int i = 0; i < traindata.size(); i++) {
248            traindataCopy.get(i).setValue(attributeIndex,
249                                          traindata.get(i).value(attributeIndex) * SCALER);
250        }
251        return traindataCopy;
252    }
253}
Note: See TracBrowser for help on using the repository browser.