source: trunk/CrossPare/src/de/ugoe/cs/cpdp/util/WekaUtils.java @ 130

Last change on this file since 130 was 129, checked in by sherbold, 8 years ago
  • added the same workaround for the problem with Discretize to the TopMetricFilter?. We slightly refactored the implementation within the AbstractCODEP by putting the rescaling of sets to the WekaUtils? to facilitate better re-use.
  • Property svn:mime-type set to text/plain
File size: 6.4 KB
Line 
1// Copyright 2015 Georg-August-Universität Göttingen, Germany
2//
3//   Licensed under the Apache License, Version 2.0 (the "License");
4//   you may not use this file except in compliance with the License.
5//   You may obtain a copy of the License at
6//
7//       http://www.apache.org/licenses/LICENSE-2.0
8//
9//   Unless required by applicable law or agreed to in writing, software
10//   distributed under the License is distributed on an "AS IS" BASIS,
11//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12//   See the License for the specific language governing permissions and
13//   limitations under the License.
14
15package de.ugoe.cs.cpdp.util;
16
17// TODO comment
18import org.apache.commons.math3.ml.distance.EuclideanDistance;
19
20import weka.core.Instance;
21import weka.core.Instances;
22
23public class WekaUtils {
24
25    public static class DistChar {
26        public final double mean;
27        public final double std;
28        public final double min;
29        public final double max;
30        public final int num;
31        private DistChar(double mean, double std, double min, double max, int num) {
32            this.mean = mean;
33            this.std = std;
34            this.min = min;
35            this.max = max;
36            this.num = num;
37        }
38    }
39   
40    /**
41     * Scaling value that moves the decimal point by 5 digets.
42     */
43    public final static double SCALER = 10000.0d;
44   
45    /**
46     * <p>
47     * Adoption of the Hamming difference to numerical values, i.e., basically a count of different
48     * metric values.
49     * </p>
50     *
51     * @param inst1
52     *            first instance to be compared
53     * @param inst2
54     *            second instance to be compared
55     * @return the distance
56     */
57    public static double hammingDistance(Instance inst1, Instance inst2) {
58        double distance = 0.0;
59        for (int j = 0; j < inst1.numAttributes(); j++) {
60            if (j != inst1.classIndex()) {
61                if (inst1.value(j) != inst2.value(j)) {
62                    distance += 1.0;
63                }
64            }
65        }
66        return distance;
67    }
68   
69    public static double[] instanceValues(Instance instance) {
70        double[] values = new double[instance.numAttributes()-1];
71        int k=0;
72        for( int j=0; j<instance.numAttributes() ; j++ ) {
73            if( j!= instance.classIndex() ) {
74                values[k] = instance.value(j);
75                k++;
76            }
77        }
78        return values;
79    }
80   
81    public static DistChar datasetDistance(Instances data) {
82        double distance;
83        double sumAll = 0.0;
84        double sumAllQ = 0.0;
85        double min = Double.MAX_VALUE;
86        double max = Double.MIN_VALUE;
87        int numCmp = 0;
88        int l = 0;
89        double[] inst1 = new double[data.numAttributes()-1];
90        double[] inst2 = new double[data.numAttributes()-1];
91        EuclideanDistance euclideanDistance = new EuclideanDistance();
92        for( int i=0; i<data.numInstances(); i++ ) {
93            l=0;
94            for( int k=0; k<data.numAttributes(); k++ ) {
95                if( k!=data.classIndex() ) {
96                    inst1[l] = data.instance(i).value(k);
97                }
98            }
99            for( int j=0; j<data.numInstances(); j++ ) {
100                if( j!=i ) {
101                    l=0;
102                    for( int k=0; k<data.numAttributes(); k++ ) {
103                        if( k!=data.classIndex() ) {
104                            inst2[l] = data.instance(j).value(k);
105                        }
106                    }
107                    distance = euclideanDistance.compute(inst1, inst2);
108                    sumAll += distance;
109                    sumAllQ += distance*distance;
110                    numCmp++;
111                    if( distance < min ) {
112                        min = distance;
113                    }
114                    if( distance > max ) {
115                        max = distance;
116                    }
117                }
118            }
119        }
120        double mean = sumAll / numCmp;
121        double std = Math.sqrt((sumAllQ-(sumAll*sumAll)/numCmp) *
122                                  (1.0d / (numCmp - 1)));
123        return new DistChar(mean, std, min, max, data.numInstances());
124    }
125   
126    // like above, but for single attribute
127    public static DistChar attributeDistance(Instances data, int index) {
128        double distance;
129        double sumAll = 0.0;
130        double sumAllQ = 0.0;
131        double min = Double.MAX_VALUE;
132        double max = Double.MIN_VALUE;
133        int numCmp = 0;
134        double value1, value2;
135        for( int i=0; i<data.numInstances(); i++ ) {
136            value1 = data.instance(i).value(index);
137            for( int j=0; j<data.numInstances(); j++ ) {
138                if( j!=i ) {
139                    value2 = data.instance(j).value(index);
140                    distance = Math.abs(value1-value2);
141                    sumAll += distance;
142                    sumAllQ += distance*distance;
143                    numCmp++;
144                    if( distance < min ) {
145                        min = distance;
146                    }
147                    if( distance > max ) {
148                        max = distance;
149                    }
150                }
151            }
152        }
153        double mean = sumAll / numCmp;
154        double std = Math.sqrt((sumAllQ-(sumAll*sumAll)/numCmp) *
155                                  (1.0d / (numCmp - 1)));
156        return new DistChar(mean, std, min, max, data.numInstances());
157    }
158   
159    /**
160     * <p>
161     * Upscales the value of a single attribute. This is a workaround to get BayesNet running for
162     * all data. Works on a copy of the training data, i.e., leaves the original data untouched.
163     * </p>
164     *
165     * @param traindata
166     *            data from which the attribute is upscaled.
167     * @param attributeIndex
168     *            index of the attribute
169     * @return data with upscaled attribute
170     */
171    public static Instances upscaleAttribute(Instances traindata, int attributeIndex) {
172        Instances traindataCopy = new Instances(traindata);
173        for (int i = 0; i < traindata.size(); i++) {
174            traindataCopy.get(i).setValue(attributeIndex,
175                                          traindata.get(i).value(attributeIndex) * SCALER);
176        }
177        return traindataCopy;
178    }
179}
Note: See TracBrowser for help on using the repository browser.