Context Navigation

WekaUtils.java @ 130

Last change on this file since 130 was 129, checked in by sherbold, 9 years ago
added the same workaround for the problem with Discretize to the TopMetricFilter?. We slightly refactored the implementation within the AbstractCODEP by putting the rescaling of sets to the WekaUtils? to facilitate better re-use.
Property svn:mime-type set to `text/plain`
File size: 6.4 KB

Line
1	// Copyright 2015 Georg-August-Universität Göttingen, Germany
2	//
3	// Licensed under the Apache License, Version 2.0 (the "License");
4	// you may not use this file except in compliance with the License.
5	// You may obtain a copy of the License at
6	//
7	// http://www.apache.org/licenses/LICENSE-2.0
8	//
9	// Unless required by applicable law or agreed to in writing, software
10	// distributed under the License is distributed on an "AS IS" BASIS,
11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	// See the License for the specific language governing permissions and
13	// limitations under the License.
14
15	package de.ugoe.cs.cpdp.util;
16
17	// TODO comment
18	import org.apache.commons.math3.ml.distance.EuclideanDistance;
19
20	import weka.core.Instance;
21	import weka.core.Instances;
22
23	public class WekaUtils {
24
25	public static class DistChar {
26	public final double mean;
27	public final double std;
28	public final double min;
29	public final double max;
30	public final int num;
31	private DistChar(double mean, double std, double min, double max, int num) {
32	this.mean = mean;
33	this.std = std;
34	this.min = min;
35	this.max = max;
36	this.num = num;
37	}
38	}
39
40	/**
41	* Scaling value that moves the decimal point by 5 digets.
42	*/
43	public final static double SCALER = 10000.0d;
44
45	/**
46	* <p>
47	* Adoption of the Hamming difference to numerical values, i.e., basically a count of different
48	* metric values.
49	* </p>
50	*
51	* @param inst1
52	* first instance to be compared
53	* @param inst2
54	* second instance to be compared
55	* @return the distance
56	*/
57	public static double hammingDistance(Instance inst1, Instance inst2) {
58	double distance = 0.0;
59	for (int j = 0; j < inst1.numAttributes(); j++) {
60	if (j != inst1.classIndex()) {
61	if (inst1.value(j) != inst2.value(j)) {
62	distance += 1.0;
63	}
64	}
65	}
66	return distance;
67	}
68
69	public static double[] instanceValues(Instance instance) {
70	double[] values = new double[instance.numAttributes()-1];
71	int k=0;
72	for( int j=0; j<instance.numAttributes() ; j++ ) {
73	if( j!= instance.classIndex() ) {
74	values[k] = instance.value(j);
75	k++;
76	}
77	}
78	return values;
79	}
80
81	public static DistChar datasetDistance(Instances data) {
82	double distance;
83	double sumAll = 0.0;
84	double sumAllQ = 0.0;
85	double min = Double.MAX_VALUE;
86	double max = Double.MIN_VALUE;
87	int numCmp = 0;
88	int l = 0;
89	double[] inst1 = new double[data.numAttributes()-1];
90	double[] inst2 = new double[data.numAttributes()-1];
91	EuclideanDistance euclideanDistance = new EuclideanDistance();
92	for( int i=0; i<data.numInstances(); i++ ) {
93	l=0;
94	for( int k=0; k<data.numAttributes(); k++ ) {
95	if( k!=data.classIndex() ) {
96	inst1[l] = data.instance(i).value(k);
97	}
98	}
99	for( int j=0; j<data.numInstances(); j++ ) {
100	if( j!=i ) {
101	l=0;
102	for( int k=0; k<data.numAttributes(); k++ ) {
103	if( k!=data.classIndex() ) {
104	inst2[l] = data.instance(j).value(k);
105	}
106	}
107	distance = euclideanDistance.compute(inst1, inst2);
108	sumAll += distance;
109	sumAllQ += distance*distance;
110	numCmp++;
111	if( distance < min ) {
112	min = distance;
113	}
114	if( distance > max ) {
115	max = distance;
116	}
117	}
118	}
119	}
120	double mean = sumAll / numCmp;
121	double std = Math.sqrt((sumAllQ-(sumAllsumAll)/numCmp)
122	(1.0d / (numCmp - 1)));
123	return new DistChar(mean, std, min, max, data.numInstances());
124	}
125
126	// like above, but for single attribute
127	public static DistChar attributeDistance(Instances data, int index) {
128	double distance;
129	double sumAll = 0.0;
130	double sumAllQ = 0.0;
131	double min = Double.MAX_VALUE;
132	double max = Double.MIN_VALUE;
133	int numCmp = 0;
134	double value1, value2;
135	for( int i=0; i<data.numInstances(); i++ ) {
136	value1 = data.instance(i).value(index);
137	for( int j=0; j<data.numInstances(); j++ ) {
138	if( j!=i ) {
139	value2 = data.instance(j).value(index);
140	distance = Math.abs(value1-value2);
141	sumAll += distance;
142	sumAllQ += distance*distance;
143	numCmp++;
144	if( distance < min ) {
145	min = distance;
146	}
147	if( distance > max ) {
148	max = distance;
149	}
150	}
151	}
152	}
153	double mean = sumAll / numCmp;
154	double std = Math.sqrt((sumAllQ-(sumAllsumAll)/numCmp)
155	(1.0d / (numCmp - 1)));
156	return new DistChar(mean, std, min, max, data.numInstances());
157	}
158
159	/**
160	* <p>
161	* Upscales the value of a single attribute. This is a workaround to get BayesNet running for
162	* all data. Works on a copy of the training data, i.e., leaves the original data untouched.
163	* </p>
164	*
165	* @param traindata
166	* data from which the attribute is upscaled.
167	* @param attributeIndex
168	* index of the attribute
169	* @return data with upscaled attribute
170	*/
171	public static Instances upscaleAttribute(Instances traindata, int attributeIndex) {
172	Instances traindataCopy = new Instances(traindata);
173	for (int i = 0; i < traindata.size(); i++) {
174	traindataCopy.get(i).setValue(attributeIndex,
175	traindata.get(i).value(attributeIndex) * SCALER);
176	}
177	return traindataCopy;
178	}
179	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format