Context Navigation

WekaUtils.java @ 140

Last change on this file since 140 was 136, checked in by sherbold, 8 years ago
more code documentation
Property svn:mime-type set to `text/plain`
File size: 8.2 KB

Line
1	// Copyright 2015 Georg-August-Universität Göttingen, Germany
2	//
3	// Licensed under the Apache License, Version 2.0 (the "License");
4	// you may not use this file except in compliance with the License.
5	// You may obtain a copy of the License at
6	//
7	// http://www.apache.org/licenses/LICENSE-2.0
8	//
9	// Unless required by applicable law or agreed to in writing, software
10	// distributed under the License is distributed on an "AS IS" BASIS,
11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	// See the License for the specific language governing permissions and
13	// limitations under the License.
14
15	package de.ugoe.cs.cpdp.util;
16
17	import org.apache.commons.math3.ml.distance.EuclideanDistance;
18
19	import weka.core.Instance;
20	import weka.core.Instances;
21
22	/**
23	* <p>
24	* Collections of helper functions to work with Weka.
25	* </p>
26	*
27	* @author Steffen Herbold
28	*/
29	public class WekaUtils {
30
31	/**
32	* <p>
33	* Data class for distance between instances within a data set based on their distributional
34	* characteristics.
35	* </p>
36	*
37	* @author Steffen Herbold
38	*/
39	public static class DistChar {
40
41	/**
42	* mean distance
43	*/
44	public final double mean;
45
46	/**
47	* standard deviation
48	*/
49	public final double std;
50
51	/**
52	* minimal value
53	*/
54	public final double min;
55
56	/**
57	* maximal value
58	*/
59	public final double max;
60
61	/**
62	* number of instances
63	*/
64	public final int num;
65
66	/**
67	* <p>
68	* Constructor. Creates a new DistChar object.
69	* </p>
70	*
71	* @param mean mean distance between instances
72	* @param std standard deviation of distances between instances
73	* @param min minimal distance between instances
74	* @param max maximal distance between instances
75	* @param num number of instance
76	*/
77	private DistChar(double mean, double std, double min, double max, int num) {
78	this.mean = mean;
79	this.std = std;
80	this.min = min;
81	this.max = max;
82	this.num = num;
83	}
84	}
85
86	/**
87	* Scaling value that moves the decimal point by 5 digets.
88	*/
89	public final static double SCALER = 10000.0d;
90
91	/**
92	* <p>
93	* Adoption of the Hamming difference to numerical values, i.e., basically a count of different
94	* metric values.
95	* </p>
96	*
97	* @param inst1
98	* first instance to be compared
99	* @param inst2
100	* second instance to be compared
101	* @return the distance
102	*/
103	public static double hammingDistance(Instance inst1, Instance inst2) {
104	double distance = 0.0;
105	for (int j = 0; j < inst1.numAttributes(); j++) {
106	if (j != inst1.classIndex()) {
107	if (inst1.value(j) != inst2.value(j)) {
108	distance += 1.0;
109	}
110	}
111	}
112	return distance;
113	}
114
115	/**
116	* <p>
117	* Returns a double array of the values without the classification.
118	* </p>
119	*
120	* @param instance
121	* the instance
122	* @return double array
123	*/
124	public static double[] instanceValues(Instance instance) {
125	double[] values = new double[instance.numAttributes() - 1];
126	int k = 0;
127	for (int j = 0; j < instance.numAttributes(); j++) {
128	if (j != instance.classIndex()) {
129	values[k] = instance.value(j);
130	k++;
131	}
132	}
133	return values;
134	}
135
136	/**
137	* <p>
138	* Calculates the distributional characteristics of the distances the instances within a data
139	* set have to each other.
140	* </p>
141	*
142	* @param data
143	* data for which the instances are characterized
144	* @return characteristics
145	*/
146	public static DistChar datasetDistance(Instances data) {
147	double distance;
148	double sumAll = 0.0;
149	double sumAllQ = 0.0;
150	double min = Double.MAX_VALUE;
151	double max = Double.MIN_VALUE;
152	int numCmp = 0;
153	int l = 0;
154	double[] inst1 = new double[data.numAttributes() - 1];
155	double[] inst2 = new double[data.numAttributes() - 1];
156	EuclideanDistance euclideanDistance = new EuclideanDistance();
157	for (int i = 0; i < data.numInstances(); i++) {
158	l = 0;
159	for (int k = 0; k < data.numAttributes(); k++) {
160	if (k != data.classIndex()) {
161	inst1[l] = data.instance(i).value(k);
162	}
163	}
164	for (int j = 0; j < data.numInstances(); j++) {
165	if (j != i) {
166	l = 0;
167	for (int k = 0; k < data.numAttributes(); k++) {
168	if (k != data.classIndex()) {
169	inst2[l] = data.instance(j).value(k);
170	}
171	}
172	distance = euclideanDistance.compute(inst1, inst2);
173	sumAll += distance;
174	sumAllQ += distance * distance;
175	numCmp++;
176	if (distance < min) {
177	min = distance;
178	}
179	if (distance > max) {
180	max = distance;
181	}
182	}
183	}
184	}
185	double mean = sumAll / numCmp;
186	double std = Math.sqrt((sumAllQ - (sumAll * sumAll) / numCmp) * (1.0d / (numCmp - 1)));
187	return new DistChar(mean, std, min, max, data.numInstances());
188	}
189
190	/**
191	* <p>
192	* Calculates the distributional characteristics of the distances of a single attribute the
193	* instances within a data set have to each other.
194	* </p>
195	*
196	* @param data
197	* data for which the instances are characterized
198	* @param index
199	* attribute for which the distances are characterized
200	* @return characteristics
201	*/
202	public static DistChar attributeDistance(Instances data, int index) {
203	double distance;
204	double sumAll = 0.0;
205	double sumAllQ = 0.0;
206	double min = Double.MAX_VALUE;
207	double max = Double.MIN_VALUE;
208	int numCmp = 0;
209	double value1, value2;
210	for (int i = 0; i < data.numInstances(); i++) {
211	value1 = data.instance(i).value(index);
212	for (int j = 0; j < data.numInstances(); j++) {
213	if (j != i) {
214	value2 = data.instance(j).value(index);
215	distance = Math.abs(value1 - value2);
216	sumAll += distance;
217	sumAllQ += distance * distance;
218	numCmp++;
219	if (distance < min) {
220	min = distance;
221	}
222	if (distance > max) {
223	max = distance;
224	}
225	}
226	}
227	}
228	double mean = sumAll / numCmp;
229	double std = Math.sqrt((sumAllQ - (sumAll * sumAll) / numCmp) * (1.0d / (numCmp - 1)));
230	return new DistChar(mean, std, min, max, data.numInstances());
231	}
232
233	/**
234	* <p>
235	* Upscales the value of a single attribute. This is a workaround to get BayesNet running for
236	* all data. Works on a copy of the training data, i.e., leaves the original data untouched.
237	* </p>
238	*
239	* @param traindata
240	* data from which the attribute is upscaled.
241	* @param attributeIndex
242	* index of the attribute
243	* @return data with upscaled attribute
244	*/
245	public static Instances upscaleAttribute(Instances traindata, int attributeIndex) {
246	Instances traindataCopy = new Instances(traindata);
247	for (int i = 0; i < traindata.size(); i++) {
248	traindataCopy.get(i).setValue(attributeIndex,
249	traindata.get(i).value(attributeIndex) * SCALER);
250	}
251	return traindataCopy;
252	}
253	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format