Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

CLIFF.java

Last change on this file was 135, checked in by sherbold, 8 years ago
code documentation and formatting
Property svn:mime-type set to `text/plain`
File size: 6.9 KB

Line
1	// Copyright 2015 Georg-August-Universität Göttingen, Germany
2	//
3	// Licensed under the Apache License, Version 2.0 (the "License");
4	// you may not use this file except in compliance with the License.
5	// You may obtain a copy of the License at
6	//
7	// http://www.apache.org/licenses/LICENSE-2.0
8	//
9	// Unless required by applicable law or agreed to in writing, software
10	// distributed under the License is distributed on an "AS IS" BASIS,
11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	// See the License for the specific language governing permissions and
13	// limitations under the License.
14
15	package de.ugoe.cs.cpdp.dataselection;
16
17	import java.util.Arrays;
18
19	import org.apache.commons.collections4.list.SetUniqueList;
20
21	import weka.core.Instances;
22
23	/**
24	* Implements CLIFF data pruning.
25	*
26	* @author Steffen Herbold
27	*/
28	public class CLIFF implements IPointWiseDataselectionStrategy, ISetWiseDataselectionStrategy {
29
30	/**
31	* percentage of data selected
32	*/
33	private double percentage = 0.10;
34
35	/**
36	* number of ranges considered
37	*/
38	private final int numRanges = 10;
39
40	/**
41	* Sets the number of neighbors.
42	*
43	* @param parameters
44	* number of neighbors
45	*/
46	@Override
47	public void setParameter(String parameters) {
48	if (parameters != null) {
49	percentage = Double.parseDouble(parameters);
50	}
51	}
52
53	/*
54	* @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances,
55	* org.apache.commons.collections4.list.SetUniqueList)
56	*/
57	@Override
58	public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
59	for (Instances traindata : traindataSet) {
60	applyCLIFF(traindata);
61	}
62	}
63
64	/*
65	* @see de.ugoe.cs.cpdp.dataselection.PointWiseDataselectionStrategy#apply(weka.core.Instances,
66	* weka.core.Instances)
67	*/
68	@Override
69	public Instances apply(Instances testdata, Instances traindata) {
70	return applyCLIFF(traindata);
71	}
72
73	/**
74	* <p>
75	* Applies the CLIFF relevancy filter to the data.
76	* </p>
77	*
78	* @param data
79	* the data
80	* @return CLIFF-filtered data
81	*/
82	protected Instances applyCLIFF(Instances data) {
83	final double[][] powerAttributes = new double[data.size()][data.numAttributes()];
84	final double[] powerEntity = new double[data.size()];
85
86	final int[] counts = data.attributeStats(data.classIndex()).nominalCounts;
87	final double probDefect = data.numInstances() / (double) counts[1];
88
89	for (int j = 0; j < data.numAttributes(); j++) {
90	if (data.attribute(j) != data.classAttribute()) {
91	final double[] ranges = getRanges(data, j);
92	final double[] probDefectRange = getRangeProbabilities(data, j, ranges);
93
94	for (int i = 0; i < data.numInstances(); i++) {
95	final double value = data.instance(i).value(j);
96	final int range = determineRange(ranges, value);
97	double probClass, probNotClass, probRangeClass, probRangeNotClass;
98	if (data.instance(i).classValue() == 1) {
99	probClass = probDefect;
100	probNotClass = 1.0 - probDefect;
101	probRangeClass = probDefectRange[range];
102	probRangeNotClass = 1.0 - probDefectRange[range];
103	}
104	else {
105	probClass = 1.0 - probDefect;
106	probNotClass = probDefect;
107	probRangeClass = 1.0 - probDefectRange[range];
108	probRangeNotClass = probDefectRange[range];
109	}
110	powerAttributes[i][j] = Math.pow(probRangeClass, 2.0) /
111	(probRangeClass * probClass + probRangeNotClass * probNotClass);
112	}
113	}
114	}
115
116	for (int i = 0; i < data.numInstances(); i++) {
117	powerEntity[i] = 1.0;
118	for (int j = 0; j < data.numAttributes(); j++) {
119	powerEntity[i] *= powerAttributes[i][j];
120	}
121	}
122	double[] sortedPower = powerEntity.clone();
123	Arrays.sort(sortedPower);
124	double cutOff = sortedPower[(int) (data.numInstances() * (1 - percentage))];
125
126	final Instances selected = new Instances(data);
127	selected.delete();
128	for (int i = 0; i < data.numInstances(); i++) {
129	if (powerEntity[i] >= cutOff) {
130	selected.add(data.instance(i));
131	}
132	}
133	return selected;
134	}
135
136	/**
137	* <p>
138	* Gets an array with the ranges from the data for a given attribute
139	* </p>
140	*
141	* @param data
142	* the data
143	* @param j
144	* index of the attribute
145	* @return the ranges for the attribute
146	*/
147	private double[] getRanges(Instances data, int j) {
148	double[] values = new double[numRanges + 1];
149	for (int k = 0; k < numRanges; k++) {
150	values[k] = data.kthSmallestValue(j, (int) (data.size() * (k + 1.0) / numRanges));
151	}
152	values[numRanges] = data.attributeStats(j).numericStats.max;
153	return values;
154	}
155
156	/**
157	* <p>
158	* Gets the probabilities of a positive prediction for each range for a given attribute
159	* </p>
160	*
161	* @param data
162	* the data
163	* @param j
164	* index of the attribute
165	* @param ranges
166	* the ranges
167	* @return probabilities for each range
168	*/
169	private double[] getRangeProbabilities(Instances data, int j, double[] ranges) {
170	double[] probDefectRange = new double[numRanges];
171	int[] countRange = new int[numRanges];
172	int[] countDefect = new int[numRanges];
173	for (int i = 0; i < data.numInstances(); i++) {
174	int range = determineRange(ranges, data.instance(i).value(j));
175	countRange[range]++;
176	if (data.instance(i).classValue() == 1) {
177	countDefect[range]++;
178	}
179
180	}
181	for (int k = 0; k < numRanges; k++) {
182	probDefectRange[k] = ((double) countDefect[k]) / countRange[k];
183	}
184	return probDefectRange;
185	}
186
187	/**
188	* <p>
189	* Determines the range of a give value
190	* </p>
191	*
192	* @param ranges
193	* the possible ranges
194	* @param value
195	* the value
196	* @return index of the range
197	*/
198	private int determineRange(double[] ranges, double value) {
199	for (int k = 0; k < numRanges; k++) {
200	if (value <= ranges[k + 1]) {
201	return k;
202	}
203	}
204	throw new RuntimeException("invalid range or value");
205	}
206	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format