Context Navigation

← Previous Revision
Next Revision →
Normal
Revision Log

CLIFF.java

Last change on this file was 135, checked in by sherbold, 8 years ago
code documentation and formatting
Property svn:mime-type set to `text/plain`
File size: 6.9 KB

Rev	Line
[86]	1	// Copyright 2015 Georg-August-Universität Göttingen, Germany
[50]	2	//
	3	// Licensed under the Apache License, Version 2.0 (the "License");
	4	// you may not use this file except in compliance with the License.
	5	// You may obtain a copy of the License at
	6	//
	7	// http://www.apache.org/licenses/LICENSE-2.0
	8	//
	9	// Unless required by applicable law or agreed to in writing, software
	10	// distributed under the License is distributed on an "AS IS" BASIS,
	11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	12	// See the License for the specific language governing permissions and
	13	// limitations under the License.
	14
	15	package de.ugoe.cs.cpdp.dataselection;
	16
	17	import java.util.Arrays;
	18
	19	import org.apache.commons.collections4.list.SetUniqueList;
	20
	21	import weka.core.Instances;
	22
	23	/**
	24	* Implements CLIFF data pruning.
	25	*
	26	* @author Steffen Herbold
	27	*/
	28	public class CLIFF implements IPointWiseDataselectionStrategy, ISetWiseDataselectionStrategy {
	29
[135]	30	/**
	31	* percentage of data selected
	32	*/
[50]	33	private double percentage = 0.10;
[135]	34
	35	/**
	36	* number of ranges considered
	37	*/
[50]	38	private final int numRanges = 10;
	39
	40	/**
	41	* Sets the number of neighbors.
	42	*
	43	* @param parameters
	44	* number of neighbors
	45	*/
	46	@Override
	47	public void setParameter(String parameters) {
[135]	48	if (parameters != null) {
[50]	49	percentage = Double.parseDouble(parameters);
	50	}
	51	}
[135]	52
	53	/*
[50]	54	* @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances,
[135]	55	* org.apache.commons.collections4.list.SetUniqueList)
[50]	56	*/
	57	@Override
	58	public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
[135]	59	for (Instances traindata : traindataSet) {
[50]	60	applyCLIFF(traindata);
	61	}
	62	}
	63
[135]	64	/*
[50]	65	* @see de.ugoe.cs.cpdp.dataselection.PointWiseDataselectionStrategy#apply(weka.core.Instances,
[135]	66	* weka.core.Instances)
[50]	67	*/
	68	@Override
	69	public Instances apply(Instances testdata, Instances traindata) {
	70	return applyCLIFF(traindata);
	71	}
	72
[135]	73	/**
	74	* <p>
	75	* Applies the CLIFF relevancy filter to the data.
	76	* </p>
	77	*
	78	* @param data
	79	* the data
	80	* @return CLIFF-filtered data
	81	*/
[120]	82	protected Instances applyCLIFF(Instances data) {
[50]	83	final double[][] powerAttributes = new double[data.size()][data.numAttributes()];
	84	final double[] powerEntity = new double[data.size()];
[135]	85
[50]	86	final int[] counts = data.attributeStats(data.classIndex()).nominalCounts;
	87	final double probDefect = data.numInstances() / (double) counts[1];
[135]	88
	89	for (int j = 0; j < data.numAttributes(); j++) {
	90	if (data.attribute(j) != data.classAttribute()) {
[50]	91	final double[] ranges = getRanges(data, j);
	92	final double[] probDefectRange = getRangeProbabilities(data, j, ranges);
[135]	93
	94	for (int i = 0; i < data.numInstances(); i++) {
[50]	95	final double value = data.instance(i).value(j);
	96	final int range = determineRange(ranges, value);
	97	double probClass, probNotClass, probRangeClass, probRangeNotClass;
[135]	98	if (data.instance(i).classValue() == 1) {
[50]	99	probClass = probDefect;
[135]	100	probNotClass = 1.0 - probDefect;
[50]	101	probRangeClass = probDefectRange[range];
[135]	102	probRangeNotClass = 1.0 - probDefectRange[range];
	103	}
	104	else {
	105	probClass = 1.0 - probDefect;
[50]	106	probNotClass = probDefect;
[135]	107	probRangeClass = 1.0 - probDefectRange[range];
[50]	108	probRangeNotClass = probDefectRange[range];
	109	}
[135]	110	powerAttributes[i][j] = Math.pow(probRangeClass, 2.0) /
	111	(probRangeClass * probClass + probRangeNotClass * probNotClass);
[50]	112	}
	113	}
	114	}
[135]	115
	116	for (int i = 0; i < data.numInstances(); i++) {
[50]	117	powerEntity[i] = 1.0;
[135]	118	for (int j = 0; j < data.numAttributes(); j++) {
[50]	119	powerEntity[i] *= powerAttributes[i][j];
	120	}
	121	}
	122	double[] sortedPower = powerEntity.clone();
	123	Arrays.sort(sortedPower);
[135]	124	double cutOff = sortedPower[(int) (data.numInstances() * (1 - percentage))];
[50]	125
	126	final Instances selected = new Instances(data);
	127	selected.delete();
[135]	128	for (int i = 0; i < data.numInstances(); i++) {
	129	if (powerEntity[i] >= cutOff) {
[50]	130	selected.add(data.instance(i));
	131	}
	132	}
	133	return selected;
	134	}
[135]	135
	136	/**
	137	* <p>
	138	* Gets an array with the ranges from the data for a given attribute
	139	* </p>
	140	*
	141	* @param data
	142	* the data
	143	* @param j
	144	* index of the attribute
	145	* @return the ranges for the attribute
	146	*/
[50]	147	private double[] getRanges(Instances data, int j) {
[135]	148	double[] values = new double[numRanges + 1];
	149	for (int k = 0; k < numRanges; k++) {
	150	values[k] = data.kthSmallestValue(j, (int) (data.size() * (k + 1.0) / numRanges));
[50]	151	}
	152	values[numRanges] = data.attributeStats(j).numericStats.max;
	153	return values;
	154	}
[135]	155
	156	/**
	157	* <p>
	158	* Gets the probabilities of a positive prediction for each range for a given attribute
	159	* </p>
	160	*
	161	* @param data
	162	* the data
	163	* @param j
	164	* index of the attribute
	165	* @param ranges
	166	* the ranges
	167	* @return probabilities for each range
	168	*/
[50]	169	private double[] getRangeProbabilities(Instances data, int j, double[] ranges) {
	170	double[] probDefectRange = new double[numRanges];
	171	int[] countRange = new int[numRanges];
	172	int[] countDefect = new int[numRanges];
[135]	173	for (int i = 0; i < data.numInstances(); i++) {
	174	int range = determineRange(ranges, data.instance(i).value(j));
[50]	175	countRange[range]++;
[135]	176	if (data.instance(i).classValue() == 1) {
[50]	177	countDefect[range]++;
	178	}
	179
	180	}
[135]	181	for (int k = 0; k < numRanges; k++) {
[50]	182	probDefectRange[k] = ((double) countDefect[k]) / countRange[k];
	183	}
	184	return probDefectRange;
	185	}
[135]	186
	187	/**
	188	* <p>
	189	* Determines the range of a give value
	190	* </p>
	191	*
	192	* @param ranges
	193	* the possible ranges
	194	* @param value
	195	* the value
	196	* @return index of the range
	197	*/
[50]	198	private int determineRange(double[] ranges, double value) {
[135]	199	for (int k = 0; k < numRanges; k++) {
	200	if (value <= ranges[k + 1]) {
[50]	201	return k;
	202	}
	203	}
	204	throw new RuntimeException("invalid range or value");
	205	}
	206	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format