Context Navigation

← Previous Revision
Next Revision →
Normal
Revision Log

Oversampling.java

Last change on this file was 135, checked in by sherbold, 8 years ago
code documentation and formatting
Property svn:mime-type set to `text/plain`
File size: 3.6 KB

Rev	Line
[86]	1	// Copyright 2015 Georg-August-Universität Göttingen, Germany
[41]	2	//
	3	// Licensed under the Apache License, Version 2.0 (the "License");
	4	// you may not use this file except in compliance with the License.
	5	// You may obtain a copy of the License at
	6	//
	7	// http://www.apache.org/licenses/LICENSE-2.0
	8	//
	9	// Unless required by applicable law or agreed to in writing, software
	10	// distributed under the License is distributed on an "AS IS" BASIS,
	11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	12	// See the License for the specific language governing permissions and
	13	// limitations under the License.
	14
[38]	15	package de.ugoe.cs.cpdp.dataprocessing;
	16
	17	import org.apache.commons.collections4.list.SetUniqueList;
	18
	19	import weka.core.Instances;
	20	import weka.filters.Filter;
	21	import weka.filters.supervised.instance.Resample;
	22
	23	/**
[41]	24	* Implements oversampling, a strategy for handling bias in data. In case there are less positive
	25	* samples (i.e. defect-prone) samples in the data than negative samples (i.e. non-defect-prone),
	26	* the defect-prone entities are over-sampled such that the number of defect-prone and
	27	* non-defect-prone instances is the same afterwards. This means, that some of the defect-prone
	28	* entities will be more than once within the data.
[38]	29	*
	30	* @author Steffen Herbold
	31	*/
[41]	32	public class Oversampling implements IProcessesingStrategy, ISetWiseProcessingStrategy {
[38]	33
[41]	34	/**
	35	* Does not have parameters. String is ignored.
	36	*
	37	* @param parameters
	38	* ignored
	39	*/
	40	@Override
	41	public void setParameter(String parameters) {
	42	// dummy
	43	}
[38]	44
[41]	45	/*
	46	* (non-Javadoc)
	47	*
	48	* @see de.ugoe.cs.cpdp.dataprocessing.ISetWiseProcessingStrategy#apply(weka. core.Instances,
	49	* org.apache.commons.collections4.list.SetUniqueList)
	50	*/
	51	@Override
	52	public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
	53	for (Instances traindata : traindataSet) {
	54	apply(testdata, traindata);
	55	}
	56	}
[38]	57
[41]	58	/*
	59	* (non-Javadoc)
	60	*
	61	* @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core. Instances,
	62	* weka.core.Instances)
	63	*/
	64	@Override
	65	public void apply(Instances testdata, Instances traindata) {
[38]	66
[41]	67	final int[] counts = traindata.attributeStats(traindata.classIndex()).nominalCounts;
	68	if (counts[1] < counts[0]) {
	69	Instances negatives = new Instances(traindata);
	70	Instances positives = new Instances(traindata);
[38]	71
[41]	72	for (int i = traindata.size() - 1; i >= 0; i--) {
	73	if (Double.compare(1.0, negatives.get(i).classValue()) == 0) {
	74	negatives.remove(i);
	75	}
	76	if (Double.compare(0.0, positives.get(i).classValue()) == 0) {
	77	positives.remove(i);
	78	}
	79	}
[38]	80
[41]	81	Resample resample = new Resample();
	82	resample.setSampleSizePercent((100.0 * counts[0]) / counts[1]);
	83	try {
	84	resample.setInputFormat(traindata);
	85	positives = Filter.useFilter(positives, resample);
	86	}
	87	catch (Exception e) {
	88	throw new RuntimeException(e);
	89	}
	90	traindata.clear();
	91	for (int i = 0; i < negatives.size(); i++) {
	92	traindata.add(negatives.get(i));
	93	}
	94	for (int i = 0; i < positives.size(); i++) {
	95	traindata.add(positives.get(i));
	96	}
	97	}
	98	}
[38]	99
	100	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format