Context Navigation

Undersampling.java @ 38

Last change on this file since 38 was 18, checked in by atrautsch, 10 years ago
TODO Kommentar fuer Undersampling Bug.
Property svn:mime-type set to `text/plain`
File size: 2.5 KB

Line
1	package de.ugoe.cs.cpdp.dataprocessing;
2
3	import org.apache.commons.collections4.list.SetUniqueList;
4
5	import weka.core.Instances;
6	import weka.filters.Filter;
7	import weka.filters.supervised.instance.Resample;
8
9	/**
10	* Implements undersampling, a strategy for handling bias in data. In case there are less positive samples (i.e. defect-prone) samples in the
11	* data than negative samples (i.e. non-defect-prone), the non-defect-prone entities are sampled such thatthe number of defect-prone and non-defect-prone instances is the same afterwards.
12	* @author Steffen Herbold
13	*/
14	public class Undersampling implements IProcessesingStrategy,
15	ISetWiseProcessingStrategy {
16
17
18	/**
19	* Does not have parameters. String is ignored.
20	* @param parameters ignored
21	*/
22	@Override
23	public void setParameter(String parameters) {
24	// dummy
25	}
26
27	/*
28	* (non-Javadoc)
29	* @see de.ugoe.cs.cpdp.dataprocessing.ISetWiseProcessingStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList)
30	*/
31	@Override
32	public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
33	for( Instances traindata : traindataSet ) {
34	apply(testdata, traindata);
35	}
36	}
37
38	/*
39	* (non-Javadoc)
40	* @see de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.Instances, weka.core.Instances)
41	*/
42	@Override
43	public void apply(Instances testdata, Instances traindata) {
44
45	final int[] counts = traindata.attributeStats(traindata.classIndex()).nominalCounts;
46
47	if( counts[1]<counts[0] ) {
48	Instances negatives = new Instances(traindata);
49	Instances positives = new Instances(traindata);
50
51	for( int i=traindata.size()-1 ; i>=0 ; i-- ) {
52	if( Double.compare(1.0, negatives.get(i).classValue())==0 ) {
53	negatives.remove(i);
54	}
55	if( Double.compare(0.0, positives.get(i).classValue())==0 ) {
56	positives.remove(i);
57	}
58	}
59
60	Resample resample = new Resample();
61	// TODO: resample.setSampleSizePercent((100.0*counts[1])/100+0.01);
62	// Ohne +0.01 wird bei tomcat, xerces-1.2 und jedit-4.0 ein negative weniger zurückgegeben
63	resample.setSampleSizePercent((100.0* counts[1])/counts[0]);
64	try {
65	resample.setInputFormat(traindata);
66	negatives = Filter.useFilter(negatives, resample);
67	} catch (Exception e) {
68	throw new RuntimeException(e);
69	}
70	traindata.clear();
71	for( int i=0 ; i<negatives.size() ; i++ ) {
72	traindata.add(negatives.get(i));
73	}
74	for( int i=0 ; i<positives.size() ; i++ ) {
75	traindata.add(positives.get(i));
76	}
77	}
78	}
79
80	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format