Context Navigation

Oversampling.java @ 39

Last change on this file since 39 was 38, checked in by sherbold, 11 years ago
added Oversampling and Resampling processors fixed bug in ZScoreNormalizations added new load for the Audi data set that is based on changes minor changes to remove warnings
Property svn:mime-type set to `text/plain`
File size: 2.7 KB

Line
1	package de.ugoe.cs.cpdp.dataprocessing;
2
3	import org.apache.commons.collections4.list.SetUniqueList;
4
5	import weka.core.Instances;
6	import weka.filters.Filter;
7	import weka.filters.supervised.instance.Resample;
8
9	/**
10	* Implements oversampling, a strategy for
11	* handling bias in data. In case there are less positive samples (i.e.
12	* defect-prone) samples in the data than negative samples (i.e.
13	* non-defect-prone), the defect-prone entities are over-sampled such that the
14	* number of defect-prone and non-defect-prone instances is the same afterwards.
15	* This means, that some of the defect-prone entities will be more than once
16	* within the data.
17	*
18	* @author Steffen Herbold
19	*/
20	public class Oversampling implements IProcessesingStrategy,
21	ISetWiseProcessingStrategy {
22
23	/**
24	* Does not have parameters. String is ignored.
25	*
26	* @param parameters
27	* ignored
28	*/
29	@Override
30	public void setParameter(String parameters) {
31	// dummy
32	}
33
34	/*
35	* (non-Javadoc)
36	*
37	* @see
38	* de.ugoe.cs.cpdp.dataprocessing.ISetWiseProcessingStrategy#apply(weka.
39	* core.Instances, org.apache.commons.collections4.list.SetUniqueList)
40	*/
41	@Override
42	public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
43	for (Instances traindata : traindataSet) {
44	apply(testdata, traindata);
45	}
46	}
47
48	/*
49	* (non-Javadoc)
50	*
51	* @see
52	* de.ugoe.cs.cpdp.dataprocessing.IProcessesingStrategy#apply(weka.core.
53	* Instances, weka.core.Instances)
54	*/
55	@Override
56	public void apply(Instances testdata, Instances traindata) {
57
58	final int[] counts = traindata.attributeStats(traindata.classIndex()).nominalCounts;
59	if (counts[1] < counts[0]) {
60	Instances negatives = new Instances(traindata);
61	Instances positives = new Instances(traindata);
62
63	for (int i = traindata.size() - 1; i >= 0; i--) {
64	if (Double.compare(1.0, negatives.get(i).classValue()) == 0) {
65	negatives.remove(i);
66	}
67	if (Double.compare(0.0, positives.get(i).classValue()) == 0) {
68	positives.remove(i);
69	}
70	}
71
72	Resample resample = new Resample();
73	// TODO: resample.setSampleSizePercent((100.0*counts[1])/100+0.01);
74	// Ohne +0.01 wird bei tomcat, xerces-1.2 und jedit-4.0 ein negative
75	// weniger zurückgegeben
76	resample.setSampleSizePercent((100.0 * counts[0]) / counts[1]);
77	try {
78	resample.setInputFormat(traindata);
79	positives = Filter.useFilter(positives, resample);
80	} catch (Exception e) {
81	throw new RuntimeException(e);
82	}
83	traindata.clear();
84	for (int i = 0; i < negatives.size(); i++) {
85	traindata.add(negatives.get(i));
86	}
87	for (int i = 0; i < positives.size(); i++) {
88	traindata.add(positives.get(i));
89	}
90	}
91	}
92
93	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format