Context Navigation

source: trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/SetWiseEMContextSelection.java @ 39

Last change on this file since 39 was 38, checked in by sherbold, 9 years ago
added Oversampling and Resampling processors fixed bug in ZScoreNormalizations added new load for the Audi data set that is based on changes minor changes to remove warnings
File size: 6.8 KB

Line
1	package de.ugoe.cs.cpdp.dataselection;
2
3	import java.util.ArrayList;
4	import java.util.HashSet;
5	import java.util.LinkedList;
6	import java.util.List;
7	import java.util.Set;
8	import java.util.logging.Level;
9
10	import org.apache.commons.collections4.list.SetUniqueList;
11
12	import de.ugoe.cs.util.console.Console;
13	import weka.clusterers.EM;
14	import weka.core.Attribute;
15	import weka.core.DenseInstance;
16	import weka.core.Instance;
17	import weka.core.Instances;
18	import weka.filters.Filter;
19	import weka.filters.unsupervised.attribute.Normalize;
20
21	/**
22	* Selects training data by clustering project context factors.
23	*
24	* The project context factors used for the clustering are configured in
25	* the XML param attribute, Example:
26	* <setwiseselector name="SetWiseEMContextSelection" param="AFS TND TNC" />
27	*/
28	public class SetWiseEMContextSelection implements ISetWiseDataselectionStrategy {
29
30	private String[] project_context_factors; // = new String[]{"TND", "TNC", "TNF", "TLOC"};
31
32	@Override
33	public void setParameter(String parameters) {
34	if( parameters!=null ) {
35	project_context_factors = parameters.split(" ");
36	}
37	}
38
39	/**
40	* Uses the Weka EM-Clustering algorithm to cluster the projects
41	* by their project context factors.
42	* The project context factors are first normalized and then used for clustering.
43	* They can be configured in the configuration param.
44	*
45	* @param testdata
46	* @param traindataSet
47	*/
48	protected void cluster(Instances testdata, SetUniqueList<Instances> traindataSet) {
49	// now do the clustering, normalizedCharacteristicInstances ruft getContextFactors auf
50	final Instances data = this.normalizedCharacteristicInstances(testdata, traindataSet);
51
52	final Instance targetInstance = data.instance(0);
53	final List<Instance> candidateInstances = new LinkedList<Instance>();
54	for( int i=1; i<data.numInstances(); i++ ) {
55	candidateInstances.add(data.instance(i));
56	}
57
58	// cluster and select
59	try {
60	final EM emeans = new EM();
61	boolean onlyTarget = true;
62	int targetCluster;
63	int maxNumClusters = candidateInstances.size();
64
65	do { // while(onlyTarget)
66	emeans.setMaximumNumberOfClusters(maxNumClusters);
67	emeans.buildClusterer(data);
68
69	targetCluster = emeans.clusterInstance(targetInstance);
70
71	// check if cluster only contains target project
72	for( int i=0 ; i<candidateInstances.size() && onlyTarget; i++ ) {
73	onlyTarget &= !(emeans.clusterInstance(candidateInstances.get(i))==targetCluster);
74	}
75	maxNumClusters = emeans.numberOfClusters()-1;
76
77	//Console.traceln(Level.INFO, "number of clusters: " + emeans.numberOfClusters());
78	} while(onlyTarget);
79
80	Console.traceln(Level.INFO, "clusters: " + maxNumClusters);
81	Console.traceln(Level.INFO, "instances vor dem clustern: " + traindataSet.size());
82	int numRemoved = 0;
83	for( int i=0 ; i<candidateInstances.size() ; i++ ) {
84	if( emeans.clusterInstance(candidateInstances.get(i))!=targetCluster ) {
85	traindataSet.remove(i-numRemoved++);
86	}
87	}
88	Console.traceln(Level.INFO, "instances nach dem clustern: " + traindataSet.size());
89	} catch(Exception e) {
90	throw new RuntimeException("error applying setwise EM clustering training data selection", e);
91	}
92	}
93
94	@Override
95	public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
96	// issuetracking und pl muss passen
97	/*
98	int s = traindataSet.size();
99	Console.traceln(Level.INFO, "remove non matching PL and IssueTracking projects, size now: " + s);
100	this.removeWrongContext(testdata, traindataSet, "PL");
101	this.removeWrongContext(testdata, traindataSet, "IssueTracking");
102	s = traindataSet.size();
103	Console.traceln(Level.INFO, "size after removal: " + s);
104	*/
105	// now cluster
106	this.cluster(testdata, traindataSet);
107	}
108
109	/**
110	* Returns test- and training data with only the project context factors
111	* which were chosen in the configuration.
112	* This is later used for clustering.
113	*
114	* @param testdata
115	* @param traindataSet
116	* @return
117	*/
118	protected Instances getContextFactors(Instances testdata, SetUniqueList<Instances> traindataSet) {
119	// setup weka Instances for clustering
120	final ArrayList<Attribute> atts = new ArrayList<Attribute>();
121
122	// we only want the project context factors
123	for( String pcf : this.project_context_factors ) {
124	atts.add(new Attribute(pcf));
125	}
126
127	// set up the data
128	final Instances data = new Instances("project_context_factors", atts, 0);
129	double[] instanceValues = new double[atts.size()];
130
131	// only project context factors + only one instance per project needed
132	int i = 0;
133	for( String pcf : this.project_context_factors ) {
134	instanceValues[i] = testdata.instance(0).value(testdata.attribute(pcf));
135	//Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " + instanceValues[i]);
136	i++;
137	}
138	data.add(new DenseInstance(1.0, instanceValues));
139
140	// now for the projects of the training stet
141	for( Instances traindata : traindataSet ) {
142	instanceValues = new double[atts.size()]; // ohne das hier immer dieselben werte?!
143	i = 0;
144	for( String pcf : this.project_context_factors ) {
145	instanceValues[i] = traindata.instance(0).value(traindata.attribute(pcf));
146	//Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " + instanceValues[i]);
147	i++;
148	}
149
150	data.add(new DenseInstance(1.0, instanceValues));
151	}
152
153	return data;
154	}
155
156	/**
157	* Delete projects where the project context does not match the training project
158	*
159	* @param testdata
160	* @param traindataSet
161	* @param attribute
162	*/
163	protected void removeWrongContext(Instances testdata, SetUniqueList<Instances> traindataSet, String attribute) {
164	Set<Instances> remove = new HashSet<Instances>();
165	for( Instances traindata : traindataSet ) {
166	if( traindata.firstInstance().value(traindata.attribute(attribute)) != testdata.firstInstance().value(testdata.attribute(attribute)) ) {
167	remove.add(traindata);
168	//Console.traceln(Level.WARNING, "rmove attribute "+attribute+" test: "+testdata.firstInstance().value(testdata.attribute(attribute))+" train: "+traindata.firstInstance().value(traindata.attribute(attribute)));
169	}
170	}
171
172	// now delete the projects from set
173	for( Instances i : remove ) {
174	traindataSet.remove(i);
175	//Console.traceln(Level.INFO, "removing training project from set");
176	}
177	}
178
179	/**
180	* Normalizes the data before it gets used for clustering
181	*
182	* @param testdata
183	* @param traindataSet
184	* @return
185	*/
186	protected Instances normalizedCharacteristicInstances(Instances testdata, SetUniqueList<Instances> traindataSet) {
187	Instances data = this.getContextFactors(testdata, traindataSet);
188	try {
189	final Normalize normalizer = new Normalize();
190	normalizer.setInputFormat(data);
191	data = Filter.useFilter(data, normalizer);
192	} catch (Exception e) {
193	throw new RuntimeException("Unexpected exception during normalization of distributional characteristics.", e);
194	}
195	return data;
196	}
197	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: