Context Navigation

source: trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/SetWiseEMContextSelection.java @ 37

Last change on this file since 37 was 29, checked in by atrautsch, 10 years ago
em cluster project context selection
File size: 6.8 KB

Line
1	package de.ugoe.cs.cpdp.dataselection;
2
3	import java.util.ArrayList;
4	import java.util.HashSet;
5	import java.util.LinkedList;
6	import java.util.List;
7	import java.util.Set;
8	import java.util.logging.Level;
9
10	import org.apache.commons.collections4.list.SetUniqueList;
11
12	import de.ugoe.cs.util.console.Console;
13	import weka.clusterers.EM;
14	import weka.core.Attribute;
15	import weka.core.DenseInstance;
16	import weka.core.Instance;
17	import weka.core.Instances;
18	import weka.filters.Filter;
19	import weka.filters.unsupervised.attribute.Normalize;
20
21	/**
22	* Selects training data by clustering project context factors.
23	*
24	* The project context factors used for the clustering are configured in
25	* the XML param attribute, Example:
26	* <setwiseselector name="SetWiseEMContextSelection" param="AFS TND TNC" />
27	*/
28	public class SetWiseEMContextSelection implements ISetWiseDataselectionStrategy {
29
30	private String parameters;
31	private String[] project_context_factors; // = new String[]{"TND", "TNC", "TNF", "TLOC"};
32
33	@Override
34	public void setParameter(String parameters) {
35	if( parameters!=null ) {
36	project_context_factors = parameters.split(" ");
37	}
38	}
39
40	/**
41	* Uses the Weka EM-Clustering algorithm to cluster the projects
42	* by their project context factors.
43	* The project context factors are first normalized and then used for clustering.
44	* They can be configured in the configuration param.
45	*
46	* @param testdata
47	* @param traindataSet
48	*/
49	protected void cluster(Instances testdata, SetUniqueList<Instances> traindataSet) {
50	// now do the clustering, normalizedCharacteristicInstances ruft getContextFactors auf
51	final Instances data = this.normalizedCharacteristicInstances(testdata, traindataSet);
52
53	final Instance targetInstance = data.instance(0);
54	final List<Instance> candidateInstances = new LinkedList<Instance>();
55	for( int i=1; i<data.numInstances(); i++ ) {
56	candidateInstances.add(data.instance(i));
57	}
58
59	// cluster and select
60	try {
61	final EM emeans = new EM();
62	boolean onlyTarget = true;
63	int targetCluster;
64	int maxNumClusters = candidateInstances.size();
65
66	do { // while(onlyTarget)
67	emeans.setMaximumNumberOfClusters(maxNumClusters);
68	emeans.buildClusterer(data);
69
70	targetCluster = emeans.clusterInstance(targetInstance);
71
72	// check if cluster only contains target project
73	for( int i=0 ; i<candidateInstances.size() && onlyTarget; i++ ) {
74	onlyTarget &= !(emeans.clusterInstance(candidateInstances.get(i))==targetCluster);
75	}
76	maxNumClusters = emeans.numberOfClusters()-1;
77
78	//Console.traceln(Level.INFO, "number of clusters: " + emeans.numberOfClusters());
79	} while(onlyTarget);
80
81	Console.traceln(Level.INFO, "clusters: " + maxNumClusters);
82	Console.traceln(Level.INFO, "instances vor dem clustern: " + traindataSet.size());
83	int numRemoved = 0;
84	for( int i=0 ; i<candidateInstances.size() ; i++ ) {
85	if( emeans.clusterInstance(candidateInstances.get(i))!=targetCluster ) {
86	traindataSet.remove(i-numRemoved++);
87	}
88	}
89	Console.traceln(Level.INFO, "instances nach dem clustern: " + traindataSet.size());
90	} catch(Exception e) {
91	throw new RuntimeException("error applying setwise EM clustering training data selection", e);
92	}
93	}
94
95	@Override
96	public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
97	// issuetracking und pl muss passen
98	/*
99	int s = traindataSet.size();
100	Console.traceln(Level.INFO, "remove non matching PL and IssueTracking projects, size now: " + s);
101	this.removeWrongContext(testdata, traindataSet, "PL");
102	this.removeWrongContext(testdata, traindataSet, "IssueTracking");
103	s = traindataSet.size();
104	Console.traceln(Level.INFO, "size after removal: " + s);
105	*/
106	// now cluster
107	this.cluster(testdata, traindataSet);
108	}
109
110	/**
111	* Returns test- and training data with only the project context factors
112	* which were chosen in the configuration.
113	* This is later used for clustering.
114	*
115	* @param testdata
116	* @param traindataSet
117	* @return
118	*/
119	protected Instances getContextFactors(Instances testdata, SetUniqueList<Instances> traindataSet) {
120	// setup weka Instances for clustering
121	final ArrayList<Attribute> atts = new ArrayList<Attribute>();
122
123	// we only want the project context factors
124	for( String pcf : this.project_context_factors ) {
125	atts.add(new Attribute(pcf));
126	}
127
128	// set up the data
129	final Instances data = new Instances("project_context_factors", atts, 0);
130	double[] instanceValues = new double[atts.size()];
131
132	// only project context factors + only one instance per project needed
133	int i = 0;
134	for( String pcf : this.project_context_factors ) {
135	instanceValues[i] = testdata.instance(0).value(testdata.attribute(pcf));
136	//Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " + instanceValues[i]);
137	i++;
138	}
139	data.add(new DenseInstance(1.0, instanceValues));
140
141	// now for the projects of the training stet
142	for( Instances traindata : traindataSet ) {
143	instanceValues = new double[atts.size()]; // ohne das hier immer dieselben werte?!
144	i = 0;
145	for( String pcf : this.project_context_factors ) {
146	instanceValues[i] = traindata.instance(0).value(traindata.attribute(pcf));
147	//Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " + instanceValues[i]);
148	i++;
149	}
150
151	data.add(new DenseInstance(1.0, instanceValues));
152	}
153
154	return data;
155	}
156
157	/**
158	* Delete projects where the project context does not match the training project
159	*
160	* @param testdata
161	* @param traindataSet
162	* @param attribute
163	*/
164	protected void removeWrongContext(Instances testdata, SetUniqueList<Instances> traindataSet, String attribute) {
165	Set<Instances> remove = new HashSet<Instances>();
166	for( Instances traindata : traindataSet ) {
167	if( traindata.firstInstance().value(traindata.attribute(attribute)) != testdata.firstInstance().value(testdata.attribute(attribute)) ) {
168	remove.add(traindata);
169	//Console.traceln(Level.WARNING, "rmove attribute "+attribute+" test: "+testdata.firstInstance().value(testdata.attribute(attribute))+" train: "+traindata.firstInstance().value(traindata.attribute(attribute)));
170	}
171	}
172
173	// now delete the projects from set
174	for( Instances i : remove ) {
175	traindataSet.remove(i);
176	//Console.traceln(Level.INFO, "removing training project from set");
177	}
178	}
179
180	/**
181	* Normalizes the data before it gets used for clustering
182	*
183	* @param testdata
184	* @param traindataSet
185	* @return
186	*/
187	protected Instances normalizedCharacteristicInstances(Instances testdata, SetUniqueList<Instances> traindataSet) {
188	Instances data = this.getContextFactors(testdata, traindataSet);
189	try {
190	final Normalize normalizer = new Normalize();
191	normalizer.setInputFormat(data);
192	data = Filter.useFilter(data, normalizer);
193	} catch (Exception e) {
194	throw new RuntimeException("Unexpected exception during normalization of distributional characteristics.", e);
195	}
196	return data;
197	}
198	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: