Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

SetWiseEMContextSelection.java

Last change on this file was 135, checked in by sherbold, 8 years ago
code documentation and formatting
File size: 9.1 KB

Line
1	// Copyright 2015 Georg-August-Universität Göttingen, Germany
2	//
3	// Licensed under the Apache License, Version 2.0 (the "License");
4	// you may not use this file except in compliance with the License.
5	// You may obtain a copy of the License at
6	//
7	// http://www.apache.org/licenses/LICENSE-2.0
8	//
9	// Unless required by applicable law or agreed to in writing, software
10	// distributed under the License is distributed on an "AS IS" BASIS,
11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	// See the License for the specific language governing permissions and
13	// limitations under the License.
14
15	package de.ugoe.cs.cpdp.dataselection;
16
17	import java.util.ArrayList;
18	import java.util.HashSet;
19	import java.util.LinkedList;
20	import java.util.List;
21	import java.util.Set;
22	import java.util.logging.Level;
23
24	import org.apache.commons.collections4.list.SetUniqueList;
25
26	import de.ugoe.cs.util.console.Console;
27	import weka.clusterers.EM;
28	import weka.core.Attribute;
29	import weka.core.DenseInstance;
30	import weka.core.Instance;
31	import weka.core.Instances;
32	import weka.filters.Filter;
33	import weka.filters.unsupervised.attribute.Normalize;
34
35	/**
36	* Selects training data by clustering project context factors.
37	*
38	* The project context factors used for the clustering are configured in the XML param attribute,
39	* Example: <setwiseselector name="SetWiseEMContextSelection" param="AFS TND TNC" />
40	*/
41	public class SetWiseEMContextSelection implements ISetWiseDataselectionStrategy {
42
43	/**
44	* context factors
45	*/
46	private String[] project_context_factors; // = new String[]{"TND", "TNC", "TNF", "TLOC"};
47
48	/*
49	* (non-Javadoc)
50	*
51	* @see de.ugoe.cs.cpdp.IParameterizable#setParameter(java.lang.String)
52	*/
53	@Override
54	public void setParameter(String parameters) {
55	if (parameters != null) {
56	project_context_factors = parameters.split(" ");
57	}
58	}
59
60	/**
61	* Uses the Weka EM-Clustering algorithm to cluster the projects by their project context
62	* factors. The project context factors are first normalized and then used for clustering. They
63	* can be configured in the configuration param.
64	*
65	* @param testdata
66	* @param traindataSet
67	*/
68	protected void cluster(Instances testdata, SetUniqueList<Instances> traindataSet) {
69	// now do the clustering, normalizedCharacteristicInstances ruft getContextFactors auf
70	final Instances data = this.normalizedCharacteristicInstances(testdata, traindataSet);
71
72	final Instance targetInstance = data.instance(0);
73	final List<Instance> candidateInstances = new LinkedList<Instance>();
74	for (int i = 1; i < data.numInstances(); i++) {
75	candidateInstances.add(data.instance(i));
76	}
77
78	// cluster and select
79	try {
80	final EM emeans = new EM();
81	boolean onlyTarget = true;
82	int targetCluster;
83	int maxNumClusters = candidateInstances.size();
84
85	do { // while(onlyTarget)
86	emeans.setMaximumNumberOfClusters(maxNumClusters);
87	emeans.buildClusterer(data);
88
89	targetCluster = emeans.clusterInstance(targetInstance);
90
91	// check if cluster only contains target project
92	for (int i = 0; i < candidateInstances.size() && onlyTarget; i++) {
93	onlyTarget &=
94	!(emeans.clusterInstance(candidateInstances.get(i)) == targetCluster);
95	}
96	maxNumClusters = emeans.numberOfClusters() - 1;
97
98	// Console.traceln(Level.INFO, "number of clusters: " + emeans.numberOfClusters());
99	}
100	while (onlyTarget);
101
102	Console.traceln(Level.INFO, "clusters: " + maxNumClusters);
103	Console.traceln(Level.INFO, "instances vor dem clustern: " + traindataSet.size());
104	int numRemoved = 0;
105	for (int i = 0; i < candidateInstances.size(); i++) {
106	if (emeans.clusterInstance(candidateInstances.get(i)) != targetCluster) {
107	traindataSet.remove(i - numRemoved++);
108	}
109	}
110	Console.traceln(Level.INFO, "instances nach dem clustern: " + traindataSet.size());
111	}
112	catch (Exception e) {
113	throw new RuntimeException("error applying setwise EM clustering training data selection",
114	e);
115	}
116	}
117
118	/*
119	* (non-Javadoc)
120	*
121	* @see de.ugoe.cs.cpdp.dataselection.ISetWiseDataselectionStrategy#apply(weka.core.Instances,
122	* org.apache.commons.collections4.list.SetUniqueList)
123	*/
124	@Override
125	public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
126	// issuetracking und pl muss passen
127	/*
128	* int s = traindataSet.size(); Console.traceln(Level.INFO,
129	* "remove non matching PL and IssueTracking projects, size now: " + s);
130	* this.removeWrongContext(testdata, traindataSet, "PL"); this.removeWrongContext(testdata,
131	* traindataSet, "IssueTracking"); s = traindataSet.size(); Console.traceln(Level.INFO,
132	* "size after removal: " + s);
133	*/
134	// now cluster
135	this.cluster(testdata, traindataSet);
136	}
137
138	/**
139	* Returns test- and training data with only the project context factors which were chosen in
140	* the configuration. This is later used for clustering.
141	*
142	* @param testdata
143	* @param traindataSet
144	* @return
145	*/
146	protected Instances getContextFactors(Instances testdata,
147	SetUniqueList<Instances> traindataSet)
148	{
149	// setup weka Instances for clustering
150	final ArrayList<Attribute> atts = new ArrayList<Attribute>();
151
152	// we only want the project context factors
153	for (String pcf : this.project_context_factors) {
154	atts.add(new Attribute(pcf));
155	}
156
157	// set up the data
158	final Instances data = new Instances("project_context_factors", atts, 0);
159	double[] instanceValues = new double[atts.size()];
160
161	// only project context factors + only one instance per project needed
162	int i = 0;
163	for (String pcf : this.project_context_factors) {
164	instanceValues[i] = testdata.instance(0).value(testdata.attribute(pcf));
165	// Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " +
166	// instanceValues[i]);
167	i++;
168	}
169	data.add(new DenseInstance(1.0, instanceValues));
170
171	// now for the projects of the training stet
172	for (Instances traindata : traindataSet) {
173	instanceValues = new double[atts.size()]; // ohne das hier immer dieselben werte?!
174	i = 0;
175	for (String pcf : this.project_context_factors) {
176	instanceValues[i] = traindata.instance(0).value(traindata.attribute(pcf));
177	// Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " +
178	// instanceValues[i]);
179	i++;
180	}
181
182	data.add(new DenseInstance(1.0, instanceValues));
183	}
184
185	return data;
186	}
187
188	/**
189	* Delete projects where the project context does not match the training project
190	*
191	* @param testdata
192	* @param traindataSet
193	* @param attribute
194	*/
195	protected void removeWrongContext(Instances testdata,
196	SetUniqueList<Instances> traindataSet,
197	String attribute)
198	{
199	Set<Instances> remove = new HashSet<Instances>();
200	for (Instances traindata : traindataSet) {
201	if (traindata.firstInstance().value(traindata.attribute(attribute)) != testdata
202	.firstInstance().value(testdata.attribute(attribute)))
203	{
204	remove.add(traindata);
205	// Console.traceln(Level.WARNING,
206	// "rmove attribute "+attribute+" test:
207	// "+testdata.firstInstance().value(testdata.attribute(attribute))+" train:
208	// "+traindata.firstInstance().value(traindata.attribute(attribute)));
209	}
210	}
211
212	// now delete the projects from set
213	for (Instances i : remove) {
214	traindataSet.remove(i);
215	// Console.traceln(Level.INFO, "removing training project from set");
216	}
217	}
218
219	/**
220	* Normalizes the data before it gets used for clustering
221	*
222	* @param testdata
223	* @param traindataSet
224	* @return
225	*/
226	protected Instances normalizedCharacteristicInstances(Instances testdata,
227	SetUniqueList<Instances> traindataSet)
228	{
229	Instances data = this.getContextFactors(testdata, traindataSet);
230	try {
231	final Normalize normalizer = new Normalize();
232	normalizer.setInputFormat(data);
233	data = Filter.useFilter(data, normalizer);
234	}
235	catch (Exception e) {
236	throw new RuntimeException("Unexpected exception during normalization of distributional characteristics.",
237	e);
238	}
239	return data;
240	}
241	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/SetWiseEMContextSelection.java

Download in other formats: