Context Navigation

source: trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/SetWiseEMContextSelection.java @ 50

Last change on this file since 50 was 41, checked in by sherbold, 9 years ago
formatted code and added copyrights
File size: 8.7 KB

Line
1	// Copyright 2015 Georg-August-Universität Göttingen, Germany
2	//
3	// Licensed under the Apache License, Version 2.0 (the "License");
4	// you may not use this file except in compliance with the License.
5	// You may obtain a copy of the License at
6	//
7	// http://www.apache.org/licenses/LICENSE-2.0
8	//
9	// Unless required by applicable law or agreed to in writing, software
10	// distributed under the License is distributed on an "AS IS" BASIS,
11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	// See the License for the specific language governing permissions and
13	// limitations under the License.
14
15	package de.ugoe.cs.cpdp.dataselection;
16
17	import java.util.ArrayList;
18	import java.util.HashSet;
19	import java.util.LinkedList;
20	import java.util.List;
21	import java.util.Set;
22	import java.util.logging.Level;
23
24	import org.apache.commons.collections4.list.SetUniqueList;
25
26	import de.ugoe.cs.util.console.Console;
27	import weka.clusterers.EM;
28	import weka.core.Attribute;
29	import weka.core.DenseInstance;
30	import weka.core.Instance;
31	import weka.core.Instances;
32	import weka.filters.Filter;
33	import weka.filters.unsupervised.attribute.Normalize;
34
35	/**
36	* Selects training data by clustering project context factors.
37	*
38	* The project context factors used for the clustering are configured in the XML param attribute,
39	* Example: <setwiseselector name="SetWiseEMContextSelection" param="AFS TND TNC" />
40	*/
41	public class SetWiseEMContextSelection implements ISetWiseDataselectionStrategy {
42
43	private String[] project_context_factors; // = new String[]{"TND", "TNC", "TNF", "TLOC"};
44
45	@Override
46	public void setParameter(String parameters) {
47	if (parameters != null) {
48	project_context_factors = parameters.split(" ");
49	}
50	}
51
52	/**
53	* Uses the Weka EM-Clustering algorithm to cluster the projects by their project context
54	* factors. The project context factors are first normalized and then used for clustering. They
55	* can be configured in the configuration param.
56	*
57	* @param testdata
58	* @param traindataSet
59	*/
60	protected void cluster(Instances testdata, SetUniqueList<Instances> traindataSet) {
61	// now do the clustering, normalizedCharacteristicInstances ruft getContextFactors auf
62	final Instances data = this.normalizedCharacteristicInstances(testdata, traindataSet);
63
64	final Instance targetInstance = data.instance(0);
65	final List<Instance> candidateInstances = new LinkedList<Instance>();
66	for (int i = 1; i < data.numInstances(); i++) {
67	candidateInstances.add(data.instance(i));
68	}
69
70	// cluster and select
71	try {
72	final EM emeans = new EM();
73	boolean onlyTarget = true;
74	int targetCluster;
75	int maxNumClusters = candidateInstances.size();
76
77	do { // while(onlyTarget)
78	emeans.setMaximumNumberOfClusters(maxNumClusters);
79	emeans.buildClusterer(data);
80
81	targetCluster = emeans.clusterInstance(targetInstance);
82
83	// check if cluster only contains target project
84	for (int i = 0; i < candidateInstances.size() && onlyTarget; i++) {
85	onlyTarget &=
86	!(emeans.clusterInstance(candidateInstances.get(i)) == targetCluster);
87	}
88	maxNumClusters = emeans.numberOfClusters() - 1;
89
90	// Console.traceln(Level.INFO, "number of clusters: " + emeans.numberOfClusters());
91	}
92	while (onlyTarget);
93
94	Console.traceln(Level.INFO, "clusters: " + maxNumClusters);
95	Console.traceln(Level.INFO, "instances vor dem clustern: " + traindataSet.size());
96	int numRemoved = 0;
97	for (int i = 0; i < candidateInstances.size(); i++) {
98	if (emeans.clusterInstance(candidateInstances.get(i)) != targetCluster) {
99	traindataSet.remove(i - numRemoved++);
100	}
101	}
102	Console.traceln(Level.INFO, "instances nach dem clustern: " + traindataSet.size());
103	}
104	catch (Exception e) {
105	throw new RuntimeException(
106	"error applying setwise EM clustering training data selection",
107	e);
108	}
109	}
110
111	@Override
112	public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
113	// issuetracking und pl muss passen
114	/*
115	* int s = traindataSet.size(); Console.traceln(Level.INFO,
116	* "remove non matching PL and IssueTracking projects, size now: " + s);
117	* this.removeWrongContext(testdata, traindataSet, "PL"); this.removeWrongContext(testdata,
118	* traindataSet, "IssueTracking"); s = traindataSet.size(); Console.traceln(Level.INFO,
119	* "size after removal: " + s);
120	*/
121	// now cluster
122	this.cluster(testdata, traindataSet);
123	}
124
125	/**
126	* Returns test- and training data with only the project context factors which were chosen in
127	* the configuration. This is later used for clustering.
128	*
129	* @param testdata
130	* @param traindataSet
131	* @return
132	*/
133	protected Instances getContextFactors(Instances testdata, SetUniqueList<Instances> traindataSet)
134	{
135	// setup weka Instances for clustering
136	final ArrayList<Attribute> atts = new ArrayList<Attribute>();
137
138	// we only want the project context factors
139	for (String pcf : this.project_context_factors) {
140	atts.add(new Attribute(pcf));
141	}
142
143	// set up the data
144	final Instances data = new Instances("project_context_factors", atts, 0);
145	double[] instanceValues = new double[atts.size()];
146
147	// only project context factors + only one instance per project needed
148	int i = 0;
149	for (String pcf : this.project_context_factors) {
150	instanceValues[i] = testdata.instance(0).value(testdata.attribute(pcf));
151	// Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " +
152	// instanceValues[i]);
153	i++;
154	}
155	data.add(new DenseInstance(1.0, instanceValues));
156
157	// now for the projects of the training stet
158	for (Instances traindata : traindataSet) {
159	instanceValues = new double[atts.size()]; // ohne das hier immer dieselben werte?!
160	i = 0;
161	for (String pcf : this.project_context_factors) {
162	instanceValues[i] = traindata.instance(0).value(traindata.attribute(pcf));
163	// Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " +
164	// instanceValues[i]);
165	i++;
166	}
167
168	data.add(new DenseInstance(1.0, instanceValues));
169	}
170
171	return data;
172	}
173
174	/**
175	* Delete projects where the project context does not match the training project
176	*
177	* @param testdata
178	* @param traindataSet
179	* @param attribute
180	*/
181	protected void removeWrongContext(Instances testdata,
182	SetUniqueList<Instances> traindataSet,
183	String attribute)
184	{
185	Set<Instances> remove = new HashSet<Instances>();
186	for (Instances traindata : traindataSet) {
187	if (traindata.firstInstance().value(traindata.attribute(attribute)) != testdata
188	.firstInstance().value(testdata.attribute(attribute)))
189	{
190	remove.add(traindata);
191	// Console.traceln(Level.WARNING,
192	// "rmove attribute "+attribute+" test: "+testdata.firstInstance().value(testdata.attribute(attribute))+" train: "+traindata.firstInstance().value(traindata.attribute(attribute)));
193	}
194	}
195
196	// now delete the projects from set
197	for (Instances i : remove) {
198	traindataSet.remove(i);
199	// Console.traceln(Level.INFO, "removing training project from set");
200	}
201	}
202
203	/**
204	* Normalizes the data before it gets used for clustering
205	*
206	* @param testdata
207	* @param traindataSet
208	* @return
209	*/
210	protected Instances normalizedCharacteristicInstances(Instances testdata,
211	SetUniqueList<Instances> traindataSet)
212	{
213	Instances data = this.getContextFactors(testdata, traindataSet);
214	try {
215	final Normalize normalizer = new Normalize();
216	normalizer.setInputFormat(data);
217	data = Filter.useFilter(data, normalizer);
218	}
219	catch (Exception e) {
220	throw new RuntimeException(
221	"Unexpected exception during normalization of distributional characteristics.",
222	e);
223	}
224	return data;
225	}
226	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: