Context Navigation

← Previous Revision
Next Revision →
Normal
Revision Log

SetWiseEMContextSelection.java

Last change on this file was 135, checked in by sherbold, 8 years ago
code documentation and formatting
File size: 9.1 KB

Rev	Line
[86]	1	// Copyright 2015 Georg-August-Universität Göttingen, Germany
[41]	2	//
	3	// Licensed under the Apache License, Version 2.0 (the "License");
	4	// you may not use this file except in compliance with the License.
	5	// You may obtain a copy of the License at
	6	//
	7	// http://www.apache.org/licenses/LICENSE-2.0
	8	//
	9	// Unless required by applicable law or agreed to in writing, software
	10	// distributed under the License is distributed on an "AS IS" BASIS,
	11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	12	// See the License for the specific language governing permissions and
	13	// limitations under the License.
	14
[29]	15	package de.ugoe.cs.cpdp.dataselection;
	16
	17	import java.util.ArrayList;
	18	import java.util.HashSet;
	19	import java.util.LinkedList;
	20	import java.util.List;
	21	import java.util.Set;
	22	import java.util.logging.Level;
	23
	24	import org.apache.commons.collections4.list.SetUniqueList;
	25
	26	import de.ugoe.cs.util.console.Console;
	27	import weka.clusterers.EM;
	28	import weka.core.Attribute;
	29	import weka.core.DenseInstance;
	30	import weka.core.Instance;
	31	import weka.core.Instances;
	32	import weka.filters.Filter;
	33	import weka.filters.unsupervised.attribute.Normalize;
	34
	35	/**
	36	* Selects training data by clustering project context factors.
	37	*
[41]	38	* The project context factors used for the clustering are configured in the XML param attribute,
	39	* Example: <setwiseselector name="SetWiseEMContextSelection" param="AFS TND TNC" />
[29]	40	*/
	41	public class SetWiseEMContextSelection implements ISetWiseDataselectionStrategy {
	42
[135]	43	/**
	44	* context factors
	45	*/
[41]	46	private String[] project_context_factors; // = new String[]{"TND", "TNC", "TNF", "TLOC"};
[29]	47
[135]	48	/*
	49	* (non-Javadoc)
	50	*
	51	* @see de.ugoe.cs.cpdp.IParameterizable#setParameter(java.lang.String)
	52	*/
[41]	53	@Override
	54	public void setParameter(String parameters) {
	55	if (parameters != null) {
	56	project_context_factors = parameters.split(" ");
	57	}
	58	}
[29]	59
[41]	60	/**
	61	* Uses the Weka EM-Clustering algorithm to cluster the projects by their project context
	62	* factors. The project context factors are first normalized and then used for clustering. They
	63	* can be configured in the configuration param.
	64	*
	65	* @param testdata
	66	* @param traindataSet
	67	*/
	68	protected void cluster(Instances testdata, SetUniqueList<Instances> traindataSet) {
	69	// now do the clustering, normalizedCharacteristicInstances ruft getContextFactors auf
	70	final Instances data = this.normalizedCharacteristicInstances(testdata, traindataSet);
[29]	71
[41]	72	final Instance targetInstance = data.instance(0);
	73	final List<Instance> candidateInstances = new LinkedList<Instance>();
	74	for (int i = 1; i < data.numInstances(); i++) {
	75	candidateInstances.add(data.instance(i));
	76	}
[29]	77
[41]	78	// cluster and select
	79	try {
	80	final EM emeans = new EM();
	81	boolean onlyTarget = true;
	82	int targetCluster;
	83	int maxNumClusters = candidateInstances.size();
[29]	84
[41]	85	do { // while(onlyTarget)
	86	emeans.setMaximumNumberOfClusters(maxNumClusters);
	87	emeans.buildClusterer(data);
	88
	89	targetCluster = emeans.clusterInstance(targetInstance);
	90
	91	// check if cluster only contains target project
	92	for (int i = 0; i < candidateInstances.size() && onlyTarget; i++) {
	93	onlyTarget &=
	94	!(emeans.clusterInstance(candidateInstances.get(i)) == targetCluster);
	95	}
	96	maxNumClusters = emeans.numberOfClusters() - 1;
	97
	98	// Console.traceln(Level.INFO, "number of clusters: " + emeans.numberOfClusters());
	99	}
	100	while (onlyTarget);
	101
	102	Console.traceln(Level.INFO, "clusters: " + maxNumClusters);
	103	Console.traceln(Level.INFO, "instances vor dem clustern: " + traindataSet.size());
	104	int numRemoved = 0;
	105	for (int i = 0; i < candidateInstances.size(); i++) {
	106	if (emeans.clusterInstance(candidateInstances.get(i)) != targetCluster) {
	107	traindataSet.remove(i - numRemoved++);
	108	}
	109	}
	110	Console.traceln(Level.INFO, "instances nach dem clustern: " + traindataSet.size());
	111	}
	112	catch (Exception e) {
[135]	113	throw new RuntimeException("error applying setwise EM clustering training data selection",
[41]	114	e);
	115	}
	116	}
	117
[135]	118	/*
	119	* (non-Javadoc)
	120	*
	121	* @see de.ugoe.cs.cpdp.dataselection.ISetWiseDataselectionStrategy#apply(weka.core.Instances,
	122	* org.apache.commons.collections4.list.SetUniqueList)
	123	*/
[41]	124	@Override
	125	public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
	126	// issuetracking und pl muss passen
	127	/*
	128	* int s = traindataSet.size(); Console.traceln(Level.INFO,
	129	* "remove non matching PL and IssueTracking projects, size now: " + s);
	130	* this.removeWrongContext(testdata, traindataSet, "PL"); this.removeWrongContext(testdata,
	131	* traindataSet, "IssueTracking"); s = traindataSet.size(); Console.traceln(Level.INFO,
	132	* "size after removal: " + s);
	133	*/
	134	// now cluster
	135	this.cluster(testdata, traindataSet);
	136	}
	137
	138	/**
	139	* Returns test- and training data with only the project context factors which were chosen in
	140	* the configuration. This is later used for clustering.
	141	*
	142	* @param testdata
	143	* @param traindataSet
	144	* @return
	145	*/
[135]	146	protected Instances getContextFactors(Instances testdata,
	147	SetUniqueList<Instances> traindataSet)
[41]	148	{
	149	// setup weka Instances for clustering
	150	final ArrayList<Attribute> atts = new ArrayList<Attribute>();
	151
	152	// we only want the project context factors
	153	for (String pcf : this.project_context_factors) {
	154	atts.add(new Attribute(pcf));
	155	}
	156
	157	// set up the data
	158	final Instances data = new Instances("project_context_factors", atts, 0);
	159	double[] instanceValues = new double[atts.size()];
	160
	161	// only project context factors + only one instance per project needed
	162	int i = 0;
	163	for (String pcf : this.project_context_factors) {
	164	instanceValues[i] = testdata.instance(0).value(testdata.attribute(pcf));
	165	// Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " +
	166	// instanceValues[i]);
	167	i++;
	168	}
	169	data.add(new DenseInstance(1.0, instanceValues));
	170
	171	// now for the projects of the training stet
	172	for (Instances traindata : traindataSet) {
	173	instanceValues = new double[atts.size()]; // ohne das hier immer dieselben werte?!
	174	i = 0;
	175	for (String pcf : this.project_context_factors) {
	176	instanceValues[i] = traindata.instance(0).value(traindata.attribute(pcf));
	177	// Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " +
	178	// instanceValues[i]);
	179	i++;
	180	}
	181
	182	data.add(new DenseInstance(1.0, instanceValues));
	183	}
	184
	185	return data;
	186	}
	187
	188	/**
	189	* Delete projects where the project context does not match the training project
	190	*
	191	* @param testdata
	192	* @param traindataSet
	193	* @param attribute
	194	*/
	195	protected void removeWrongContext(Instances testdata,
	196	SetUniqueList<Instances> traindataSet,
	197	String attribute)
	198	{
	199	Set<Instances> remove = new HashSet<Instances>();
	200	for (Instances traindata : traindataSet) {
	201	if (traindata.firstInstance().value(traindata.attribute(attribute)) != testdata
	202	.firstInstance().value(testdata.attribute(attribute)))
	203	{
	204	remove.add(traindata);
	205	// Console.traceln(Level.WARNING,
[135]	206	// "rmove attribute "+attribute+" test:
	207	// "+testdata.firstInstance().value(testdata.attribute(attribute))+" train:
	208	// "+traindata.firstInstance().value(traindata.attribute(attribute)));
[41]	209	}
	210	}
	211
	212	// now delete the projects from set
	213	for (Instances i : remove) {
	214	traindataSet.remove(i);
	215	// Console.traceln(Level.INFO, "removing training project from set");
	216	}
	217	}
	218
	219	/**
	220	* Normalizes the data before it gets used for clustering
	221	*
	222	* @param testdata
	223	* @param traindataSet
	224	* @return
	225	*/
	226	protected Instances normalizedCharacteristicInstances(Instances testdata,
	227	SetUniqueList<Instances> traindataSet)
	228	{
	229	Instances data = this.getContextFactors(testdata, traindataSet);
	230	try {
	231	final Normalize normalizer = new Normalize();
	232	normalizer.setInputFormat(data);
	233	data = Filter.useFilter(data, normalizer);
	234	}
	235	catch (Exception e) {
[135]	236	throw new RuntimeException("Unexpected exception during normalization of distributional characteristics.",
[41]	237	e);
	238	}
	239	return data;
	240	}
[29]	241	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/SetWiseEMContextSelection.java

Download in other formats: