// Copyright 2015 Georg-August-Universität Göttingen, Germany // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package de.ugoe.cs.cpdp.dataselection; import java.util.ArrayList; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Set; import java.util.logging.Level; import org.apache.commons.collections4.list.SetUniqueList; import de.ugoe.cs.util.console.Console; import weka.clusterers.EM; import weka.core.Attribute; import weka.core.DenseInstance; import weka.core.Instance; import weka.core.Instances; import weka.filters.Filter; import weka.filters.unsupervised.attribute.Normalize; /** * Selects training data by clustering project context factors. * * The project context factors used for the clustering are configured in the XML param attribute, * Example: */ public class SetWiseEMContextSelection implements ISetWiseDataselectionStrategy { /** * context factors */ private String[] project_context_factors; // = new String[]{"TND", "TNC", "TNF", "TLOC"}; /* * (non-Javadoc) * * @see de.ugoe.cs.cpdp.IParameterizable#setParameter(java.lang.String) */ @Override public void setParameter(String parameters) { if (parameters != null) { project_context_factors = parameters.split(" "); } } /** * Uses the Weka EM-Clustering algorithm to cluster the projects by their project context * factors. The project context factors are first normalized and then used for clustering. They * can be configured in the configuration param. * * @param testdata * @param traindataSet */ protected void cluster(Instances testdata, SetUniqueList traindataSet) { // now do the clustering, normalizedCharacteristicInstances ruft getContextFactors auf final Instances data = this.normalizedCharacteristicInstances(testdata, traindataSet); final Instance targetInstance = data.instance(0); final List candidateInstances = new LinkedList(); for (int i = 1; i < data.numInstances(); i++) { candidateInstances.add(data.instance(i)); } // cluster and select try { final EM emeans = new EM(); boolean onlyTarget = true; int targetCluster; int maxNumClusters = candidateInstances.size(); do { // while(onlyTarget) emeans.setMaximumNumberOfClusters(maxNumClusters); emeans.buildClusterer(data); targetCluster = emeans.clusterInstance(targetInstance); // check if cluster only contains target project for (int i = 0; i < candidateInstances.size() && onlyTarget; i++) { onlyTarget &= !(emeans.clusterInstance(candidateInstances.get(i)) == targetCluster); } maxNumClusters = emeans.numberOfClusters() - 1; // Console.traceln(Level.INFO, "number of clusters: " + emeans.numberOfClusters()); } while (onlyTarget); Console.traceln(Level.INFO, "clusters: " + maxNumClusters); Console.traceln(Level.INFO, "instances vor dem clustern: " + traindataSet.size()); int numRemoved = 0; for (int i = 0; i < candidateInstances.size(); i++) { if (emeans.clusterInstance(candidateInstances.get(i)) != targetCluster) { traindataSet.remove(i - numRemoved++); } } Console.traceln(Level.INFO, "instances nach dem clustern: " + traindataSet.size()); } catch (Exception e) { throw new RuntimeException("error applying setwise EM clustering training data selection", e); } } /* * (non-Javadoc) * * @see de.ugoe.cs.cpdp.dataselection.ISetWiseDataselectionStrategy#apply(weka.core.Instances, * org.apache.commons.collections4.list.SetUniqueList) */ @Override public void apply(Instances testdata, SetUniqueList traindataSet) { // issuetracking und pl muss passen /* * int s = traindataSet.size(); Console.traceln(Level.INFO, * "remove non matching PL and IssueTracking projects, size now: " + s); * this.removeWrongContext(testdata, traindataSet, "PL"); this.removeWrongContext(testdata, * traindataSet, "IssueTracking"); s = traindataSet.size(); Console.traceln(Level.INFO, * "size after removal: " + s); */ // now cluster this.cluster(testdata, traindataSet); } /** * Returns test- and training data with only the project context factors which were chosen in * the configuration. This is later used for clustering. * * @param testdata * @param traindataSet * @return */ protected Instances getContextFactors(Instances testdata, SetUniqueList traindataSet) { // setup weka Instances for clustering final ArrayList atts = new ArrayList(); // we only want the project context factors for (String pcf : this.project_context_factors) { atts.add(new Attribute(pcf)); } // set up the data final Instances data = new Instances("project_context_factors", atts, 0); double[] instanceValues = new double[atts.size()]; // only project context factors + only one instance per project needed int i = 0; for (String pcf : this.project_context_factors) { instanceValues[i] = testdata.instance(0).value(testdata.attribute(pcf)); // Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " + // instanceValues[i]); i++; } data.add(new DenseInstance(1.0, instanceValues)); // now for the projects of the training stet for (Instances traindata : traindataSet) { instanceValues = new double[atts.size()]; // ohne das hier immer dieselben werte?! i = 0; for (String pcf : this.project_context_factors) { instanceValues[i] = traindata.instance(0).value(traindata.attribute(pcf)); // Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " + // instanceValues[i]); i++; } data.add(new DenseInstance(1.0, instanceValues)); } return data; } /** * Delete projects where the project context does not match the training project * * @param testdata * @param traindataSet * @param attribute */ protected void removeWrongContext(Instances testdata, SetUniqueList traindataSet, String attribute) { Set remove = new HashSet(); for (Instances traindata : traindataSet) { if (traindata.firstInstance().value(traindata.attribute(attribute)) != testdata .firstInstance().value(testdata.attribute(attribute))) { remove.add(traindata); // Console.traceln(Level.WARNING, // "rmove attribute "+attribute+" test: // "+testdata.firstInstance().value(testdata.attribute(attribute))+" train: // "+traindata.firstInstance().value(traindata.attribute(attribute))); } } // now delete the projects from set for (Instances i : remove) { traindataSet.remove(i); // Console.traceln(Level.INFO, "removing training project from set"); } } /** * Normalizes the data before it gets used for clustering * * @param testdata * @param traindataSet * @return */ protected Instances normalizedCharacteristicInstances(Instances testdata, SetUniqueList traindataSet) { Instances data = this.getContextFactors(testdata, traindataSet); try { final Normalize normalizer = new Normalize(); normalizer.setInputFormat(data); data = Filter.useFilter(data, normalizer); } catch (Exception e) { throw new RuntimeException("Unexpected exception during normalization of distributional characteristics.", e); } return data; } }