source: trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/SetWiseEMContextSelection.java

Last change on this file was 135, checked in by sherbold, 8 years ago
  • code documentation and formatting
File size: 9.1 KB
Line 
1// Copyright 2015 Georg-August-Universität Göttingen, Germany
2//
3//   Licensed under the Apache License, Version 2.0 (the "License");
4//   you may not use this file except in compliance with the License.
5//   You may obtain a copy of the License at
6//
7//       http://www.apache.org/licenses/LICENSE-2.0
8//
9//   Unless required by applicable law or agreed to in writing, software
10//   distributed under the License is distributed on an "AS IS" BASIS,
11//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12//   See the License for the specific language governing permissions and
13//   limitations under the License.
14
15package de.ugoe.cs.cpdp.dataselection;
16
17import java.util.ArrayList;
18import java.util.HashSet;
19import java.util.LinkedList;
20import java.util.List;
21import java.util.Set;
22import java.util.logging.Level;
23
24import org.apache.commons.collections4.list.SetUniqueList;
25
26import de.ugoe.cs.util.console.Console;
27import weka.clusterers.EM;
28import weka.core.Attribute;
29import weka.core.DenseInstance;
30import weka.core.Instance;
31import weka.core.Instances;
32import weka.filters.Filter;
33import weka.filters.unsupervised.attribute.Normalize;
34
35/**
36 * Selects training data by clustering project context factors.
37 *
38 * The project context factors used for the clustering are configured in the XML param attribute,
39 * Example: <setwiseselector name="SetWiseEMContextSelection" param="AFS TND TNC" />
40 */
41public class SetWiseEMContextSelection implements ISetWiseDataselectionStrategy {
42
43    /**
44     * context factors
45     */
46    private String[] project_context_factors; // = new String[]{"TND", "TNC", "TNF", "TLOC"};
47
48    /*
49     * (non-Javadoc)
50     *
51     * @see de.ugoe.cs.cpdp.IParameterizable#setParameter(java.lang.String)
52     */
53    @Override
54    public void setParameter(String parameters) {
55        if (parameters != null) {
56            project_context_factors = parameters.split(" ");
57        }
58    }
59
60    /**
61     * Uses the Weka EM-Clustering algorithm to cluster the projects by their project context
62     * factors. The project context factors are first normalized and then used for clustering. They
63     * can be configured in the configuration param.
64     *
65     * @param testdata
66     * @param traindataSet
67     */
68    protected void cluster(Instances testdata, SetUniqueList<Instances> traindataSet) {
69        // now do the clustering, normalizedCharacteristicInstances ruft getContextFactors auf
70        final Instances data = this.normalizedCharacteristicInstances(testdata, traindataSet);
71
72        final Instance targetInstance = data.instance(0);
73        final List<Instance> candidateInstances = new LinkedList<Instance>();
74        for (int i = 1; i < data.numInstances(); i++) {
75            candidateInstances.add(data.instance(i));
76        }
77
78        // cluster and select
79        try {
80            final EM emeans = new EM();
81            boolean onlyTarget = true;
82            int targetCluster;
83            int maxNumClusters = candidateInstances.size();
84
85            do { // while(onlyTarget)
86                emeans.setMaximumNumberOfClusters(maxNumClusters);
87                emeans.buildClusterer(data);
88
89                targetCluster = emeans.clusterInstance(targetInstance);
90
91                // check if cluster only contains target project
92                for (int i = 0; i < candidateInstances.size() && onlyTarget; i++) {
93                    onlyTarget &=
94                        !(emeans.clusterInstance(candidateInstances.get(i)) == targetCluster);
95                }
96                maxNumClusters = emeans.numberOfClusters() - 1;
97
98                // Console.traceln(Level.INFO, "number of clusters: " + emeans.numberOfClusters());
99            }
100            while (onlyTarget);
101
102            Console.traceln(Level.INFO, "clusters: " + maxNumClusters);
103            Console.traceln(Level.INFO, "instances vor dem clustern: " + traindataSet.size());
104            int numRemoved = 0;
105            for (int i = 0; i < candidateInstances.size(); i++) {
106                if (emeans.clusterInstance(candidateInstances.get(i)) != targetCluster) {
107                    traindataSet.remove(i - numRemoved++);
108                }
109            }
110            Console.traceln(Level.INFO, "instances nach dem clustern: " + traindataSet.size());
111        }
112        catch (Exception e) {
113            throw new RuntimeException("error applying setwise EM clustering training data selection",
114                                       e);
115        }
116    }
117
118    /*
119     * (non-Javadoc)
120     *
121     * @see de.ugoe.cs.cpdp.dataselection.ISetWiseDataselectionStrategy#apply(weka.core.Instances,
122     * org.apache.commons.collections4.list.SetUniqueList)
123     */
124    @Override
125    public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
126        // issuetracking und pl muss passen
127        /*
128         * int s = traindataSet.size(); Console.traceln(Level.INFO,
129         * "remove non matching PL and IssueTracking projects, size now: " + s);
130         * this.removeWrongContext(testdata, traindataSet, "PL"); this.removeWrongContext(testdata,
131         * traindataSet, "IssueTracking"); s = traindataSet.size(); Console.traceln(Level.INFO,
132         * "size after removal: " + s);
133         */
134        // now cluster
135        this.cluster(testdata, traindataSet);
136    }
137
138    /**
139     * Returns test- and training data with only the project context factors which were chosen in
140     * the configuration. This is later used for clustering.
141     *
142     * @param testdata
143     * @param traindataSet
144     * @return
145     */
146    protected Instances getContextFactors(Instances testdata,
147                                          SetUniqueList<Instances> traindataSet)
148    {
149        // setup weka Instances for clustering
150        final ArrayList<Attribute> atts = new ArrayList<Attribute>();
151
152        // we only want the project context factors
153        for (String pcf : this.project_context_factors) {
154            atts.add(new Attribute(pcf));
155        }
156
157        // set up the data
158        final Instances data = new Instances("project_context_factors", atts, 0);
159        double[] instanceValues = new double[atts.size()];
160
161        // only project context factors + only one instance per project needed
162        int i = 0;
163        for (String pcf : this.project_context_factors) {
164            instanceValues[i] = testdata.instance(0).value(testdata.attribute(pcf));
165            // Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " +
166            // instanceValues[i]);
167            i++;
168        }
169        data.add(new DenseInstance(1.0, instanceValues));
170
171        // now for the projects of the training stet
172        for (Instances traindata : traindataSet) {
173            instanceValues = new double[atts.size()]; // ohne das hier immer dieselben werte?!
174            i = 0;
175            for (String pcf : this.project_context_factors) {
176                instanceValues[i] = traindata.instance(0).value(traindata.attribute(pcf));
177                // Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " +
178                // instanceValues[i]);
179                i++;
180            }
181
182            data.add(new DenseInstance(1.0, instanceValues));
183        }
184
185        return data;
186    }
187
188    /**
189     * Delete projects where the project context does not match the training project
190     *
191     * @param testdata
192     * @param traindataSet
193     * @param attribute
194     */
195    protected void removeWrongContext(Instances testdata,
196                                      SetUniqueList<Instances> traindataSet,
197                                      String attribute)
198    {
199        Set<Instances> remove = new HashSet<Instances>();
200        for (Instances traindata : traindataSet) {
201            if (traindata.firstInstance().value(traindata.attribute(attribute)) != testdata
202                .firstInstance().value(testdata.attribute(attribute)))
203            {
204                remove.add(traindata);
205                // Console.traceln(Level.WARNING,
206                // "rmove attribute "+attribute+" test:
207                // "+testdata.firstInstance().value(testdata.attribute(attribute))+" train:
208                // "+traindata.firstInstance().value(traindata.attribute(attribute)));
209            }
210        }
211
212        // now delete the projects from set
213        for (Instances i : remove) {
214            traindataSet.remove(i);
215            // Console.traceln(Level.INFO, "removing training project from set");
216        }
217    }
218
219    /**
220     * Normalizes the data before it gets used for clustering
221     *
222     * @param testdata
223     * @param traindataSet
224     * @return
225     */
226    protected Instances normalizedCharacteristicInstances(Instances testdata,
227                                                          SetUniqueList<Instances> traindataSet)
228    {
229        Instances data = this.getContextFactors(testdata, traindataSet);
230        try {
231            final Normalize normalizer = new Normalize();
232            normalizer.setInputFormat(data);
233            data = Filter.useFilter(data, normalizer);
234        }
235        catch (Exception e) {
236            throw new RuntimeException("Unexpected exception during normalization of distributional characteristics.",
237                                       e);
238        }
239        return data;
240    }
241}
Note: See TracBrowser for help on using the repository browser.