source: trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/SetWiseEMContextSelection.java @ 122

Last change on this file since 122 was 86, checked in by sherbold, 9 years ago
  • switched workspace encoding to UTF-8 and fixed broken characters
File size: 8.7 KB
Line 
1// Copyright 2015 Georg-August-Universität Göttingen, Germany
2//
3//   Licensed under the Apache License, Version 2.0 (the "License");
4//   you may not use this file except in compliance with the License.
5//   You may obtain a copy of the License at
6//
7//       http://www.apache.org/licenses/LICENSE-2.0
8//
9//   Unless required by applicable law or agreed to in writing, software
10//   distributed under the License is distributed on an "AS IS" BASIS,
11//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12//   See the License for the specific language governing permissions and
13//   limitations under the License.
14
15package de.ugoe.cs.cpdp.dataselection;
16
17import java.util.ArrayList;
18import java.util.HashSet;
19import java.util.LinkedList;
20import java.util.List;
21import java.util.Set;
22import java.util.logging.Level;
23
24import org.apache.commons.collections4.list.SetUniqueList;
25
26import de.ugoe.cs.util.console.Console;
27import weka.clusterers.EM;
28import weka.core.Attribute;
29import weka.core.DenseInstance;
30import weka.core.Instance;
31import weka.core.Instances;
32import weka.filters.Filter;
33import weka.filters.unsupervised.attribute.Normalize;
34
35/**
36 * Selects training data by clustering project context factors.
37 *
38 * The project context factors used for the clustering are configured in the XML param attribute,
39 * Example: <setwiseselector name="SetWiseEMContextSelection" param="AFS TND TNC" />
40 */
41public class SetWiseEMContextSelection implements ISetWiseDataselectionStrategy {
42
43    private String[] project_context_factors; // = new String[]{"TND", "TNC", "TNF", "TLOC"};
44
45    @Override
46    public void setParameter(String parameters) {
47        if (parameters != null) {
48            project_context_factors = parameters.split(" ");
49        }
50    }
51
52    /**
53     * Uses the Weka EM-Clustering algorithm to cluster the projects by their project context
54     * factors. The project context factors are first normalized and then used for clustering. They
55     * can be configured in the configuration param.
56     *
57     * @param testdata
58     * @param traindataSet
59     */
60    protected void cluster(Instances testdata, SetUniqueList<Instances> traindataSet) {
61        // now do the clustering, normalizedCharacteristicInstances ruft getContextFactors auf
62        final Instances data = this.normalizedCharacteristicInstances(testdata, traindataSet);
63
64        final Instance targetInstance = data.instance(0);
65        final List<Instance> candidateInstances = new LinkedList<Instance>();
66        for (int i = 1; i < data.numInstances(); i++) {
67            candidateInstances.add(data.instance(i));
68        }
69
70        // cluster and select
71        try {
72            final EM emeans = new EM();
73            boolean onlyTarget = true;
74            int targetCluster;
75            int maxNumClusters = candidateInstances.size();
76
77            do { // while(onlyTarget)
78                emeans.setMaximumNumberOfClusters(maxNumClusters);
79                emeans.buildClusterer(data);
80
81                targetCluster = emeans.clusterInstance(targetInstance);
82
83                // check if cluster only contains target project
84                for (int i = 0; i < candidateInstances.size() && onlyTarget; i++) {
85                    onlyTarget &=
86                        !(emeans.clusterInstance(candidateInstances.get(i)) == targetCluster);
87                }
88                maxNumClusters = emeans.numberOfClusters() - 1;
89
90                // Console.traceln(Level.INFO, "number of clusters: " + emeans.numberOfClusters());
91            }
92            while (onlyTarget);
93
94            Console.traceln(Level.INFO, "clusters: " + maxNumClusters);
95            Console.traceln(Level.INFO, "instances vor dem clustern: " + traindataSet.size());
96            int numRemoved = 0;
97            for (int i = 0; i < candidateInstances.size(); i++) {
98                if (emeans.clusterInstance(candidateInstances.get(i)) != targetCluster) {
99                    traindataSet.remove(i - numRemoved++);
100                }
101            }
102            Console.traceln(Level.INFO, "instances nach dem clustern: " + traindataSet.size());
103        }
104        catch (Exception e) {
105            throw new RuntimeException(
106                                       "error applying setwise EM clustering training data selection",
107                                       e);
108        }
109    }
110
111    @Override
112    public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
113        // issuetracking und pl muss passen
114        /*
115         * int s = traindataSet.size(); Console.traceln(Level.INFO,
116         * "remove non matching PL and IssueTracking projects, size now: " + s);
117         * this.removeWrongContext(testdata, traindataSet, "PL"); this.removeWrongContext(testdata,
118         * traindataSet, "IssueTracking"); s = traindataSet.size(); Console.traceln(Level.INFO,
119         * "size after removal: " + s);
120         */
121        // now cluster
122        this.cluster(testdata, traindataSet);
123    }
124
125    /**
126     * Returns test- and training data with only the project context factors which were chosen in
127     * the configuration. This is later used for clustering.
128     *
129     * @param testdata
130     * @param traindataSet
131     * @return
132     */
133    protected Instances getContextFactors(Instances testdata, SetUniqueList<Instances> traindataSet)
134    {
135        // setup weka Instances for clustering
136        final ArrayList<Attribute> atts = new ArrayList<Attribute>();
137
138        // we only want the project context factors
139        for (String pcf : this.project_context_factors) {
140            atts.add(new Attribute(pcf));
141        }
142
143        // set up the data
144        final Instances data = new Instances("project_context_factors", atts, 0);
145        double[] instanceValues = new double[atts.size()];
146
147        // only project context factors + only one instance per project needed
148        int i = 0;
149        for (String pcf : this.project_context_factors) {
150            instanceValues[i] = testdata.instance(0).value(testdata.attribute(pcf));
151            // Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " +
152            // instanceValues[i]);
153            i++;
154        }
155        data.add(new DenseInstance(1.0, instanceValues));
156
157        // now for the projects of the training stet
158        for (Instances traindata : traindataSet) {
159            instanceValues = new double[atts.size()]; // ohne das hier immer dieselben werte?!
160            i = 0;
161            for (String pcf : this.project_context_factors) {
162                instanceValues[i] = traindata.instance(0).value(traindata.attribute(pcf));
163                // Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " +
164                // instanceValues[i]);
165                i++;
166            }
167
168            data.add(new DenseInstance(1.0, instanceValues));
169        }
170
171        return data;
172    }
173
174    /**
175     * Delete projects where the project context does not match the training project
176     *
177     * @param testdata
178     * @param traindataSet
179     * @param attribute
180     */
181    protected void removeWrongContext(Instances testdata,
182                                      SetUniqueList<Instances> traindataSet,
183                                      String attribute)
184    {
185        Set<Instances> remove = new HashSet<Instances>();
186        for (Instances traindata : traindataSet) {
187            if (traindata.firstInstance().value(traindata.attribute(attribute)) != testdata
188                .firstInstance().value(testdata.attribute(attribute)))
189            {
190                remove.add(traindata);
191                // Console.traceln(Level.WARNING,
192                // "rmove attribute "+attribute+" test: "+testdata.firstInstance().value(testdata.attribute(attribute))+" train: "+traindata.firstInstance().value(traindata.attribute(attribute)));
193            }
194        }
195
196        // now delete the projects from set
197        for (Instances i : remove) {
198            traindataSet.remove(i);
199            // Console.traceln(Level.INFO, "removing training project from set");
200        }
201    }
202
203    /**
204     * Normalizes the data before it gets used for clustering
205     *
206     * @param testdata
207     * @param traindataSet
208     * @return
209     */
210    protected Instances normalizedCharacteristicInstances(Instances testdata,
211                                                          SetUniqueList<Instances> traindataSet)
212    {
213        Instances data = this.getContextFactors(testdata, traindataSet);
214        try {
215            final Normalize normalizer = new Normalize();
216            normalizer.setInputFormat(data);
217            data = Filter.useFilter(data, normalizer);
218        }
219        catch (Exception e) {
220            throw new RuntimeException(
221                                       "Unexpected exception during normalization of distributional characteristics.",
222                                       e);
223        }
224        return data;
225    }
226}
Note: See TracBrowser for help on using the repository browser.