Context Navigation

← Previous Change
Next Change →

SetWiseEMContextSelection.java

Timestamp:

09/24/15 10:59:05 (9 years ago)

Author:

sherbold

Message:

formatted code and added copyrights

File:

: 1 edited

trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/SetWiseEMContextSelection.java (modified) (2 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/SetWiseEMContextSelection.java

-                      r38
+                      r41
+// Copyright 2015 Georg-August-Universität Göttingen, Germany
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
 package de.ugoe.cs.cpdp.dataselection;
 …
  * Selects training data by clustering project context factors.
+ *
+ * The project context factors used for the clustering are configured in
+ * the XML param attribute, Example:
+ * <setwiseselector name="SetWiseEMContextSelection" param="AFS TND TNC" />
+ * The project context factors used for the clustering are configured in the XML param attribute,
+ * Example: <setwiseselector name="SetWiseEMContextSelection" param="AFS TND TNC" />
  */
 public class SetWiseEMContextSelection implements ISetWiseDataselectionStrategy {
+        private String[] project_context_factors; // = new String[]{"TND", "TNC", "TNF", "TLOC"};
+        @Override
+        public void setParameter(String parameters) {
+                if( parameters!=null ) {
+                        project_context_factors = parameters.split(" ");
+                }
+        }
+        /**
+         * Uses the Weka EM-Clustering algorithm to cluster the projects
+         * by their project context factors.
+         * The project context factors are first normalized and then used for clustering.
+         * They can be configured in the configuration param.
+         *
+         * @param testdata
+         * @param traindataSet
+         */
+        protected void cluster(Instances testdata, SetUniqueList<Instances> traindataSet) {
+                // now do the clustering, normalizedCharacteristicInstances ruft getContextFactors auf
+                final Instances data = this.normalizedCharacteristicInstances(testdata, traindataSet);
+                final Instance targetInstance = data.instance(0);
+                final List<Instance> candidateInstances = new LinkedList<Instance>();
+                for( int i=1; i<data.numInstances(); i++ ) {
+                        candidateInstances.add(data.instance(i));
+                }
+                // cluster and select
+                try {
+                        final EM emeans = new EM();
+                        boolean onlyTarget = true;
+                        int targetCluster;
+                        int maxNumClusters = candidateInstances.size();
+                        do { // while(onlyTarget)
+                                emeans.setMaximumNumberOfClusters(maxNumClusters);
+                                emeans.buildClusterer(data);
+                                targetCluster = emeans.clusterInstance(targetInstance);
+                                // check if cluster only contains target project
+                                for( int i=0 ; i<candidateInstances.size() && onlyTarget; i++ ) {
+                                        onlyTarget &= !(emeans.clusterInstance(candidateInstances.get(i))==targetCluster);
+                                }
+                                maxNumClusters = emeans.numberOfClusters()-1;
+                                //Console.traceln(Level.INFO, "number of clusters: " + emeans.numberOfClusters());
+                        } while(onlyTarget);
+                        Console.traceln(Level.INFO, "clusters: " + maxNumClusters);
+                        Console.traceln(Level.INFO, "instances vor dem clustern: " + traindataSet.size());
+                        int numRemoved = 0;
+                        for( int i=0 ; i<candidateInstances.size() ; i++ ) {
+                                if( emeans.clusterInstance(candidateInstances.get(i))!=targetCluster ) {
+                                        traindataSet.remove(i-numRemoved++);
+                                }
+                        }
+                        Console.traceln(Level.INFO, "instances nach dem clustern: " + traindataSet.size());
+                } catch(Exception e) {
+                        throw new RuntimeException("error applying setwise EM clustering training data selection", e);
+                }
+        }
+        @Override
+        public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
+                // issuetracking und pl muss passen
+                /*
+                int s = traindataSet.size();
+                Console.traceln(Level.INFO, "remove non matching PL and IssueTracking projects, size now: " + s);
+                this.removeWrongContext(testdata, traindataSet, "PL");
+                this.removeWrongContext(testdata, traindataSet, "IssueTracking");
+                s = traindataSet.size();
+                Console.traceln(Level.INFO, "size after removal: " + s);
+                */
+                // now cluster
+                this.cluster(testdata, traindataSet);
+        }
+        /**
+         * Returns test- and training data with only the project context factors
+         * which were chosen in the configuration.
+         * This is later used for clustering.
+         *
+         * @param testdata
+         * @param traindataSet
+         * @return
+         */
+        protected Instances getContextFactors(Instances testdata, SetUniqueList<Instances> traindataSet) {
+                // setup weka Instances for clustering
+                final ArrayList<Attribute> atts = new ArrayList<Attribute>();
+                // we only want the project context factors
+                for( String pcf : this.project_context_factors ) {
+                        atts.add(new Attribute(pcf));
+                }
+                // set up the data
+                final Instances data = new Instances("project_context_factors", atts, 0);
+                double[] instanceValues = new double[atts.size()];
+                // only project context factors + only one instance per project needed
+                int i = 0;
+                for( String pcf : this.project_context_factors ) {
+                        instanceValues[i] = testdata.instance(0).value(testdata.attribute(pcf));
+                        //Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " + instanceValues[i]);
+                        i++;
+                }
+                data.add(new DenseInstance(1.0, instanceValues));
+                // now for the projects of the training stet
+                for( Instances traindata : traindataSet ) {
+                        instanceValues = new double[atts.size()];  // ohne das hier immer dieselben werte?!
+                        i = 0;
+                        for( String pcf : this.project_context_factors ) {
+                                instanceValues[i] = traindata.instance(0).value(traindata.attribute(pcf));
+                                //Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " + instanceValues[i]);
+                                i++;
+                        }
+                        data.add(new DenseInstance(1.0, instanceValues));
+                }
+                return data;
+        }
+        /**
+         * Delete projects where the project context does not match the training project
+         *
+         * @param testdata
+         * @param traindataSet
+         * @param attribute
+         */
+        protected void removeWrongContext(Instances testdata, SetUniqueList<Instances> traindataSet, String attribute) {
+                Set<Instances> remove = new HashSet<Instances>();
+                for( Instances traindata : traindataSet ) {
+                        if( traindata.firstInstance().value(traindata.attribute(attribute)) != testdata.firstInstance().value(testdata.attribute(attribute)) ) {
+                                remove.add(traindata);
+                                //Console.traceln(Level.WARNING, "rmove attribute "+attribute+" test: "+testdata.firstInstance().value(testdata.attribute(attribute))+" train: "+traindata.firstInstance().value(traindata.attribute(attribute)));
+                        }
+                }
+                // now delete the projects from set
+                for( Instances i : remove ) {
+                        traindataSet.remove(i);
+                        //Console.traceln(Level.INFO, "removing training project from set");
+                }
+        }
+        /**
+         * Normalizes the data before it gets used for clustering
+         *
+         * @param testdata
+         * @param traindataSet
+         * @return
+         */
+        protected Instances normalizedCharacteristicInstances(Instances testdata, SetUniqueList<Instances> traindataSet) {
+                Instances data = this.getContextFactors(testdata, traindataSet);
+                try {
+                        final Normalize normalizer = new Normalize();
+                        normalizer.setInputFormat(data);
+                        data = Filter.useFilter(data, normalizer);
+                } catch (Exception e) {
+                        throw new RuntimeException("Unexpected exception during normalization of distributional characteristics.", e);
+                }
+                return data;
+        }
+    private String[] project_context_factors; // = new String[]{"TND", "TNC", "TNF", "TLOC"};
+    @Override
+    public void setParameter(String parameters) {
+        if (parameters != null) {
+            project_context_factors = parameters.split(" ");
+        }
+    }
+    /**
+     * Uses the Weka EM-Clustering algorithm to cluster the projects by their project context
+     * factors. The project context factors are first normalized and then used for clustering. They
+     * can be configured in the configuration param.
+     *
+     * @param testdata
+     * @param traindataSet
+     */
+    protected void cluster(Instances testdata, SetUniqueList<Instances> traindataSet) {
+        // now do the clustering, normalizedCharacteristicInstances ruft getContextFactors auf
+        final Instances data = this.normalizedCharacteristicInstances(testdata, traindataSet);
+        final Instance targetInstance = data.instance(0);
+        final List<Instance> candidateInstances = new LinkedList<Instance>();
+        for (int i = 1; i < data.numInstances(); i++) {
+            candidateInstances.add(data.instance(i));
+        }
+        // cluster and select
+        try {
+            final EM emeans = new EM();
+            boolean onlyTarget = true;
+            int targetCluster;
+            int maxNumClusters = candidateInstances.size();
+            do { // while(onlyTarget)
+                emeans.setMaximumNumberOfClusters(maxNumClusters);
+                emeans.buildClusterer(data);
+                targetCluster = emeans.clusterInstance(targetInstance);
+                // check if cluster only contains target project
+                for (int i = 0; i < candidateInstances.size() && onlyTarget; i++) {
+                    onlyTarget &=
+                        !(emeans.clusterInstance(candidateInstances.get(i)) == targetCluster);
+                }
+                maxNumClusters = emeans.numberOfClusters() - 1;
+                // Console.traceln(Level.INFO, "number of clusters: " + emeans.numberOfClusters());
+            }
+            while (onlyTarget);
+            Console.traceln(Level.INFO, "clusters: " + maxNumClusters);
+            Console.traceln(Level.INFO, "instances vor dem clustern: " + traindataSet.size());
+            int numRemoved = 0;
+            for (int i = 0; i < candidateInstances.size(); i++) {
+                if (emeans.clusterInstance(candidateInstances.get(i)) != targetCluster) {
+                    traindataSet.remove(i - numRemoved++);
+                }
+            }
+            Console.traceln(Level.INFO, "instances nach dem clustern: " + traindataSet.size());
+        }
+        catch (Exception e) {
+            throw new RuntimeException(
+                                       "error applying setwise EM clustering training data selection",
+                                       e);
+        }
+    }
+    @Override
+    public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
+        // issuetracking und pl muss passen
+        /*
+         * int s = traindataSet.size(); Console.traceln(Level.INFO,
+         * "remove non matching PL and IssueTracking projects, size now: " + s);
+         * this.removeWrongContext(testdata, traindataSet, "PL"); this.removeWrongContext(testdata,
+         * traindataSet, "IssueTracking"); s = traindataSet.size(); Console.traceln(Level.INFO,
+         * "size after removal: " + s);
+         */
+        // now cluster
+        this.cluster(testdata, traindataSet);
+    }
+    /**
+     * Returns test- and training data with only the project context factors which were chosen in
+     * the configuration. This is later used for clustering.
+     *
+     * @param testdata
+     * @param traindataSet
+     * @return
+     */
+    protected Instances getContextFactors(Instances testdata, SetUniqueList<Instances> traindataSet)
+    {
+        // setup weka Instances for clustering
+        final ArrayList<Attribute> atts = new ArrayList<Attribute>();
+        // we only want the project context factors
+        for (String pcf : this.project_context_factors) {
+            atts.add(new Attribute(pcf));
+        }
+        // set up the data
+        final Instances data = new Instances("project_context_factors", atts, 0);
+        double[] instanceValues = new double[atts.size()];
+        // only project context factors + only one instance per project needed
+        int i = 0;
+        for (String pcf : this.project_context_factors) {
+            instanceValues[i] = testdata.instance(0).value(testdata.attribute(pcf));
+            // Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " +
+            // instanceValues[i]);
+            i++;
+        }
+        data.add(new DenseInstance(1.0, instanceValues));
+        // now for the projects of the training stet
+        for (Instances traindata : traindataSet) {
+            instanceValues = new double[atts.size()]; // ohne das hier immer dieselben werte?!
+            i = 0;
+            for (String pcf : this.project_context_factors) {
+                instanceValues[i] = traindata.instance(0).value(traindata.attribute(pcf));
+                // Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " +
+                // instanceValues[i]);
+                i++;
+            }
+            data.add(new DenseInstance(1.0, instanceValues));
+        }
+        return data;
+    }
+    /**
+     * Delete projects where the project context does not match the training project
+     *
+     * @param testdata
+     * @param traindataSet
+     * @param attribute
+     */
+    protected void removeWrongContext(Instances testdata,
+                                      SetUniqueList<Instances> traindataSet,
+                                      String attribute)
+    {
+        Set<Instances> remove = new HashSet<Instances>();
+        for (Instances traindata : traindataSet) {
+            if (traindata.firstInstance().value(traindata.attribute(attribute)) != testdata
+                .firstInstance().value(testdata.attribute(attribute)))
+            {
+                remove.add(traindata);
+                // Console.traceln(Level.WARNING,
+                // "rmove attribute "+attribute+" test: "+testdata.firstInstance().value(testdata.attribute(attribute))+" train: "+traindata.firstInstance().value(traindata.attribute(attribute)));
+            }
+        }
+        // now delete the projects from set
+        for (Instances i : remove) {
+            traindataSet.remove(i);
+            // Console.traceln(Level.INFO, "removing training project from set");
+        }
+    }
+    /**
+     * Normalizes the data before it gets used for clustering
+     *
+     * @param testdata
+     * @param traindataSet
+     * @return
+     */
+    protected Instances normalizedCharacteristicInstances(Instances testdata,
+                                                          SetUniqueList<Instances> traindataSet)
+    {
+        Instances data = this.getContextFactors(testdata, traindataSet);
+        try {
+            final Normalize normalizer = new Normalize();
+            normalizer.setInputFormat(data);
+            data = Filter.useFilter(data, normalizer);
+        }
+        catch (Exception e) {
+            throw new RuntimeException(
+                                       "Unexpected exception during normalization of distributional characteristics.",
+                                       e);
+        }
+        return data;
+    }
+}

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 41 for trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/SetWiseEMContextSelection.java

Legend:

trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/SetWiseEMContextSelection.java

Download in other formats: