Context Navigation

← Previous Change
Next Change →

dataselection

Timestamp:

09/24/15 10:59:05 (9 years ago)

Author:

sherbold

Message:

formatted code and added copyrights

Location:

trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection

Files:

: 11 edited

AbstractCharacteristicSelection.java (modified) (2 diffs)
IPointWiseDataselectionStrategy.java (modified) (2 diffs)
ISetWiseDataselectionStrategy.java (modified) (2 diffs)
PetersFilter.java (modified) (3 diffs)
PointWiseEMClusterSelection.java (modified) (2 diffs)
SeparatabilitySelection.java (modified) (2 diffs)
SetWiseEMClusterSelection.java (modified) (2 diffs)
SetWiseEMContextSelection.java (modified) (2 diffs)
SetWiseKNNSelection.java (modified) (2 diffs)
TestAsTraining.java (modified) (2 diffs)
TurhanFilter.java (modified) (2 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/AbstractCharacteristicSelection.java

-                      r2
+                      r41
+// Copyright 2015 Georg-August-Universität Göttingen, Germany
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
 package de.ugoe.cs.cpdp.dataselection;
 …
 /**
+ * Abstract class that implements the foundation of setwise data selection strategies using distributional characteristics.
+ * This class provides the means to transform the data sets into their characteristic vectors.
+ * Abstract class that implements the foundation of setwise data selection strategies using
+ * distributional characteristics. This class provides the means to transform the data sets into
+ * their characteristic vectors.
+ *
  * @author Steffen Herbold
  */
+public abstract class AbstractCharacteristicSelection implements
+                ISetWiseDataselectionStrategy {
+public abstract class AbstractCharacteristicSelection implements ISetWiseDataselectionStrategy {
+        /**
+         * vector with the distributional characteristics
+         */
+        private String[] characteristics = new String[]{"mean","stddev"};
+        /**
+         * Sets the distributional characteristics. The names of the characteristics are separated by blanks.
+         */
+        @Override
+        public void setParameter(String parameters) {
+                if( !"".equals(parameters) ) {
+                        characteristics = parameters.split(" ");
+                }
+        }
+        /**
+         * Transforms the data into the distributional characteristics. The first instance is the test data, followed by the training data.
+         * @param testdata test data
+         * @param traindataSet training data sets
+         * @return distributional characteristics of the data
+         */
+        protected Instances characteristicInstances(Instances testdata, SetUniqueList<Instances> traindataSet) {
+                // setup weka Instances for clustering
+                final ArrayList<Attribute> atts = new ArrayList<Attribute>();
+                final Attribute classAtt = testdata.classAttribute();
+                for( int i=0 ; i<testdata.numAttributes() ; i++ ) {
+                        Attribute dataAtt = testdata.attribute(i);
+                        if( !dataAtt.equals(classAtt) ) {
+                                for( String characteristic : characteristics ) {
+                                        atts.add(new Attribute(dataAtt.name() + "_" + characteristic));
+                                }
+                        }
+                }
+                final Instances data = new Instances("distributional_characteristics", atts, 0);
+                // setup data for clustering
+                double[] instanceValues = new double[atts.size()];
+                for( int i=0 ; i<testdata.numAttributes() ; i++ ) {
+                        Attribute dataAtt = testdata.attribute(i);
+                        if( !dataAtt.equals(classAtt) ) {
+                                Stats stats = testdata.attributeStats(i).numericStats;
+                                for( int j=0; j<characteristics.length; j++ ) {
+                                        if( "mean".equals(characteristics[j]) ) {
+                                                instanceValues[i*characteristics.length+j] = stats.mean;
+                                        } else if( "stddev".equals(characteristics[j])) {
+                                                instanceValues[i*characteristics.length+j] = stats.stdDev;
+                                        } else if( "var".equals(characteristics[j])) {
+                                                instanceValues[i*characteristics.length+j] = testdata.variance(j);
+                                        } else {
+                                                throw new RuntimeException("Unkown distributional characteristic: " + characteristics[j]);
+                                        }
+                                }
+                        }
+                }
+                data.add(new DenseInstance(1.0, instanceValues));
+                for( Instances traindata : traindataSet ) {
+                        instanceValues = new double[atts.size()];
+                        for( int i=0 ; i<traindata.numAttributes() ; i++ ) {
+                                Attribute dataAtt = traindata.attribute(i);
+                                if( !dataAtt.equals(classAtt) ) {
+                                        Stats stats = traindata.attributeStats(i).numericStats;
+                                        for( int j=0; j<characteristics.length; j++ ) {
+                                                if( "mean".equals(characteristics[j]) ) {
+                                                        instanceValues[i*characteristics.length+j] = stats.mean;
+                                                } else if( "stddev".equals(characteristics[j])) {
+                                                        instanceValues[i*characteristics.length+j] = stats.stdDev;
+                                                } else if( "var".equals(characteristics[j])) {
+                                                        instanceValues[i*characteristics.length+j] = testdata.variance(j);
+                                                } else {
+                                                        throw new RuntimeException("Unkown distributional characteristic: " + characteristics[j]);
+                                                }
+                                        }
+                                }
+                        }
+                        Instance instance = new DenseInstance(1.0, instanceValues);
+                        data.add(instance);
+                }
+                return data;
+        }
+        /**
+         * Returns the normalized distributional characteristics of the training data.
+         * @param testdata test data
+         * @param traindataSet training data sets
+         * @return normalized distributional characteristics of the data
+         */
+        protected Instances normalizedCharacteristicInstances(Instances testdata, SetUniqueList<Instances> traindataSet) {
+                Instances data = characteristicInstances(testdata, traindataSet);
+                try {
+                        final Normalize normalizer = new Normalize();
+                        normalizer.setInputFormat(data);
+                        data = Filter.useFilter(data, normalizer);
+                } catch (Exception e) {
+                        throw new RuntimeException("Unexpected exception during normalization of distributional characteristics.", e);
+                }
+                return data;
+        }
+    /**
+     * vector with the distributional characteristics
+     */
+    private String[] characteristics = new String[]
+        { "mean", "stddev" };
+    /**
+     * Sets the distributional characteristics. The names of the characteristics are separated by
+     * blanks.
+     */
+    @Override
+    public void setParameter(String parameters) {
+        if (!"".equals(parameters)) {
+            characteristics = parameters.split(" ");
+        }
+    }
+    /**
+     * Transforms the data into the distributional characteristics. The first instance is the test
+     * data, followed by the training data.
+     *
+     * @param testdata
+     *            test data
+     * @param traindataSet
+     *            training data sets
+     * @return distributional characteristics of the data
+     */
+    protected Instances characteristicInstances(Instances testdata,
+                                                SetUniqueList<Instances> traindataSet)
+    {
+        // setup weka Instances for clustering
+        final ArrayList<Attribute> atts = new ArrayList<Attribute>();
+        final Attribute classAtt = testdata.classAttribute();
+        for (int i = 0; i < testdata.numAttributes(); i++) {
+            Attribute dataAtt = testdata.attribute(i);
+            if (!dataAtt.equals(classAtt)) {
+                for (String characteristic : characteristics) {
+                    atts.add(new Attribute(dataAtt.name() + "_" + characteristic));
+                }
+            }
+        }
+        final Instances data = new Instances("distributional_characteristics", atts, 0);
+        // setup data for clustering
+        double[] instanceValues = new double[atts.size()];
+        for (int i = 0; i < testdata.numAttributes(); i++) {
+            Attribute dataAtt = testdata.attribute(i);
+            if (!dataAtt.equals(classAtt)) {
+                Stats stats = testdata.attributeStats(i).numericStats;
+                for (int j = 0; j < characteristics.length; j++) {
+                    if ("mean".equals(characteristics[j])) {
+                        instanceValues[i * characteristics.length + j] = stats.mean;
+                    }
+                    else if ("stddev".equals(characteristics[j])) {
+                        instanceValues[i * characteristics.length + j] = stats.stdDev;
+                    }
+                    else if ("var".equals(characteristics[j])) {
+                        instanceValues[i * characteristics.length + j] = testdata.variance(j);
+                    }
+                    else {
+                        throw new RuntimeException("Unkown distributional characteristic: " +
+                            characteristics[j]);
+                    }
+                }
+            }
+        }
+        data.add(new DenseInstance(1.0, instanceValues));
+        for (Instances traindata : traindataSet) {
+            instanceValues = new double[atts.size()];
+            for (int i = 0; i < traindata.numAttributes(); i++) {
+                Attribute dataAtt = traindata.attribute(i);
+                if (!dataAtt.equals(classAtt)) {
+                    Stats stats = traindata.attributeStats(i).numericStats;
+                    for (int j = 0; j < characteristics.length; j++) {
+                        if ("mean".equals(characteristics[j])) {
+                            instanceValues[i * characteristics.length + j] = stats.mean;
+                        }
+                        else if ("stddev".equals(characteristics[j])) {
+                            instanceValues[i * characteristics.length + j] = stats.stdDev;
+                        }
+                        else if ("var".equals(characteristics[j])) {
+                            instanceValues[i * characteristics.length + j] = testdata.variance(j);
+                        }
+                        else {
+                            throw new RuntimeException("Unkown distributional characteristic: " +
+                                characteristics[j]);
+                        }
+                    }
+                }
+            }
+            Instance instance = new DenseInstance(1.0, instanceValues);
+            data.add(instance);
+        }
+        return data;
+    }
+    /**
+     * Returns the normalized distributional characteristics of the training data.
+     *
+     * @param testdata
+     *            test data
+     * @param traindataSet
+     *            training data sets
+     * @return normalized distributional characteristics of the data
+     */
+    protected Instances normalizedCharacteristicInstances(Instances testdata,
+                                                          SetUniqueList<Instances> traindataSet)
+    {
+        Instances data = characteristicInstances(testdata, traindataSet);
+        try {
+            final Normalize normalizer = new Normalize();
+            normalizer.setInputFormat(data);
+            data = Filter.useFilter(data, normalizer);
+        }
+        catch (Exception e) {
+            throw new RuntimeException(
+                                       "Unexpected exception during normalization of distributional characteristics.",
+                                       e);
+        }
+        return data;
+    }
+}

trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/IPointWiseDataselectionStrategy.java

-                      r2
+                      r41
+// Copyright 2015 Georg-August-Universität Göttingen, Germany
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
 package de.ugoe.cs.cpdp.dataselection;
 …
 /**
+ * Interface for pointwise data selection strategies.
+ * Interface for pointwise data selection strategies.
+ *
  * @author Steffen Herbold
  */
 public interface IPointWiseDataselectionStrategy extends IParameterizable {
+        /**
+         * Applies the data selection strategy.
+         * @param testdata test data
+         * @param traindata candidate training data
+         * @return the selected training data
+         */
+        Instances apply(Instances testdata, Instances traindata);
+    /**
+     * Applies the data selection strategy.
+     *
+     * @param testdata
+     *            test data
+     * @param traindata
+     *            candidate training data
+     * @return the selected training data
+     */
+    Instances apply(Instances testdata, Instances traindata);
+}

trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/ISetWiseDataselectionStrategy.java

-                      r2
+                      r41
+// Copyright 2015 Georg-August-Universität Göttingen, Germany
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
 package de.ugoe.cs.cpdp.dataselection;
 …
 /**
  * Interface for setwise data selection strategies.
+ *
  * @author Steffen Herbold
  */
 public interface ISetWiseDataselectionStrategy extends IParameterizable {
+        /**
+         * Applies a setwise data selection strategy.
+         * @param testdata test data for which the training data is selected
+         * @param traindataSet candidate training data
+         */
+        void apply(Instances testdata, SetUniqueList<Instances> traindataSet);
+    /**
+     * Applies a setwise data selection strategy.
+     *
+     * @param testdata
+     *            test data for which the training data is selected
+     * @param traindataSet
+     *            candidate training data
+     */
+    void apply(Instances testdata, SetUniqueList<Instances> traindataSet);
+}

trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/PetersFilter.java

-                      r2
+                      r41
+// Copyright 2015 Georg-August-Universität Göttingen, Germany
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
 package de.ugoe.cs.cpdp.dataselection;
 …
 /**
+ * Filter according to F. Peters, T. Menzies, and A. Marcus: Better Cross Company Defect Prediction
+ * <br><br>
+ * This filter does not work, the paper has been withdrawn.
+ * Filter according to F. Peters, T. Menzies, and A. Marcus: Better Cross Company Defect Prediction <br>
+ * <br>
+ * This filter does not work, the paper has been withdrawn.
+ *
  * @author Steffen Herbold
  */
 …
 public class PetersFilter implements IPointWiseDataselectionStrategy {
+        /**
+         * @see de.ugoe.cs.cpdp.IParameterizable#setParameter(java.lang.String)
+         */
+        @Override
+        public void setParameter(String parameters) {
+                // dummy
+        }
+    /**
+     * @see de.ugoe.cs.cpdp.IParameterizable#setParameter(java.lang.String)
+     */
+    @Override
+    public void setParameter(String parameters) {
+        // dummy
+    }
+        /**
+         * @see de.ugoe.cs.cpdp.dataselection.IPointWiseDataselectionStrategy#apply(weka.core.Instances, weka.core.Instances)
+         */
+        @Override
+        public Instances apply(Instances testdata, Instances traindata) {
+                final Attribute classAttribute = testdata.classAttribute();
+                final double[][] testDoubles = new double[testdata.numInstances()][testdata.numAttributes()];
+                for( int i=0; i<testdata.numInstances() ; i++ ) {
+                        Instance instance = testdata.instance(i);
+                        int tmp = 0;
+                        for( int j=0 ; j<testdata.numAttributes(); j++ ) {
+                                if( testdata.attribute(j)!=classAttribute ) {
+                                        testDoubles[i][tmp++] = instance.value(j);
+                                }
+                        }
+                }
+                final double[][] trainDoubles = new double[traindata.numInstances()][testdata.numAttributes()];
+                for( int i=0; i<traindata.numInstances() ; i++ ) {
+                        Instance instance = traindata.instance(i);
+                        int tmp = 0;
+                        for( int j=0 ; j<testdata.numAttributes(); j++ ) {
+                                if( testdata.attribute(j)!=classAttribute ) {
+                                        trainDoubles[i][tmp++] = instance.value(j);
+                                }
+                        }
+                }
+                final List<List<Integer>> fanList = new ArrayList<List<Integer>>(testdata.numInstances());
+                for( int i=0; i<testdata.numInstances(); i++ ) {
+                        fanList.add(new LinkedList<Integer>());
+                }
+                for( int i=0; i<traindata.numInstances(); i++ ) {
+                        double minDistance = Double.MAX_VALUE;
+                        int minIndex = 0;
+                        for( int j=0; j<testdata.numInstances(); j++ ) {
+                                double distance = MathArrays.distance(trainDoubles[i], testDoubles[j]);
+                                if( distance<minDistance ) {
+                                        minDistance = distance;
+                                        minIndex = j;
+                                }
+                        }
+                        fanList.get(minIndex).add(i);
+                }
+                final SetUniqueList<Integer> selectedIndex = SetUniqueList.setUniqueList(new LinkedList<Integer>());
+                for( int i=0; i<testdata.numInstances(); i++ ) {
+                        double minDistance = Double.MAX_VALUE;
+                        int minIndex = -1;
+                        for( Integer j : fanList.get(i) ) {
+                                double distance = MathArrays.distance(testDoubles[i], trainDoubles[j]);
+                                if( distance<minDistance && distance>0.0d ) {
+                                        minDistance = distance;
+                                        minIndex = j;
+                                }
+                        }
+                        if( minIndex!=-1 ) {
+                                selectedIndex.add(minIndex);
+                        }
+                }
+                final Instances selected = new Instances(testdata);
+                selected.delete();
+                for( Integer i : selectedIndex) {
+                        selected.add(traindata.instance(i));
+                }
+                return selected;
+        }
+    /**
+     * @see de.ugoe.cs.cpdp.dataselection.IPointWiseDataselectionStrategy#apply(weka.core.Instances,
+     *      weka.core.Instances)
+     */
+    @Override
+    public Instances apply(Instances testdata, Instances traindata) {
+        final Attribute classAttribute = testdata.classAttribute();
+        final double[][] testDoubles =
+            new double[testdata.numInstances()][testdata.numAttributes()];
+        for (int i = 0; i < testdata.numInstances(); i++) {
+            Instance instance = testdata.instance(i);
+            int tmp = 0;
+            for (int j = 0; j < testdata.numAttributes(); j++) {
+                if (testdata.attribute(j) != classAttribute) {
+                    testDoubles[i][tmp++] = instance.value(j);
+                }
+            }
+        }
+        final double[][] trainDoubles =
+            new double[traindata.numInstances()][testdata.numAttributes()];
+        for (int i = 0; i < traindata.numInstances(); i++) {
+            Instance instance = traindata.instance(i);
+            int tmp = 0;
+            for (int j = 0; j < testdata.numAttributes(); j++) {
+                if (testdata.attribute(j) != classAttribute) {
+                    trainDoubles[i][tmp++] = instance.value(j);
+                }
+            }
+        }
+        final List<List<Integer>> fanList = new ArrayList<List<Integer>>(testdata.numInstances());
+        for (int i = 0; i < testdata.numInstances(); i++) {
+            fanList.add(new LinkedList<Integer>());
+        }
+        for (int i = 0; i < traindata.numInstances(); i++) {
+            double minDistance = Double.MAX_VALUE;
+            int minIndex = 0;
+            for (int j = 0; j < testdata.numInstances(); j++) {
+                double distance = MathArrays.distance(trainDoubles[i], testDoubles[j]);
+                if (distance < minDistance) {
+                    minDistance = distance;
+                    minIndex = j;
+                }
+            }
+            fanList.get(minIndex).add(i);
+        }
+        final SetUniqueList<Integer> selectedIndex =
+            SetUniqueList.setUniqueList(new LinkedList<Integer>());
+        for (int i = 0; i < testdata.numInstances(); i++) {
+            double minDistance = Double.MAX_VALUE;
+            int minIndex = -1;
+            for (Integer j : fanList.get(i)) {
+                double distance = MathArrays.distance(testDoubles[i], trainDoubles[j]);
+                if (distance < minDistance && distance > 0.0d) {
+                    minDistance = distance;
+                    minIndex = j;
+                }
+            }
+            if (minIndex != -1) {
+                selectedIndex.add(minIndex);
+            }
+        }
+        final Instances selected = new Instances(testdata);
+        selected.delete();
+        for (Integer i : selectedIndex) {
+            selected.add(traindata.instance(i));
+        }
+        return selected;
+    }
+}

trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/PointWiseEMClusterSelection.java

-                      r2
+                      r41
+// Copyright 2015 Georg-August-Universität Göttingen, Germany
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
 package de.ugoe.cs.cpdp.dataselection;
 …
 import de.ugoe.cs.util.console.Console;
 /**
  * Use in Config:
+ *
+ * Specify number of clusters
+ * -N = Num Clusters
+ * <pointwiseselector name="PointWiseEMClusterSelection" param="-N 10"/>
+ *
+ * Try to determine the number of clusters:
+ * -I 10 = max iterations
+ * -X 5 = 5 folds for cross evaluation
+ * -max = max number of clusters
+ * <pointwiseselector name="PointWiseEMClusterSelection" param="-I 10 -X 5 -max 300"/>
+ * Specify number of clusters -N = Num Clusters <pointwiseselector
+ * name="PointWiseEMClusterSelection" param="-N 10"/>
+ *
+ * Don't forget to add:
+ * <preprocessor name="Normalization" param=""/>
+ * Try to determine the number of clusters: -I 10 = max iterations -X 5 = 5 folds for cross
+ * evaluation -max = max number of clusters <pointwiseselector name="PointWiseEMClusterSelection"
+ * param="-I 10 -X 5 -max 300"/>
+ *
+ * Don't forget to add: <preprocessor name="Normalization" param=""/>
  */
 public class PointWiseEMClusterSelection implements IPointWiseDataselectionStrategy {
-        private String[] params;
-        @Override
-        public void setParameter(String parameters) {
-                params = parameters.split(" ");
+        }
+        /**
+         * 1. Cluster the traindata
+         * 2. for each instance in the testdata find the assigned cluster
+         * 3. select only traindata from the clusters we found in our testdata
+         *
+         * @returns the selected training data
+         */
+        @Override
+        public Instances apply(Instances testdata, Instances traindata) {
+                //final Attribute classAttribute = testdata.classAttribute();
+                final List<Integer> selectedCluster = SetUniqueList.setUniqueList(new LinkedList<Integer>());
+    private String[] params;
+                // 1. copy train- and testdata
+                Instances train = new Instances(traindata);
+                Instances test = new Instances(testdata);
+                Instances selected = null;
+                try {
+                        // remove class attribute from traindata
+                        Remove filter = new Remove();
+                        filter.setAttributeIndices("" + (train.classIndex() + 1));
+                        filter.setInputFormat(train);
+                        train = Filter.useFilter(train, filter);
+                        Console.traceln(Level.INFO, String.format("starting clustering"));
+                        // 3. cluster data
+                        EM clusterer = new EM();
+                        clusterer.setOptions(params);
+                        clusterer.buildClusterer(train);
+                        int numClusters = clusterer.getNumClusters();
+                        if ( numClusters == -1) {
+                                Console.traceln(Level.INFO, String.format("we have unlimited clusters"));
+                        }else {
+                                Console.traceln(Level.INFO, String.format("we have: "+numClusters+" clusters"));
+                        }
+                        // 4. classify testdata, save cluster int
+                        // remove class attribute from testdata?
+                        Remove filter2 = new Remove();
+                        filter2.setAttributeIndices("" + (test.classIndex() + 1));
+                        filter2.setInputFormat(test);
+                        test = Filter.useFilter(test, filter2);
+                        int cnum;
+                        for( int i=0; i < test.numInstances(); i++ ) {
+                                cnum = ((EM)clusterer).clusterInstance(test.get(i));
+    @Override
+    public void setParameter(String parameters) {
+        params = parameters.split(" ");
+    }
+                                // we dont want doubles (maybe use a hashset instead of list?)
+                                if ( !selectedCluster.contains(cnum) ) {
+                                        selectedCluster.add(cnum);
+                                        //Console.traceln(Level.INFO, String.format("assigned to cluster: "+cnum));
+                                }
+                        }
+                        Console.traceln(Level.INFO, String.format("our testdata is in: "+selectedCluster.size()+" different clusters"));
+                        // 5. get cluster membership of our traindata
+                        AddCluster cfilter = new AddCluster();
+                        cfilter.setClusterer(clusterer);
+                        cfilter.setInputFormat(train);
+                        Instances ctrain = Filter.useFilter(train, cfilter);
+                        // 6. for all traindata get the cluster int, if it is in our list of testdata cluster int add the traindata
+                        // of this cluster to our returned traindata
+                        int cnumber;
+                        selected = new Instances(traindata);
+                        selected.delete();
+                        for ( int j=0; j < ctrain.numInstances(); j++ ) {
+                                // get the cluster number from the attributes
+                                cnumber = Integer.parseInt(ctrain.get(j).stringValue(ctrain.get(j).numAttributes()-1).replace("cluster", ""));
+                                //Console.traceln(Level.INFO, String.format("instance "+j+" is in cluster: "+cnumber));
+                                if ( selectedCluster.contains(cnumber) ) {
+                                        // this only works if the index does not change
+                                        selected.add(traindata.get(j));
+                                        // check for differences, just one attribute, we are pretty sure the index does not change
+                                        if ( traindata.get(j).value(3) != ctrain.get(j).value(3) ) {
+                                                Console.traceln(Level.WARNING, String.format("we have a difference between train an ctrain!"));
+                                        }
+                                }
+                        }
+                        Console.traceln(Level.INFO, String.format("that leaves us with: "+selected.numInstances()+" traindata instances from "+traindata.numInstances()));
+                }catch( Exception e ) {
+                        Console.traceln(Level.WARNING, String.format("ERROR"));
+                        throw new RuntimeException("error in pointwise em", e);
+                }
+                return selected;
+        }
+    /**
+     * 1. Cluster the traindata 2. for each instance in the testdata find the assigned cluster 3.
+     * select only traindata from the clusters we found in our testdata
+     *
+     * @returns the selected training data
+     */
+    @Override
+    public Instances apply(Instances testdata, Instances traindata) {
+        // final Attribute classAttribute = testdata.classAttribute();
+        final List<Integer> selectedCluster =
+            SetUniqueList.setUniqueList(new LinkedList<Integer>());
+        // 1. copy train- and testdata
+        Instances train = new Instances(traindata);
+        Instances test = new Instances(testdata);
+        Instances selected = null;
+        try {
+            // remove class attribute from traindata
+            Remove filter = new Remove();
+            filter.setAttributeIndices("" + (train.classIndex() + 1));
+            filter.setInputFormat(train);
+            train = Filter.useFilter(train, filter);
+            Console.traceln(Level.INFO, String.format("starting clustering"));
+            // 3. cluster data
+            EM clusterer = new EM();
+            clusterer.setOptions(params);
+            clusterer.buildClusterer(train);
+            int numClusters = clusterer.getNumClusters();
+            if (numClusters == -1) {
+                Console.traceln(Level.INFO, String.format("we have unlimited clusters"));
+            }
+            else {
+                Console.traceln(Level.INFO, String.format("we have: " + numClusters + " clusters"));
+            }
+            // 4. classify testdata, save cluster int
+            // remove class attribute from testdata?
+            Remove filter2 = new Remove();
+            filter2.setAttributeIndices("" + (test.classIndex() + 1));
+            filter2.setInputFormat(test);
+            test = Filter.useFilter(test, filter2);
+            int cnum;
+            for (int i = 0; i < test.numInstances(); i++) {
+                cnum = ((EM) clusterer).clusterInstance(test.get(i));
+                // we dont want doubles (maybe use a hashset instead of list?)
+                if (!selectedCluster.contains(cnum)) {
+                    selectedCluster.add(cnum);
+                    // Console.traceln(Level.INFO, String.format("assigned to cluster: "+cnum));
+                }
+            }
+            Console.traceln(Level.INFO,
+                            String.format("our testdata is in: " + selectedCluster.size() +
+                                " different clusters"));
+            // 5. get cluster membership of our traindata
+            AddCluster cfilter = new AddCluster();
+            cfilter.setClusterer(clusterer);
+            cfilter.setInputFormat(train);
+            Instances ctrain = Filter.useFilter(train, cfilter);
+            // 6. for all traindata get the cluster int, if it is in our list of testdata cluster
+            // int add the traindata
+            // of this cluster to our returned traindata
+            int cnumber;
+            selected = new Instances(traindata);
+            selected.delete();
+            for (int j = 0; j < ctrain.numInstances(); j++) {
+                // get the cluster number from the attributes
+                cnumber =
+                    Integer.parseInt(ctrain.get(j).stringValue(ctrain.get(j).numAttributes() - 1)
+                        .replace("cluster", ""));
+                // Console.traceln(Level.INFO,
+                // String.format("instance "+j+" is in cluster: "+cnumber));
+                if (selectedCluster.contains(cnumber)) {
+                    // this only works if the index does not change
+                    selected.add(traindata.get(j));
+                    // check for differences, just one attribute, we are pretty sure the index does
+                    // not change
+                    if (traindata.get(j).value(3) != ctrain.get(j).value(3)) {
+                        Console.traceln(Level.WARNING, String
+                            .format("we have a difference between train an ctrain!"));
+                    }
+                }
+            }
+            Console.traceln(Level.INFO,
+                            String.format("that leaves us with: " + selected.numInstances() +
+                                " traindata instances from " + traindata.numInstances()));
+        }
+        catch (Exception e) {
+            Console.traceln(Level.WARNING, String.format("ERROR"));
+            throw new RuntimeException("error in pointwise em", e);
+        }
+        return selected;
+    }
+}

trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/SeparatabilitySelection.java

-                      r2
+                      r41
+// Copyright 2015 Georg-August-Universität Göttingen, Germany
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
 package de.ugoe.cs.cpdp.dataselection;
 …
 /**
+ * A setwise data selection strategy based on the separatability of the training data from the test data after Z. He, F. Peters, T. Menzies, Y. Yang: Learning from Open-Source Projects: An Empirical Study on Defect Prediction.
+ * <br><br>
+ * This is calculated through the error of a logistic regression classifier that tries to separate the sets.
+ * A setwise data selection strategy based on the separatability of the training data from the test
+ * data after Z. He, F. Peters, T. Menzies, Y. Yang: Learning from Open-Source Projects: An
+ * Empirical Study on Defect Prediction. <br>
+ * <br>
+ * This is calculated through the error of a logistic regression classifier that tries to separate
+ * the sets.
+ *
  * @author Steffen Herbold
  */
 public class SeparatabilitySelection implements ISetWiseDataselectionStrategy {
+        /**
+         * size of the random sample that is drawn from both test data and training data
+         */
+        private final int sampleSize = 500;
+        /**
+         * number of repetitions of the sample drawing
+         */
+        private final int maxRep = 10;
+        /**
+         * number of neighbors that are selected
+         */
+        private int neighbors = 10;
+        /**
+         * Sets the number of neighbors that are selected.
+         */
+        @Override
+        public void setParameter(String parameters) {
+                if( !"".equals(parameters) ) {
+                        neighbors = Integer.parseInt(parameters);
+                }
+        }
+    /**
+     * size of the random sample that is drawn from both test data and training data
+     */
+    private final int sampleSize = 500;
+        /**
+         * @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList)
+         */
+        @Override
+        public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
+                final Random rand = new Random(1);
+                // calculate distances between testdata and traindata
+                final double[] distances = new double[traindataSet.size()];
+                int i=0;
+                for( Instances traindata : traindataSet ) {
+                        double distance = 0.0;
+                        for( int rep=0; rep<maxRep ; rep++ ) {
+                                // sample instances
+                                Instances sample = new Instances(testdata);
+                                for( int j=0; j<sampleSize; j++ ) {
+                                        Instance inst = new DenseInstance(testdata.instance(rand.nextInt(testdata.numInstances())));
+                                        inst.setDataset(sample);
+                                        inst.setClassValue(1.0);
+                                        sample.add(inst);
+                                        inst = new DenseInstance(traindata.instance(rand.nextInt(traindata.numInstances())));
+                                        inst.setDataset(sample);
+                                        inst.setClassValue(0.0);
+                                        sample.add(inst);
+                                }
+                                // calculate separation
+                                Evaluation eval;
+                                try {
+                                        eval = new Evaluation(sample);
+                                        eval.crossValidateModel(new Logistic(), sample, 5, rand);
+                                } catch (Exception e) {
+                                        throw new RuntimeException("cross-validation during calculation of separatability failed", e);
+                                }
+                                distance += eval.pctCorrect()/100.0;
+                        }
+                        distances[i++] = 2*((distance/maxRep)-0.5);
+                }
+                // select closest neighbors
+                final double[] distancesCopy = Arrays.copyOf(distances, distances.length);
+                Arrays.sort(distancesCopy);
+                final double cutoffDistance = distancesCopy[neighbors];
+                for( i=traindataSet.size()-1; i>=0 ; i-- ) {
+                        if( distances[i]>cutoffDistance ) {
+                                traindataSet.remove(i);
+                        }
+                }
+        }
+    /**
+     * number of repetitions of the sample drawing
+     */
+    private final int maxRep = 10;
+    /**
+     * number of neighbors that are selected
+     */
+    private int neighbors = 10;
+    /**
+     * Sets the number of neighbors that are selected.
+     */
+    @Override
+    public void setParameter(String parameters) {
+        if (!"".equals(parameters)) {
+            neighbors = Integer.parseInt(parameters);
+        }
+    }
+    /**
+     * @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances,
+     *      org.apache.commons.collections4.list.SetUniqueList)
+     */
+    @Override
+    public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
+        final Random rand = new Random(1);
+        // calculate distances between testdata and traindata
+        final double[] distances = new double[traindataSet.size()];
+        int i = 0;
+        for (Instances traindata : traindataSet) {
+            double distance = 0.0;
+            for (int rep = 0; rep < maxRep; rep++) {
+                // sample instances
+                Instances sample = new Instances(testdata);
+                for (int j = 0; j < sampleSize; j++) {
+                    Instance inst =
+                        new DenseInstance(testdata.instance(rand.nextInt(testdata.numInstances())));
+                    inst.setDataset(sample);
+                    inst.setClassValue(1.0);
+                    sample.add(inst);
+                    inst =
+                        new DenseInstance(
+                                          traindata.instance(rand.nextInt(traindata.numInstances())));
+                    inst.setDataset(sample);
+                    inst.setClassValue(0.0);
+                    sample.add(inst);
+                }
+                // calculate separation
+                Evaluation eval;
+                try {
+                    eval = new Evaluation(sample);
+                    eval.crossValidateModel(new Logistic(), sample, 5, rand);
+                }
+                catch (Exception e) {
+                    throw new RuntimeException(
+                                               "cross-validation during calculation of separatability failed",
+                                               e);
+                }
+                distance += eval.pctCorrect() / 100.0;
+            }
+            distances[i++] = 2 * ((distance / maxRep) - 0.5);
+        }
+        // select closest neighbors
+        final double[] distancesCopy = Arrays.copyOf(distances, distances.length);
+        Arrays.sort(distancesCopy);
+        final double cutoffDistance = distancesCopy[neighbors];
+        for (i = traindataSet.size() - 1; i >= 0; i--) {
+            if (distances[i] > cutoffDistance) {
+                traindataSet.remove(i);
+            }
+        }
+    }
+}

trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/SetWiseEMClusterSelection.java

-                      r2
+                      r41
+// Copyright 2015 Georg-August-Universität Göttingen, Germany
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
 package de.ugoe.cs.cpdp.dataselection;
 …
 /**
+ * Filter based on EM clustering after S. Herbold: Training data selection for cross-project defect prediction
+ * Filter based on EM clustering after S. Herbold: Training data selection for cross-project defect
+ * prediction
+ *
  * @author Steffen Herbold
  */
 public class SetWiseEMClusterSelection extends AbstractCharacteristicSelection {
+        /**
+         * @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList)
+         */
+        @Override
+        public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
+                final Instances data = normalizedCharacteristicInstances(testdata, traindataSet);
+                final Instance targetInstance = data.instance(0);
+                final List<Instance> candidateInstances = new LinkedList<Instance>();
+                for( int i=1; i<data.numInstances(); i++ ) {
+                        candidateInstances.add(data.instance(i));
+                }
+                // cluster and select
+                try {
+                        final EM emeans = new EM();
+                        boolean onlyTarget = true;
+                        int targetCluster;
+                        int maxNumClusters = candidateInstances.size();
+                        do { // while(onlyTarget)
+                                emeans.setMaximumNumberOfClusters(maxNumClusters);
+                                emeans.buildClusterer(data);
+                                targetCluster = emeans.clusterInstance(targetInstance);
+                                // check if cluster only contains target project
+                                for( int i=0 ; i<candidateInstances.size() && onlyTarget; i++ ) {
+                                        onlyTarget &= !(emeans.clusterInstance(candidateInstances.get(i))==targetCluster);
+                                }
+                                maxNumClusters = emeans.numberOfClusters()-1;
+                        } while(onlyTarget);
+                        int numRemoved = 0;
+                        for( int i=0 ; i<candidateInstances.size() ; i++ ) {
+                                if( emeans.clusterInstance(candidateInstances.get(i))!=targetCluster ) {
+                                        traindataSet.remove(i-numRemoved++);
+                                }
+                        }
+                } catch(Exception e) {
+                        throw new RuntimeException("error applying setwise EM clustering training data selection", e);
+                }
+        }
+    /**
+     * @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances,
+     *      org.apache.commons.collections4.list.SetUniqueList)
+     */
+    @Override
+    public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
+        final Instances data = normalizedCharacteristicInstances(testdata, traindataSet);
+        final Instance targetInstance = data.instance(0);
+        final List<Instance> candidateInstances = new LinkedList<Instance>();
+        for (int i = 1; i < data.numInstances(); i++) {
+            candidateInstances.add(data.instance(i));
+        }
+        // cluster and select
+        try {
+            final EM emeans = new EM();
+            boolean onlyTarget = true;
+            int targetCluster;
+            int maxNumClusters = candidateInstances.size();
+            do { // while(onlyTarget)
+                emeans.setMaximumNumberOfClusters(maxNumClusters);
+                emeans.buildClusterer(data);
+                targetCluster = emeans.clusterInstance(targetInstance);
+                // check if cluster only contains target project
+                for (int i = 0; i < candidateInstances.size() && onlyTarget; i++) {
+                    onlyTarget &=
+                        !(emeans.clusterInstance(candidateInstances.get(i)) == targetCluster);
+                }
+                maxNumClusters = emeans.numberOfClusters() - 1;
+            }
+            while (onlyTarget);
+            int numRemoved = 0;
+            for (int i = 0; i < candidateInstances.size(); i++) {
+                if (emeans.clusterInstance(candidateInstances.get(i)) != targetCluster) {
+                    traindataSet.remove(i - numRemoved++);
+                }
+            }
+        }
+        catch (Exception e) {
+            throw new RuntimeException(
+                                       "error applying setwise EM clustering training data selection",
+                                       e);
+        }
+    }
+}

trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/SetWiseEMContextSelection.java

-                      r38
+                      r41
+// Copyright 2015 Georg-August-Universität Göttingen, Germany
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
 package de.ugoe.cs.cpdp.dataselection;
 …
  * Selects training data by clustering project context factors.
+ *
+ * The project context factors used for the clustering are configured in
+ * the XML param attribute, Example:
+ * <setwiseselector name="SetWiseEMContextSelection" param="AFS TND TNC" />
+ * The project context factors used for the clustering are configured in the XML param attribute,
+ * Example: <setwiseselector name="SetWiseEMContextSelection" param="AFS TND TNC" />
  */
 public class SetWiseEMContextSelection implements ISetWiseDataselectionStrategy {
+        private String[] project_context_factors; // = new String[]{"TND", "TNC", "TNF", "TLOC"};
+        @Override
+        public void setParameter(String parameters) {
+                if( parameters!=null ) {
+                        project_context_factors = parameters.split(" ");
+                }
+        }
+        /**
+         * Uses the Weka EM-Clustering algorithm to cluster the projects
+         * by their project context factors.
+         * The project context factors are first normalized and then used for clustering.
+         * They can be configured in the configuration param.
+         *
+         * @param testdata
+         * @param traindataSet
+         */
+        protected void cluster(Instances testdata, SetUniqueList<Instances> traindataSet) {
+                // now do the clustering, normalizedCharacteristicInstances ruft getContextFactors auf
+                final Instances data = this.normalizedCharacteristicInstances(testdata, traindataSet);
+                final Instance targetInstance = data.instance(0);
+                final List<Instance> candidateInstances = new LinkedList<Instance>();
+                for( int i=1; i<data.numInstances(); i++ ) {
+                        candidateInstances.add(data.instance(i));
+                }
+                // cluster and select
+                try {
+                        final EM emeans = new EM();
+                        boolean onlyTarget = true;
+                        int targetCluster;
+                        int maxNumClusters = candidateInstances.size();
+                        do { // while(onlyTarget)
+                                emeans.setMaximumNumberOfClusters(maxNumClusters);
+                                emeans.buildClusterer(data);
+                                targetCluster = emeans.clusterInstance(targetInstance);
+                                // check if cluster only contains target project
+                                for( int i=0 ; i<candidateInstances.size() && onlyTarget; i++ ) {
+                                        onlyTarget &= !(emeans.clusterInstance(candidateInstances.get(i))==targetCluster);
+                                }
+                                maxNumClusters = emeans.numberOfClusters()-1;
+                                //Console.traceln(Level.INFO, "number of clusters: " + emeans.numberOfClusters());
+                        } while(onlyTarget);
+                        Console.traceln(Level.INFO, "clusters: " + maxNumClusters);
+                        Console.traceln(Level.INFO, "instances vor dem clustern: " + traindataSet.size());
+                        int numRemoved = 0;
+                        for( int i=0 ; i<candidateInstances.size() ; i++ ) {
+                                if( emeans.clusterInstance(candidateInstances.get(i))!=targetCluster ) {
+                                        traindataSet.remove(i-numRemoved++);
+                                }
+                        }
+                        Console.traceln(Level.INFO, "instances nach dem clustern: " + traindataSet.size());
+                } catch(Exception e) {
+                        throw new RuntimeException("error applying setwise EM clustering training data selection", e);
+                }
+        }
+        @Override
+        public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
+                // issuetracking und pl muss passen
+                /*
+                int s = traindataSet.size();
+                Console.traceln(Level.INFO, "remove non matching PL and IssueTracking projects, size now: " + s);
+                this.removeWrongContext(testdata, traindataSet, "PL");
+                this.removeWrongContext(testdata, traindataSet, "IssueTracking");
+                s = traindataSet.size();
+                Console.traceln(Level.INFO, "size after removal: " + s);
+                */
+                // now cluster
+                this.cluster(testdata, traindataSet);
+        }
+        /**
+         * Returns test- and training data with only the project context factors
+         * which were chosen in the configuration.
+         * This is later used for clustering.
+         *
+         * @param testdata
+         * @param traindataSet
+         * @return
+         */
+        protected Instances getContextFactors(Instances testdata, SetUniqueList<Instances> traindataSet) {
+                // setup weka Instances for clustering
+                final ArrayList<Attribute> atts = new ArrayList<Attribute>();
+                // we only want the project context factors
+                for( String pcf : this.project_context_factors ) {
+                        atts.add(new Attribute(pcf));
+                }
+                // set up the data
+                final Instances data = new Instances("project_context_factors", atts, 0);
+                double[] instanceValues = new double[atts.size()];
+                // only project context factors + only one instance per project needed
+                int i = 0;
+                for( String pcf : this.project_context_factors ) {
+                        instanceValues[i] = testdata.instance(0).value(testdata.attribute(pcf));
+                        //Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " + instanceValues[i]);
+                        i++;
+                }
+                data.add(new DenseInstance(1.0, instanceValues));
+                // now for the projects of the training stet
+                for( Instances traindata : traindataSet ) {
+                        instanceValues = new double[atts.size()];  // ohne das hier immer dieselben werte?!
+                        i = 0;
+                        for( String pcf : this.project_context_factors ) {
+                                instanceValues[i] = traindata.instance(0).value(traindata.attribute(pcf));
+                                //Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " + instanceValues[i]);
+                                i++;
+                        }
+                        data.add(new DenseInstance(1.0, instanceValues));
+                }
+                return data;
+        }
+        /**
+         * Delete projects where the project context does not match the training project
+         *
+         * @param testdata
+         * @param traindataSet
+         * @param attribute
+         */
+        protected void removeWrongContext(Instances testdata, SetUniqueList<Instances> traindataSet, String attribute) {
+                Set<Instances> remove = new HashSet<Instances>();
+                for( Instances traindata : traindataSet ) {
+                        if( traindata.firstInstance().value(traindata.attribute(attribute)) != testdata.firstInstance().value(testdata.attribute(attribute)) ) {
+                                remove.add(traindata);
+                                //Console.traceln(Level.WARNING, "rmove attribute "+attribute+" test: "+testdata.firstInstance().value(testdata.attribute(attribute))+" train: "+traindata.firstInstance().value(traindata.attribute(attribute)));
+                        }
+                }
+                // now delete the projects from set
+                for( Instances i : remove ) {
+                        traindataSet.remove(i);
+                        //Console.traceln(Level.INFO, "removing training project from set");
+                }
+        }
+        /**
+         * Normalizes the data before it gets used for clustering
+         *
+         * @param testdata
+         * @param traindataSet
+         * @return
+         */
+        protected Instances normalizedCharacteristicInstances(Instances testdata, SetUniqueList<Instances> traindataSet) {
+                Instances data = this.getContextFactors(testdata, traindataSet);
+                try {
+                        final Normalize normalizer = new Normalize();
+                        normalizer.setInputFormat(data);
+                        data = Filter.useFilter(data, normalizer);
+                } catch (Exception e) {
+                        throw new RuntimeException("Unexpected exception during normalization of distributional characteristics.", e);
+                }
+                return data;
+        }
+    private String[] project_context_factors; // = new String[]{"TND", "TNC", "TNF", "TLOC"};
+    @Override
+    public void setParameter(String parameters) {
+        if (parameters != null) {
+            project_context_factors = parameters.split(" ");
+        }
+    }
+    /**
+     * Uses the Weka EM-Clustering algorithm to cluster the projects by their project context
+     * factors. The project context factors are first normalized and then used for clustering. They
+     * can be configured in the configuration param.
+     *
+     * @param testdata
+     * @param traindataSet
+     */
+    protected void cluster(Instances testdata, SetUniqueList<Instances> traindataSet) {
+        // now do the clustering, normalizedCharacteristicInstances ruft getContextFactors auf
+        final Instances data = this.normalizedCharacteristicInstances(testdata, traindataSet);
+        final Instance targetInstance = data.instance(0);
+        final List<Instance> candidateInstances = new LinkedList<Instance>();
+        for (int i = 1; i < data.numInstances(); i++) {
+            candidateInstances.add(data.instance(i));
+        }
+        // cluster and select
+        try {
+            final EM emeans = new EM();
+            boolean onlyTarget = true;
+            int targetCluster;
+            int maxNumClusters = candidateInstances.size();
+            do { // while(onlyTarget)
+                emeans.setMaximumNumberOfClusters(maxNumClusters);
+                emeans.buildClusterer(data);
+                targetCluster = emeans.clusterInstance(targetInstance);
+                // check if cluster only contains target project
+                for (int i = 0; i < candidateInstances.size() && onlyTarget; i++) {
+                    onlyTarget &=
+                        !(emeans.clusterInstance(candidateInstances.get(i)) == targetCluster);
+                }
+                maxNumClusters = emeans.numberOfClusters() - 1;
+                // Console.traceln(Level.INFO, "number of clusters: " + emeans.numberOfClusters());
+            }
+            while (onlyTarget);
+            Console.traceln(Level.INFO, "clusters: " + maxNumClusters);
+            Console.traceln(Level.INFO, "instances vor dem clustern: " + traindataSet.size());
+            int numRemoved = 0;
+            for (int i = 0; i < candidateInstances.size(); i++) {
+                if (emeans.clusterInstance(candidateInstances.get(i)) != targetCluster) {
+                    traindataSet.remove(i - numRemoved++);
+                }
+            }
+            Console.traceln(Level.INFO, "instances nach dem clustern: " + traindataSet.size());
+        }
+        catch (Exception e) {
+            throw new RuntimeException(
+                                       "error applying setwise EM clustering training data selection",
+                                       e);
+        }
+    }
+    @Override
+    public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
+        // issuetracking und pl muss passen
+        /*
+         * int s = traindataSet.size(); Console.traceln(Level.INFO,
+         * "remove non matching PL and IssueTracking projects, size now: " + s);
+         * this.removeWrongContext(testdata, traindataSet, "PL"); this.removeWrongContext(testdata,
+         * traindataSet, "IssueTracking"); s = traindataSet.size(); Console.traceln(Level.INFO,
+         * "size after removal: " + s);
+         */
+        // now cluster
+        this.cluster(testdata, traindataSet);
+    }
+    /**
+     * Returns test- and training data with only the project context factors which were chosen in
+     * the configuration. This is later used for clustering.
+     *
+     * @param testdata
+     * @param traindataSet
+     * @return
+     */
+    protected Instances getContextFactors(Instances testdata, SetUniqueList<Instances> traindataSet)
+    {
+        // setup weka Instances for clustering
+        final ArrayList<Attribute> atts = new ArrayList<Attribute>();
+        // we only want the project context factors
+        for (String pcf : this.project_context_factors) {
+            atts.add(new Attribute(pcf));
+        }
+        // set up the data
+        final Instances data = new Instances("project_context_factors", atts, 0);
+        double[] instanceValues = new double[atts.size()];
+        // only project context factors + only one instance per project needed
+        int i = 0;
+        for (String pcf : this.project_context_factors) {
+            instanceValues[i] = testdata.instance(0).value(testdata.attribute(pcf));
+            // Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " +
+            // instanceValues[i]);
+            i++;
+        }
+        data.add(new DenseInstance(1.0, instanceValues));
+        // now for the projects of the training stet
+        for (Instances traindata : traindataSet) {
+            instanceValues = new double[atts.size()]; // ohne das hier immer dieselben werte?!
+            i = 0;
+            for (String pcf : this.project_context_factors) {
+                instanceValues[i] = traindata.instance(0).value(traindata.attribute(pcf));
+                // Console.traceln(Level.INFO, "adding attribute: " + pcf + " value: " +
+                // instanceValues[i]);
+                i++;
+            }
+            data.add(new DenseInstance(1.0, instanceValues));
+        }
+        return data;
+    }
+    /**
+     * Delete projects where the project context does not match the training project
+     *
+     * @param testdata
+     * @param traindataSet
+     * @param attribute
+     */
+    protected void removeWrongContext(Instances testdata,
+                                      SetUniqueList<Instances> traindataSet,
+                                      String attribute)
+    {
+        Set<Instances> remove = new HashSet<Instances>();
+        for (Instances traindata : traindataSet) {
+            if (traindata.firstInstance().value(traindata.attribute(attribute)) != testdata
+                .firstInstance().value(testdata.attribute(attribute)))
+            {
+                remove.add(traindata);
+                // Console.traceln(Level.WARNING,
+                // "rmove attribute "+attribute+" test: "+testdata.firstInstance().value(testdata.attribute(attribute))+" train: "+traindata.firstInstance().value(traindata.attribute(attribute)));
+            }
+        }
+        // now delete the projects from set
+        for (Instances i : remove) {
+            traindataSet.remove(i);
+            // Console.traceln(Level.INFO, "removing training project from set");
+        }
+    }
+    /**
+     * Normalizes the data before it gets used for clustering
+     *
+     * @param testdata
+     * @param traindataSet
+     * @return
+     */
+    protected Instances normalizedCharacteristicInstances(Instances testdata,
+                                                          SetUniqueList<Instances> traindataSet)
+    {
+        Instances data = this.getContextFactors(testdata, traindataSet);
+        try {
+            final Normalize normalizer = new Normalize();
+            normalizer.setInputFormat(data);
+            data = Filter.useFilter(data, normalizer);
+        }
+        catch (Exception e) {
+            throw new RuntimeException(
+                                       "Unexpected exception during normalization of distributional characteristics.",
+                                       e);
+        }
+        return data;
+    }
+}

trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/SetWiseKNNSelection.java

-                      r2
+                      r41
+// Copyright 2015 Georg-August-Universität Göttingen, Germany
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
 package de.ugoe.cs.cpdp.dataselection;
 …
 /**
+ * Filter based on the k-nearest neighbor (KNN) algorithm S. Herbold: Training data selection for cross-project defect prediction
+ * Filter based on the k-nearest neighbor (KNN) algorithm S. Herbold: Training data selection for
+ * cross-project defect prediction
+ *
  * @author Steffen Herbold
  */
 public class SetWiseKNNSelection extends AbstractCharacteristicSelection {
-        /**
-         * number of neighbors selected
-         */
-        private int k = 1;
-        /**
-         * @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList)
-         */
-        @Override
-        public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
-                final Instances data = normalizedCharacteristicInstances(testdata, traindataSet);
-                final Set<Integer> selected = new HashSet<Integer>();
-                for( int i=0 ; i<k ; i++ ) {
-                        int closestIndex = getClosest(data);
-                        selected.add(closestIndex);
-                        data.delete(closestIndex);
+                }
-                for( int i=traindataSet.size()-1; i>=0 ; i-- ) {
-                        if( selected.contains(i) ) {
-                                traindataSet.remove(i);
+                        }
+                }
+        }
-        /**
-         * Helper method that determines the index of the instance with the smallest distance to the first instance (index 0).
-         * @param data data set
-         * @return index of the closest instance
-         */
-        private int getClosest(Instances data) {
-                double closestDistance = Double.MAX_VALUE;
-                int closestIndex = 1;
-                for( int i=1 ; i<data.numInstances() ; i++ ) {
-                        double distance = MathArrays.distance(data.instance(0).toDoubleArray(), data.instance(i).toDoubleArray());
-                        if( distance < closestDistance) {
-                                closestDistance = distance;
-                                closestIndex = i;
+                        }
+                }
-                return closestIndex;
+        }
+        /**
+         * Sets the number of neighbors followed by the distributional characteristics, the values are separated by blanks.
+         * @see AbstractCharacteristicSelection#setParameter(String)
+         */
+        @Override
+        public void setParameter(String parameters) {
+                if( !"".equals(parameters) ) {
+                        final String[] split = parameters.split(" ");
+                        k = Integer.parseInt(split[0]);
+                        String str = "";
+                        for( int i=1 ; i<split.length; i++ ) {
+                                str += split[i];
+                                if( i<split.length-1 )  {
+                                        str += " ";
+                                }
+                        }
+                        super.setParameter(str);
+                }
+        }
+    /**
+     * number of neighbors selected
+     */
+    private int k = 1;
+    /**
+     * @see de.ugoe.cs.cpdp.dataselection.SetWiseDataselectionStrategy#apply(weka.core.Instances,
+     *      org.apache.commons.collections4.list.SetUniqueList)
+     */
+    @Override
+    public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
+        final Instances data = normalizedCharacteristicInstances(testdata, traindataSet);
+        final Set<Integer> selected = new HashSet<Integer>();
+        for (int i = 0; i < k; i++) {
+            int closestIndex = getClosest(data);
+            selected.add(closestIndex);
+            data.delete(closestIndex);
+        }
+        for (int i = traindataSet.size() - 1; i >= 0; i--) {
+            if (selected.contains(i)) {
+                traindataSet.remove(i);
+            }
+        }
+    }
+    /**
+     * Helper method that determines the index of the instance with the smallest distance to the
+     * first instance (index 0).
+     *
+     * @param data
+     *            data set
+     * @return index of the closest instance
+     */
+    private int getClosest(Instances data) {
+        double closestDistance = Double.MAX_VALUE;
+        int closestIndex = 1;
+        for (int i = 1; i < data.numInstances(); i++) {
+            double distance =
+                MathArrays.distance(data.instance(0).toDoubleArray(), data.instance(i)
+                    .toDoubleArray());
+            if (distance < closestDistance) {
+                closestDistance = distance;
+                closestIndex = i;
+            }
+        }
+        return closestIndex;
+    }
+    /**
+     * Sets the number of neighbors followed by the distributional characteristics, the values are
+     * separated by blanks.
+     *
+     * @see AbstractCharacteristicSelection#setParameter(String)
+     */
+    @Override
+    public void setParameter(String parameters) {
+        if (!"".equals(parameters)) {
+            final String[] split = parameters.split(" ");
+            k = Integer.parseInt(split[0]);
+            String str = "";
+            for (int i = 1; i < split.length; i++) {
+                str += split[i];
+                if (i < split.length - 1) {
+                    str += " ";
+                }
+            }
+            super.setParameter(str);
+        }
+    }
+}

trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/TestAsTraining.java

-                      r2
+                      r41
+// Copyright 2015 Georg-August-Universität Göttingen, Germany
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
 package de.ugoe.cs.cpdp.dataselection;
 …
 /**
  * Uses the test data as training data.
+ *
  * @author Steffen Herbold
+ *
+ *
  */
 public class TestAsTraining implements ISetWiseDataselectionStrategy {
         /**
          * no parameters
          */
         @Override
         public void setParameter(String parameters) {
                 // dummy
+        }
+    /**
+     * no parameters
+     */
+    @Override
+    public void setParameter(String parameters) {
+        // dummy
+    }
+        /**(non-Javadoc)
+         * @see de.ugoe.cs.cpdp.dataselection.ISetWiseDataselectionStrategy#apply(weka.core.Instances, org.apache.commons.collections4.list.SetUniqueList)
+         */
+        @Override
+        public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
+                traindataSet.clear();
+                traindataSet.add(new Instances(testdata));
+        }
+    /**
+     * (non-Javadoc)
+     *
+     * @see de.ugoe.cs.cpdp.dataselection.ISetWiseDataselectionStrategy#apply(weka.core.Instances,
+     *      org.apache.commons.collections4.list.SetUniqueList)
+     */
+    @Override
+    public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
+        traindataSet.clear();
+        traindataSet.add(new Instances(testdata));
+    }
+}

trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/TurhanFilter.java

-                      r2
+                      r41
+// Copyright 2015 Georg-August-Universität Göttingen, Germany
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
 package de.ugoe.cs.cpdp.dataselection;
 …
 /**
+ * Filter according to B. Turhan, T. Menzies, A. Bener, and J. Die Stefano: On the relative value of cross-company and within company defect prediction
+ * Filter according to B. Turhan, T. Menzies, A. Bener, and J. Die Stefano: On the relative value of
+ * cross-company and within company defect prediction
+ *
  * @author Steffen Herbold
  */
 public class TurhanFilter implements IPointWiseDataselectionStrategy {
+        /**
+         * number of neighbors that are selected
+         */
+        private int k = 10;
+        /**
+         * Sets the number of neighbors.
+         * @param parameters number of neighbors
+         */
+        @Override
+        public void setParameter(String parameters) {
+                k = Integer.parseInt(parameters);
+        }
+    /**
+     * number of neighbors that are selected
+     */
+    private int k = 10;
+        /**
+         * @see de.ugoe.cs.cpdp.dataselection.PointWiseDataselectionStrategy#apply(weka.core.Instances, weka.core.Instances)
+         */
+        @Override
+        public Instances apply(Instances testdata, Instances traindata) {
+                final Attribute classAttribute = testdata.classAttribute();
+                final List<Integer> selectedIndex = SetUniqueList.setUniqueList(new LinkedList<Integer>());
+                final double[][] trainDoubles = new double[traindata.numInstances()][testdata.numAttributes()];
+                for( int i=0; i<traindata.numInstances() ; i++ ) {
+                        Instance instance = traindata.instance(i);
+                        int tmp = 0;
+                        for( int j=0 ; j<testdata.numAttributes(); j++ ) {
+                                if( testdata.attribute(j)!=classAttribute ) {
+                                        trainDoubles[i][tmp++] = instance.value(j);
+                                }
+                        }
+                }
+                for( int i=0; i<testdata.numInstances() ; i++ ) {
+                        Instance testIntance = testdata.instance(i);
+                        double[] targetVector = new double[testdata.numAttributes()-1];
+                        int tmp = 0;
+                        for( int j=0 ; j<testdata.numAttributes(); j++ ) {
+                                if( testdata.attribute(j)!=classAttribute ) {
+                                        targetVector[tmp++] = testIntance.value(j);
+                                }
+                        }
+                        double farthestClosestDistance = Double.MAX_VALUE;
+                        int farthestClosestIndex = 0;
+                        double[] closestDistances = new double[k];
+                        for( int m=0 ; m<closestDistances.length ; m++ ) {
+                                closestDistances[m] = Double.MAX_VALUE;
+                        }
+                        int[] closestIndex = new int[k];
+                        for( int n=0; n<traindata.numInstances() ; n++ ) {
+                                double distance = MathArrays.distance(targetVector, trainDoubles[n]);
+                                if( distance<farthestClosestDistance ) {
+                                        closestIndex[farthestClosestIndex] = n;
+                                        closestDistances[farthestClosestIndex] = distance;
+                                        farthestClosestIndex = ArrayTools.findMax(closestDistances);
+                                        farthestClosestDistance = closestDistances[farthestClosestIndex];
+                                }
+                        }
+                        for( int index : closestIndex ) {
+                                selectedIndex.add(index);
+                        }
+                }
+                final Instances selected = new Instances(testdata);
+                selected.delete();
+                for( Integer i : selectedIndex) {
+                        selected.add(traindata.instance(i));
+                }
+                return selected;
+        }
+    /**
+     * Sets the number of neighbors.
+     *
+     * @param parameters
+     *            number of neighbors
+     */
+    @Override
+    public void setParameter(String parameters) {
+        k = Integer.parseInt(parameters);
+    }
+    /**
+     * @see de.ugoe.cs.cpdp.dataselection.PointWiseDataselectionStrategy#apply(weka.core.Instances,
+     *      weka.core.Instances)
+     */
+    @Override
+    public Instances apply(Instances testdata, Instances traindata) {
+        final Attribute classAttribute = testdata.classAttribute();
+        final List<Integer> selectedIndex = SetUniqueList.setUniqueList(new LinkedList<Integer>());
+        final double[][] trainDoubles =
+            new double[traindata.numInstances()][testdata.numAttributes()];
+        for (int i = 0; i < traindata.numInstances(); i++) {
+            Instance instance = traindata.instance(i);
+            int tmp = 0;
+            for (int j = 0; j < testdata.numAttributes(); j++) {
+                if (testdata.attribute(j) != classAttribute) {
+                    trainDoubles[i][tmp++] = instance.value(j);
+                }
+            }
+        }
+        for (int i = 0; i < testdata.numInstances(); i++) {
+            Instance testIntance = testdata.instance(i);
+            double[] targetVector = new double[testdata.numAttributes() - 1];
+            int tmp = 0;
+            for (int j = 0; j < testdata.numAttributes(); j++) {
+                if (testdata.attribute(j) != classAttribute) {
+                    targetVector[tmp++] = testIntance.value(j);
+                }
+            }
+            double farthestClosestDistance = Double.MAX_VALUE;
+            int farthestClosestIndex = 0;
+            double[] closestDistances = new double[k];
+            for (int m = 0; m < closestDistances.length; m++) {
+                closestDistances[m] = Double.MAX_VALUE;
+            }
+            int[] closestIndex = new int[k];
+            for (int n = 0; n < traindata.numInstances(); n++) {
+                double distance = MathArrays.distance(targetVector, trainDoubles[n]);
+                if (distance < farthestClosestDistance) {
+                    closestIndex[farthestClosestIndex] = n;
+                    closestDistances[farthestClosestIndex] = distance;
+                    farthestClosestIndex = ArrayTools.findMax(closestDistances);
+                    farthestClosestDistance = closestDistances[farthestClosestIndex];
+                }
+            }
+            for (int index : closestIndex) {
+                selectedIndex.add(index);
+            }
+        }
+        final Instances selected = new Instances(testdata);
+        selected.delete();
+        for (Integer i : selectedIndex) {
+            selected.add(traindata.instance(i));
+        }
+        return selected;
+    }
+}

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 41 for trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection

Legend:

Download in other formats: