Context Navigation

← Previous Change
Next Change →

WekaLocalTraining2.java

Timestamp:

09/05/14 15:52:48 (10 years ago)

Author:

atrautsch

Message:

Aufräumarbeiten durchgeführt und jetzt auch eine Version die Funktioniert.
Debugausgaben habe ich erst mal nur auskommentiert.

File:

: 1 edited

trunk/CrossPare/src/de/ugoe/cs/cpdp/training/WekaLocalTraining2.java (modified) (24 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/CrossPare/src/de/ugoe/cs/cpdp/training/WekaLocalTraining2.java

-                      r16
+                      r17
 import java.io.PrintStream;
 import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
 import java.util.HashMap;
 import java.util.HashSet;
 …
 import org.apache.commons.io.output.NullOutputStream;
+import de.ugoe.cs.cpdp.training.QuadTree;
 import de.ugoe.cs.util.console.Console;
 import weka.classifiers.AbstractClassifier;
 …
 /**
+ * ACHTUNG UNFERTIG
+ * Trainer with reimplementation of WHERE clustering algorithm from:
+ * Tim Menzies, Andrew Butcher, David Cok, Andrian Marcus, Lucas Layman,
+ * Forrest Shull, Burak Turhan, Thomas Zimmermann,
+ * "Local versus Global Lessons for Defect Prediction and Effort Estimation,"
+ * IEEE Transactions on Software Engineering, vol. 39, no. 6, pp. 822-834, June, 2013
+ *
  * With WekaLocalTraining2 we do the following:
 …
  * 3) We cluster the QuadTree nodes together if they have similar density (50%)
  * 4) We save the clusters and their training data
  * 5) We only use clusters with > ALPHA instances (currently Math.sqrt(SIZE)), rest is discarded
+ * 5) We only use clusters with > ALPHA instances (currently Math.sqrt(SIZE)), rest is discarded with the training data of this cluster
  * 6) We train a Weka classifier for each cluster with the clusters training data
  * 7) We recalculate Fastmap distances for a single instance and then try to find a cluster containing the coords of the instance.
+ * 7) We recalculate Fastmap distances for a single instance with the old pivots and then try to find a cluster containing the coords of the instance.
  * 7.1.) If we can not find a cluster (due to coords outside of all clusters) we find the nearest cluster.
  * 8) We classifiy the Instance with the classifier and traindata from the Cluster we found in 7.
+ * 8) We classify the Instance with the classifier and traindata from the Cluster we found in 7.
  */
 public class WekaLocalTraining2 extends WekaBaseTraining2 implements ITrainingStrategy {
         private final TraindatasetCluster classifier = new TraindatasetCluster();
-        // these values are set later when we have all the information we need (size)
-        /*Stopping rule for tree recursion (Math.sqrt(Instances)*/
-        public static double ALPHA = 0;
-        /*size of the complete set (used for density function)*/
-        public static int SIZE = 0;
-        /*Stopping rule for clustering*/
-        public static double DELTA = 0.5;
-        // we need these references later in the testing
-        private static QuadTree TREE;
-        private static Fastmap FMAP;
-        private static EuclideanDistance DIST;
-        private static Instances TRAIN;
-        // cluster payloads
-        private static ArrayList<ArrayList<QuadTreePayload<Instance>>> cluster = new ArrayList<ArrayList<QuadTreePayload<Instance>>>();
-        // cluster sizes (index is cluster number, arraylist is list of boxes (x0,y0,x1,y1)
-        private static HashMap<Integer, ArrayList<Double[][]>> CSIZE = new HashMap<Integer, ArrayList<Double[][]>>();
         @Override
 …
                 return classifier;
+        }
         @Override
 …
                 private static final long serialVersionUID = 1L;
+                /* classifier per cluster */
                 private HashMap<Integer, Classifier> cclassifier = new HashMap<Integer, Classifier>();
+                /* instances per cluster */
                 private HashMap<Integer, Instances> ctraindata = new HashMap<Integer, Instances>();
+                /* holds the instances and indices of the pivot objects of the Fastmap calculation in buildClassifier*/
+                private HashMap<Integer, Instance> cpivots = new HashMap<Integer, Instance>();
+                /* holds the indices of the pivot objects for x,y and the dimension [x,y][dimension]*/
+                private int[][] cpivotindices = new int[2][2];
+                /* holds the sizes of the cluster multiple "boxes" per cluster */
+                private HashMap<Integer, ArrayList<Double[][]>> csize;
+                private boolean show_biggest = true;
+                private int CFOUND = 0;
+                private int CNOTFOUND = 0;
 …
                 /**
                  * Because Fastmap saves only the image not the values of the attributes it used
+                 * we can not use it or the QuadTree to classify single instances to clusters.
+                 *
+                 * To classify a single instance we measure the distance to all instances we have clustered and
+                 * use the cluster where the distance is minimal.
+                 *
+                 * TODO: class attribute filter raus
+                 * TODO: werden auf die übergebene Instance ebenfalls die preprocessors angewendet? müsste eigentlich
+                 * we can not use the old data directly to classify single instances to clusters.
+                 *
+                 * To classify a single instance we do a new fastmap computation with only the instance and
+                 * the old pivot elements.
+                 *
+                 * After that we find the cluster with our fastmap result for x and y.
                  */
                 @Override
 …
                         double ret = 0;
                         try {
+                                // classinstance gets passed to classifier
                                 Instances traindata = ctraindata.get(0);
                                 Instance classInstance = createInstance(traindata, instance);
+                                // this one keeps the class attribute
+                                Instances traindata2 = ctraindata.get(1);
                                 // remove class attribute before clustering
 …
                                 filter.setInputFormat(traindata);
                                 traindata = Filter.useFilter(traindata, filter);
                                 Instance clusterInstance = createInstance(traindata, instance);
+                                // build temp dist matrix (2 Pivot per dimension + 1 instance we want to classify)
+                                Fastmap FMAP = new Fastmap(2);
+                                EuclideanDistance dist = new EuclideanDistance(traindata);
+                                // we set our pivot indices [x=0,y=1][dimension]
+                                int[][] npivotindices = new int[2][2];
+                                npivotindices[0][0] = 1;
+                                npivotindices[1][0] = 2;
+                                npivotindices[0][1] = 3;
+                                npivotindices[1][1] = 4;
+                                // build temp dist matrix (2 pivots per dimension + 1 instance we want to classify)
+                                // the instance we want to classify comes first after that the pivot elements in the order defined above
                                 double[][] distmat = new double[2*FMAP.target_dims+1][2*FMAP.target_dims+1];
+                                // vector of instances of pivots + 1 (for the instance we want to classify)
+                                int[] tmp = new int[FMAP.PA.length+1];
+                                Instance tmpi;
+                                Instance tmpj;
+                                for(int i=0; i < tmp.length; i++) {
+                                        for(int j=0; j < tmp.length; j++) {
+                                                if(i==0) {
+                                                        tmpi = instance;
+                                                }else{
+                                                        tmpi = TRAIN.get(i);
+                                distmat[0][0] = 0;
+                                distmat[0][1] = dist.distance(clusterInstance, this.cpivots.get((Integer)this.cpivotindices[0][0]));
+                                distmat[0][2] = dist.distance(clusterInstance, this.cpivots.get((Integer)this.cpivotindices[1][0]));
+                                distmat[0][3] = dist.distance(clusterInstance, this.cpivots.get((Integer)this.cpivotindices[0][1]));
+                                distmat[0][4] = dist.distance(clusterInstance, this.cpivots.get((Integer)this.cpivotindices[1][1]));
+                                distmat[1][0] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[0][0]), clusterInstance);
+                                distmat[1][1] = 0;
+                                distmat[1][2] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[0][0]), this.cpivots.get((Integer)this.cpivotindices[1][0]));
+                                distmat[1][3] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[0][0]), this.cpivots.get((Integer)this.cpivotindices[0][1]));
+                                distmat[1][4] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[0][0]), this.cpivots.get((Integer)this.cpivotindices[1][1]));
+                                distmat[2][0] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[1][0]), clusterInstance);
+                                distmat[2][1] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[1][0]), this.cpivots.get((Integer)this.cpivotindices[0][0]));
+                                distmat[2][2] = 0;
+                                distmat[2][3] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[1][0]), this.cpivots.get((Integer)this.cpivotindices[0][1]));
+                                distmat[2][4] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[1][0]), this.cpivots.get((Integer)this.cpivotindices[1][1]));
+                                distmat[3][0] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[0][1]), clusterInstance);
+                                distmat[3][1] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[0][1]), this.cpivots.get((Integer)this.cpivotindices[0][0]));
+                                distmat[3][2] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[0][1]), this.cpivots.get((Integer)this.cpivotindices[1][0]));
+                                distmat[3][3] = 0;
+                                distmat[3][4] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[0][1]), this.cpivots.get((Integer)this.cpivotindices[1][1]));
+                                distmat[4][0] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[1][1]), clusterInstance);
+                                distmat[4][1] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[1][1]), this.cpivots.get((Integer)this.cpivotindices[0][0]));
+                                distmat[4][2] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[1][1]), this.cpivots.get((Integer)this.cpivotindices[1][0]));
+                                distmat[4][3] = dist.distance(this.cpivots.get((Integer)this.cpivotindices[1][1]), this.cpivots.get((Integer)this.cpivotindices[0][1]));
+                                distmat[4][4] = 0;
+                                /* debug output: show biggest distance found within the new distance matrix
+                                double biggest = 0;
+                                for(int i=0; i < distmat.length; i++) {
+                                        for(int j=0; j < distmat[0].length; j++) {
+                                                if(biggest < distmat[i][j]) {
+                                                        biggest = distmat[i][j];
+                                                }
-                                                if(j == 0) {
-                                                        tmpj = instance;
-                                                }else {
-                                                        tmpj = TRAIN.get(j);
+                                                }
-                                                distmat[i][j] = DIST.distance(tmpi, tmpj);
+                                        }
+                                }
+                                // this is the projection vector for our instance
+                                double[] proj = FMAP.addInstance(distmat);
+                                if(this.show_biggest) {
+                                        Console.traceln(Level.INFO, String.format(""+clusterInstance));
+                                        Console.traceln(Level.INFO, String.format("biggest distances: "+ biggest));
+                                        this.show_biggest = false;
+                                }
+                                */
+                                FMAP.setDistmat(distmat);
+                                FMAP.setPivots(npivotindices);
+                                FMAP.calculate();
+                                double[][] x = FMAP.getX();
+                                double[] proj = x[0];
+                                // debug output: show the calculated distance matrix, our result vektor for the instance and the complete result matrix
+                                /*
+                                Console.traceln(Level.INFO, "distmat:");
+                            for(int i=0; i<distmat.length; i++){
+                                for(int j=0; j<distmat[0].length; j++){
+                                        Console.trace(Level.INFO, String.format("%20s", distmat[i][j]));
+                                }
+                                Console.traceln(Level.INFO, "");
+                            }
+                            Console.traceln(Level.INFO, "vector:");
+                            for(int i=0; i < proj.length; i++) {
+                                Console.trace(Level.INFO, String.format("%20s", proj[i]));
+                            }
+                            Console.traceln(Level.INFO, "");
+                                Console.traceln(Level.INFO, "resultmat:");
+                            for(int i=0; i<x.length; i++){
+                                for(int j=0; j<x[0].length; j++){
+                                        Console.trace(Level.INFO, String.format("%20s", x[i][j]));
+                                }
+                                Console.traceln(Level.INFO, "");
+                            }
+                            */
+                                // TODO: can we be in more cluster than one?
+                                // now we iterate over all clusters (well, boxes of sizes per cluster really) and save the number of the
+                                // cluster in which we are
                                 int cnumber;
                                 int found_cnumber = -1;
                                 Iterator<Integer> clusternumber = CSIZE.keySet().iterator();
                                 while ( clusternumber.hasNext() ) {
+                                Iterator<Integer> clusternumber = this.csize.keySet().iterator();
+                                while ( clusternumber.hasNext() && found_cnumber == -1) {
                                         cnumber = clusternumber.next();
+                                        // jetzt iterieren wir über die boxen und hoffen wir finden was (cluster könnte auch entfernt worden sein)
+                                        for ( int box=0; box < CSIZE.get(cnumber).size(); box++ ) {
+                                                Double[][] current = CSIZE.get(cnumber).get(box);
+                                                if(proj[0] <= current[0][0] && proj[0] >= current[0][1] &&  // x
+                                                   proj[1] <= current[1][0] && proj[1] >= current[1][1]) {  // y
+                                        // now iterate over the boxes of the cluster and hope we find one (cluster could have been removed)
+                                        // or we are too far away from any cluster
+                                        for ( int box=0; box < this.csize.get(cnumber).size(); box++ ) {
+                                                Double[][] current = this.csize.get(cnumber).get(box);
+                                                if(proj[0] >= current[0][0] && proj[0] <= current[0][1] &&  // x
+                                                   proj[1] >= current[1][0] && proj[1] <= current[1][1]) {  // y
                                                         found_cnumber = cnumber;
+                                                }
 …
+                                }
+                                // wenn wir keinen cluster finden, liegen wir außerhalb des bereichs
+                                // kann das vorkommen mit fastmap?
+                                // ja das kann vorkommen wir suchen also weiterhin den nächsten
+                                // müssten mal durchzählen wie oft das vorkommt
+                                // we want to count how often we are really inside a cluster
                                 if ( found_cnumber == -1 ) {
+                                        //Console.traceln(Level.INFO, String.format("ERROR matching instance to cluster!"));
+                                        //throw new RuntimeException("no cluster for test instance found!");
+                                }
+                                // jetzt kann es vorkommen das der cluster gelöscht wurde (weil zuwenig instanzen), jetzt müssen wir den
+                                // finden der am nächsten dran ist
+                                        CNOTFOUND += 1;
+                                }else {
+                                        CFOUND += 1;
+                                }
+                                // now it can happen that we dont find a cluster because we deleted it previously (too few instances)
+                                // or we get bigger distance measures from weka so that we are completely outside of our clusters.
+                                // in these cases we just find the nearest cluster to our instance and use it for classification.
+                                // to do that we use the EuclideanDistance again to compare our distance to all other Instances
+                                // then we take the cluster of the closest weka instance
+                                dist = new EuclideanDistance(traindata2);
                                 if( !this.ctraindata.containsKey(found_cnumber) ) {
                                         double min_distance = 99999999;
 …
                                                 cnumber = clusternumber.next();
                                                 for(int i=0; i < ctraindata.get(cnumber).size(); i++) {
                                                         if(DIST.distance(clusterInstance, ctraindata.get(cnumber).get(i)) <= min_distance) {
+                                                        if(dist.distance(instance, ctraindata.get(cnumber).get(i)) <= min_distance) {
                                                                 found_cnumber = cnumber;
                                                                 min_distance = DIST.distance(clusterInstance, ctraindata.get(cnumber).get(i));
+                                                                min_distance = dist.distance(instance, ctraindata.get(cnumber).get(i));
+                                                        }
+                                                }
 …
                                 // here we have the cluster where an instance has the minimum distance between itself the
                                 // instance we want to classify
+                                // if we still have not found a cluster we exit because something is really wrong
                                 if( found_cnumber == -1 ) {
-                                        // this is an error condition
                                         Console.traceln(Level.INFO, String.format("ERROR matching instance to cluster with full search!"));
                                         throw new RuntimeException("min_cluster not found");
+                                }
                                 // classify the passed instance with the cluster we found
+                                        throw new RuntimeException("cluster not found with full search");
+                                }
+                                // classify the passed instance with the cluster we found and its training data
                                 ret = cclassifier.get(found_cnumber).classifyInstance(classInstance);
 …
                 public void buildClassifier(Instances traindata) throws Exception {
+                        //Console.traceln(Level.INFO, String.format("found: "+ CFOUND + ", notfound: " + CNOTFOUND));
+                        this.show_biggest = true;
                         // 1. copy traindata
                         Instances train = new Instances(traindata);
+                        Instances train2 = new Instances(traindata);  // this one keeps the class attribute
                         // 2. remove class attribute for clustering
+                        //Remove filter = new Remove();
+                        //filter.setAttributeIndices("" + (train.classIndex() + 1));
+                        //filter.setInputFormat(train);
+                        //train = Filter.useFilter(train, filter);
+                        TRAIN = train;
+                        Remove filter = new Remove();
+                        filter.setAttributeIndices("" + (train.classIndex() + 1));
+                        filter.setInputFormat(train);
+                        train = Filter.useFilter(train, filter);
                         // 3. calculate distance matrix (needed for Fastmap because it starts at dimension 1)
+                        DIST = new EuclideanDistance(train);
+                        double[][] dist = new double[train.size()][train.size()];
+                        for(int i=0; i < train.size(); i++) {
+                                for(int j=0; j < train.size(); j++) {
+                                        dist[i][j] = DIST.distance(train.get(i), train.get(j));
+                                }
+                        }
+                        double biggest = 0;
+                        EuclideanDistance dist = new EuclideanDistance(train);
+                        double[][] distmat = new double[train.size()][train.size()];
+                        for( int i=0; i < train.size(); i++ ) {
+                                for( int j=0; j < train.size(); j++ ) {
+                                        distmat[i][j] = dist.distance(train.get(i), train.get(j));
+                                        if( distmat[i][j] > biggest ) {
+                                                biggest = distmat[i][j];
+                                        }
+                                }
+                        }
+                        //Console.traceln(Level.INFO, String.format("biggest distances: "+ biggest));
                         // 4. run fastmap for 2 dimensions on the distance matrix
+                        FMAP = new Fastmap(2, dist);
+                        Fastmap FMAP = new Fastmap(2);
+                        FMAP.setDistmat(distmat);
                         FMAP.calculate();
+                        cpivotindices = FMAP.getPivots();
                         double[][] X = FMAP.getX();
 …
                         // set quadtree payload values and get max and min x and y values for size
                     for(int i=0; i<X.length; i++){
+                    for( int i=0; i<X.length; i++ ){
                         if(X[i][0] >= big[0]) {
                                 big[0] = X[i][0];
 …
                                 small[1] = X[i][1];
+                        }
                         QuadTreePayload<Instance> tmp = new QuadTreePayload<Instance>(X[i][0], X[i][1], train.get(i));
+                        QuadTreePayload<Instance> tmp = new QuadTreePayload<Instance>(X[i][0], X[i][1], train2.get(i));
                         qtp.add(tmp);
+                    }
+                    Console.traceln(Level.INFO, String.format("size for cluster ("+small[0]+","+small[1]+") - ("+big[0]+","+big[1]+")"));
                     // 5. generate quadtree
+                    TREE = new QuadTree(null, qtp);
+                    ALPHA = Math.sqrt(train.size());
+                    SIZE = train.size();
+                    //Console.traceln(Level.INFO, String.format("Generate QuadTree with "+ SIZE + " size, Alpha: "+ ALPHA+ ""));
+                    QuadTree TREE = new QuadTree(null, qtp);
+                    QuadTree.size = train.size();
+                    QuadTree.alpha = Math.sqrt(train.size());
+                    QuadTree.ccluster = new ArrayList<ArrayList<QuadTreePayload<Instance>>>();
+                    QuadTree.csize = new HashMap<Integer, ArrayList<Double[][]>>();
+                    //Console.traceln(Level.INFO, String.format("Generate QuadTree with "+ QuadTree.size + " size, Alpha: "+ QuadTree.alpha+ ""));
                     // set the size and then split the tree recursively at the median value for x, y
                     TREE.setSize(new double[] {small[0], big[0]}, new double[] {small[1], big[1]});
+                    // recursive split und grid clustering eher static
                     TREE.recursiveSplit(TREE);
 …
                     ArrayList<QuadTree> l = new ArrayList<QuadTree>(TREE.getList(TREE));
                     // recursive grid clustering (tree pruning), the values are stored in cluster
+                    // recursive grid clustering (tree pruning), the values are stored in ccluster
                     TREE.gridClustering(l);
                     // wir iterieren durch die cluster und sammeln uns die instanzen daraus
+                    for(int i=0; i < cluster.size(); i++) {
+                        ArrayList<QuadTreePayload<Instance>> current = cluster.get(i);
+                    //ctraindata.clear();
+                    for( int i=0; i < QuadTree.ccluster.size(); i++ ) {
+                        ArrayList<QuadTreePayload<Instance>> current = QuadTree.ccluster.get(i);
                         // i is the clusternumber
                         // we only allow clusters with Instances > ALPHA, other clusters are not considered!
+                        if(current.size() > ALPHA) {
+                                for(int j=0; j < current.size(); j++ ) {
+                                        if(!ctraindata.containsKey(i)) {
+                                                ctraindata.put(i, new Instances(traindata));
+                        //if(current.size() > QuadTree.alpha) {
+                        if( current.size() > 4 ) {
+                                for( int j=0; j < current.size(); j++ ) {
+                                        if( !ctraindata.containsKey(i) ) {
+                                                ctraindata.put(i, new Instances(train2));
                                                 ctraindata.get(i).delete();
+                                        }
                                         ctraindata.get(i).add(current.get(j).getInst());
+                                }
+                        }else{
+                                Console.traceln(Level.INFO, String.format("drop cluster, only: " + current.size() + " instances"));
+                        }
+                    }
+                        // here we keep things we need later on
+                        // QuadTree sizes for later use
+                        this.csize = new HashMap<Integer, ArrayList<Double[][]>>(QuadTree.csize);
+                        // pivot elements
+                        //this.cpivots.clear();
+                        for( int i=0; i < FMAP.PA[0].length; i++ ) {
+                                this.cpivots.put(FMAP.PA[0][i], (Instance)train.get(FMAP.PA[0][i]).copy());
+                        }
+                        for( int j=0; j < FMAP.PA[0].length; j++ ) {
+                                this.cpivots.put(FMAP.PA[1][j], (Instance)train.get(FMAP.PA[1][j]).copy());
+                        }
+                        /* debug output
+                        int pnumber;
+                        Iterator<Integer> pivotnumber = cpivots.keySet().iterator();
+                        while ( pivotnumber.hasNext() ) {
+                                pnumber = pivotnumber.next();
+                                Console.traceln(Level.INFO, String.format("pivot: "+pnumber+ " inst: "+cpivots.get(pnumber)));
+                        }
+                        */
                     // train one classifier per cluster, we get the clusternumber from the traindata
                     int cnumber;
                         Iterator<Integer> clusternumber = ctraindata.keySet().iterator();
+                        //cclassifier.clear();
                         while ( clusternumber.hasNext() ) {
                                 cnumber = clusternumber.next();
                                 cclassifier.put(cnumber,setupClassifier());
+                                cnumber = clusternumber.next();
+                                cclassifier.put(cnumber,setupClassifier()); // das hier ist der eigentliche trainer
                                 cclassifier.get(cnumber).buildClassifier(ctraindata.get(cnumber));
                                 //Console.traceln(Level.INFO, String.format("classifier in cluster "+cnumber));
                                 //Console.traceln(Level.INFO, String.format("" + ctraindata.get(cnumber).size() + " instances in cluster "+cnumber));
+                        }
+                        //Console.traceln(Level.INFO, String.format("num clusters: "+cclassifier.size()));
+                }
+        }
 …
         /**
+         * hier stecken die Fastmap koordinaten drin
+         * sowie als Payload jeweils 1 weka instanz
+         * Payload for the QuadTree.
+         * x and y are the calculated Fastmap values.
+         * T is a weka instance.
          */
         public class QuadTreePayload<T> {
 …
                 private int target_dims = 0;
+                /*3 x k tmp projections array, we need this for later projections*/
+                double[][] tmpX;
+                /**/
+                public Fastmap(int k, double[][] O) {
+                        this.tmpX = new double[2*k+1][k];
+                // if we already have the pivot elements
+                private boolean pivot_set = false;
+                public Fastmap(int k) {
+                        this.target_dims = k;
+                }
+                /**
+                 * Sets the distance matrix
+                 * and params that depend on this
+                 * @param O
+                 */
+                public void setDistmat(double[][] O) {
                         this.O = O;
                         int N = O.length;
+                        this.target_dims = k;
+                        this.X = new double[N][k];
+                        this.PA = new int[2][k];
+                        this.X = new double[N][this.target_dims];
+                        this.PA = new int[2][this.target_dims];
+                }
+                /**
+                 * Set pivot elements, we need that to classify instances
+                 * after the calculation is complete (because we then want to reuse
+                 * only the pivot elements).
+                 *
+                 * @param pi
+                 */
+                public void setPivots(int[][] pi) {
+                        this.pivot_set = true;
+                        this.PA = pi;
+                }
+                /**
+                 * Return the pivot elements that were chosen during the calculation
+                 *
+                 * @return
+                 */
+                public int[][] getPivots() {
+                        return this.PA;
+                }
 …
                         // basis is object distance, we get this from our distance matrix
-                        // alternatively we could provide a distance function that takes 2 vectors
                         double tmp = this.O[x][y] * this.O[x][y];
                         // decrease by projections
+                        for(int i=0; i < k; i++) {
+                                //double tmp2 = Math.abs(this.X[x][i] - this.X[y][i]);
+                                double tmp2 =  (this.X[x][i] - this.X[y][i]);
+                        for( int i=0; i < k; i++ ) {
+                                double tmp2 = (this.X[x][i] - this.X[y][i]);
                                 tmp -= tmp2 * tmp2;
+                        }
                         return Math.abs(tmp);
+                }
-                /**
-                 * Distance calculation used for adding an Instance after initialization is complete
+                 *
-                 * @param x x index of x image (if k==0 x object)
-                 * @param y y index of y image (if k==0 y object)
-                 * @param kdimensionality
-                 * @param distmat temp distmatrix for the instance to be added
-                 * @return distance between x, y
-                 */
-                public double tmpDist(int x, int y, int k, double[][] distmat) {
-                        double tmp = distmat[x][y] * distmat[x][y];
-                        // decrease by projections
-                        for(int i=0; i < k; i++) {
-                                double tmp2 = (this.tmpX[x][i] - this.tmpX[y][i]);
-                                tmp -= tmp2 * tmp2;
+                        }
-                        //return Math.abs(tmp);
-                        return tmp;
+                }
-                /**
-                 * Projects an instance after initialization is complete
+                 *
-                 * This uses the previously saved pivot elements.
+                 *
-                 * @param distmat distance matrix of the instance and pivot elements (3x3 matrix)
-                 * @return vector of the projection values (k-vector)
-                 */
-                public double[] addInstance(double[][] distmat) {
-                        for(int k=0; k < this.target_dims; k++) {
-                                double dxy = this.dist(this.PA[0][k], this.PA[1][k], k);
-                                for(int i=0; i < distmat.length; i++) {
-                                        double dix = this.tmpDist(i, 2*k+1, k, distmat);
-                                        double diy = this.tmpDist(i, 2*k+2, k, distmat);
-                                        // projektion speichern
-                                        this.tmpX[i][k] = (dix + dxy - diy) / (2 * Math.sqrt(dxy));
+                                }
+                        }
-                        double[] ret = new double[this.target_dims];
-                        for(int k=0; k < this.target_dims; k++) {
-                                ret[k] = this.tmpX[0][k];
+                        }
-                        return ret;
+                }
 …
                         int ret = 0;
                         for(int i=0; i < O.length; i++) {
+                        for( int i=0; i < O.length; i++ ) {
                                 double dist = this.dist(i, index, this.col);
                                 if(i != index && dist > furthest) {
+                                if( i != index && dist > furthest ) {
                                         furthest = dist;
                                         ret = i;
 …
                 /**
+                 * Calculates the new k-vector values
+                 * Calculates the new k-vector values (projections)
+                 *
+                 * This is basically algorithm 2 of the fastmap paper.
+                 * We just added the possibility to pre-set the pivot elements because
+                 * we need to classify single instances after the computation is already done.
+                 *
                  * @param dims dimensionality
 …
                 public void calculate() {
+                        for(int k=0; k <this.target_dims; k++) {
+                        for( int k=0; k < this.target_dims; k++ ) {
                                 // 2) choose pivot objects
+                                int[] pivots = this.findDistantObjects();
+                                // 3) record ids of pivot objects
+                                this.PA[0][this.col] = pivots[0];
+                                this.PA[1][this.col] = pivots[1];
+                                if ( !this.pivot_set ) {
+                                        int[] pivots = this.findDistantObjects();
+                                        // 3) record ids of pivot objects
+                                        this.PA[0][this.col] = pivots[0];
+                                        this.PA[1][this.col] = pivots[1];
+                                }
                                 // 4) inter object distances are zero (this.X is initialized with 0 so we just continue)
                                 if(this.dist(pivots[0], pivots[1], this.col) == 0) {
+                                if( this.dist(this.PA[0][this.col], this.PA[1][this.col], this.col) == 0 ) {
                                         continue;
+                                }
                                 // 5) project the objects on the line between the pivots
                                 double dxy = this.dist(pivots[0], pivots[1], this.col);
                                 for(int i=0; i < this.O.length; i++) {
+                                double dxy = this.dist(this.PA[0][this.col], this.PA[1][this.col], this.col);
+                                for( int i=0; i < this.O.length; i++ ) {
                                         double dix = this.dist(i, pivots[0], this.col);
                                         double diy = this.dist(i, pivots[1], this.col);
+                                        double dix = this.dist(i, this.PA[0][this.col], this.col);
+                                        double diy = this.dist(i, this.PA[1][this.col], this.col);
                                         double tmp = (dix + dxy - diy) / (2 * Math.sqrt(dxy));
+                                        // save the projection
                                         this.X[i][this.col] = tmp;
+                                }
 …
                 /**
+                 * returns the result matrix
+                 * returns the result matrix of the projections
+                 *
                  * @return calculated result
                  */
 …
+                }
+        }
-        /**
-         * QuadTree implementation
+         *
-         * QuadTree gets a list of instances and then recursively split them into 4 childs
-         * For this it uses the median of the 2 values x,y
-         */
-        public class QuadTree {
-                // 1 parent or null
-                private QuadTree parent = null;
-                // 4 childs, 1 per quadrant
-                private QuadTree child_nw;
-                private QuadTree child_ne;
-                private QuadTree child_se;
-                private QuadTree child_sw;
-                // list (only helps with generate list of childs!)
-                private ArrayList<QuadTree> l = new ArrayList<QuadTree>();
-                // level only used for debugging
-                public int level = 0;
-                // size of the quadrant
-                private double[] x;
-                private double[] y;
-                public boolean verbose = false;
-                // payload, mal sehen ob das geht mit dem generic
-                // evtl. statt ArrayList eigene QuadTreePayloadlist
-                private ArrayList<QuadTreePayload<Instance>> payload;
-                public QuadTree(QuadTree parent, ArrayList<QuadTreePayload<Instance>> payload) {
-                        this.parent = parent;
-                        this.payload = payload;
+                }
-                public String toString() {
-                        String n = "";
-                        if(this.parent == null) {
-                                n += "rootnode ";
+                        }
-                        String level = new String(new char[this.level]).replace("\0", "-");
-                        n += level + " instances: " + this.getNumbers();
-                        return n;
+                }
-                /**
-                 * Returns the payload, used for clustering
-                 * in the clustering list we only have children with paylod
+                 *
-                 * @return payload
-                 */
-                public ArrayList<QuadTreePayload<Instance>> getPayload() {
-                        return this.payload;
+                }
-                /**
-                 * Calculate the density of this quadrant
+                 *
-                 * density = number of instances / global size (all instances)
+                 *
-                 * @return density
-                 */
-                public double getDensity() {
-                        double dens = 0;
-                        dens = (double)this.getNumbers() / SIZE;
-                        return dens;
+                }
-                public void setSize(double[] x, double[] y){
-                        this.x = x;
-                        this.y = y;
+                }
-                public double[][] getSize() {
-                        return new double[][] {this.x, this.y};
+                }
-                public Double[][] getSizeDouble() {
-                        Double[] tmpX = new Double[2];
-                        Double[] tmpY = new Double[2];
-                        tmpX[0] = this.x[0];
-                        tmpX[1] = this.x[1];
-                        tmpY[0] = this.y[0];
-                        tmpY[1] = this.y[1];
-                        return new Double[][] {tmpX, tmpY};
+                }
-                /**
-                 * TODO: DRY, median ist immer dasselbe
+                 *
-                 * @return median for x
-                 */
-                private double getMedianForX() {
-                        double med_x =0 ;
-                        Collections.sort(this.payload, new Comparator<QuadTreePayload<Instance>>() {
-                        @Override
-                        public int compare(QuadTreePayload<Instance> x1, QuadTreePayload<Instance> x2) {
-                            return Double.compare(x1.x, x2.x);
+                        }
-                    });
-                        if(this.payload.size() % 2 == 0) {
-                                int mid = this.payload.size() / 2;
-                                med_x = (this.payload.get(mid).x + this.payload.get(mid+1).x) / 2;
-                        }else {
-                                int mid = this.payload.size() / 2;
-                                med_x = this.payload.get(mid).x;
+                        }
-                        if(this.verbose) {
-                                System.out.println("sorted:");
-                                for(int i = 0; i < this.payload.size(); i++) {
-                                        System.out.print(""+this.payload.get(i).x+",");
+                                }
-                                System.out.println("median x: " + med_x);
+                        }
-                        return med_x;
+                }
-                private double getMedianForY() {
-                        double med_y =0 ;
-                        Collections.sort(this.payload, new Comparator<QuadTreePayload<Instance>>() {
-                        @Override
-                        public int compare(QuadTreePayload<Instance> y1, QuadTreePayload<Instance> y2) {
-                            return Double.compare(y1.y, y2.y);
+                        }
-                    });
-                        if(this.payload.size() % 2 == 0) {
-                                int mid = this.payload.size() / 2;
-                                med_y = (this.payload.get(mid).y + this.payload.get(mid+1).y) / 2;
-                        }else {
-                                int mid = this.payload.size() / 2;
-                                med_y = this.payload.get(mid).y;
+                        }
-                        if(this.verbose) {
-                                System.out.println("sorted:");
-                                for(int i = 0; i < this.payload.size(); i++) {
-                                        System.out.print(""+this.payload.get(i).y+",");
+                                }
-                                System.out.println("median y: " + med_y);
+                        }
-                        return med_y;
+                }
-                /**
-                 * Reurns the number of instances in the payload
+                 *
-                 * @return int number of instances
-                 */
-                public int getNumbers() {
-                        int number = 0;
-                        if(this.payload != null) {
-                                number = this.payload.size();
+                        }
-                        return number;
+                }
-                /**
-                 * Calculate median values of payload for x, y and split into 4 sectors
+                 *
-                 * @return Array of QuadTree nodes (4 childs)
-                 * @throws Exception if we would run into an recursive loop
-                 */
-                public QuadTree[] split() throws Exception {
-                        double medx = this.getMedianForX();
-                        double medy = this.getMedianForY();
-                        // Payload lists for each child
-                        ArrayList<QuadTreePayload<Instance>> nw = new ArrayList<QuadTreePayload<Instance>>();
-                        ArrayList<QuadTreePayload<Instance>> sw = new ArrayList<QuadTreePayload<Instance>>();
-                        ArrayList<QuadTreePayload<Instance>> ne = new ArrayList<QuadTreePayload<Instance>>();
-                        ArrayList<QuadTreePayload<Instance>> se = new ArrayList<QuadTreePayload<Instance>>();
-                        // sort the payloads to new payloads
-                        // here we have the problem that payloads with the same values are sorted
-                        // into the same slots and it could happen that medx and medy = size_x[1] and size_y[1]
-                        // in that case we would have an endless loop
-                        for(int i=0; i < this.payload.size(); i++) {
-                                QuadTreePayload<Instance> item = this.payload.get(i);
-                                // north west
-                                if(item.x <= medx && item.y >= medy) {
-                                        nw.add(item);
+                                }
-                                // south west
-                                else if(item.x <= medx && item.y <= medy) {
-                                        sw.add(item);
+                                }
-                                // north east
-                                else if(item.x >= medx && item.y >= medy) {
-                                        ne.add(item);
+                                }
-                                // south east
-                                else if(item.x >= medx && item.y <= medy) {
-                                        se.add(item);
+                                }
+                        }
-                        // if we assign one child a payload equal to our own (see problem above)
-                        // we throw an exceptions which stops the recursion on this node
-                        // second error is minimum number of instances
-                        //Console.traceln(Level.INFO, String.format("NW: "+ nw.size() + " SW: " + sw.size() + " NE: " + ne.size() + " SE: " + se.size()));
-                        if(nw.equals(this.payload)) {
-                                throw new Exception("payload equal");
+                        }
-                        if(sw.equals(this.payload)) {
-                                throw new Exception("payload equal");
+                        }
-                        if(ne.equals(this.payload)) {
-                                throw new Exception("payload equal");
+                        }
-                        if(se.equals(this.payload)) {
-                                throw new Exception("payload equal");
+                        }
-                        this.child_nw = new QuadTree(this, nw);
-                        this.child_nw.setSize(new double[] {this.x[0], medx}, new double[] {medy, this.y[1]});
-                        this.child_nw.level = this.level + 1;
-                        this.child_sw = new QuadTree(this, sw);
-                        this.child_sw.setSize(new double[] {this.x[0], medx}, new double[] {this.y[0], medy});
-                        this.child_sw.level = this.level + 1;
-                        this.child_ne = new QuadTree(this, ne);
-                        this.child_ne.setSize(new double[] {medx, this.x[1]}, new double[] {medy, this.y[1]});
-                        this.child_ne.level = this.level + 1;
-                        this.child_se = new QuadTree(this, se);
-                        this.child_se.setSize(new double[] {medx, this.x[1]}, new double[] {this.y[0], medy});
-                        this.child_se.level = this.level + 1;
-                        this.payload = null;
-                        return new QuadTree[] {this.child_nw, this.child_ne, this.child_se, this.child_sw};
+                }
-                /**
-                 * TODO: evt. auslagern, eigentlich auch eher ne statische methode
+                 *
-                 * @param q
-                 */
-                public void recursiveSplit(QuadTree q) {
-                        if(this.verbose) {
-                                System.out.println("splitting: "+ q);
+                        }
-                        if(q.getNumbers() < ALPHA) {
-                                return;
-                        }else{
-                                // exception wird geworfen wenn es zur endlosrekursion kommen würde (siehe text bei split())
-                                try {
-                                        QuadTree[] childs = q.split();
-                                        this.recursiveSplit(childs[0]);
-                                        this.recursiveSplit(childs[1]);
-                                        this.recursiveSplit(childs[2]);
-                                        this.recursiveSplit(childs[3]);
-                                }catch(Exception e) {
-                                        return;
+                                }
+                        }
+                }
-                /**
-                 * returns an list of childs sorted by density
+                 *
-                 * @param q QuadTree
-                 * @return list of QuadTrees
-                 */
-                private void generateList(QuadTree q) {
-                        // entweder es gibtes 4 childs oder keins
-                        if(q.child_ne == null) {
-                                this.l.add(q);
-                                //return;
+                        }
-                        if(q.child_ne != null) {
-                                this.generateList(q.child_ne);
+                        }
-                        if(q.child_nw != null) {
-                                this.generateList(q.child_nw);
+                        }
-                        if(q.child_se != null) {
-                                this.generateList(q.child_se);
+                        }
-                        if(q.child_sw != null) {
-                                this.generateList(q.child_sw);
+                        }
+                }
-                /**
-                 * Checks if passed QuadTree is neighbouring to us
+                 *
-                 * @param q QuadTree
-                 * @return true if passed QuadTree is a neighbour
-                 */
-                public boolean isNeighbour(QuadTree q) {
-                        boolean is_neighbour = false;
-                        double[][] our_size = this.getSize();
-                        double[][] new_size = q.getSize();
-                        // X is i=0, Y is i=1
-                        for(int i =0; i < 2; i++) {
-                                // check X and Y (0,1)
-                                // we are smaller than q
-                                // -------------- q
-                                //    ------- we
-                                if(our_size[i][0] >= new_size[i][0] && our_size[i][1] <= new_size[i][1]) {
-                                        is_neighbour = true;
+                                }
-                                // we overlap with q at some point
-                                //a) ---------------q
-                                //         ----------- we
-                                //b)     --------- q
-                                // --------- we
-                                if((our_size[i][0] >= new_size[i][0] && our_size[i][0] <= new_size[i][1]) ||
-                                   (our_size[i][1] >= new_size[i][0] && our_size[i][1] <= new_size[i][1])) {
-                                        is_neighbour = true;
+                                }
-                                // we are larger than q
-                                //    ---- q
-                                // ---------- we
-                                if(our_size[i][1] >= new_size[i][1] && our_size[i][0] <= new_size[i][0]) {
-                                        is_neighbour = true;
+                                }
+                        }
-                        if(is_neighbour && this.verbose) {
-                                System.out.println(this + " neighbour of: " + q);
+                        }
-                        return is_neighbour;
+                }
-                /**
-                 * todo
-                 */
-                public boolean isInside(double x, double y) {
-                        boolean is_inside_x = false;
-                        boolean is_inside_y = false;
-                        double[][] our_size = this.getSize();
-                        if(our_size[0][0] <= x && our_size[0][1] >= x) {
-                                is_inside_x = true;
+                        }
-                        if(our_size[1][0] <= y && our_size[1][1] >= y) {
-                                is_inside_y = true;
+                        }
-                        if(is_inside_y && is_inside_x && this.verbose) {
-                                System.out.println(this + " contains: " + x + ", "+ y);
+                        }
-                        return is_inside_x && is_inside_y;
+                }
-                /**
-                 * Perform Pruning and clustering of the quadtree
+                 *
-                 * 1) get list of leaf quadrants
-                 * 2) sort by their density
-                 * 3) set stop_rule to 0.5 * highest Density in the list
-                 * 4) merge all nodes with a density > stop_rule to the new cluster and remove all from list
-                 * 5) repeat
+                 *
-                 * @param q List of QuadTree (children only)
-                 */
-                public void gridClustering(ArrayList<QuadTree> list) {
-                        //System.out.println("listsize: " + list.size());
-                        // basisfall
-                        if(list.size() == 0) {
-                                return;
+                        }
-                        double stop_rule;
-                        QuadTree biggest;
-                        QuadTree current;
-                        // current clusterlist
-                        ArrayList<QuadTreePayload<Instance>> current_cluster;
-                        // remove list
-                    ArrayList<Integer> remove = new ArrayList<Integer>();
-                        // 1. find biggest
-                    biggest = list.get(list.size()-1);
-                    stop_rule = biggest.getDensity() * 0.5;
-                    current_cluster = new ArrayList<QuadTreePayload<Instance>>();
-                    current_cluster.addAll(biggest.getPayload());
-                    //System.out.println("adding "+biggest.getDensity() + " to cluster");
-                    // remove the biggest because we are starting with it
-                    remove.add(list.size()-1);
-                    //System.out.println("removing "+biggest.getDensity() + " from list");
-                    ArrayList<Double[][]> tmpSize = new ArrayList<Double[][]>();
-                    tmpSize.add(biggest.getSizeDouble());
-                        // check the items for their density
-                    for(int i=list.size()-1; i >= 0; i--) {
-                        current = list.get(i);
-                                // 2. find neighbours with correct density
-                        // if density > stop_rule and is_neighbour add to cluster and remove from list
-                        if(current.getDensity() > stop_rule && !current.equals(biggest) && current.isNeighbour(biggest)) {
-                                //System.out.println("adding " + current.getDensity() + " to cluster");
-                                //System.out.println("removing "+current.getDensity() + " from list");
-                                current_cluster.addAll(current.getPayload());
-                                // wir können hier nicht removen weil wir sonst den index verschieben
-                                remove.add(i);
-                                // außerdem brauchen wir die größe
-                                tmpSize.add(current.getSizeDouble());
+                        }
+                        }
-                        // 3. remove from list
-                    for(Integer item: remove) {
-                        list.remove((int)item);
+                    }
-                        // 4. add to cluster
-                    cluster.add(current_cluster);
-                    // 5. add size of our current (biggest)
-                    // we need that to classify test instances to a cluster
-                    Integer cnumber = new Integer(cluster.size()-1);
-                    if(CSIZE.containsKey(cnumber) == false) {
-                        CSIZE.put(cnumber, tmpSize);
+                    }
-                        // recurse
-                    //System.out.println("restlist " + list.size());
-                    this.gridClustering(list);
+                }
-                public void printInfo() {
-                    System.out.println("we have " + cluster.size() + " clusters");
-                    for(int i=0; i < cluster.size(); i++) {
-                        System.out.println("cluster: "+i+ " size: "+ cluster.get(i).size());
+                    }
+                }
-                /**
-                 * Helper Method to get a sortet list (by density) for all
-                 * children
+                 *
-                 * @param q QuadTree
-                 * @return Sorted ArrayList of quadtrees
-                 */
-                public ArrayList<QuadTree> getList(QuadTree q) {
-                        this.generateList(q);
-                        Collections.sort(this.l, new Comparator<QuadTree>() {
-                        @Override
-                        public int compare(QuadTree x1, QuadTree x2) {
-                            return Double.compare(x1.getDensity(), x2.getDensity());
+                        }
-                    });
-                        return this.l;
+                }
+        }
+}

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 17 for trunk/CrossPare/src/de/ugoe/cs/cpdp/training/WekaLocalTraining2.java

Legend:

trunk/CrossPare/src/de/ugoe/cs/cpdp/training/WekaLocalTraining2.java

Download in other formats: