// Copyright 2015 Georg-August-Universität Göttingen, Germany // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package de.ugoe.cs.cpdp.training; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Random; import java.util.Set; import java.util.logging.Level; import de.ugoe.cs.cpdp.training.QuadTree; import de.ugoe.cs.util.console.Console; import weka.classifiers.AbstractClassifier; import weka.classifiers.Classifier; import weka.core.DenseInstance; import weka.core.EuclideanDistance; import weka.core.Instance; import weka.core.Instances; import weka.filters.Filter; import weka.filters.unsupervised.attribute.Remove; /** *
* Trainer with reimplementation of WHERE clustering algorithm from: Tim Menzies, Andrew Butcher, * David Cok, Andrian Marcus, Lucas Layman, Forrest Shull, Burak Turhan, Thomas Zimmermann, * "Local versus Global Lessons for Defect Prediction and Effort Estimation," IEEE Transactions on * Software Engineering, vol. 39, no. 6, pp. 822-834, June, 2013 *
** With WekaLocalFQTraining we do the following: *
* Weka classifier for the local model with WHERE clustering *
* * @author Alexander Trautsch */ public class TraindatasetCluster extends AbstractClassifier { /** * default serialization ID */ private static final long serialVersionUID = 1L; /** * classifiers for each cluster */ private HashMap* copies an instance such that is is compatible with the local model *
* * @param instances * instance format * @param instance * instance that is copied * @return */ private Instance createInstance(Instances instances, Instance instance) { // attributes for feeding instance to classifier Set* Because Fastmap saves only the image not the values of the attributes it used we can not * use the old data directly to classify single instances to clusters. *
** To classify a single instance we do a new Fastmap computation with only the instance and * the old pivot elements. *
* * After that we find the cluster with our Fastmap result for x and y. * * * @param instance * instance that is classified * @see weka.classifiers.AbstractClassifier#classifyInstance(weka.core.Instance) */ @Override public double classifyInstance(Instance instance) { double ret = 0; try { // classinstance gets passed to classifier Instances traindata = ctraindata.get(0); Instance classInstance = createInstance(traindata, instance); // this one keeps the class attribute Instances traindata2 = ctraindata.get(1); // remove class attribute before clustering Remove filter = new Remove(); filter.setAttributeIndices("" + (traindata.classIndex() + 1)); filter.setInputFormat(traindata); traindata = Filter.useFilter(traindata, filter); Instance clusterInstance = createInstance(traindata, instance); Fastmap FMAP = new Fastmap(2); EuclideanDistance dist = new EuclideanDistance(traindata); // we set our pivot indices [x=0,y=1][dimension] int[][] npivotindices = new int[2][2]; npivotindices[0][0] = 1; npivotindices[1][0] = 2; npivotindices[0][1] = 3; npivotindices[1][1] = 4; // build temp dist matrix (2 pivots per dimension + 1 instance we want to classify) // the instance we want to classify comes first after that the pivot elements in the // order defined above double[][] distmat = new double[2 * FMAP.target_dims + 1][2 * FMAP.target_dims + 1]; distmat[0][0] = 0; distmat[0][1] = dist.distance(clusterInstance, this.cpivots.get((Integer) this.cpivotindices[0][0])); distmat[0][2] = dist.distance(clusterInstance, this.cpivots.get((Integer) this.cpivotindices[1][0])); distmat[0][3] = dist.distance(clusterInstance, this.cpivots.get((Integer) this.cpivotindices[0][1])); distmat[0][4] = dist.distance(clusterInstance, this.cpivots.get((Integer) this.cpivotindices[1][1])); distmat[1][0] = dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][0]), clusterInstance); distmat[1][1] = 0; distmat[1][2] = dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][0]), this.cpivots.get((Integer) this.cpivotindices[1][0])); distmat[1][3] = dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][0]), this.cpivots.get((Integer) this.cpivotindices[0][1])); distmat[1][4] = dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][0]), this.cpivots.get((Integer) this.cpivotindices[1][1])); distmat[2][0] = dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][0]), clusterInstance); distmat[2][1] = dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][0]), this.cpivots.get((Integer) this.cpivotindices[0][0])); distmat[2][2] = 0; distmat[2][3] = dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][0]), this.cpivots.get((Integer) this.cpivotindices[0][1])); distmat[2][4] = dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][0]), this.cpivots.get((Integer) this.cpivotindices[1][1])); distmat[3][0] = dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][1]), clusterInstance); distmat[3][1] = dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][1]), this.cpivots.get((Integer) this.cpivotindices[0][0])); distmat[3][2] = dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][1]), this.cpivots.get((Integer) this.cpivotindices[1][0])); distmat[3][3] = 0; distmat[3][4] = dist.distance(this.cpivots.get((Integer) this.cpivotindices[0][1]), this.cpivots.get((Integer) this.cpivotindices[1][1])); distmat[4][0] = dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][1]), clusterInstance); distmat[4][1] = dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][1]), this.cpivots.get((Integer) this.cpivotindices[0][0])); distmat[4][2] = dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][1]), this.cpivots.get((Integer) this.cpivotindices[1][0])); distmat[4][3] = dist.distance(this.cpivots.get((Integer) this.cpivotindices[1][1]), this.cpivots.get((Integer) this.cpivotindices[0][1])); distmat[4][4] = 0; /* * debug output: show biggest distance found within the new distance matrix double * biggest = 0; for(int i=0; i < distmat.length; i++) { for(int j=0; j < * distmat[0].length; j++) { if(biggest < distmat[i][j]) { biggest = distmat[i][j]; * } } } if(this.show_biggest) { Console.traceln(Level.INFO, * String.format(""+clusterInstance)); Console.traceln(Level.INFO, String.format( * "biggest distances: "+ biggest)); this.show_biggest = false; } */ FMAP.setDistmat(distmat); FMAP.setPivots(npivotindices); FMAP.calculate(); double[][] x = FMAP.getX(); double[] proj = x[0]; // debug output: show the calculated distance matrix, our result vektor for the // instance and the complete result matrix /* * Console.traceln(Level.INFO, "distmat:"); for(int i=0; i* Payload for the QuadTree. x and y are the calculated Fastmap values. T is a Weka instance. *
* * @author Alexander Trautsch * @param* Constructor. Creates the payload. *
* * @param x * x-value * @param y * y-value * @param value * associated instace */ public QuadTreePayload(double x, double y, T value) { this.x = x; this.y = y; this.inst = value; } /** ** returns the instance *
* * @return the instance */ public T getInst() { return this.inst; } } /** *
* Fastmap implementation after:
* * Faloutsos, C., & Lin, K. I. (1995). FastMap: A fast algorithm for indexing, data-mining and
* visualization of traditional and multimedia datasets (Vol. 24, No. 2, pp. 163-174). ACM.
*
* Constructor. Creates a new Fastmap object. *
* * @param k */ public Fastmap(int k) { this.target_dims = k; } /** ** Sets the distance matrix and params that depend on this. *
* * @param O * distance matrix */ public void setDistmat(double[][] O) { this.O = O; int N = O.length; this.X = new double[N][this.target_dims]; this.PA = new int[2][this.target_dims]; } /** ** Set pivot elements, we need that to classify instances after the calculation is complete * (because we then want to reuse only the pivot elements). *
* * @param pi * the pivots */ public void setPivots(int[][] pi) { this.pivot_set = true; this.PA = pi; } /** ** Return the pivot elements that were chosen during the calculation *
* * @return the pivots */ public int[][] getPivots() { return this.PA; } /** ** The distance function for euclidean distance. Acts according to equation 4 of the Fastmap * paper. *
* * @param x * x index of x image (if k==0 x object) * @param y * y index of y image (if k==0 y object) * @param k * dimensionality * @return the distance */ private double dist(int x, int y, int k) { // basis is object distance, we get this from our distance matrix double tmp = this.O[x][y] * this.O[x][y]; // decrease by projections for (int i = 0; i < k; i++) { double tmp2 = (this.X[x][i] - this.X[y][i]); tmp -= tmp2 * tmp2; } return Math.abs(tmp); } /** ** Find the object farthest from the given index. This method is a helper Method for * findDistandObjects. *
* * @param index * of the object * @return index of the farthest object from the given index */ private int findFarthest(int index) { double furthest = Double.MIN_VALUE; int ret = 0; for (int i = 0; i < O.length; i++) { double dist = this.dist(i, index, this.col); if (i != index && dist > furthest) { furthest = dist; ret = i; } } return ret; } /** ** Finds the pivot objects. This method is basically algorithm 1 of the Fastmap paper. *
* * @return 2 indexes of the chosen pivot objects */ private int[] findDistantObjects() { // 1. choose object randomly Random r = new Random(); int obj = r.nextInt(this.O.length); // 2. find farthest object from randomly chosen object int idx1 = this.findFarthest(obj); // 3. find farthest object from previously farthest object int idx2 = this.findFarthest(idx1); return new int[] { idx1, idx2 }; } /** ** Calculates the new k-vector values (projections) This is basically algorithm 2 of the * fastmap paper. We just added the possibility to pre-set the pivot elements because we * need to classify single instances after the computation is already done. *
*/ public void calculate() { for (int k = 0; k < this.target_dims; k++) { // 2) choose pivot objects if (!this.pivot_set) { int[] pivots = this.findDistantObjects(); // 3) record ids of pivot objects this.PA[0][this.col] = pivots[0]; this.PA[1][this.col] = pivots[1]; } // 4) inter object distances are zero (this.X is initialized with 0 so we just // continue) if (this.dist(this.PA[0][this.col], this.PA[1][this.col], this.col) == 0) { continue; } // 5) project the objects on the line between the pivots double dxy = this.dist(this.PA[0][this.col], this.PA[1][this.col], this.col); for (int i = 0; i < this.O.length; i++) { double dix = this.dist(i, this.PA[0][this.col], this.col); double diy = this.dist(i, this.PA[1][this.col], this.col); double tmp = (dix + dxy - diy) / (2 * Math.sqrt(dxy)); // save the projection this.X[i][this.col] = tmp; } this.col += 1; } } /** ** returns the result matrix of the projections *
* * @return calculated result */ public double[][] getX() { return this.X; } } }