source: trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/MahalanobisOutlierRemoval.java @ 136

Last change on this file since 136 was 135, checked in by sherbold, 8 years ago
  • code documentation and formatting
  • Property svn:mime-type set to text/plain
File size: 5.2 KB
Line 
1// Copyright 2015 Georg-August-Universität Göttingen, Germany
2//
3//   Licensed under the Apache License, Version 2.0 (the "License");
4//   you may not use this file except in compliance with the License.
5//   You may obtain a copy of the License at
6//
7//       http://www.apache.org/licenses/LICENSE-2.0
8//
9//   Unless required by applicable law or agreed to in writing, software
10//   distributed under the License is distributed on an "AS IS" BASIS,
11//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12//   See the License for the specific language governing permissions and
13//   limitations under the License.
14
15package de.ugoe.cs.cpdp.dataselection;
16
17import org.apache.commons.collections4.list.SetUniqueList;
18import org.apache.commons.math3.linear.BlockRealMatrix;
19import org.apache.commons.math3.linear.LUDecomposition;
20import org.apache.commons.math3.linear.RealMatrix;
21import org.apache.commons.math3.linear.SingularMatrixException;
22import org.apache.commons.math3.stat.correlation.Covariance;
23
24import de.lmu.ifi.dbs.elki.logging.Logging.Level;
25import de.ugoe.cs.cpdp.util.WekaUtils;
26import de.ugoe.cs.util.console.Console;
27import weka.core.Instances;
28
29/**
30 * <p>
31 * Uses the Mahalanobis distance for outlier removal. All instances that are epsilon times the
32 * distance are removed. The default for epsilon is 3.0.
33 * </p>
34 *
35 * @author Steffen Herbold
36 */
37public class MahalanobisOutlierRemoval
38    implements IPointWiseDataselectionStrategy, ISetWiseDataselectionStrategy
39{
40
41    /**
42     * Distance outside which entities are removed as outliers.
43     */
44    private double epsilon = 3.0d;
45
46    /**
47     * Sets epsilon. Default is 3.0.
48     *
49     * @see de.ugoe.cs.cpdp.IParameterizable#setParameter(java.lang.String)
50     */
51    @Override
52    public void setParameter(String parameters) {
53        if (parameters != null && !parameters.isEmpty()) {
54            epsilon = Double.parseDouble(parameters);
55        }
56    }
57
58    /*
59     * (non-Javadoc)
60     *
61     * @see de.ugoe.cs.cpdp.dataselection.ISetWiseDataselectionStrategy#apply(weka.core.Instances,
62     * org.apache.commons.collections4.list.SetUniqueList)
63     */
64    @Override
65    public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
66        for (Instances traindata : traindataSet) {
67            applyMahalanobisDistancesRemoval(traindata);
68        }
69    }
70
71    /*
72     * (non-Javadoc)
73     *
74     * @see de.ugoe.cs.cpdp.dataselection.IPointWiseDataselectionStrategy#apply(weka.core.Instances,
75     * weka.core.Instances)
76     */
77    @Override
78    public Instances apply(Instances testdata, Instances traindata) {
79        applyMahalanobisDistancesRemoval(traindata);
80        return traindata;
81    }
82
83    /**
84     * <p>
85     * removes all instances, whose Mahalanobi distance to the mean of the data is greater than
86     * epsilon.
87     * </p>
88     *
89     * @param data
90     *            data where the outliers are removed
91     */
92    private void applyMahalanobisDistancesRemoval(Instances data) {
93        RealMatrix values = new BlockRealMatrix(data.size(), data.numAttributes() - 1);
94        for (int i = 0; i < data.size(); i++) {
95            values.setRow(i, WekaUtils.instanceValues(data.get(i)));
96        }
97        RealMatrix inverseCovariance;
98        try {
99            inverseCovariance = new LUDecomposition(new Covariance(values).getCovarianceMatrix())
100                .getSolver().getInverse();
101        }
102        catch (SingularMatrixException e) {
103            Console
104                .traceln(Level.WARNING,
105                         "could not perform Mahalanobis outlier removal due to singular covariance matrix");
106            return;
107        }
108        // create mean vector
109        double[] meanValues = new double[data.numAttributes() - 1];
110        int k = 0;
111        for (int j = 0; j < data.numAttributes(); j++) {
112            if (j != data.classIndex()) {
113                meanValues[k] = data.attributeStats(j).numericStats.mean;
114                k++;
115            }
116        }
117
118        for (int i = data.size() - 1; i >= 0; i--) {
119            double distance =
120                mahalanobisDistance(inverseCovariance, WekaUtils.instanceValues(data.get(i)),
121                                    meanValues);
122            if (distance > epsilon) {
123                data.remove(i);
124            }
125        }
126    }
127
128    /**
129     * <p>
130     * Calculates the Mahalanobis distance between two vectors for a given inverse covariance
131     * matric.
132     * </p>
133     *
134     * @param inverseCovariance
135     * @param vector1
136     * @param vector2
137     * @return
138     */
139    private double mahalanobisDistance(RealMatrix inverseCovariance,
140                                       double[] vector1,
141                                       double[] vector2)
142    {
143        RealMatrix x = new BlockRealMatrix(1, vector1.length);
144        x.setRow(0, vector1);
145        RealMatrix y = new BlockRealMatrix(1, vector2.length);
146        y.setRow(0, vector2);
147
148        RealMatrix deltaxy = x.subtract(y);
149
150        return Math
151            .sqrt(deltaxy.multiply(inverseCovariance).multiply(deltaxy.transpose()).getEntry(0, 0));
152    }
153}
Note: See TracBrowser for help on using the repository browser.