source: trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/MahalanobisOutlierRemoval.java @ 120

Last change on this file since 120 was 117, checked in by sherbold, 8 years ago
  • Property svn:mime-type set to text/plain
File size: 5.1 KB
Line 
1// Copyright 2015 Georg-August-Universität Göttingen, Germany
2//
3//   Licensed under the Apache License, Version 2.0 (the "License");
4//   you may not use this file except in compliance with the License.
5//   You may obtain a copy of the License at
6//
7//       http://www.apache.org/licenses/LICENSE-2.0
8//
9//   Unless required by applicable law or agreed to in writing, software
10//   distributed under the License is distributed on an "AS IS" BASIS,
11//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12//   See the License for the specific language governing permissions and
13//   limitations under the License.
14
15package de.ugoe.cs.cpdp.dataselection;
16
17import org.apache.commons.collections4.list.SetUniqueList;
18import org.apache.commons.math3.linear.BlockRealMatrix;
19import org.apache.commons.math3.linear.LUDecomposition;
20import org.apache.commons.math3.linear.RealMatrix;
21import org.apache.commons.math3.linear.SingularMatrixException;
22import org.apache.commons.math3.stat.correlation.Covariance;
23
24import de.lmu.ifi.dbs.elki.logging.Logging.Level;
25import de.ugoe.cs.cpdp.util.WekaUtils;
26import de.ugoe.cs.util.console.Console;
27import weka.core.Instances;
28
29/**
30 * <p>
31 * Uses the Mahalanobis distance for outlier removal. All instances that are epsilon times the
32 * distance are removed. The default for epsilon is 3.0.
33 * </p>
34 *
35 * @author Steffen Herbold
36 */
37public class MahalanobisOutlierRemoval
38    implements IPointWiseDataselectionStrategy, ISetWiseDataselectionStrategy
39{
40
41    /**
42     * Distance outside which entities are removed as outliers.
43     */
44    private double epsilon = 3.0d;
45
46    /**
47     * Sets epsilon. Default is 3.0.
48     *
49     * @see de.ugoe.cs.cpdp.IParameterizable#setParameter(java.lang.String)
50     */
51    @Override
52    public void setParameter(String parameters) {
53        if (parameters != null && !parameters.isEmpty()) {
54            epsilon = Double.parseDouble(parameters);
55        }
56    }
57
58    /*
59     * (non-Javadoc)
60     *
61     * @see de.ugoe.cs.cpdp.dataselection.ISetWiseDataselectionStrategy#apply(weka.core.Instances,
62     * org.apache.commons.collections4.list.SetUniqueList)
63     */
64    @Override
65    public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
66        for (Instances traindata : traindataSet) {
67            applyMahalanobisDistancesRemoval(traindata);
68        }
69    }
70
71    /*
72     * (non-Javadoc)
73     *
74     * @see de.ugoe.cs.cpdp.dataselection.IPointWiseDataselectionStrategy#apply(weka.core.Instances,
75     * weka.core.Instances)
76     */
77    @Override
78    public Instances apply(Instances testdata, Instances traindata) {
79        applyMahalanobisDistancesRemoval(traindata);
80        return traindata;
81    }
82
83    /**
84     * <p>
85     * removes all instances, whose Mahalanobi distance to the mean of the data is greater than
86     * epsilon.
87     * </p>
88     *
89     * @param data
90     *            data where the outliers are removed
91     */
92    private void applyMahalanobisDistancesRemoval(Instances data) {
93        RealMatrix values = new BlockRealMatrix(data.size(), data.numAttributes() - 1);
94        for (int i = 0; i < data.size(); i++) {
95            values.setRow(i, WekaUtils.instanceValues(data.get(i)));
96        }
97        RealMatrix inverseCovariance;
98        try {
99            inverseCovariance =
100            new LUDecomposition(new Covariance(values).getCovarianceMatrix()).getSolver()
101                .getInverse();
102        } catch(SingularMatrixException e) {
103            Console.traceln(Level.WARNING, "could not perform Mahalanobis outlier removal due to singular covariance matrix");
104            return;
105        }
106        // create mean vector
107        double[] meanValues = new double[data.numAttributes() - 1];
108        int k = 0;
109        for (int j = 0; j < data.numAttributes(); j++) {
110            if (j != data.classIndex()) {
111                meanValues[k] = data.attributeStats(j).numericStats.mean;
112                k++;
113            }
114        }
115
116        for (int i = data.size() - 1; i >= 0; i--) {
117            double distance =
118                mahalanobisDistance(inverseCovariance, WekaUtils.instanceValues(data.get(i)),
119                                    meanValues);
120            if (distance > epsilon) {
121                data.remove(i);
122            }
123        }
124    }
125
126    /**
127     * <p>
128     * Calculates the Mahalanobis distance between two vectors for a given inverse covariance
129     * matric.
130     * </p>
131     *
132     * @param inverseCovariance
133     * @param vector1
134     * @param vector2
135     * @return
136     */
137    private double mahalanobisDistance(RealMatrix inverseCovariance,
138                                       double[] vector1,
139                                       double[] vector2)
140    {
141        RealMatrix x = new BlockRealMatrix(1, vector1.length);
142        x.setRow(0, vector1);
143        RealMatrix y = new BlockRealMatrix(1, vector2.length);
144        y.setRow(0, vector2);
145
146        RealMatrix deltaxy = x.subtract(y);
147
148        return Math
149            .sqrt(deltaxy.multiply(inverseCovariance).multiply(deltaxy.transpose()).getEntry(0, 0));
150    }
151}
Note: See TracBrowser for help on using the repository browser.