source: trunk/CrossPare/src/de/ugoe/cs/cpdp/dataselection/MahalanobisOutlierRemoval.java @ 83

Last change on this file since 83 was 82, checked in by sherbold, 9 years ago
  • Property svn:mime-type set to text/plain
File size: 4.7 KB
Line 
1// Copyright 2015 Georg-August-Universität Göttingen, Germany
2//
3//   Licensed under the Apache License, Version 2.0 (the "License");
4//   you may not use this file except in compliance with the License.
5//   You may obtain a copy of the License at
6//
7//       http://www.apache.org/licenses/LICENSE-2.0
8//
9//   Unless required by applicable law or agreed to in writing, software
10//   distributed under the License is distributed on an "AS IS" BASIS,
11//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12//   See the License for the specific language governing permissions and
13//   limitations under the License.
14
15package de.ugoe.cs.cpdp.dataselection;
16
17import org.apache.commons.collections4.list.SetUniqueList;
18import org.apache.commons.math3.linear.BlockRealMatrix;
19import org.apache.commons.math3.linear.LUDecomposition;
20import org.apache.commons.math3.linear.RealMatrix;
21import org.apache.commons.math3.stat.correlation.Covariance;
22
23import de.ugoe.cs.cpdp.util.WekaUtils;
24import weka.core.Instances;
25
26/**
27 * <p>
28 * Uses the Mahalanobis distance for outlier removal. All instances that are epsilon times the
29 * distance are removed. The default for epsilon is 3.0.
30 * </p>
31 *
32 * @author Steffen Herbold
33 */
34public class MahalanobisOutlierRemoval
35    implements IPointWiseDataselectionStrategy, ISetWiseDataselectionStrategy
36{
37
38    /**
39     * Distance outside which entities are removed as outliers.
40     */
41    private double epsilon = 3.0d;
42
43    /**
44     * Sets epsilon. Default is 3.0.
45     *
46     * @see de.ugoe.cs.cpdp.IParameterizable#setParameter(java.lang.String)
47     */
48    @Override
49    public void setParameter(String parameters) {
50        if (parameters != null && !parameters.isEmpty()) {
51            epsilon = Double.parseDouble(parameters);
52        }
53    }
54
55    /*
56     * (non-Javadoc)
57     *
58     * @see de.ugoe.cs.cpdp.dataselection.ISetWiseDataselectionStrategy#apply(weka.core.Instances,
59     * org.apache.commons.collections4.list.SetUniqueList)
60     */
61    @Override
62    public void apply(Instances testdata, SetUniqueList<Instances> traindataSet) {
63        for (Instances traindata : traindataSet) {
64            applyMahalanobisDistancesRemoval(traindata);
65        }
66    }
67
68    /*
69     * (non-Javadoc)
70     *
71     * @see de.ugoe.cs.cpdp.dataselection.IPointWiseDataselectionStrategy#apply(weka.core.Instances,
72     * weka.core.Instances)
73     */
74    @Override
75    public Instances apply(Instances testdata, Instances traindata) {
76        applyMahalanobisDistancesRemoval(traindata);
77        return traindata;
78    }
79
80    /**
81     * <p>
82     * removes all instances, whose Mahalanobi distance to the mean of the data is greater than
83     * epsilon.
84     * </p>
85     *
86     * @param data
87     *            data where the outliers are removed
88     */
89    private void applyMahalanobisDistancesRemoval(Instances data) {
90        RealMatrix values = new BlockRealMatrix(data.size(), data.numAttributes() - 1);
91        for (int i = 0; i < data.size(); i++) {
92            values.setRow(i, WekaUtils.instanceValues(data.get(i)));
93        }
94        RealMatrix inverseCovariance =
95            new LUDecomposition(new Covariance(values).getCovarianceMatrix()).getSolver()
96                .getInverse();
97
98        // create mean vector
99        double[] meanValues = new double[data.numAttributes() - 1];
100        int k = 0;
101        for (int j = 0; j < data.numAttributes(); j++) {
102            if (j != data.classIndex()) {
103                meanValues[k] = data.attributeStats(j).numericStats.mean;
104                k++;
105            }
106        }
107
108        for (int i = data.size() - 1; i >= 0; i--) {
109            double distance =
110                mahalanobisDistance(inverseCovariance, WekaUtils.instanceValues(data.get(i)),
111                                    meanValues);
112            if (distance > epsilon) {
113                data.remove(i);
114            }
115        }
116    }
117
118    /**
119     * <p>
120     * Calculates the Mahalanobis distance between two vectors for a given inverse covariance
121     * matric.
122     * </p>
123     *
124     * @param inverseCovariance
125     * @param vector1
126     * @param vector2
127     * @return
128     */
129    private double mahalanobisDistance(RealMatrix inverseCovariance,
130                                       double[] vector1,
131                                       double[] vector2)
132    {
133        RealMatrix x = new BlockRealMatrix(1, vector1.length);
134        x.setRow(0, vector1);
135        RealMatrix y = new BlockRealMatrix(1, vector2.length);
136        y.setRow(0, vector2);
137
138        RealMatrix deltaxy = x.subtract(y);
139
140        return Math
141            .sqrt(deltaxy.multiply(inverseCovariance).multiply(deltaxy.transpose()).getEntry(0, 0));
142    }
143}
Note: See TracBrowser for help on using the repository browser.