source: trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/NormalizationUtil.java @ 70

Last change on this file since 70 was 51, checked in by sherbold, 8 years ago
  • refactored existing normalization approaches into common util class
  • added zScoreTraining normalization after Nam et al.
  • added TCAPlus normalization selection after Nam et al.
  • Property svn:mime-type set to text/plain
File size: 5.7 KB
Line 
1// Copyright 2015 Georg-August-Universität Göttingen, Germany
2//
3//   Licensed under the Apache License, Version 2.0 (the "License");
4//   you may not use this file except in compliance with the License.
5//   You may obtain a copy of the License at
6//
7//       http://www.apache.org/licenses/LICENSE-2.0
8//
9//   Unless required by applicable law or agreed to in writing, software
10//   distributed under the License is distributed on an "AS IS" BASIS,
11//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12//   See the License for the specific language governing permissions and
13//   limitations under the License.
14
15package de.ugoe.cs.cpdp.dataprocessing;
16
17import org.apache.commons.collections4.list.SetUniqueList;
18
19import weka.core.Instance;
20import weka.core.Instances;
21
22/**
23 * <p>
24 * Helper class for normalization of data sets.
25 * </p>
26 *
27 * @author Steffen Herbold
28 */
29public class NormalizationUtil {
30
31    /**
32     * <p>
33     * Min-Max normalization to scale all data to the interval [0,1] (N1 in Transfer Defect Learning
34     * by Nam et al.).
35     * </p>
36     *
37     * @param data
38     *            data that is normalized
39     */
40    public static void minMax(Instances data) {
41        for (int j = 0; j < data.numAttributes(); j++) {
42            if (data.classIndex() != j) {
43                double min = data.attributeStats(j).numericStats.min;
44                double max = data.attributeStats(j).numericStats.max;
45
46                for (int i = 0; i < data.numInstances(); i++) {
47                    Instance inst = data.instance(i);
48                    double newValue = (inst.value(j) - min) / (max - min);
49                    inst.setValue(j, newValue);
50                }
51            }
52        }
53    }
54
55    /**
56     * <p>
57     * Z-Score normalization (N2 in Transfer Defect Learning by Nam et al.).
58     * </p>
59     *
60     * @param data
61     *            data that is normalized
62     */
63    public static void zScore(Instances data) {
64        final double[] mean = new double[data.numAttributes()];
65        final double[] std = new double[data.numAttributes()];
66
67        // get means and stddevs of data
68        for (int j = 0; j < data.numAttributes(); j++) {
69            if (data.classIndex() != j) {
70                mean[j] = data.meanOrMode(j);
71                std[j] = Math.sqrt(data.variance(j));
72            }
73        }
74        applyZScore(data, mean, std);
75    }
76
77    /**
78     * <p>
79     * Z-Score normalization using the mean and std of the training data (N3 in Transfer Defect
80     * Learning by Nam et al.).
81     * </p>
82     *
83     * @param testdata
84     *            test data of the target product
85     * @param traindata
86     *            training data
87     */
88    public static void zScoreTraining(Instances testdata, Instances traindata) {
89        final double[] mean = new double[testdata.numAttributes()];
90        final double[] std = new double[testdata.numAttributes()];
91
92        // get means of training
93        for (int j = 0; j < traindata.numAttributes(); j++) {
94            if (traindata.classIndex() != j) {
95                mean[j] = traindata.meanOrMode(j);
96                std[j] = Math.sqrt(traindata.variance(j));
97            }
98        }
99
100        applyZScore(testdata, mean, std);
101        applyZScore(traindata, mean, std);
102    }
103
104    /**
105     * <p>
106     * Z-Score normalization using the mean and std of the test data (N4 in Transfer Defect Learning
107     * by Nam et al.).
108     * </p>
109     *
110     * @param testdata
111     *            test data of the target product
112     * @param traindata
113     *            training data
114     */
115    public static void zScoreTarget(Instances testdata, Instances traindata) {
116        final double[] mean = new double[testdata.numAttributes()];
117        final double[] std = new double[testdata.numAttributes()];
118
119        // get means of testdata
120        for (int j = 0; j < testdata.numAttributes(); j++) {
121            if (testdata.classIndex() != j) {
122                mean[j] = testdata.meanOrMode(j);
123                std[j] = Math.sqrt(testdata.variance(j));
124            }
125        }
126
127        applyZScore(testdata, mean, std);
128        applyZScore(traindata, mean, std);
129    }
130
131    /**
132     * <p>
133     * Z-Score normalization using the mean and std of the test data (N4 in Transfer Defect Learning
134     * by Nam et al.).
135     * </p>
136     *
137     * @param testdata
138     *            test data of the target product
139     * @param traindata
140     *            training data
141     */
142    public static void zScoreTarget(Instances testdata, SetUniqueList<Instances> traindataSet) {
143        final double[] mean = new double[testdata.numAttributes()];
144        final double[] std = new double[testdata.numAttributes()];
145
146        // get means of testdata
147        for (int j = 0; j < testdata.numAttributes(); j++) {
148            if (testdata.classIndex() != j) {
149                mean[j] = testdata.meanOrMode(j);
150                std[j] = Math.sqrt(testdata.variance(j));
151            }
152        }
153
154        applyZScore(testdata, mean, std);
155        for (Instances traindata : traindataSet) {
156            applyZScore(traindata, mean, std);
157        }
158    }
159
160    /**
161     * <p>
162     * Internal helper function
163     * </p>
164     */
165    private static void applyZScore(Instances data, double[] mean, double[] std) {
166        for (int i = 0; i < data.numInstances(); i++) {
167            Instance instance = data.instance(i);
168            for (int j = 0; j < data.numAttributes(); j++) {
169                if (data.classIndex() != j) {
170                    instance.setValue(j, instance.value(j) - mean[j] / std[j]);
171                }
172            }
173        }
174    }
175}
Note: See TracBrowser for help on using the repository browser.