source: trunk/CrossPare/src/de/ugoe/cs/cpdp/dataprocessing/TCAPlusNormalization.java @ 51

Last change on this file since 51 was 51, checked in by sherbold, 9 years ago
  • refactored existing normalization approaches into common util class
  • added zScoreTraining normalization after Nam et al.
  • added TCAPlus normalization selection after Nam et al.
  • Property svn:mime-type set to text/plain
File size: 4.2 KB
Line 
1package de.ugoe.cs.cpdp.dataprocessing;
2
3import org.apache.commons.math3.ml.distance.EuclideanDistance;
4
5import weka.core.Instances;
6
7// normalization selected according to TCA+ rules (TCA has to be applied separately
8public class TCAPlusNormalization implements IProcessesingStrategy {
9
10    private class DistChar {
11        private final double mean;
12        private final double std;
13        private final double min;
14        private final double max;
15        private int num;
16        private DistChar(double mean, double std, double min, double max, int num) {
17            this.mean = mean;
18            this.std = std;
19            this.min = min;
20            this.max = max;
21            this.num = num;
22        }
23    }
24   
25    /**
26     * Does not have parameters. String is ignored.
27     *
28     * @param parameters
29     *            ignored
30     */
31    @Override
32    public void setParameter(String parameters) {
33        // TODO Auto-generated method stub
34       
35    }
36
37    @Override
38    public void apply(Instances testdata, Instances traindata) {
39        applyTCAPlus(testdata, traindata);
40    }
41   
42    private void applyTCAPlus(Instances testdata, Instances traindata) {
43        DistChar dcTest = datasetDistance(testdata);
44        DistChar dcTrain = datasetDistance(traindata);
45       
46        // RULE 1:
47        if( 0.9*dcTrain.mean<=dcTest.mean && 1.1*dcTrain.mean>=dcTest.mean &&
48            0.9*dcTrain.std<=dcTest.std && 1.1*dcTrain.std>=dcTest.std) {
49            // do nothing
50        }
51        // RULE 2:
52        else if((0.4*dcTrain.min>dcTest.min || 1.6*dcTrain.min<dcTest.min) &&
53                (0.4*dcTrain.max>dcTest.max || 1.6*dcTrain.min<dcTest.max) &&
54                (0.4*dcTrain.min>dcTest.num || 1.6*dcTrain.min<dcTest.num)) {
55            NormalizationUtil.minMax(testdata);
56            NormalizationUtil.minMax(traindata);
57        }
58        // RULE 3:
59        else if((0.4*dcTrain.std>dcTest.std && dcTrain.num<dcTest.num) ||
60                (1.6*dcTrain.std<dcTest.std)&& dcTrain.num>dcTest.num) {
61            NormalizationUtil.zScoreTraining(testdata, traindata);
62        }
63        // RULE 4:
64        else if((0.4*dcTrain.std>dcTest.std && dcTrain.num>dcTest.num) ||
65                (1.6*dcTrain.std<dcTest.std)&& dcTrain.num<dcTest.num) {
66            NormalizationUtil.zScoreTarget(testdata, traindata);
67        }
68        //RULE 5:
69        else {
70            NormalizationUtil.zScore(testdata);
71            NormalizationUtil.zScore(traindata);
72        }
73    }
74   
75    private DistChar datasetDistance(Instances data) {
76        double distance;
77        double sumAll = 0.0;
78        double sumAllQ = 0.0;
79        double min = Double.MAX_VALUE;
80        double max = Double.MIN_VALUE;
81        int numCmp = 0;
82        int l = 0;
83        double[] inst1 = new double[data.numAttributes()-1];
84        double[] inst2 = new double[data.numAttributes()-1];
85        EuclideanDistance euclideanDistance = new EuclideanDistance();
86        for( int i=0; i<data.numInstances(); i++ ) {
87            l=0;
88            for( int k=0; k<data.numAttributes(); k++ ) {
89                if( k!=data.classIndex() ) {
90                    inst1[l] = data.instance(i).value(k);
91                }
92            }
93            for( int j=0; j<data.numInstances(); j++ ) {
94                l=0;
95                for( int k=0; k<data.numAttributes(); k++ ) {
96                    if( k!=data.classIndex() ) {
97                        inst2[l] = data.instance(j).value(k);
98                    }
99                }
100                distance = euclideanDistance.compute(inst1, inst2);
101                sumAll += distance;
102                sumAllQ += distance*distance;
103                numCmp++;
104                if( distance < min ) {
105                    min = distance;
106                }
107                if( distance > max ) {
108                    max = distance;
109                }
110            }
111        }
112        double mean = sumAll / numCmp;
113        double std = Math.sqrt((sumAllQ-(sumAll*sumAll)/numCmp) *
114                                  (1.0d / (numCmp - 1)));
115        return new DistChar(mean, std, min, max, data.numInstances());
116    }
117
118}
Note: See TracBrowser for help on using the repository browser.