美文网首页计算机杂谈大数据程序员
【毕设进行时-工业大数据,数据挖掘】一种基于熵的连续属性离散化算

【毕设进行时-工业大数据,数据挖掘】一种基于熵的连续属性离散化算

作者: 张照博 | 来源:发表于2018-05-14 21:09 被阅读274次

    正文之前

    事情是这样的,我前面说过了。。。。就是我的毕业论文字数写到14200的时候就感觉有点写不动了,虽然还有性能度量和致谢和一大批的文献参考没写,但是我总感觉这样不妥,所以就特地的又加了点东西。在后剪枝方法和连续值离散化之间,我选择了离散化这个相对好点的东西。后剪枝感觉没什么好补充的。。

    超喜欢的长腿跳舞小姐姐

    正文

    从不废话,先放代码!

    /* *********************
     * Author   :   HustWolf --- 张照博
    
     * Time     :   2018.1-2018.5
    
     * Address  :   HUST
    
     * Version  :   1.0
    
     * 定义一些静态的数值,并且提供getter
     ********************* */
    
    import java.text.NumberFormat;
    import java.util.*;
    
    class Alone_Value_Category  implements Comparable<Alone_Value_Category>{
        private float sensor;
        private float category;
        //    private float[] range = new float[2];
        Alone_Value_Category(float a, float b){
            super();
            this.sensor = a;
            this.category = b;
        }
    
        float getSensor(){
            return  sensor;
        }
        float getCategory(){
            return category;
        }
        //    void setRange(float a, float b){
    //        range[0] = a;
    //        range[1] =b;
    //    }
        @Override
        public String toString() {
            return "\n[Sensor:" + sensor + ", category=" + category + "]";
        }
        @Override
        public int compareTo(Alone_Value_Category o) {
            return Float.compare(this.sensor,o.sensor);
        }
    }
    

    上面这个是👆定义的一个存储数据的地方,这个类用来分割数据,做到单属性对分类的格式。一条4 Sensor 1Category 一共会被拆解为4个这种类的实例分别参与EADC离散化的过程。

    class Interval{
        private float top;
        private float bottom;
        public Map<Float,List<Alone_Value_Category> > sample = new HashMap<Float, List<Alone_Value_Category>>();
        Interval(){};
        Interval(Interval b){
            top = b.top;
            bottom = b.bottom;
            sample = b.sample;
        }
        Interval(float a, float b, float c, List<Alone_Value_Category> d){
            this.top = a;
            this.bottom = b;
            sample.put(c,d);
        }
    
        public float getTop() {
            return top;
        }
    
        public float getBottom() {
            return bottom;
        }
    
        public void setTop(float top) {
            this.top = top;
        }
    
        public void setBottom(float bottom) {
            this.bottom = bottom;
        }
    
        public void setSample(Map<Float, List<Alone_Value_Category>> sample) {
            this.sample = sample;
        }
    
        public Interval addTmp(Interval b){
            Interval re = new Interval(b);
            if (top>b.top) re.setTop(top);
            else re.setTop(b.top);
            if (bottom<b.bottom) re.setBottom(bottom);
            else re.setBottom(b.bottom);
            re.sample.putAll(sample);
            return re;
        }
        public void merge(Interval b){
            if (top<b.top)
                top = b.top;
            if (bottom>b.bottom)
                bottom = b.bottom;
            sample.putAll(b.sample);
        }
        public int getCount(){
            int count = 0;
            for(List<Alone_Value_Category> s:sample.values()){
                count+=s.size();
            }
            return count;
        }
        @Override
        public String toString() {
            return "bottom:"+bottom+" top:"+top+" size:"+getCount();
        }
    }
    

    区间类,每一个区间有上界,下界,还有对应的Alone_Value_Category集合。不过这里面的集合是按照类别-->List的模式存储。按照我的数据,应该是每一个Interval都有两个List

    public class Parameter {
        private static int rate = 2;
        private static int trainNum = 40000;
        private static int testNum = trainNum/rate;
        public static int getTrainNum(){
            return trainNum;
        }
        public static int getRate(){
            return rate;
        }
        public static int getTestNum(){
            return testNum;
        }
        public static int getTestDistance(){
            return 2000000/testNum;
        }
        public static int getTrainDistance(){
            return 2000000/trainNum;
        }
        public static void setRate(int r){
            rate = r;
            testNum = trainNum / rate;
        }   
        public static void setTrainNum(int t){
            trainNum = t;
            testNum = trainNum / rate;
        }
        public static void setTestNum(int t){
            testNum = t;
            trainNum = testNum * rate;
        }
    
    
        public static void Clear(ArrayList<Interval> allInterval){
            ArrayList<Interval> del = new ArrayList<>();
            for (int s = 0;s<allInterval.size();++s) {
                if (allInterval.get(s).getCount() == 0){
                    if (s>0) {
                        allInterval.get(s - 1).merge(allInterval.get(s));
                        del.add(allInterval.get(s));
                    }
                    continue;
                }
            }
            allInterval.removeAll(del);
        }
        static double Entropy(ArrayList<Interval> set, int size){
            double shang = 0;
            NumberFormat nf = NumberFormat.getNumberInstance();
            nf.setMaximumFractionDigits(4);
            for (Interval x:set){
                double p =(double)x.getCount()/(double)size;
                shang  -= p*(Math.log(p)/Math.log(2));
            }
            return  Double.parseDouble(nf.format(shang));
        }
        public static ArrayList<List<Float>> EADC(float[][] dat) {
            ArrayList<List<Float>> re = new  ArrayList<>();
            for (int valueindex = 0; valueindex< dat[0].length-1;++valueindex) {
                ArrayList<Alone_Value_Category> LIST = new ArrayList<>();
                for (int i = 0; i < dat.length; ++i) {
                    LIST.add(new Alone_Value_Category(dat[i][valueindex], dat[i][dat[valueindex].length - 1]));
                    //便利旧集合没有就添加到新集合
                }
                Collections.sort(LIST);
                float len = LIST.get(LIST.size() - 1).getSensor() - LIST.get(0).getSensor();
                int k = 40;
                float gap = (len + 1) / k;
                float Lowest = LIST.get(0).getSensor() - 0.50f;
                float Highest = LIST.get(LIST.size()-1).getSensor() + 0.50f;
                NumberFormat nf = NumberFormat.getNumberInstance();
                nf.setMaximumFractionDigits(1);
                List<Float> range = new LinkedList<>();
                for (int x = 0; x <= k; ++x) {
                    range.add(Float.parseFloat(nf.format(Lowest + x * gap)));
                }
                ArrayList<Interval> allInterval = new ArrayList<>();
                for (int i = 0; i < k; ++i) {
                    Interval newarea = new Interval();
                    newarea.setBottom(range.get(i));
                    newarea.setTop(range.get(i + 1));
                    for (Alone_Value_Category s : LIST) {
                        if (s.getSensor() > range.get(i) && s.getSensor() < range.get(i + 1)) {
                            if (!newarea.sample.containsKey(s.getCategory())) {
                                newarea.sample.put(s.getCategory(), new LinkedList<>());
                            }
                            newarea.sample.get(s.getCategory()).add(s);
                        }
                    }
                    allInterval.add(newarea);
                }
                int size = 0;
                Clear(allInterval);
                for (Interval s : allInterval) {
                    size += s.getCount();
                }
                k = allInterval.size();
                int k0 = k;
                double Ck0 = 0.5;
                boolean Loop = true;
                double Hpk_1 = 0;
                while (Loop && k >= 10) {
                    double minD = 1000;
                    int mergePoint = 0;
                    double Hp0 = Entropy(allInterval, size);
                    double Hpk;
                    ArrayList<Interval> newA = new ArrayList<>();
                    for (int i = 0; i < allInterval.size() - 1; ++i) {
                        newA.addAll(allInterval);
                        newA.get(i).merge(newA.get(i + 1));
                        newA.remove(i + 1);
                        Hpk = Entropy(newA, size);
                        if (Hpk - Hp0 < minD) {
                            Hpk_1 = Hpk;
                            minD = Hpk - Hp0;
                            mergePoint = i;
                        }
                        newA.clear();
                    }
                    allInterval.get(mergePoint).merge(allInterval.get(mergePoint + 1));
                    allInterval.remove(allInterval.get(mergePoint + 1));
                    double Ck_1 = (k0 - 1) * Hpk_1 - Hp0 * (k - 2);
                    if (Ck_1 > Ck0) {
                        --k;
                    } else {
                        Loop = false;
                        --k;
                    }
    //                Ck = Ck_1;
                }
                range.clear();
                range.add(-100f);
                for (Interval s:allInterval) {
                    range.add(s.getTop());
                }
                range.add(100f);
                re.add(range);
    //        long endTime=System.currentTimeMillis(); //获取结束时间
    //        System.out.println("\n程序运行时间: "+(endTime-startTime)+"ms");
            }
            return re;
        }
    }
    

    主体类,也是EADC算法的(一种基于熵的连续属性离散化算法)的Java实现!我是三天晒网,一天打渔,不过终于今天还是肝出来了。。这就意味着差不多要收工了!美滋滋Q!!!

    具体来说其实还好吧。。。等后面毕业了我把我的毕业论文写成简书发出来,大家伙就看的明白了咯!现在先上数学表达!

    最后得到的伪代码就是下面的了:

    当然,他这个有点看不明白,看我的解释吧!

    整个离散化的过程如下:
    (1) 从数据库读取数据,传入到离散化方法中;
    (2) 先针对单一的属性,取出所有的值,并且对其进行排序;
    (3) 排序后划分区间,并且利用熵的计算公式计算出初始熵,设置度量数值Ck = 0 ;
    (4) 合并两个相邻区间,使合并前后的熵差最小,并且重置划分点,保存合并后的熵值;
    (5) 根据上面的度量公式计算出Ck-1 = h;
    (6) 如果Ck-1 > Ck ,那么k = k -1,回到第(4)步;
    (7) 如果Ck-1 < Ck ,保存当前的区间划分,结束区间划分进程;
    (8) 将传入的数据根据当前区间划分进行离散化。
    离散化流程图如下:

    上面这图花了好久。才算是理清了。。。不容易啊不容易!!

    正文之后

    争取今晚写完论文,明天排版完毕,最好事明天先自查,然后大后天上知网查重。。。大大后天,要给某人一个惊喜,就是不知道她能不能看到了!!

    相关文章

      网友评论

      • 知识学者:排版是word?
        张照博:@东风冷雪 恩。。妥的。。LateX什么的太麻烦了不是?
        知识学者:@HustWolf 论文排版是word吧。
        张照博:你说哪儿??有来自pdf的,也有别的。画图用的draw.io

      本文标题:【毕设进行时-工业大数据,数据挖掘】一种基于熵的连续属性离散化算

      本文链接:https://www.haomeiwen.com/subject/hafsdftx.html