美文网首页每天AI你三千遍
统计特征IV值算法实现demo

统计特征IV值算法实现demo

作者: AI_Engine | 来源:发表于2018-04-21 23:37 被阅读89次

    import numpy as np

    from math import log

    from math import e

    import os,sys

    reload(sys)

    sys.setdefaultencoding('utf-8')

    #获取数据并以列表的形式返回

    def achieve_data(path):

        feaValue_list = []

        if os.path.isdir(path):

            file_list = os.listdir(path)

            for f in file_list:

                if f.endswith('.txt'):

                    fea_array = np.genfromtxt(path+'\\'+f,dtype= float)

                feaValue_list.extend(fea_array)

        return feaValue_list

    #计算每个特征的分组临界值点,这里分为4组。

    def cal_stage_vlaue(sample_list):

        all_fea_step = []

        sample_array_len = len(sample_list)

        step_num = sample_array_len/4

        stepValue1 = stepValue2 = stepValue3 = stepValue4 = 0

        fea_num = len(sample_list[0])

        sample_array = np.array(sample_list)

        for i in range(0,fea_num):

            col_arg = np.argsort(sample_array[:,i])

            col_arg_array = sample_array[col_arg]

            stepValue1 = col_arg_array[step_num][i]

            stepValue2 = col_arg_array[2*step_num][i]

            stepValue3 = col_arg_array[3*step_num][i]

            stepValue4 = col_arg_array[-1][i]

            one_fea_step = [0,stepValue1,stepValue2,stepValue3,stepValue4]

            all_fea_step.append(one_fea_step)

        return all_fea_step

    #计算每个特征的不同分组情况下,黑白样本分别覆盖的数量

    def cal_proportion(stage_array,white_list,black_list):

        fea_num = len(stage_array)

        all_fea_pro_list = []

        for i in range(fea_num):

            white_count = len(white_list)

            white_num_1 = white_num_2 = white_num_3 = white_num_4 = 0

            for j in range(white_count):

                if white_list[j][i] >= stage_array[i][0] and white_list[j][i] < stage_array[i][1]:

                    white_num_1 = white_num_1+1

                elif white_list[j][i] >= stage_array[i][1] and white_list[j][i] < stage_array[i][2]:

                    white_num_2 = white_num_2+1

                elif white_list[j][i] >= stage_array[i][2] and white_list[j][i] < stage_array[i][3]:

                    white_num_3 = white_num_3+1

                elif white_list[j][i] >= stage_array[i][3] and white_list[j][i] <= stage_array[i][4]:

                    white_num_4 = white_num_4+1

                else:

                    pass

            white_fea_pro_list = [white_num_1,white_num_2,white_num_3,white_num_4]

            black_count = len(black_list)

            black_num_1 = black_num_2 = black_num_3 = black_num_4 = 0

            for k in range(black_count):

                if black_list[k][i] >= stage_array[i][0] and black_list[k][i] < stage_array[i][1]:

                    black_num_1 = black_num_1 + 1

                elif black_list[k][i] >= stage_array[i][1] and black_list[k][i] < stage_array[i][2]:

                    black_num_2 = black_num_2 + 1

                elif black_list[k][i] >= stage_array[i][2] and black_list[k][i] < stage_array[i][3]:

                    black_num_3 = black_num_3 + 1

                elif black_list[k][i] >= stage_array[i][3] and black_list[k][i] < stage_array[i][4]

                    black_num_4 = black_num_4 + 1

                else:

                    pass

            black_fea_pro_list = [black_num_1, black_num_2, black_num_3, black_num_4]

            one_fea_pro_list = []

            one_fea_pro_list.append(black_fea_pro_list)

            one_fea_pro_list.append(white_fea_pro_list)

            all_fea_pro_list.append(one_fea_pro_list)

        #print all_fea_pro_list

        all_fea_pro_list = fix_pro(all_fea_pro_list)

        return all_fea_pro_list

    #这里是人工处理黑白样本在分组中覆盖数量为0的情况。

    def fix_pro(pro_list):

        num1 = len(pro_list)

        for i in range(num1):

            num2 = len(pro_list[i])

            for j in range(num2):

                num3 = len(pro_list[i][j])

                for k in range(num3):

                    if pro_list[i][j][k] == 0:

                        pro_list[i][j][k] = 1

        return pro_list

    #计算每个特征的IV值

    def cal_IV(propor_array):

        propor_array_count = len(propor_array)

        IV_list = []

        for i in range(propor_array_count):

            pro_part_array = propor_array[i].T

            print pro_part_array

            black_sum,white_sum = pro_part_array.sum(axis=0)

            print black_sum,white_sum

            DB1 = float(pro_part_array[0][0])/ float(black_sum)

            DB2 = float(pro_part_array[1][0])/ float(black_sum)

            DB3 = float(pro_part_array[2][0])/ float(black_sum)

            DB4 = float(pro_part_array[3][0])/ float(black_sum)

            DG1 = float(pro_part_array[0][1])/ float(white_sum)

            DG2 = float(pro_part_array[1][1])/ float(white_sum)

            DG3 = float(pro_part_array[2][1])/ float(white_sum)

            DG4 = float(pro_part_array[3][1])/ float(white_sum)

            IV1 = (DG1-DB1) * log(float(DG1/DB1),e)

            IV2 = (DG2-DB2) * log(float(DG2/DB2),e)

            IV3 = (DG3-DB3) * log(float(DG3/DB3),e)

            IV4 = (DG4-DB4) * log(float(DG4/DB4),e)

            IV = IV1+IV2+IV3+IV4

            IV_list.append(IV)

        return IV_list

    相关文章

      网友评论

        本文标题:统计特征IV值算法实现demo

        本文链接:https://www.haomeiwen.com/subject/astqlftx.html