实例(1)——特征工程

作者: 飘涯 | 来源:发表于2018-04-23 20:53 被阅读183次

    前言:机器学习工程师一半的时间花在数据的清洗、特征选择、降维等数据处理上面,下面就以邮件过滤系统为例,介绍一下机器学习模型构建前的一些非常重要的工作。

    • 收集数据

    不同的项目有不同的数据来源,这在前面介绍过。

    • 查看数据

    这次训练模型的数据当然是六万多份邮件以及邮件的标签,如下图:


    邮件
    标签

    通过数据可以得到如下:
    任务

    • 监督学习还是无监督学习?二分类还是多分类?文本分类还是 结构化数据分类?短文本分类还是长文本分类?
      答:有便签,监督学习,二分类,长文本分类

    数据

    • 样本如何定义?什么样的数据作为特征?如果划分训练集和测 试集?
      答:可以分为发送邮件地址;接受邮件地址;发送时间;邮件内容;邮件长度
      如何从上述的特征中选出合适的特征?
      答:通过统计计算
      选择合适的模型;根据具体的任务优化模型;模型调优;多模 型融合
    • 数据预处理

    • 分别提取上述特征到一个csv文件
      1 把便签转化为数字
      代码如下:
    import sys
    import os
    import time
    '''
    把六万条数据,写到一行上,制作标签,标签已经给你标注好
    '''
    #1制作标签字典
    def label_dict(label_path):
        type_dict = {"spam":"1","ham":"0"}
        content = open(label_path)
        index_dict = {}
        #用try防止出错发生
        try:
            for line in content:
                arr = line.split(" ")
                if len(arr)==2:
                    key,value=arr
                    value=value.replace("../data",'').replace("\n",'')
                    index_dict[value]=type_dict[key.lower()]
        finally:
            content.close()
        return index_dict
    a = label_dict("./full/index")
    print(a)
    

    输出结果如下:

    '/028/239': '0', '/028/240': '0', '/028/241': '1', '/028/242': '1', '/028/243': '1', '/028/244': '1', '/028/245': '1', '/028/2
    

    2 提取特征,先定义一个文件的特征提取

    def feature_dict(email_path):
        email_content = open(email_path,'r',encoding="gb2312",errors="ignore")
        content_dict={}
        try:
            is_content = False
            for line in email_content:
                line = line.strip()#去除首尾空格字符
                if line.startswith("From:"):
                    content_dict["from"] = line[5:]
                elif line.startswith("To"):
                    content_dict["to"]=line[3:]
                elif line.startswith("Date"):
                    content_dict["date"]=line[5:]
                elif not line:
                    is_content=True
                if is_content:
                    if "content" in content_dict:
                        content_dict['content'] += line
                    else:
                        content_dict['content'] = line
            pass
        finally:
            email_content.close()
        return content_dict
    

    输出结果:

    {'from': ' "yan"<(8月27-28,上海)培训课程>', 'to': ' lu@ccert.edu.cn', 'date': ' Tue, 30 Aug 2005 10:08:15 +0800', 'content': '非财务纠淼牟莆窆芾-(沙盘模拟
    

    3.把上述的字典转化为文本

    def dict_to_text(email_path):
        content_dict=feature_dict(email_path)
        # 进行处理
        result_str = content_dict.get('from', 'unkown').replace(',', '').strip() + ","
        result_str += content_dict.get('to', 'unknown').replace(',', '').strip() + ","
        result_str += content_dict.get('date', 'unknown').replace(',', '').strip() + ","
        result_str += content_dict.get('content', 'unknown').replace(',', ' ').strip()
        return result_str
    

    输出的结果为:

    "yan"<(8月27-28上海)培训课程>,lu@ccert.edu.cn,Tue 30 Aug 2005 10:08:15 +0800,非财务纠淼牟莆窆芾-(沙盘模拟)------如何运用财务岳硖岣吖芾砑
    

    4.提取上述特征,写入到一个文件中,两个for循环

    start = time.time()
    index_dict = label_dict("./full/index")
    list0 = os.listdir('./data')  # 文件夹的名称
    
    for l1 in list0:  # 开始把N个文件夹中的file写入N*n个wiriter
        l1_path = './data/' + l1
        print('开始处理文件夹' + l1_path)
        list1 = os.listdir(l1_path)
    
        write_file_path = './process/process01_' + l1
    
        with open(write_file_path, "w", encoding='utf-8') as writer:
            for l2 in list1:
                l2_path = l1_path + "/" + l2  # 得到要处理文件的具体路径
    
                index_key = "/" + l1 + "/" + l2
    
                if index_key in index_dict:
                    content_str = dict_to_text(l2_path)
                    content_str += "," + index_dict[index_key] + "\n"
                    writer.writelines(content_str)
    
    with open('./result_process01', "w", encoding='utf-8') as writer:
        for l1 in list0:
            file_path = './process/process01_' + l1
            print("开始合并文件:" + file_path)
    
            with open(file_path, encoding='utf-8') as file:
                for line in file:
                    writer.writelines(line)
    
    end = time.time()
    
    print('数据处理总共耗时%.2f' % (end - start))
    

    得到结果如下:


    新的文件
    • 数据分析

    分别查看特征属性对标签值的相关性
    1.查看邮件收发地址对标签的影响

    df = pd.read_csv('./result_process01', sep = ',', header = None, names= ['from','to', 'date', 'content','label'])
    def 获取邮件收发地址(strl):#发送接收地址提取
        it = re.findall(r"@([A-Za-z0-9]*\.[A-Za-z0-9\.]+)", str(strl))#正则匹配
        result = ''
        if len(it)>0:
            result = it[0]
        else:
            result = 'unknown'
        return result
    
    df['from_address'] = pd.Series(map(lambda str : 获取邮件收发地址(str), df['from']))#map映射并添加
    df['to_address'] = pd.Series(map(lambda str: 获取邮件收发地址(str), df['to']))
    #开始分析:多少种地址,每种多少个
    print(df['from_address'].unique().shape)
    print(df['from_address'].value_counts().head(5))
    from_address_df = df.from_address.value_counts().to_frame()#转为结构化的输出,输出带索引
    print(from_address_df.head(5))
    

    结果:

    (3567,)
    163.com                  7500
    mail.tsinghua.edu.cn     6498
    126.com                  5822
    tom.com                  4075
    mails.tsinghua.edu.cn    3205
    

    可以看出地址对是否为垃圾邮件没有影响。
    时间也没有影响
    2.对内容进行分词

    print('='*30 + '现在开始分词,请耐心等待5分钟。。。' + '='*20)
    df['content'] = df['content'].astype('str')#astype类型转换,转为str
    df['jieba_cut_content'] = list(map(lambda st: "  ".join(jieba.cut(st)), df['content']))
    print(df["jieba_cut_content"].head(4))
    

    3.判断邮件长度对是否为垃圾邮件有没有影响

    def 邮件长度统计(lg):
        if lg <= 10:
            return 0
        elif lg <= 100:
            return 1
        elif lg <= 500:
            return 2
        elif lg <= 1000:
            return 3
        elif lg <= 1500:
            return 4
        elif lg <= 2000:
            return 5
        elif lg <= 2500:
            return 6
        elif lg <=  3000:
            return 7
        elif lg <= 4000:
            return 8
        elif lg <= 5000:
            return 9
        elif lg <= 10000:
            return 10
        elif lg <= 20000:
            return 11
        elif lg <= 30000:
            return 12
        elif lg <= 50000:
            return 13
        else:
            return 14
     
    df['content_length'] = pd.Series(map(lambda st:len(st), df['content']))
    df['content_length_type'] = pd.Series(map(lambda st: 邮件长度统计(st), df['content_length']))
    # print(df.head(10))  #如果不count就按照自然顺序排      
    df2 = df.groupby(['content_length_type', 'label'])['label'].agg(['count']).reset_index()#agg 计算并且添加count用于后续计算
    df3 = df2[df2.label == 1][['content_length_type', 'count']].rename(columns = {'count' : 'c1'})
    df4 = df2[df2.label == 0][['content_length_type', 'count']].rename(columns = {'count' : 'c2'})
    df5 = pd.merge(df3, df4)#注意pandas中merge与concat的区别
    df5['c1_rage'] = df5.apply(lambda r: r['c1'] / (r['c1'] + r['c2']), axis = 1)
    df5['c2_rage'] = df5.apply(lambda r: r['c2'] / (r['c1'] + r['c2']), axis = 1)
    # print(df5)
    #画图出来观测为信号添加做准备
    plt.plot(df5['content_length_type'], df5['c1_rage'], label = u'垃圾邮件比例')
    plt.plot(df5['content_length_type'], df5['c2_rage'], label = u'正常邮件比例')
    plt.grid(True)
    plt.legend(loc = 0)#加入图例
    plt.show()
    

    可见邮件对长度还是有一定影响的
    写出拟合函数:


    def process_content_sema(x):
        if x > 10000:
            return 0.5 / np.exp(np.log10(x) - np.log10(500)) + np.log(abs(x - 500) + 1) - np.log(abs(x - 10000)) + 1
        else:
            return 0.5 / np.exp(np.log10(x) - np.log10(500)) + np.log(abs(x - 500) + 1)
    

    4,特征提取
    删除没有用的特征,把有用的特征给你保存下来

    df['content_length_sema'] = list(map(lambda st: process_content_sema(st), df['content_length'])) 
    # print(df.head(10))
    # sys.exit(0)
    print(df.dtypes) #可以查看每一列的数据类型,也可以查看每一列的名称
       
    df.drop(['from', 'to', 'date', 'from_address', 'to_address', \
             'date_week','date_hour', 'date_time_quantum', 'content', \
             'content_length', 'content_length_type'], 1, inplace=True)
    print(df.info())
    print(df.head(10)) 
      
    df.to_csv('./result_process02', encoding='utf-8', index = False)
    df.to_csv('./result_process02.csv', encoding='utf-8', index = False)
    

    结果如下:


    • 模型训练

    选择用贝叶斯算法进行模型计算,原因是速度快,且效果好
    选择召回率对模型进行评估

    from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer#CountVectorizer把词进行可视化
    from sklearn.decomposition import TruncatedSVD
    from sklearn.naive_bayes import BernoulliNB
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import f1_score, precision_score, recall_score
    
    # mpl.rcParams['font.sans-serif'] = [u'simHei']
    # mpl.rcParams['axes.unicode_minus'] = False
    
    df = pd.read_csv('./result_process02.csv', sep =',')
    # print(df.head(5))
    df.dropna(axis = 0, how ='any', inplace = True) #按行删除Nan 确保数据安全
    # print(df.head(5))
    # print(df.info())
    
    x_train, x_test, y_train, y_test = train_test_split(df[['has_date','jieba_cut_content']],\
                                                        df['label'],test_size = 0.2, random_state = 0)
    
    # print("训练数据集大小:%d" % x_train.shape[0])
    # print("测试集数据大小:%d" % x_test.shape[0])
    # print(x_train.head(10))
    # print(x_test.head(10)) #注意前面索引
    #================================================================================================
    print('='*30 + '开始计算tf—idf权重' + '='*30)
    transformer = TfidfVectorizer(norm = 'l2', use_idf = True)#逆向文件频率
    svd = TruncatedSVD(n_components=20)
    jieba_cut_content = list(x_train['jieba_cut_content'].astype('str'))
    transformer_model = transformer.fit(jieba_cut_content)
    df1 = transformer_model.transform(jieba_cut_content)
    # print(df1)
    # print(df1.shape)
    print('='*30 + '开始SVD降维计算' + '='*30)
    svd_model = svd.fit(df1)
    df2 = svd_model.transform(df1)
    data = pd.DataFrame(df2)
    # print(data.head(10))
    # print(data.info())
    print('='*30 + '重新构建矩阵开始' + '='*30)
    data['has_date'] = list(x_train['has_date'])
    # data['content_length_sema'] = list(x_train['content_length_sema'])
    # print(data.head(10))
    # print(data.info())
    print('='*30 + '构建伯努利贝叶斯模型' + '='*30)
    nb = BernoulliNB(alpha = 1.0, binarize = 0.0005)#二值转换阈值
    model = nb.fit(data, y_train)
    #================================================================================
    print('='*30 + '构建测试集' + '='*30)
    jieba_cut_content_test = list(x_test['jieba_cut_content'].astype('str'))
    data_test = pd.DataFrame(svd_model.transform(transformer_model.transform(jieba_cut_content_test)))
    data_test['has_date'] = list(x_test['has_date'])
    # data_test['content_length_sema'] = list(x_test['content_length_sema'])
    # print(data_test.head(10))
    # print(data_test.info())
    #开始预测
    print('='*30 + '开始预测测试集' + '='*30)
    y_predict = model.predict(data_test)
        
    precision = precision_score(y_test, y_predict)
    recall = recall_score(y_test, y_predict)
    f1mean = f1_score(y_test, y_predict)
       
    print('精确率为:%0.5f' % precision)
    print('召回率:%0.5f' % recall)
    print('F1均值为:%0.5f' % f1mean)
    

    结果:
    精确率为:0.94549
    召回率:0.98925
    F1均值为:0.96688
    详细代码以及说明见github地址:https://github.com/dctongsheng/Spam-filtering-projects001

    相关文章

      网友评论

      本文标题:实例(1)——特征工程

      本文链接:https://www.haomeiwen.com/subject/nsillftx.html