美文网首页
classification_precedure_python

classification_precedure_python

作者: Tim_Chen | 来源:发表于2017-07-28 16:32 被阅读0次

    import sklearn
    import pandas as pd
    import numpy as np

    '''----------导入导出数据---------'''
    
    file1 = '/Users/e/Desktop/active_lost_prediction_model.csv'
    t = pd.read_csv(file1)
    
    #透视表
    t = t.pivot_table(index = ['user_id','exist_type'],columns = 'service',aggfunc = max,fill_value = 0)
    
    file2 = '/Users/e/Desktop/export.csv'
    t.to_csv(file2)
    
    '''----------数据处理---------'''
    #填充缺失值
    t_pro = t.fillna(-1)
    
    #替换值
    t_pro['sex'] = t_pro['sex'].replace(['男','女'],[1,0])
    
    replace_dict = {
        -1.0:20,
        1.0:1,
        2.0:20
    }
    t_pro['sex'] = t_pro['sex'].map(lambda x:replace_dict[x])
    
    
    '''----------平衡类---------'''
    #拆分不同类,统计个数
    t_pro_label1 = t_pro[t_pro['label']==1]
    t_pro_label0 = t_pro[t_pro['label']==0]
    
    label1_cnt = t_pro_label1.count()
    label0_cnt = t_pro_label0.count()
    
    #抽样
    t_pro_label1_sampled = t_pro_label1.sample(n = label0_cnt)
    
    #合并
    t_prod = pd.concat(t_pro_label1_sampled,t_pro_label0,axis = 0)
    
    
    '''----------划分训练集---------'''
    #拆分X与y
    y = data['label']
    X = data.drop(['label','user_id','selected_day'],axis = 1)
    
    
    #拆分训练集
    from sklearn.cross_validation import train_test_split
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 0,stratify = y)
    
    '''----------训练与预测模型---------'''
    from sklearn.ensemble import RandomForestClassifier
    forest = RandomForestClassifier(criterion = 'entropy',n_estimators = 10,random_state=1,n_jobs = 2)
    forest.fit(X_train,y_train)
    
    y_pred = forest.predict(X_test)
    y_train_pred = forest.predict(X_train)
    
    
    
    
    '''----------训练与测试的准确性---------'''
    from sklearn.metrics import classification_report
    from sklearn.metrics import accuracy_score
    print('predict')
    print(accuracy_score(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print('model')
    print(accuracy_score(y_train,y_train_pred))
    print(classification_report(y_train,y_train_pred))
    
    
    
    
    '''----------随机森林相关函数---------'''
    #特征重要性
    importances = forest.feature_importances_
    feature_labels = X_train.columns
    indices = np.argsort(importances)[::-1]
    for f in range(X_train.shape[1]):
        print("%d  %20s  %f",f+1,feature_labels[indices[f]],importances[indices[f]])

    相关文章

      网友评论

          本文标题:classification_precedure_python

          本文链接:https://www.haomeiwen.com/subject/dzkflxtx.html