美文网首页
2020-09-21

2020-09-21

作者: 池中之于 | 来源:发表于2020-09-21 22:47 被阅读0次

    金融风控练习赛——Task3 特征工程

    1. 数据预处理

    1.1 数据清洗

    数据清洗主要用于对原始数据中缺失值,异常值进行处理。

    
    #数据读取及分类
    
    import pandas as pd
    
    import numpy as np
    
    import matplotlib.pyplot as plt
    
    import seaborn as sns
    
    import datetime
    
    from tqdm import tqdm
    
    from sklearn.preprocessing import LabelEncoder
    
    from sklearn.feature_selection import SelectKBest
    
    from sklearn.feature_selection import chi2
    
    from sklearn.preprocessing import MinMaxScaler
    
    import xgboost as xgb
    
    import lightgbm as lgb
    
    from catboost import CatBoostRegressor
    
    import warnings
    
    from sklearn.model_selection import StratifiedKFold, KFold
    
    from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
    
    warnings.filterwarnings('ignore')
    
    data_train =pd.read_csv('../data/train.csv')
    
    data_test_a = pd.read_csv('../data/testA.csv')
    
    numerical_fea = list(data_train.select_dtypes(exclude=['object']).columns)
    
    category_fea = list(filter(lambda x: x not in numerical_fea,list(data_train.columns)))
    
    label = 'isDefault'
    
    numerical_fea.remove(label)
    
    # 数值型特征--平均数填充
    data_train[numerical_fea] = data_train[numerical_fea].fillna(data_train[numerical_fea].median())
    data_test_a[numerical_fea] = data_test_a[numerical_fea].fillna(data_train[numerical_fea].median())
    # 类别型特征--众数填充
    data_train[category_fea] = data_train[category_fea].fillna(data_train[category_fea].mode())
    data_test_a[category_fea] = data_test_a[category_fea].fillna(data_train[category_fea].mode())
    
    

    1.2 数据分箱

    • 分位数分箱
    data['loanAmnt_bin'] = pd.qcut(data['loanAmnt'], 10, labels=False)
    

    2. 变量选择

    主要以变量的线性相关性为例。

    x_train = data_train.drop(['isDefault'], axis=1)
    #计算协方差
    data_corr = x_train.corrwith(data_train.isDefault) #计算相关性
    result = pd.DataFrame(columns=['features', 'corr'])
    result['features'] = data_corr.index
    result['corr'] = data_corr.values
    

    方差选择特征

    from sklearn.feature_selection import VarianceThreshold
    #其中参数threshold为方差的阈值
    VarianceThreshold(threshold=3).fit_transform(train,target_train)
    

    3. 时间特征处理

    #转化成时间格式
    for data in [data_train, data_test_a]:
        data['issueDate'] = pd.to_datetime(data['issueDate'],format='%Y-%m-%d')
        startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
        #构造时间特征
        data['issueDateDT'] = data['issueDate'].apply(lambda x: x-startdate).dt.days
    
    def to_employmentLength(s):
        if pd.isnull(s):
            return s
        else:
            return np.int8(s.split()[0])
    for data in [data_train, data_test_a]:
        data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
        data['employmentLength'].replace('< 1 year', '0 years', inplace=True)
        data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)
    

    处理类别特征

    #查看类别特征
    category_fea
    #['grade', 'subGrade', 'employmentLength', 'issueDate', 'earliesCreditLine']
    
    for data in [data_train, data_test_a]:
        data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))
    
    for data in [data_train, data_test_a]:
        data['grade'] = data['grade'].map({'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7})
    
    for data in [data_train, data_test_a]:
        data = pd.get_dummies(data, columns=['subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)
    

    相关文章

      网友评论

          本文标题:2020-09-21

          本文链接:https://www.haomeiwen.com/subject/enabyktx.html