美文网首页
贷款利润最大化

贷款利润最大化

作者: ForgetThatNight | 来源:发表于2018-07-07 11:20 被阅读369次

    数据下载地址 https://www.lendingclub.com/info/download-data.action

    样本示例
    import pandas as pd
    loans_2007 = pd.read_csv('LoanStats3a.csv', skiprows=1)
    half_count = len(loans_2007) / 2
    # 去掉没有用的数据
    loans_2007 = loans_2007.dropna(thresh=half_count, axis=1)
    loans_2007 = loans_2007.drop(['desc', 'url'],axis=1)
    loans_2007.to_csv('loans_2007.csv', index=False)
    
    import pandas as pd
    loans_2007 = pd.read_csv("loans_2007.csv")
    #loans_2007.drop_duplicates() 打印第一行数据和第一列
    print(loans_2007.iloc[0])
    print(loans_2007.shape[1])
    

    数据预处理 --去掉没用的特征

    loans_2007 = loans_2007.drop(["id", "member_id", "funded_amnt", "funded_amnt_inv", "grade", "sub_grade", "emp_title", "issue_d"], axis=1)
    
    loans_2007 = loans_2007.drop(["zip_code", "out_prncp", "out_prncp_inv", "total_pymnt", "total_pymnt_inv", "total_rec_prncp"], axis=1)
    
    loans_2007 = loans_2007.drop(["total_rec_int", "total_rec_late_fee", "recoveries", "collection_recovery_fee", "last_pymnt_d", "last_pymnt_amnt"], axis=1)
    print(loans_2007.iloc[0])
    print(loans_2007.shape[1])
    

    统计loan_status取值的次数

    print(loans_2007['loan_status'].value_counts())
    

    数据预处理 -- 拿到最多的两个值作为分类的值--这里做二分类

    
    loans_2007 = loans_2007[(loans_2007['loan_status'] == "Fully Paid") | (loans_2007['loan_status'] == "Charged Off")]
    
    status_replace = {
        "loan_status" : {
            "Fully Paid": 1,
            "Charged Off": 0,
        }
    }
    # pandas来做label值
    loans_2007 = loans_2007.replace(status_replace)
    
    #let's look for any columns that contain only one unique value and remove them
    # 拿到所有的列进行清洗工作
    orig_columns = loans_2007.columns
    drop_columns = []
    for col in orig_columns:
    # 去掉缺失值,再去计算唯一属性有几个
        col_series = loans_2007[col].dropna().unique()
    #如果列的值只有一个 则去掉
        if len(col_series) == 1:
            drop_columns.append(col)
    loans_2007 = loans_2007.drop(drop_columns, axis=1)
    print(drop_columns)
    print loans_2007.shape
    loans_2007.to_csv('filtered_loans_2007.csv', index=False)
    

    只有24列了:
    ['initial_list_status', 'collections_12_mths_ex_med', 'policy_code', 'application_type', 'acc_now_delinq', 'chargeoff_within_12_mths', 'delinq_amnt', 'tax_liens']
    (39560, 24)
    现在数据集里还包含了很多的缺失值、字符、标点符号之类的

    import pandas as pd
    loans = pd.read_csv('filtered_loans_2007.csv')
    # 。isnull是否有缺失值 sum计算一共有多少个缺失值
    null_counts = loans.isnull().sum()
    print(null_counts)
    
    缺失值较多的列要么去掉样本要么去掉列 这里去掉列

    缺失值较少的就直接去掉对应的样本

    统计各类型的特征个数

    loans = loans.drop("pub_rec_bankruptcies", axis=1)
    loans = loans.dropna(axis=0)
    print(loans.dtypes.value_counts())
    
    sklearn不认object这种字符类的数据,只认数字

    统计各个object类的数据到底是什么情况,以便进行数据清洗

    object_columns_df = loans.select_dtypes(include=["object"])
    print(object_columns_df.iloc[0])
    
    某些列可以去掉单位 然后保留
    cols = ['home_ownership', 'verification_status', 'emp_length', 'term', 'addr_state']
    for c in cols:
        print(loans[c].value_counts())
    
    可以将部分特征转换为对应的数值型
    print(loans["purpose"].value_counts())
    print(loans["title"].value_counts())
    
    title的选项较多,这里去掉title这个特征

    将数据转换为sklearn能处理的数据

    mapping_dict = {
        "emp_length": {
            "10+ years": 10,
            "9 years": 9,
            "8 years": 8,
            "7 years": 7,
            "6 years": 6,
            "5 years": 5,
            "4 years": 4,
            "3 years": 3,
            "2 years": 2,
            "1 year": 1,
            "< 1 year": 0,
            "n/a": 0
        }
    }
    loans = loans.drop(["last_credit_pull_d", "earliest_cr_line", "addr_state", "title"], axis=1)
    loans["int_rate"] = loans["int_rate"].str.rstrip("%").astype("float")
    loans["revol_util"] = loans["revol_util"].str.rstrip("%").astype("float")
    loans = loans.replace(mapping_dict)
    
    混淆矩阵的判定标准 如TP表示能还钱我们预测为要还钱

    由于关系到利润最大化问题,不能再单纯的考虑精度和召回率了,比如FP和FN对公司的损失不是一样大的


    cat_columns = ["home_ownership", "verification_status", "emp_length", "purpose", "term"]
    dummy_df = pd.get_dummies(loans[cat_columns])
    loans = pd.concat([loans, dummy_df], axis=1)
    loans = loans.drop(cat_columns, axis=1)
    loans = loans.drop("pymnt_plan", axis=1)
    
    loans.to_csv('cleaned_loans2007.csv', index=False)
    
    import pandas as pd
    loans = pd.read_csv("cleaned_loans2007.csv")
    print(loans.info())
    
    清洗过后的样本集
    import pandas as pd
    # False positives.
    fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
    fp = len(predictions[fp_filter])
    
    # True positives.
    tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
    tp = len(predictions[tp_filter])
    
    # False negatives.
    fn_filter = (predictions == 0) & (loans["loan_status"] == 1)
    fn = len(predictions[fn_filter])
    
    # True negatives
    tn_filter = (predictions == 0) & (loans["loan_status"] == 0)
    tn = len(predictions[tn_filter])
    

    先用逻辑回归来预测利润最大化问题的表现情况(二分类问题)

    from sklearn.linear_model import LogisticRegression
    lr = LogisticRegression()
    cols = loans.columns
    train_cols = cols.drop("loan_status")
    features = loans[train_cols]
    target = loans["loan_status"]
    lr.fit(features, target)
    predictions = lr.predict(features)
    
    from sklearn.linear_model import LogisticRegression
    from sklearn.cross_validation import cross_val_predict, KFold
    lr = LogisticRegression()
    kf = KFold(features.shape[0], random_state=1)
    predictions = cross_val_predict(lr, features, target, cv=kf)
    predictions = pd.Series(predictions)
    
    # False positives.
    fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
    fp = len(predictions[fp_filter])
    
    # True positives.
    tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
    tp = len(predictions[tp_filter])
    
    # False negatives.
    fn_filter = (predictions == 0) & (loans["loan_status"] == 1)
    fn = len(predictions[fn_filter])
    
    # True negatives
    tn_filter = (predictions == 0) & (loans["loan_status"] == 0)
    tn = len(predictions[tn_filter])
    
    # Rates
    tpr = tp / float((tp + fn))
    fpr = fp / float((fp + tn))
    
    print(tpr)
    print(fpr)
    print predictions[:20]
    
    FP达到了惊人的99.8%,即我们预测了一大批不会还钱的人我们却借出去了

    打印预测结果发现,全都是1,全借,证明这是一个废模型--原因是样本不均衡,因为大部分都信用良好
    增加权重项--负样本权重加大,正样本权重降低

    from sklearn.linear_model import LogisticRegression
    from sklearn.cross_validation import cross_val_predict
    # 使用sklearn的样本均衡策略  更改正负样本权重
    lr = LogisticRegression(class_weight="balanced")
    kf = KFold(features.shape[0], random_state=1)
    predictions = cross_val_predict(lr, features, target, cv=kf)
    predictions = pd.Series(predictions)
    
    # False positives.
    fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
    fp = len(predictions[fp_filter])
    
    # True positives.
    tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
    tp = len(predictions[tp_filter])
    
    # False negatives.
    fn_filter = (predictions == 0) & (loans["loan_status"] == 1)
    fn = len(predictions[fn_filter])
    
    # True negatives
    tn_filter = (predictions == 0) & (loans["loan_status"] == 0)
    tn = len(predictions[tn_filter])
    
    # Rates
    tpr = tp / float((tp + fn))
    fpr = fp / float((fp + tn))
    
    print(tpr)
    print(fpr)
    print predictions[:20]
    
    可以看到TP和FP的降低了

    这也不是我们业务所需要的结果,所以需要继续爬坑,
    目标是TP越大越好FP越小越好

    from sklearn.linear_model import LogisticRegression
    from sklearn.cross_validation import cross_val_predict
    # 自定义权重项  不使用sklearn的权重项
    penalty = {
        0: 5,
        1: 1
    }
    
    lr = LogisticRegression(class_weight=penalty)
    kf = KFold(features.shape[0], random_state=1)
    predictions = cross_val_predict(lr, features, target, cv=kf)
    predictions = pd.Series(predictions)
    
    # False positives.
    fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
    fp = len(predictions[fp_filter])
    
    # True positives.
    tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
    tp = len(predictions[tp_filter])
    
    # False negatives.
    fn_filter = (predictions == 0) & (loans["loan_status"] == 1)
    fn = len(predictions[fn_filter])
    
    # True negatives
    tn_filter = (predictions == 0) & (loans["loan_status"] == 0)
    tn = len(predictions[tn_filter])
    
    # Rates
    tpr = tp / float((tp + fn))
    fpr = fp / float((fp + tn))
    
    print(tpr)
    print(fpr)
    

    输出 :
    0.731799521545
    0.478985635751

    from sklearn.ensemble import RandomForestClassifier
    from sklearn.cross_validation import cross_val_predict
    rf = RandomForestClassifier(n_estimators=10,class_weight="balanced", random_state=1)
    #print help(RandomForestClassifier)
    kf = KFold(features.shape[0], random_state=1)
    predictions = cross_val_predict(rf, features, target, cv=kf)
    predictions = pd.Series(predictions)
    
    # False positives.
    fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
    fp = len(predictions[fp_filter])
    
    # True positives.
    tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
    tp = len(predictions[tp_filter])
    
    # False negatives.
    fn_filter = (predictions == 0) & (loans["loan_status"] == 1)
    fn = len(predictions[fn_filter])
    
    # True negatives
    tn_filter = (predictions == 0) & (loans["loan_status"] == 0)
    tn = len(predictions[tn_filter])
    
    # Rates
    tpr = tp / float((tp + fn))
    fpr = fp / float((fp + tn))
    

    相关文章

      网友评论

          本文标题:贷款利润最大化

          本文链接:https://www.haomeiwen.com/subject/gbziuftx.html