美文网首页
使用SKLearn构建随机森林,预测科比进球数

使用SKLearn构建随机森林,预测科比进球数

作者: FredricZhu | 来源:发表于2020-04-07 14:28 被阅读0次

    科比数据集可以在CSDN下载,
    https://download.csdn.net/download/qq_40694502/10583092
    代码如下,
    const.py

    '''
    Created on 2020年4月7日
    
    @author: Lenovo
    '''
    
    import os
    
    CSV_PATH = os.path.join(os.path.dirname(__file__), "data.csv")
    

    1_load_data.py

    '''
    Created on 2020年4月7日
    
    @author: Lenovo
    '''
    
    from const import CSV_PATH
    import matplotlib.pyplot as plt
    import pandas as pd 
    
    # 加载数据模块
    
    pd.set_option('display.max_rows', 9999)
    pd.set_option('display.max_columns', 9999)
    pd.set_option('display.width', 9999)
    
    
    print(CSV_PATH)
    raw = pd.read_csv(CSV_PATH)
    # 打印 数据形状
    print(raw.shape)
    # 打印前5行
    print(raw.head())
    
    # 是否进球
    kobe = raw[pd.notnull(raw['shot_made_flag'])]
    print(kobe.shape)
    
    # 设置画布大小
    plt.figure(figsize=(10, 10))
     
    # 透明度
    alpha = 0.02
    # 子图1
    plt.subplot(121)
    plt.scatter(kobe["loc_x"], kobe["loc_y"], color="R", alpha=alpha)
    plt.title("loc_x and loc_y")
    
    # 子图2 
    plt.subplot(122)
    plt.scatter(kobe["lon"], kobe["lat"], color="B", alpha=alpha)
    plt.title("lon and lat")
    plt.show()
    

    2_train.py

    '''
    Created on 2020年4月7日
    
    @author: Lenovo
    '''
    
    import matplotlib.pyplot as plt
    import pandas as pd 
    from const import CSV_PATH
    from sklearn.metrics import log_loss
    import time
    import numpy as np
    # find the best n_estimators for RandomForestClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import KFold
    
    # 训练并寻找最佳决策树参数
    # 此代码将寻找出最佳决策树参数为 best_n 100, best_m 10
    # 即最佳树棵数, 100棵, 最佳树深度,10
    
    print(CSV_PATH)
    # 读取数据
    raw = pd.read_csv(CSV_PATH)
    # 去除无用列
    drops = ['shot_id', 'team_id', 'team_name', 'shot_zone_area', 'shot_zone_range', 'shot_zone_basic',
             'matchup', 'lon', 'lat', 'seconds_remaining', 'minutes_remaining',
             'shot_distance', 'loc_x', 'loc_y', 'game_event_id', 'game_id', 'game_date']
    for drop in drops:
        raw.drop(drop, axis=1, inplace=True)
    
    # 对于相关特征,转换成one-hot表示,并去除原列
    # 因为计算机只能识别one-hot表示
    categorical_vars = ['action_type', 'combined_shot_type', 'shot_type', 'opponent', 'period', 'season']
    for var in categorical_vars:
        raw = pd.concat([raw, pd.get_dummies(raw[var], prefix=var)], axis=1)
        raw = raw.drop(var, 1)
    print(raw.shape)
    print(raw.head(1))
    
    # 选择进球标记非空记录作为训练集
    train_kobe = raw[pd.notnull(raw['shot_made_flag'])]
    # 选择 进球标记作为gt
    train_label = train_kobe['shot_made_flag']
    # 训练集删除gt列
    train_kobe = train_kobe.drop('shot_made_flag', axis=1)
    # 选择标记为空列,作为测试集
    test_kobe = raw[pd.isnull(raw['shot_made_flag'])]
    # 删除标记列
    test_kobe = test_kobe.drop('shot_made_flag', 1)
     
    # 10批次训练,查找最佳树个数
    print('Finding best n_estimators for RandomForestClassifier...')
    min_score = 100000
    best_n = 0
    scores_n = []
    # 10 **0 , 10 ** 1, 10 ** 2,等比数列
    range_n = np.logspace(0, 2, num=3).astype(int)
    for n in range_n:  # 树的个数
        print("the number of trees : {0}".format(n))
        t1 = time.time()
     
        rfc_score = 0.
        # 构造随机森林
        rfc = RandomForestClassifier(n_estimators=n)
        # KFold函数用于分离训练集索引和验证集索引(将训练集分为两批,一批为训练集,一批为验证集,在训练集内分)
        # 分组策略是,9份训练集,一份验证集,有余数部分按1补齐,详情见测试
        for train_k, test_k in KFold(n_splits=10, shuffle=True).split(train_kobe):
            # 喂入训练集数据
            rfc.fit(train_kobe.iloc[train_k], train_label.iloc[train_k])
            # 预测结果
            pred = rfc.predict(train_kobe.iloc[test_k])
            # 计算交叉熵误差,因为验证集只占1/10,所以要除以10
            rfc_score += log_loss(train_label.iloc[test_k], pred) / 10
        scores_n.append(rfc_score)
        if rfc_score < min_score:
            min_score = rfc_score
            best_n = n
     
        t2 = time.time()
        print('Done processing {0} trees ({1:.3f}sec)'.format(n, t2 - t1))
    
    print("BEST N->")
    print(best_n, min_score)
     
    # find best max_depth for RandomForestClassifier
    print('Finding best max_depth for RandomForestClassifier...')
    min_score = 100000
    best_m = 0
    scores_m = []
    range_m = np.logspace(0, 2, num=3).astype(int)
    for m in range_m:  # 树的深度
        print("the max depth : {0}".format(m))
        t1 = time.time()
     
        rfc_score = 0.
        rfc = RandomForestClassifier(max_depth=m, n_estimators=best_n)
        for train_k, test_k in KFold(n_splits=10, shuffle=True).split(train_kobe):
            rfc.fit(train_kobe.iloc[train_k], train_label.iloc[train_k])
            pred = rfc.predict(train_kobe.iloc[test_k])
            rfc_score += log_loss(train_label.iloc[test_k], pred) / 10
        scores_m.append(rfc_score)
        if rfc_score < min_score:
            min_score = rfc_score
            best_m = m
     
        t2 = time.time()
        print('Done processing {0} trees ({1:.3f}sec)'.format(m, t2 - t1))
    
    print("BEST M")
    print(best_m, min_score)
    plt.figure(figsize=(10, 5))
    plt.subplot(121)
    plt.plot(range_n, scores_n)
    plt.ylabel('score')
    plt.xlabel('number of trees')
     
    plt.subplot(122)
    plt.plot(range_m, scores_m)
    plt.ylabel('score')
    plt.xlabel('max depth')
    plt.show()
    

    3_predict.py

    '''
    Created on 2020年4月7日
    
    @author: Lenovo
    '''
    
    import pandas as pd 
    from const import CSV_PATH
    from sklearn.ensemble import RandomForestClassifier
    
    # 预测结果
    
    print(CSV_PATH)
    # 读取数据
    raw = pd.read_csv(CSV_PATH)
    
    
    drops = ['shot_id', 'team_id', 'team_name', 'shot_zone_area', 'shot_zone_range', 'shot_zone_basic',
             'matchup', 'lon', 'lat', 'seconds_remaining', 'minutes_remaining',
             'shot_distance', 'loc_x', 'loc_y', 'game_event_id', 'game_id', 'game_date']
    for drop in drops:
        raw.drop(drop, axis=1, inplace=True)
    
    # 对于相关特征,转换成one-hot表示,并去除原列
    # 因为计算机只能识别one-hot表示
    categorical_vars = ['action_type', 'combined_shot_type', 'shot_type', 'opponent', 'period', 'season']
    for var in categorical_vars:
        raw = pd.concat([raw, pd.get_dummies(raw[var], prefix=var)], axis=1)
        raw = raw.drop(var, 1)
    print(raw.shape)
    print(raw.head(1))
    
    
    train_kobe = raw[pd.notnull(raw['shot_made_flag'])]
    train_label = train_kobe['shot_made_flag']
    train_kobe = train_kobe.drop('shot_made_flag', axis=1)
    test_kobe = raw[pd.isnull(raw['shot_made_flag'])]
    test_kobe = test_kobe.drop('shot_made_flag', 1)
    
    # 使用最佳参数构建随机森林
    model = RandomForestClassifier(n_estimators=100, max_depth=10)
    # 喂入所有训练集数据
    model.fit(train_kobe, train_label)
    
    # 预测前10个不带label的测试集数据
    predict_idx = [i for i in range(1, 11)]
    pred = model.predict(test_kobe.iloc[predict_idx])
    # 结果显示,科比前10次,可能有三次进球,7次不进
    print(pred)
    

    工程结构如下,


    image.png

    相关文章

      网友评论

          本文标题:使用SKLearn构建随机森林,预测科比进球数

          本文链接:https://www.haomeiwen.com/subject/enfnphtx.html