使用SKLearn构建随机森林，预测科比进球数

作者: FredricZhu | 来源:发表于2020-04-07 14:28 被阅读0次

使用SKLearn构建随机森林，预测科比进球数
用户漏电检测
RandomForest（随机森林）
【火炉炼AI】机器学习007-用随机森林构建共享单车需求预测模型
随机森林算法预测法官判决，准确度优于人类水平丨数据工匠简报（Ma
SKlearn实现--随机森林
随机梯度下降模型回归实战
机器学习学习笔记--随机森林算法
使用 sklearn 进行房价预测（机器学习）
在线作图|在线做随机森林分析

科比数据集可以在CSDN下载，
https://download.csdn.net/download/qq_40694502/10583092
代码如下，
const.py

'''
Created on 2020年4月7日

@author: Lenovo
'''

import os

CSV_PATH = os.path.join(os.path.dirname(__file__), "data.csv")

1_load_data.py

'''
Created on 2020年4月7日

@author: Lenovo
'''

from const import CSV_PATH
import matplotlib.pyplot as plt
import pandas as pd 

# 加载数据模块

pd.set_option('display.max_rows', 9999)
pd.set_option('display.max_columns', 9999)
pd.set_option('display.width', 9999)


print(CSV_PATH)
raw = pd.read_csv(CSV_PATH)
# 打印 数据形状
print(raw.shape)
# 打印前5行
print(raw.head())

# 是否进球
kobe = raw[pd.notnull(raw['shot_made_flag'])]
print(kobe.shape)

# 设置画布大小
plt.figure(figsize=(10, 10))
 
# 透明度
alpha = 0.02
# 子图1
plt.subplot(121)
plt.scatter(kobe["loc_x"], kobe["loc_y"], color="R", alpha=alpha)
plt.title("loc_x and loc_y")

# 子图2 
plt.subplot(122)
plt.scatter(kobe["lon"], kobe["lat"], color="B", alpha=alpha)
plt.title("lon and lat")
plt.show()

2_train.py

'''
Created on 2020年4月7日

@author: Lenovo
'''

import matplotlib.pyplot as plt
import pandas as pd 
from const import CSV_PATH
from sklearn.metrics import log_loss
import time
import numpy as np
# find the best n_estimators for RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold

# 训练并寻找最佳决策树参数
# 此代码将寻找出最佳决策树参数为 best_n 100, best_m 10
# 即最佳树棵数， 100棵， 最佳树深度，10

print(CSV_PATH)
# 读取数据
raw = pd.read_csv(CSV_PATH)
# 去除无用列
drops = ['shot_id', 'team_id', 'team_name', 'shot_zone_area', 'shot_zone_range', 'shot_zone_basic',
         'matchup', 'lon', 'lat', 'seconds_remaining', 'minutes_remaining',
         'shot_distance', 'loc_x', 'loc_y', 'game_event_id', 'game_id', 'game_date']
for drop in drops:
    raw.drop(drop, axis=1, inplace=True)

# 对于相关特征，转换成one-hot表示，并去除原列
# 因为计算机只能识别one-hot表示
categorical_vars = ['action_type', 'combined_shot_type', 'shot_type', 'opponent', 'period', 'season']
for var in categorical_vars:
    raw = pd.concat([raw, pd.get_dummies(raw[var], prefix=var)], axis=1)
    raw = raw.drop(var, 1)
print(raw.shape)
print(raw.head(1))

# 选择进球标记非空记录作为训练集
train_kobe = raw[pd.notnull(raw['shot_made_flag'])]
# 选择 进球标记作为gt
train_label = train_kobe['shot_made_flag']
# 训练集删除gt列
train_kobe = train_kobe.drop('shot_made_flag', axis=1)
# 选择标记为空列，作为测试集
test_kobe = raw[pd.isnull(raw['shot_made_flag'])]
# 删除标记列
test_kobe = test_kobe.drop('shot_made_flag', 1)
 
# 10批次训练，查找最佳树个数
print('Finding best n_estimators for RandomForestClassifier...')
min_score = 100000
best_n = 0
scores_n = []
# 10 **0 , 10 ** 1, 10 ** 2，等比数列
range_n = np.logspace(0, 2, num=3).astype(int)
for n in range_n:  # 树的个数
    print("the number of trees : {0}".format(n))
    t1 = time.time()
 
    rfc_score = 0.
    # 构造随机森林
    rfc = RandomForestClassifier(n_estimators=n)
    # KFold函数用于分离训练集索引和验证集索引(将训练集分为两批，一批为训练集，一批为验证集，在训练集内分)
    # 分组策略是，9份训练集，一份验证集，有余数部分按1补齐，详情见测试
    for train_k, test_k in KFold(n_splits=10, shuffle=True).split(train_kobe):
        # 喂入训练集数据
        rfc.fit(train_kobe.iloc[train_k], train_label.iloc[train_k])
        # 预测结果
        pred = rfc.predict(train_kobe.iloc[test_k])
        # 计算交叉熵误差，因为验证集只占1/10，所以要除以10
        rfc_score += log_loss(train_label.iloc[test_k], pred) / 10
    scores_n.append(rfc_score)
    if rfc_score < min_score:
        min_score = rfc_score
        best_n = n
 
    t2 = time.time()
    print('Done processing {0} trees ({1:.3f}sec)'.format(n, t2 - t1))

print("BEST N->")
print(best_n, min_score)
 
# find best max_depth for RandomForestClassifier
print('Finding best max_depth for RandomForestClassifier...')
min_score = 100000
best_m = 0
scores_m = []
range_m = np.logspace(0, 2, num=3).astype(int)
for m in range_m:  # 树的深度
    print("the max depth : {0}".format(m))
    t1 = time.time()
 
    rfc_score = 0.
    rfc = RandomForestClassifier(max_depth=m, n_estimators=best_n)
    for train_k, test_k in KFold(n_splits=10, shuffle=True).split(train_kobe):
        rfc.fit(train_kobe.iloc[train_k], train_label.iloc[train_k])
        pred = rfc.predict(train_kobe.iloc[test_k])
        rfc_score += log_loss(train_label.iloc[test_k], pred) / 10
    scores_m.append(rfc_score)
    if rfc_score < min_score:
        min_score = rfc_score
        best_m = m
 
    t2 = time.time()
    print('Done processing {0} trees ({1:.3f}sec)'.format(m, t2 - t1))

print("BEST M")
print(best_m, min_score)
plt.figure(figsize=(10, 5))
plt.subplot(121)
plt.plot(range_n, scores_n)
plt.ylabel('score')
plt.xlabel('number of trees')
 
plt.subplot(122)
plt.plot(range_m, scores_m)
plt.ylabel('score')
plt.xlabel('max depth')
plt.show()

3_predict.py

'''
Created on 2020年4月7日

@author: Lenovo
'''

import pandas as pd 
from const import CSV_PATH
from sklearn.ensemble import RandomForestClassifier

# 预测结果

print(CSV_PATH)
# 读取数据
raw = pd.read_csv(CSV_PATH)


drops = ['shot_id', 'team_id', 'team_name', 'shot_zone_area', 'shot_zone_range', 'shot_zone_basic',
         'matchup', 'lon', 'lat', 'seconds_remaining', 'minutes_remaining',
         'shot_distance', 'loc_x', 'loc_y', 'game_event_id', 'game_id', 'game_date']
for drop in drops:
    raw.drop(drop, axis=1, inplace=True)

# 对于相关特征，转换成one-hot表示，并去除原列
# 因为计算机只能识别one-hot表示
categorical_vars = ['action_type', 'combined_shot_type', 'shot_type', 'opponent', 'period', 'season']
for var in categorical_vars:
    raw = pd.concat([raw, pd.get_dummies(raw[var], prefix=var)], axis=1)
    raw = raw.drop(var, 1)
print(raw.shape)
print(raw.head(1))


train_kobe = raw[pd.notnull(raw['shot_made_flag'])]
train_label = train_kobe['shot_made_flag']
train_kobe = train_kobe.drop('shot_made_flag', axis=1)
test_kobe = raw[pd.isnull(raw['shot_made_flag'])]
test_kobe = test_kobe.drop('shot_made_flag', 1)

# 使用最佳参数构建随机森林
model = RandomForestClassifier(n_estimators=100, max_depth=10)
# 喂入所有训练集数据
model.fit(train_kobe, train_label)

# 预测前10个不带label的测试集数据
predict_idx = [i for i in range(1, 11)]
pred = model.predict(test_kobe.iloc[predict_idx])
# 结果显示，科比前10次，可能有三次进球，7次不进
print(pred)

工程结构如下，

image.png