一. 数据源介绍
数据源是科比篮球比赛的一个数据集
我们先简单的看一下数据集
image.png
特征值简介:
action_type
进攻方式(更具体)
combined_shot_type
进攻方式
game_event_id
比赛时间id
game_id
比赛ID
lat
投篮点
loc_x
投篮点
loc_y
投篮点
lon
投篮点
minutes_remaining
单节剩余时间(分钟)
period
表示第几节
playoffs
是否是季后赛
season
赛季
seconds_remaining
剩余时间(秒)
shot_distance
投篮距离
shot_made_flag
是否进球
shot_type
两分球或三分球
shot_zone_area
投篮区域
shot_zone_basic
投篮区域(更具体)
shot_zone_range
投篮范围
team_id
球队ID
team_name
球队名称
game_date
比赛日期
matchup
比赛双方
opponent
对手
shot_id
投篮ID
二. 数据预处理
2.1 简单看看科比投篮的位置
代码:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
# 读取数据源
filename = "E:/file/data.csv"
raw = pd.read_csv(filename)
#print(raw.head(10))
# 选择标签值不为null的
kobe = raw[pd.notnull(raw['shot_made_flag'])]
# 画图
alpha = 0.02
plt.figure(figsize=(10, 10))
# x轴和y轴
plt.subplot(121)
plt.scatter(kobe.loc_x, kobe.loc_y, color='r', alpha=alpha)
plt.title('loc_x and loc_y')
# 精度和维度
plt.subplot(122)
plt.scatter(kobe.lon, kobe.lat, color='b', alpha=alpha)
plt.title('lat and lon')
plt.show()
测试记录:
从图中我们可以看到,loc_x和loc_y 以及 lat和lon 这些都代表了坐标,可看到kobe投篮位置的分布。
2.2 对数据做预处理
通过我们简单的分析可以看到数据集存在如下问题:
-
shot_made_flag
这个标签值有缺失值 -
loc_x
有为0的异常值 -
season
特征值格式混乱 - 部分特征值我们需要看看唯一值
做数据预处理的时候,我们需要解决上述问题
代码:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
# 读取数据源
filename = "E:/file/data.csv"
raw = pd.read_csv(filename)
#print(raw.head(10))
# 选择标签值不为null的
kobe = raw[pd.notnull(raw['shot_made_flag'])]
# 根据X轴和Y轴来计算距离,衍生一个特征
raw['dist'] = np.sqrt(raw['loc_x']**2 + raw['loc_y']**2)
# 处理loc_x 为0的异常数据
loc_x_zero = raw['loc_x'] == 0
#print (loc_x_zero)
raw['angle'] = np.array([0]*len(raw))
raw['angle'][~loc_x_zero] = np.arctan(raw['loc_y'][~loc_x_zero] / raw['loc_x'][~loc_x_zero])
raw['angle'][loc_x_zero] = np.pi / 2
# 将比赛剩余的分钟和秒 加起来,衍生一个特征
raw['remaining_time'] = raw['minutes_remaining'] * 60 + raw['seconds_remaining']
# 查看特征的唯一值
print("查看特征的唯一值")
print(kobe.action_type.unique())
print(kobe.combined_shot_type.unique())
print(kobe.shot_type.unique())
print(kobe.shot_type.value_counts())
print("处理season格式不统一的问题")
kobe['season'].unique()
raw['season'] = raw['season'].apply(lambda x: int(x.split('-')[1]))
print(raw['season'].unique())
测试记录:
查看特征的唯一值
['Jump Shot' 'Driving Dunk Shot' 'Layup Shot' 'Running Jump Shot'
'Reverse Dunk Shot' 'Slam Dunk Shot' 'Driving Layup Shot'
'Turnaround Jump Shot' 'Reverse Layup Shot' 'Tip Shot'
'Running Hook Shot' 'Alley Oop Dunk Shot' 'Dunk Shot'
'Alley Oop Layup shot' 'Running Dunk Shot' 'Driving Finger Roll Shot'
'Running Layup Shot' 'Finger Roll Shot' 'Fadeaway Jump Shot'
'Follow Up Dunk Shot' 'Hook Shot' 'Turnaround Hook Shot' 'Jump Hook Shot'
'Running Finger Roll Shot' 'Jump Bank Shot' 'Turnaround Finger Roll Shot'
'Hook Bank Shot' 'Driving Hook Shot' 'Running Tip Shot'
'Running Reverse Layup Shot' 'Driving Finger Roll Layup Shot'
'Fadeaway Bank shot' 'Pullup Jump shot' 'Finger Roll Layup Shot'
'Turnaround Fadeaway shot' 'Driving Reverse Layup Shot'
'Driving Slam Dunk Shot' 'Step Back Jump shot' 'Turnaround Bank shot'
'Reverse Slam Dunk Shot' 'Floating Jump shot' 'Putback Slam Dunk Shot'
'Running Bank shot' 'Driving Bank shot' 'Driving Jump shot'
'Putback Layup Shot' 'Putback Dunk Shot' 'Running Finger Roll Layup Shot'
'Pullup Bank shot' 'Running Slam Dunk Shot' 'Cutting Layup Shot'
'Driving Floating Jump Shot' 'Running Pull-Up Jump Shot' 'Tip Layup Shot'
'Driving Floating Bank Jump Shot']
['Jump Shot' 'Dunk' 'Layup' 'Tip Shot' 'Hook Shot' 'Bank Shot']
['2PT Field Goal' '3PT Field Goal']
2PT Field Goal 20285
3PT Field Goal 5412
Name: shot_type, dtype: int64
处理season格式不统一的问题
[ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 97 98 99 0]
2.3 查看特征值之间是否存在线性关系
如果特征值之间存在线性关系,此时我们只需要使用其中之一即可,无需使用多个特征值。
代码:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
# 读取数据源
filename = "E:/file/data.csv"
raw = pd.read_csv(filename)
#print(raw.head(10))
# 根据X轴和Y轴来计算距离,衍生一个特征
raw['dist'] = np.sqrt(raw['loc_x']**2 + raw['loc_y']**2)
# 画散点图
# 这个两个特征值呈线性关系,用一个即可
plt.figure(figsize=(5, 5))
plt.scatter(raw['dist'], raw['shot_distance'], color='blue')
plt.title('dist and shot_distance')
plt.show()
测试记录:
从图中我们可以看到,这两个特征值完全呈线性关系,此时我们只需要使用一个特征值即可。
2.4 使用group by查看数据分布情况
代码:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
# 读取数据源
filename = "E:/file/data.csv"
raw = pd.read_csv(filename)
#print(raw.head(10))
# 选择标签值不为null的
kobe = raw[pd.notnull(raw['shot_made_flag'])]
# 聚合查看数据
gs = kobe.groupby('shot_zone_area')
print(kobe['shot_zone_area'].value_counts())
print (len(gs))
测试记录:
Center(C) 11289
Right Side Center(RC) 3981
Right Side(R) 3859
Left Side Center(LC) 3364
Left Side(L) 3132
Back Court(BC) 72
Name: shot_zone_area, dtype: int64
6
2.5 查看投篮区域和投篮范围的情况
代码:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
import matplotlib.cm as cm
# 读取数据源
filename = "E:/file/data.csv"
raw = pd.read_csv(filename)
#print(raw.head(10))
# 处理loc_x 为0的异常数据
loc_x_zero = raw['loc_x'] == 0
#print (loc_x_zero)
raw['angle'] = np.array([0]*len(raw))
raw['angle'][~loc_x_zero] = np.arctan(raw['loc_y'][~loc_x_zero] / raw['loc_x'][~loc_x_zero])
raw['angle'][loc_x_zero] = np.pi / 2
# 将比赛剩余的分钟和秒 加起来,衍生一个特征
raw['remaining_time'] = raw['minutes_remaining'] * 60 + raw['seconds_remaining']
# 选择标签值不为null的
kobe = raw[pd.notnull(raw['shot_made_flag'])]
# 画图
plt.figure(figsize=(20, 10))
def scatter_plot_by_category(feat):
alpha = 0.1
gs = kobe.groupby(feat)
cs = cm.rainbow(np.linspace(0, 1, len(gs)))
for g, c in zip(gs, cs):
plt.scatter(g[1].loc_x, g[1].loc_y, color=c, alpha=alpha)
# shot_zone_area
plt.subplot(131)
scatter_plot_by_category('shot_zone_area')
plt.title('shot_zone_area')
# shot_zone_basic
plt.subplot(132)
scatter_plot_by_category('shot_zone_basic')
plt.title('shot_zone_basic')
# shot_zone_range
plt.subplot(133)
scatter_plot_by_category('shot_zone_range')
plt.title('shot_zone_range')
plt.show()
测试记录:
2.6 pandas独热编码
在对变量进行独热编码时使用,例如:某一列类别型变量是季节,取值为春、夏、秋、冬,当我们对其进行建模时,需要将其进行独热编码,这时:pandas.get_dummies便派上了用场。
image.png
data
: array-like, Series, or DataFrame 输入的数据
prefix
: string, get_dummies转换后,列名的前缀,默认为None
columns
: 指定需要实现类别转换的列名 否则转换所有类别性的列
dummy_na
: bool, default False 增加一列表示空缺值,如果False就忽略空缺值
drop_first
: bool, default False 获得k中的k-1个类别值,去除第一个,防止出现多重共线性
三. 训练模型
代码:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
import matplotlib.cm as cm
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix, log_loss
import time
# 读取数据源
filename = "E:/file/data.csv"
raw = pd.read_csv(filename)
#print(raw.head(10))
# 将不需要的特征值进行删除
drops = ['shot_id', 'team_id', 'team_name', 'shot_zone_area', 'shot_zone_range', 'shot_zone_basic', \
'matchup', 'lon', 'lat', 'seconds_remaining', 'minutes_remaining', \
'shot_distance', 'loc_x', 'loc_y', 'game_event_id', 'game_id', 'game_date']
for drop in drops:
raw = raw.drop(drop, 1)
#print(raw['combined_shot_type'].value_counts())
#pd.get_dummies(raw['combined_shot_type'], prefix='combined_shot_type')[0:2]
categorical_vars = ['action_type', 'combined_shot_type', 'shot_type', 'opponent', 'period', 'season']
for var in categorical_vars:
raw = pd.concat([raw, pd.get_dummies(raw[var], prefix=var)], 1)
raw = raw.drop(var, 1)
# 划分训练集和测试集
train_kobe = raw[pd.notnull(raw['shot_made_flag'])]
train_label = train_kobe['shot_made_flag'].astype(np.int64)
train_kobe = train_kobe.drop('shot_made_flag', 1)
test_kobe = raw[pd.isnull(raw['shot_made_flag'])]
test_kobe = test_kobe.drop('shot_made_flag', 1)
print('Finding best n_estimators for RandomForestClassifier...')
min_score = 100000
best_n = 0
scores_n = []
range_n = np.logspace(0, 2, num=3, dtype=np.int64).astype(np.int64)
for n in range_n:
print("the number of trees : {0}".format(n))
t1 = time.time()
rfc_score = 0.
rfc = RandomForestClassifier(n_estimators=n)
skfolds = StratifiedKFold(
n_splits=3,
random_state=42, # 设置随机种子
shuffle=True
)
for train_k, test_k in skfolds.split(train_kobe, train_label):
rfc.fit(train_kobe.iloc[train_k], train_label.iloc[train_k])
# rfc_score += rfc.score(train.iloc[test_k], train_y.iloc[test_k])/10
pred = rfc.predict(train_kobe.iloc[test_k])
rfc_score += log_loss(train_label.iloc[test_k], pred) / 10
scores_n.append(rfc_score)
if rfc_score < min_score:
min_score = rfc_score
best_n = n
t2 = time.time()
print('Done processing {0} trees ({1:.3f}sec)'.format(n, t2 - t1))
print(best_n, min_score)
# find best max_depth for RandomForestClassifier
print('Finding best max_depth for RandomForestClassifier...')
min_score = 100000
best_m = 0
scores_m = []
range_m = np.logspace(0, 2, num=3, dtype=np.int64).astype(np.int64)
for m in range_m:
print("the max depth : {0}".format(m))
t1 = time.time()
skfolds = StratifiedKFold(
n_splits=3,
random_state=42, # 设置随机种子
shuffle=True
)
rfc_score = 0.
rfc = RandomForestClassifier(max_depth=m, n_estimators=best_n)
for train_k, test_k in skfolds.split(train_kobe, train_label):
rfc.fit(train_kobe.iloc[train_k], train_label.iloc[train_k])
# rfc_score += rfc.score(train.iloc[test_k], train_y.iloc[test_k])/10
pred = rfc.predict(train_kobe.iloc[test_k])
rfc_score += log_loss(train_label.iloc[test_k], pred) / 10
scores_m.append(rfc_score)
if rfc_score < min_score:
min_score = rfc_score
best_m = m
t2 = time.time()
print('Done processing {0} trees ({1:.3f}sec)'.format(m, t2 - t1))
print(best_m, min_score)
plt.figure(figsize=(10, 5))
plt.subplot(121)
plt.plot(range_n, scores_n)
plt.ylabel('score')
plt.xlabel('number of trees')
plt.subplot(122)
plt.plot(range_m, scores_m)
plt.ylabel('score')
plt.xlabel('max depth')
plt.show()
测试记录:
Finding best n_estimators for RandomForestClassifier...
the number of trees : 1
Done processing 1 trees (0.556sec)
the number of trees : 10
Done processing 10 trees (1.138sec)
the number of trees : 100
Done processing 100 trees (10.486sec)
100 3.9020328477633024
Finding best max_depth for RandomForestClassifier...
the max depth : 1
Done processing 1 trees (1.294sec)
the max depth : 10
Done processing 10 trees (3.215sec)
the max depth : 100
Done processing 100 trees (10.556sec)
10 3.316936324282448
image.png
网友评论