科比数据集分析

作者: ForgetThatNight | 来源:发表于2018-07-06 21:02 被阅读19次

科比职业生涯数据集分析
科比数据集分析
细胞通讯-2单个数据集进行细胞间通讯分析
Python数据分析与机器学习42-Python库分析科比生涯数
38年来,NBA最有价值球员|数据分析
最简单的GEO分析
tensorflow笔记 - bug - onehot
2018-03-11
R语言实战 | 新手福利~认识数据集的内在
空间组数据和单细胞数据的相关性分析（Seurat）2022-05

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold


# import data
filename= "data.csv"
raw = pd.read_csv(filename)
print (raw.shape)
raw.head()

输出： (30697, 25)

# 5000 for test
kobe =  raw[pd.notnull(raw['shot_made_flag'])]
print (kobe.shape)

输出： (25697, 25)

#plt.subplot(211) first is raw second Column
alpha = 0.02
plt.figure(figsize=(10,10))

# loc_x and loc_y
plt.subplot(121)
plt.scatter(kobe.loc_x, kobe.loc_y, color='R', alpha=alpha)
plt.title('loc_x and loc_y')

# lat and lon
plt.subplot(122)
plt.scatter(kobe.lon, kobe.lat, color='B', alpha=alpha)
plt.title('lat and lon')

raw['dist'] = np.sqrt(raw['loc_x']**2 + raw['loc_y']**2)

loc_x_zero = raw['loc_x'] == 0
#print (loc_x_zero)
raw['angle'] = np.array([0]*len(raw))
raw['angle'][~loc_x_zero] = np.arctan(raw['loc_y'][~loc_x_zero] / raw['loc_x'][~loc_x_zero])
raw['angle'][loc_x_zero] = np.pi / 2 

raw['remaining_time'] = raw['minutes_remaining'] * 60 + raw['seconds_remaining']


print(kobe.action_type.unique())
print(kobe.combined_shot_type.unique())
print(kobe.shot_type.unique())
print(kobe.shot_type.value_counts())

输出：
['Jump Shot' 'Driving Dunk Shot' 'Layup Shot' 'Running Jump Shot'
'Reverse Dunk Shot' 'Slam Dunk Shot' 'Driving Layup Shot'
'Turnaround Jump Shot' 'Reverse Layup Shot' 'Tip Shot' 'Running Hook Shot'
'Alley Oop Dunk Shot' 'Dunk Shot' 'Alley Oop Layup shot'
'Running Dunk Shot' 'Driving Finger Roll Shot' 'Running Layup Shot'
'Finger Roll Shot' 'Fadeaway Jump Shot' 'Follow Up Dunk Shot' 'Hook Shot'
'Turnaround Hook Shot' 'Jump Hook Shot' 'Running Finger Roll Shot'
'Jump Bank Shot' 'Turnaround Finger Roll Shot' 'Hook Bank Shot'
'Driving Hook Shot' 'Running Tip Shot' 'Running Reverse Layup Shot'
'Driving Finger Roll Layup Shot' 'Fadeaway Bank shot' 'Pullup Jump shot'
'Finger Roll Layup Shot' 'Turnaround Fadeaway shot'
'Driving Reverse Layup Shot' 'Driving Slam Dunk Shot'
'Step Back Jump shot' 'Turnaround Bank shot' 'Reverse Slam Dunk Shot'
'Floating Jump shot' 'Putback Slam Dunk Shot' 'Running Bank shot'
'Driving Bank shot' 'Driving Jump shot' 'Putback Layup Shot'
'Putback Dunk Shot' 'Running Finger Roll Layup Shot' 'Pullup Bank shot'
'Running Slam Dunk Shot' 'Cutting Layup Shot' 'Driving Floating Jump Shot'
'Running Pull-Up Jump Shot' 'Tip Layup Shot'
'Driving Floating Bank Jump Shot']
['Jump Shot' 'Dunk' 'Layup' 'Tip Shot' 'Hook Shot' 'Bank Shot']
['2PT Field Goal' '3PT Field Goal']
2PT Field Goal 20285
3PT Field Goal 5412
Name: shot_type, dtype: int64

kobe['season'].unique()

输出：
array(['2000-01', '2001-02', '2002-03', '2003-04', '2004-05', '2005-06',
'2006-07', '2007-08', '2008-09', '2009-10', '2010-11', '2011-12',
'2012-13', '2013-14', '2014-15', '2015-16', '1996-97', '1997-98',
'1998-99', '1999-00'], dtype=object)

raw['season'] = raw['season'].apply(lambda x: int(x.split('-')[1]) )
raw['season'].unique()

输出：
array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 97,
98, 99, 0], dtype=int64)

print(kobe['team_id'].unique())
print(kobe['team_name'].unique())

输出：
[1610612747]
['Los Angeles Lakers']

pd.DataFrame({'matchup':kobe.matchup, 'opponent':kobe.opponent})

plt.figure(figsize=(5,5))

plt.scatter(raw.dist, raw.shot_distance, color='blue')
plt.title('dist and shot_distance')

gs = kobe.groupby('shot_zone_area')
print (kobe['shot_zone_area'].value_counts())
print (len(gs))

输出：
Center(C) 11289
Right Side Center(RC) 3981
Right Side(R) 3859
Left Side Center(LC) 3364
Left Side(L) 3132
Back Court(BC) 72
Name: shot_zone_area, dtype: int64
6

import matplotlib.cm as cm
plt.figure(figsize=(20,10))

def scatter_plot_by_category(feat):
    alpha = 0.1
    gs = kobe.groupby(feat)
    cs = cm.rainbow(np.linspace(0, 1, len(gs)))
    for g, c in zip(gs, cs):
        plt.scatter(g[1].loc_x, g[1].loc_y, color=c, alpha=alpha)

# shot_zone_area
plt.subplot(131)
scatter_plot_by_category('shot_zone_area')
plt.title('shot_zone_area')

# shot_zone_basic
plt.subplot(132)
scatter_plot_by_category('shot_zone_basic')
plt.title('shot_zone_basic')

# shot_zone_range
plt.subplot(133)
scatter_plot_by_category('shot_zone_range')
plt.title('shot_zone_range')

drops = ['shot_id', 'team_id', 'team_name', 'shot_zone_area', 'shot_zone_range', 'shot_zone_basic', \
         'matchup', 'lon', 'lat', 'seconds_remaining', 'minutes_remaining', \
         'shot_distance', 'loc_x', 'loc_y', 'game_event_id', 'game_id', 'game_date']
for drop in drops:
    raw = raw.drop(drop, 1)


print (raw['combined_shot_type'].value_counts())
pd.get_dummies(raw['combined_shot_type'], prefix='combined_shot_type')[0:2]

categorical_vars = ['action_type', 'combined_shot_type', 'shot_type', 'opponent', 'period', 'season']
for var in categorical_vars:
    raw = pd.concat([raw, pd.get_dummies(raw[var], prefix=var)], 1)
    raw = raw.drop(var, 1)

train_kobe = raw[pd.notnull(raw['shot_made_flag'])]
train_kobe = train_kobe.drop('shot_made_flag', 1)
train_label = train_kobe['shot_made_flag']
test_kobe = raw[pd.isnull(raw['shot_made_flag'])]
test_kobe = test_kobe.drop('shot_made_flag', 1)

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix,log_loss
import time

import numpy as np
range_m = np.logspace(0,2,num=5).astype(int)
range_m

输出： array([ 1, 3, 10, 31, 100])

# find the best n_estimators for RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold

print('Finding best n_estimators for RandomForestClassifier...')
min_score = 100000
best_n = 0
scores_n = []
range_n = np.logspace(0,2,num=3).astype(int)
for n in range_n:
    print("the number of trees : {0}".format(n))
    t1 = time.time()
    
    rfc_score = 0.
    rfc = RandomForestClassifier(n_estimators=n)
    for train_k, test_k in KFold(len(train_kobe), n_folds=10, shuffle=True):
        rfc.fit(train_kobe.iloc[train_k], train_label.iloc[train_k])
        #rfc_score += rfc.score(train.iloc[test_k], train_y.iloc[test_k])/10
        pred = rfc.predict(train_kobe.iloc[test_k])
        rfc_score += log_loss(train_label.iloc[test_k], pred) / 10
    scores_n.append(rfc_score)
    if rfc_score < min_score:
        min_score = rfc_score
        best_n = n
        
    t2 = time.time()
    print('Done processing {0} trees ({1:.3f}sec)'.format(n, t2-t1))
print(best_n, min_score)


# find best max_depth for RandomForestClassifier
print('Finding best max_depth for RandomForestClassifier...')
min_score = 100000
best_m = 0
scores_m = []
range_m = np.logspace(0,2,num=3).astype(int)
for m in range_m:
    print("the max depth : {0}".format(m))
    t1 = time.time()
    
    rfc_score = 0.
    rfc = RandomForestClassifier(max_depth=m, n_estimators=best_n)
    for train_k, test_k in KFold(len(train_kobe), n_folds=10, shuffle=True):
        rfc.fit(train_kobe.iloc[train_k], train_label.iloc[train_k])
        #rfc_score += rfc.score(train.iloc[test_k], train_y.iloc[test_k])/10
        pred = rfc.predict(train_kobe.iloc[test_k])
        rfc_score += log_loss(train_label.iloc[test_k], pred) / 10
    scores_m.append(rfc_score)
    if rfc_score < min_score:
        min_score = rfc_score
        best_m = m
    
    t2 = time.time()
    print('Done processing {0} trees ({1:.3f}sec)'.format(m, t2-t1))
print(best_m, min_score)

输出：
Finding best n_estimators for RandomForestClassifier...
the number of trees : 1
Done processing 1 trees (1.407sec)
the number of trees : 10
Done processing 10 trees (7.093sec)
the number of trees : 100
Done processing 100 trees (67.297sec)
100 11.8669680428
Finding best max_depth for RandomForestClassifier...
the max depth : 1
Done processing 1 trees (6.658sec)
the max depth : 10
Done processing 10 trees (23.687sec)
the max depth : 100
Done processing 100 trees (70.740sec)
10 11.0039977617

plt.figure(figsize=(10,5))
plt.subplot(121)
plt.plot(range_n, scores_n)
plt.ylabel('score')
plt.xlabel('number of trees')

plt.subplot(122)
plt.plot(range_m, scores_m)
plt.ylabel('score')
plt.xlabel('max depth')

model = RandomForestClassifier(n_estimators=best_n, max_depth=best_m)
model.fit(train_kobe, train_label)
# 474241623

网友评论

本文标题：科比数据集分析

本文链接：https://www.haomeiwen.com/subject/urgbuftx.html

延伸阅读

深度阅读

您也可以注册成为美文阅读网的作者，发表您的原创作品、分享您的心情！

科比数据集分析

相关文章

科比职业生涯数据集分析