案例3-NBA

作者: 7125messi | 来源:发表于2018-01-13 14:40 被阅读69次

    Apache Spark已经成为大规模数据分析的常用工具,本文我们将展示如何使用Spark来分析NBA数据。具体来说,我们将使用1979年到2016年的赛季数据以及投篮图数据来展示NBA如何继续朝着越来越多的三分投篮的方向发展。(主要是勇士队掀起的小球打法以及小学生“库里”的变态准)。
    使用Python 3,我们利用Spark Python API(PySpark)来创建和分析Spark。

    导入各种模块

    %matplotlib inline
    import os
    
    import numpy as np
    import pandas as pd
    import seaborn as sns
    
    from nba_utils import draw_3pt_piechart,plot_shot_chart
    
    from IPython.core.display import display, HTML
    from IPython.core.magic import register_cell_magic, register_line_cell_magic, register_line_magic
    from matplotlib import pyplot as plt
    from pyspark.ml.regression import LinearRegression
    from pyspark.ml.feature import VectorAssembler
    from pyspark.sql.functions import array, col, count, mean, sum, udf, when
    from pyspark.sql.types import DoubleType, IntegerType, StringType, Row
    from pyspark.sql.functions import sum, col, udf
    
    import warnings
    warnings.filterwarnings("ignore")
    
    sns.set_style("white")
    sns.set_color_codes()
    

    自定义可视化样式

    plt.style.use('fivethirtyeight')
    plt.rcParams['figure.figsize'] = (9, 5)
    plt.rcParams['font.family'] = 'serif'
    plt.rcParams['font.serif'] = 'Ubuntu'
    plt.rcParams['font.monospace'] = 'Ubuntu Mono'
    plt.rcParams['font.size'] = 9
    plt.rcParams['axes.labelsize'] = 11
    plt.rcParams['axes.labelweight'] = 'bold'
    plt.rcParams['xtick.labelsize'] = 11
    plt.rcParams['ytick.labelsize'] = 11
    plt.rcParams['legend.fontsize'] = 14
    plt.rcParams['figure.titlesize'] = 18
    
    display(HTML('<style>.container {width:80% !important;}</style>'))
    update_title = 'document.title = "Using Python and Apache Spark to Analyze the NBA and the 3-point Shot";'
    HTML('<script>{}</script>'.format(update_title))
    
    df = spark.read.option('header','true')\
              .option('inferSchema','true')\
              .csv('data/season_totals.csv')
    # 缓存数据
    df.cache()
    DataFrame[_c0: int, player: string, pos: string, age: int, team_id: string, g: int, gs: int, mp: int, fg: int, fga: int, fg_pct: double, fg3: int, fg3a: int, fg3_pct: double, fg2: int, fg2a: int, fg2_pct: double, efg_pct: double, ft: int, fta: int, ft_pct: double, orb: int, drb: int, trb: int, ast: int, stl: int, blk: int, tov: int, pf: int, pts: int, yr: int]
    
    # Using our DataFrame `df`, we can view the top 10 players
    df.orderBy('pts',ascending = False).limit(10).toPandas()[['yr','player','age','pts','fg3']]
    
    yr  player  age pts fg3
    0   1987    Jordan,Michael  23  3041    12
    1   1988    Jordan,Michael  24  2868    7
    2   2006    Bryant,Kobe 27  2832    180
    3   1990    Jordan,Michael  26  2753    92
    4   1989    Jordan,Michael  25  2633    27
    5   2014    Durant,Kevin    25  2593    192
    6   1980    Gervin,George   27  2585    32
    7   1991    Jordan,Michael  27  2580    29
    8   1982    Gervin,George   29  2551    10
    9   1993    Jordan,Michael  29  2541    81
    
    print(df.columns)
    ['_c0', 'player', 'pos', 'age', 'team_id', 'g', 'gs', 'mp', 'fg', 'fga', 'fg_pct', 'fg3', 'fg3a', 'fg3_pct', 'fg2', 'fg2a', 'fg2_pct', 'efg_pct', 'ft', 'fta', 'ft_pct', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'yr']
    
    # 3 point attempts / 36 minute
    fga_py = df.groupBy('yr')\
               .agg({'mp' : 'sum', 'fg3a' : 'sum'})\
               .select(col('yr'), (36*col('sum(fg3a)')/col('sum(mp)')).alias('fg3a_p36m'))\
               .orderBy('yr')
    
    # or could use SQL
    sqlContext.registerDataFrameAsTable(df, 'df')
    fga_py = sqlContext.sql('''SELECT yr,
                                      sum(fg3a)/sum(mp)*36 fg3a_p36m
                               FROM df GROUP BY yr
                               ORDER BY yr''')
    

    可视化

    _df = fga_py.toPandas()
    plt.plot(_df.yr,_df.fg3a_p36m, color = '#00a79c')
    plt.xlabel('Year')
    plt.ylabel('Number of attempts')
    _=plt.title('Player average 3-point attempts (per 36 minutes)')
    _=plt.annotate('3 pointer introduced', xy=(1980.5, .5), xytext=(1981, 1.1), fontsize = 12,
                   arrowprops=dict(facecolor='grey', shrink=0.05, linewidth = 2))
    _=plt.annotate('NBA moved in\n3-point line', xy=(1993.7, 1.5), xytext=(1987, 1.79), fontsize = 12,
                   arrowprops=dict(facecolor='grey', shrink=0.05, linewidth = 2))
    _=plt.annotate('NBA moved back\n3-point line', xy=(1998, 2.), xytext=(1998.5, 2.4), fontsize = 12,
                   arrowprops=dict(facecolor='grey', shrink=0.05, linewidth = 2))
    plt.tight_layout()
    plt.savefig('results/3_point_trend.png')
    
    image.png

    我们可以看到自从1979-80赛季的投篮以来,三分球命中率稳步上升。 观察90年代中期NBA几英尺线路上的尝试次数是有意思的,合乎逻辑的。 另外,过去5年的尝试次数也有了突然的增加。

    建立线性模型

    # train the model
    t = VectorAssembler(inputCols=['yr'], outputCol = 'features')
    training = t.transform(fga_py)\
                .withColumn('yr',fga_py.yr)\
                .withColumn('label',fga_py.fg3a_p36m)
    training.toPandas().head()
        yr  fg3a_p36m   features    label
    0   1980    0.410089    [1980.0]    0.410089
    1   1981    0.309376    [1981.0]    0.309376
    2   1982    0.341511    [1982.0]    0.341511
    3   1983    0.331479    [1983.0]    0.331479
    4   1984    0.357110    [1984.0]    0.357110
    
    lr = LinearRegression(maxIter=10)
    model = lr.fit(training)
    
    # apply model for the 1979-80 season thru 2020-21 season
    training_yrs = training.select('yr').rdd.map(lambda x: x[0]).collect()
    training_y = training.select('fg3a_p36m').rdd.map(lambda x: x[0]).collect()
    prediction_yrs = [2017, 2018, 2019, 2020, 2021]
    all_yrs = training_yrs + prediction_yrs
    
    # built testing DataFrame
    test_rdd = sc.parallelize(all_yrs)
    row = Row('yr')
    all_years_features = t.transform(test_rdd.map(row).toDF())
    
    # apply linear regression model
    df_results = model.transform(all_years_features).toPandas()
    
    plt.plot(df_results.yr,df_results.prediction, linewidth = 2, linestyle = '--',color = '#fc4f30', label = 'L2 Fit')
    plt.plot(training_yrs, training_y, color = '#00a79c', label = None)
    plt.xlabel('Year')
    plt.ylabel('Number of attempts')
    plt.legend(loc = 4)
    _=plt.title('Player average 3-point attempts (per 36 minutes)')
    plt.tight_layout()
    plt.savefig('results/model_prediction.png')
    
    image.png

    投篮数据

    除季节总数据外,我们还处理和分析NBA投篮图表,以观察三分球革命对投篮选择的影响。 图表数据来自(https://www.nbasavant.com),它来自NBA.com和ESPN。

    投篮图数据包含单个球员投篮命中的xy坐标,比赛日期,投篮时间,投篮距离,投篮标志等字段。 我们已经编制了所有个人赛季,从2010-11赛季到2015-16赛季,一名球员尝试了至少1000次进球。

    如前所述,我们可以将CSV数据读入Spark DataFrame。

    # reset style for pretty shot charts
    plt.style.use('default')
    sns.set_style("white")
    
    df = spark.read\
              .option('header', 'true')\
              .option('inferSchema', 'true')\
              .csv('data/shot_charts_top_10/1000_plus_shot_charts_2011_2016.csv')
    df.cache() # optimizes performance for later calls to this dataframe
    print(df.count())
    df.orderBy('game_date').limit(10).toPandas()[['yr','name','game_date','shot_distance','x','y','shot_made_flag']]
    
    243719
    
    yr  name    game_date   shot_distance   x   y   shot_made_flag
    0   2011    LaMarcus Aldridge   2010-10-26  1   4   11  0
    1   2011    Paul Pierce 2010-10-26  25  67  246 1
    2   2011    Paul Pierce 2010-10-26  18  165 83  0
    3   2011    Paul Pierce 2010-10-26  24  159 186 0
    4   2011    Paul Pierce 2010-10-26  24  198 148 1
    5   2011    Paul Pierce 2010-10-26  23  231 4   1
    6   2011    Paul Pierce 2010-10-26  1   -7  9   0
    7   2011    Paul Pierce 2010-10-26  0   -2  -5  1
    8   2011    LaMarcus Aldridge   2010-10-26  21  39  211 0
    9   2011    LaMarcus Aldridge   2010-10-26  8   -82 23  0
    

    我们可以查询个人球员和赛季,并可视化他们的投篮位置。

    以Steph Curry的2015-2016历史性的投射赛季为例。

    
    player = 'Stephen Curry'
    yr = '2016'
    df_steph = df.filter('''name == "{player}"
                            and yr == {yr}
                            and y < 400'''.format(player = player,yr = yr))
    
    x = np.array([v[0] for v in df_steph.select('x').collect()])
    y = np.array([v[0] for v in df_steph.select('y').collect()])
    p=plot_shot_chart(x, y, gridsize = 30,
                          kind='hex',
                          label='Steph Curry\n2016')
    p.savefig('results/steph_curry_2016_shotchart.png')
    
    image.png
    shot_acc = df.groupBy('shot_distance','corner_3','normal_3','is_a_3')\
                 .agg(count('*').alias('num_attempts'),mean(df.shot_made_flag).alias('shot_accuracy'))\
                 .withColumn('points_per_shot',when(col('is_a_3') == 1, col('shot_accuracy')*3)
                                               .otherwise(col('shot_accuracy')*2)
                            )\
                 .filter('num_attempts > 5')\
                 .orderBy('shot_distance')\
                 .toPandas()
    
    plt.style.use('fivethirtyeight')
    
    def plot_acc_vs_dist(df,kwargs = {}):
        plt.plot(df.shot_distance, df.points_per_shot, **kwargs)
        
    plot_acc_vs_dist(shot_acc.query('is_a_3 == False'), {'color' : '#008fd5'})
    plot_acc_vs_dist(shot_acc.query('is_a_3 == True'), {'color' : '#008fd5'})
    plt.title('Shot value vs. shot distance, 2011-2016 seasons\n Players with 1000+ attempts in a season', size = 14)
    plt.xlim(0,30)
    plt.xlabel('Shot Distance (ft)')
    plt.ylabel('Points per shot')
    plt.annotate('high efficiency 2s', xy=(2., 1.15), xytext=(4.5, 1.28),
                arrowprops=dict(facecolor='grey', shrink=0.05),
                )
    plt.annotate('high efficiency 3s', xy=(22, 1.15), xytext=(13.5, 1.15),
                arrowprops=dict(facecolor='grey', shrink=0.05),
                )
    plt.text(22, 1.28, 'corner 3s', fontsize = 12)
    plt.tight_layout()
    plt.savefig('results/pps.png')
    
    image.png

    相关文章

      网友评论

        本文标题:案例3-NBA

        本文链接:https://www.haomeiwen.com/subject/hdscoxtx.html