美文网首页
科比数据集的处理和预测(机器学习)

科比数据集的处理和预测(机器学习)

作者: Radiance_sty | 来源:发表于2019-03-20 13:28 被阅读0次

    练习:科比数据集的处理和预测

    • 数据导入

       import pandas as pd
       import matplotlib.pyplot as plt
      
      # 注意:pandas 通常不会完全显示
      pd.set_option('display.max_columns', None)          # 显示所有列
      pd.set_option('display.max_rows', None)             # 显示所有行
      pd.set_option('max_colwidth', 100)                  # 设置 value 的显示长度为100,默认为50
      pd.set_option('display.width',1000)                 # 当 console 中输出的列数超过1000的时候才会换行
      
      # import data
      filename= "data.csv"
      raw = pd.read_csv(filename)
      
      print (raw.shape)
      print(raw.head())
      
    输出结果为:
    • 散点图,通过用图表的形式观察数据,可以看出有两列内容不同的数据,但实际表达的意思相同,所以数据处理时可以只保留其中一个数据

      # 测试数据集
      kobe = raw[pd.notnull(raw['shot_made_flag'])]
      print(kobe.shape)
      
      # 画图操作-投篮位置信息
      alpha = 0.02                          # 指透明度,设置时要注意它的值
      plt.figure(figsize=(10, 10))
      
      plt.subplot(121)
      plt.scatter(kobe.loc_x, kobe.loc_y, color='R', alpha=alpha)
      plt.title('loc_x and loc_y')
      
      plt.subplot(122)
      plt.scatter(kobe.lon, kobe.lat, color='B', alpha=alpha)
      plt.title('lat and lon')
      
      plt.show()
      
    散点图
    • 特征提取

      import numpy as np
      import pandas as pd
      
      file_name = 'data.csv'
      raw = pd.read_csv(file_name)
      
      # 特征提取,将坐标转换为极坐标
      kobe = raw[pd.notnull(raw['shot_made_flag'])]
      raw['dist'] = np.sqrt(raw['loc_x']**2 + raw['loc_y']**2)
      
      loc_x_zero = raw['loc_x'] == 0
      
      raw['angle'] = np.array([0]*len(raw))
      raw['angle'][~loc_x_zero] = np.arctan(raw['loc_y'][~loc_x_zero] / raw['loc_x'][~loc_x_zero])
      raw['angle'][loc_x_zero] = np.pi / 2
      
      raw['remaining_time'] = raw['minutes_remaining'] * 60 + raw['seconds_remaining']
      
      # unique 显示一列里所有不重复的值的集合
      print(kobe.action_type.unique())
      print(kobe.combined_shot_type.unique())
      
      print(kobe.shot_type.unique())
      print(kobe.shot_type.value_counts())
      
      print(kobe.season.unique())
      print(kobe.season.value_counts())
      
    运行结果为:
    # 画图
    plt.figure(figsize=(5,5))
    
    plt.scatter(raw.dist, raw.shot_distance, color='blue')
    plt.title('dist and shot_distance')
    
    plt.show()
    
    # 查看科比的投篮区域次数
    gs = kobe.groupby('shot_zone_area')
    print(kobe['shot_zone_area'].value_counts())
    print(len(gs))
    
    运行结果为:
    • 画图-对科比投篮的区域进行统计

      # 画图-对科比投篮的区域进行统计
      plt.figure(figsize=(20,10))
      
      def scatter_plot_by_category(feat):
          alpha = 0.1
          gs = kobe.groupby(feat)
          cs = cm.rainbow(np.linspace(0,1,len(gs)))
          for g,c in zip(gs, cs):
              plt.scatter(g[1].loc_x, g[1].loc_y, color=c, alpha=alpha)
      
      # shot_zone_area
      plt.subplot(131)
      scatter_plot_by_category('shot_zone_area')
      plt.title('shot_zone_area')
      
      # shot_zone_area
      plt.subplot(132)
      scatter_plot_by_category('shot_zone_basic')
      plt.title('shot_zone_basic')
      
      # shot_zone_range
      plt.subplot(133)
      scatter_plot_by_category('shot_zone_range')
      plt.title('shot_zone_range')
      
      plt.show()
      
    运行结果如下:可以看出,以看出有三列内容不同的数据,但实际表达的意思相同,所以数据处理时可以只保留其中一个数据
    • 删除不重要的列,同时对一些字符进行one-hot处理

      # 删除不重要的列,同时对一些字符进行one-hot处理
      drops = ['shot_id', 'team_id', 'team_name', 'shot_zone_area', 'shot_zone_range', 'shot_zone_basic', 'matchup', 'lon',
         'lat', 'seconds_remaining', 'minutes_remaining','shot_distance', 'loc_x', 'loc_y', 'game_event_id', 'game_id',
         'game_date']
      for drop in drops:
          raw = raw.drop(drop, 1)
      
       print(raw['combined_shot_type'].value_counts())
       x = pd.get_dummies(raw['combined_shot_type'], prefix='combined_shot_type')[0:2]
       print(x)
      # 制定前缀为 combined_shot_type, 查看前面两项数据
      

    运行结果为:


    独热编码可以参考:https://www.cnblogs.com/lianyingteng/p/7755545.html
    • 开始训练模型,判断科比能否进球

      import numpy as np
      import pandas as pd
      import time
      
      from sklearn.ensemble import  RandomForestRegressor, RandomForestClassifier
      from sklearn.metrics import confusion_matrix, log_loss
      from sklearn.model_selection import KFold
      
      import  matplotlib.pyplot as plt
      
      file_name = 'data.csv'
      raw = pd.read_csv(file_name)
      
      # 删除不重要的列,同时对一些字符进行one-hot处理
      drops = ['shot_id', 'team_id', 'team_name', 'shot_zone_area', 'shot_zone_range', 'shot_zone_basic', 'matchup', 'lon',
         'lat', 'seconds_remaining', 'minutes_remaining','shot_distance', 'loc_x', 'loc_y', 'game_event_id', 'game_id',
         'game_date']
      for drop in drops:
          raw = raw.drop(drop, 1)
      
      # one-hot(独热编码)
      categorical_vars = ['action_type', 'combined_shot_type', 'shot_type', 'opponent', 'period', 'season']
      for var in categorical_vars:
          raw = pd.concat([raw, pd.get_dummies(raw[var], prefix=var)], 1)       # concat 为拼接操作
          raw = raw.drop(var, 1)
      
      # 至此数据的整理已经完成,下面开始训练模型,目的是判断科比是否可以进球
      # 这里把'shot_made_flag'里的5000个有缺失值得数据当做测试集
      train_kobe = raw[pd.notnull(raw['shot_made_flag'])]
      train_label = train_kobe['shot_made_flag']
      train_kobe = train_kobe.drop('shot_made_flag', 1)
      test_kobe = raw[pd.isnull(raw['shot_made_flag'])]
      test_kobe = test_kobe.drop('shot_made_flag', 1)
      
      # 用随机森林训练模型,为了方便,森林的宽度和深度用了3个值(1,10,100)
      print('Finding best n_estimators for RandomForestClassifier...')
      min_score = 100000
      best_n = 0
      scores_n = []
      range_n = np.logspace(0,2,num=3).astype(int)                              # 建造一个从1~100的等比数列
      kf = KFold(n_splits=10, shuffle=True)
      
      for n in range_n:
          print('the number of trees : {0}'.format(n))
          t1 = time.time()
      
          rfc_score = 0.
          rfc = RandomForestRegressor(n_estimators=n)                           # 随机森林分类器建立一个模型
          for train_k, test_k in kf.split(train_kobe):
              rfc.fit(train_kobe.iloc[train_k], train_label.iloc[train_k])      # 一部分为数据,一部分为标签
      
              # rfc_score += rfc.score(train.iloc[test_k], train_y.iloc[test_k])/10
              pred = rfc.predict(train_kobe.iloc[test_k])                       # 对模型进行预测
              rfc_score += log_loss(train_label.iloc[test_k], pred) / 10        # 对模型进行评估
          scores_n.append(rfc_score)
          if rfc_score < min_score:
              min_score = rfc_score
              best_n = n
          t2 = time.time()
          print('Done processing {0} trees ({1:.3f}sec)'.format(n, t2 - t1))
      print(best_n, min_score)
      
      # find the best max_depth for RandomForestClassifier
      print('Finding best max_depth for RandomForestClassifier...')
      min_score = 100000
      best_m = 0
      scores_m = []
      range_m = np.logspace(0, 2, num=3).astype(int)
      kf = KFold(n_splits=10, shuffle=True)
      
      for m in range_m:
          print("the max depth : {0}".format(m))
          t1 = time.time()
      
          rfc_score = 0.
          rfc = RandomForestClassifier(max_depth=m, n_estimators=best_n)
          for train_k, test_k in kf.split(train_kobe):
              rfc.fit(train_kobe.iloc[train_k], train_label.iloc[train_k])
              # rfc_score += rfc.score(train.iloc[test_k], train_y.iloc[test_k])/10
              pred = rfc.predict(train_kobe.iloc[test_k])
              rfc_score += log_loss(train_label.iloc[test_k], pred) / 10
          scores_m.append(rfc_score)
          if rfc_score < min_score:
              min_score = rfc_score
              best_m = m
      
          t2 = time.time()
          print('Done processing {0} trees ({1:.3f}sec)'.format(m, t2 - t1))
      print(best_m, min_score)
      
    运行结果为:
    • 画图

      plt.figure(figsize=(10, 5))
      plt.subplot(121)
      plt.plot(range_n, scores_n)
      plt.ylabel('score')
      plt.xlabel('number of trees')
      
      plt.subplot(122)
      plt.plot(range_m, scores_m)
      plt.ylabel('score')
      plt.xlabel('max depth')
      plt.show()
      
      model = RandomForestClassifier(n_estimators=best_n, max_depth=best_m)
      model.fit(train_kobe, train_label)
      # 已经拿到模型了,可以对其进行预测
      
    输出结果为:

    相关文章

      网友评论

          本文标题:科比数据集的处理和预测(机器学习)

          本文链接:https://www.haomeiwen.com/subject/vbtxvqtx.html