美文网首页
使用Pandas和Python探索数据集1

使用Pandas和Python探索数据集1

作者: python测试开发 | 来源:发表于2020-02-24 18:04 被阅读0次

加载数据集

>>> import pandas as pd
>>> nba = pd.read_csv(r"d:\nbaallelo.csv")
>>> type(nba)
<class 'pandas.core.frame.DataFrame'>
>>> len(nba)
126314
>>> nba.shape
(126314, 23)
>>> nba.head()
>>> nba.head()
   gameorder       game_id lg_id  ...  game_result  forecast notes
0          1  194611010TRH   NBA  ...            L  0.640065   NaN
1          1  194611010TRH   NBA  ...            W  0.359935   NaN
2          2  194611020CHS   NBA  ...            W  0.631101   NaN
3          2  194611020CHS   NBA  ...            L  0.368899   NaN
4          3  194611020DTF   NBA  ...            L  0.640065   NaN

[5 rows x 23 columns]

>>> pd.set_option("display.max.columns", None) # 显示所有列

>>> nba.head()
   gameorder       game_id lg_id  _iscopy  year_id  date_game  seasongame  \
0          1  194611010TRH   NBA        0     1947  11/1/1946           1   
1          1  194611010TRH   NBA        1     1947  11/1/1946           1   
2          2  194611020CHS   NBA        0     1947  11/2/1946           1   
3          2  194611020CHS   NBA        1     1947  11/2/1946           2   
4          3  194611020DTF   NBA        0     1947  11/2/1946           1   

   is_playoffs team_id  fran_id  pts      elo_i      elo_n  win_equiv opp_id  \
0            0     TRH  Huskies   66  1300.0000  1293.2767  40.294830    NYK   
1            0     NYK   Knicks   68  1300.0000  1306.7233  41.705170    TRH   
2            0     CHS    Stags   63  1300.0000  1309.6521  42.012257    NYK   
3            0     NYK   Knicks   47  1306.7233  1297.0712  40.692783    CHS   
4            0     DTF  Falcons   33  1300.0000  1279.6189  38.864048    WSC   

   opp_fran  opp_pts  opp_elo_i  opp_elo_n game_location game_result  \
0    Knicks       68  1300.0000  1306.7233             H           L   
1   Huskies       66  1300.0000  1293.2767             A           W   
2    Knicks       47  1306.7233  1297.0712             H           W   
3     Stags       63  1300.0000  1309.6521             A           L   
4  Capitols       50  1300.0000  1320.3811             H           L   

   forecast notes  
0  0.640065   NaN  
1  0.359935   NaN  
2  0.631101   NaN  
3  0.368899   NaN  
4  0.640065   NaN  

>>> pd.set_option("display.max.columns", 8) # 显示8列

>>> nba.head()
   gameorder       game_id lg_id  _iscopy  ...  game_location game_result  \
0          1  194611010TRH   NBA        0  ...              H           L   
1          1  194611010TRH   NBA        1  ...              A           W   
2          2  194611020CHS   NBA        0  ...              H           W   
3          2  194611020CHS   NBA        1  ...              A           L   
4          3  194611020DTF   NBA        0  ...              H           L   

   forecast  notes  
0  0.640065    NaN  
1  0.359935    NaN  
2  0.631101    NaN  
3  0.368899    NaN  
4  0.640065    NaN  

[5 rows x 23 columns]

>>> pd.set_option("display.precision", 2) # 设置浮点数的精度

>>> nba.tail()
        gameorder       game_id lg_id  _iscopy  ...  game_location  \
126309      63155  201506110CLE   NBA        0  ...              H   
126310      63156  201506140GSW   NBA        0  ...              H   
126311      63156  201506140GSW   NBA        1  ...              A   
126312      63157  201506170CLE   NBA        0  ...              H   
126313      63157  201506170CLE   NBA        1  ...              A   

       game_result  forecast  notes  
126309           L      0.55    NaN  
126310           W      0.77    NaN  
126311           L      0.23    NaN  
126312           L      0.48    NaN  
126313           W      0.52    NaN  

[5 rows x 23 columns]

head.7c86dafd4141.png tail.0dc48c8c2803.png

'nbaallelo.csv'可以在扣扣群630011153 144081101找到。

了解数据集

>>> nba.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 126314 entries, 0 to 126313
Data columns (total 23 columns):
gameorder        126314 non-null int64
game_id          126314 non-null object
lg_id            126314 non-null object
_iscopy          126314 non-null int64
year_id          126314 non-null int64
date_game        126314 non-null object
seasongame       126314 non-null int64
is_playoffs      126314 non-null int64
team_id          126314 non-null object
fran_id          126314 non-null object
pts              126314 non-null int64
elo_i            126314 non-null float64
elo_n            126314 non-null float64
win_equiv        126314 non-null float64
opp_id           126314 non-null object
opp_fran         126314 non-null object
opp_pts          126314 non-null int64
opp_elo_i        126314 non-null float64
opp_elo_n        126314 non-null float64
game_location    126314 non-null object
game_result      126314 non-null object
forecast         126314 non-null float64
notes            5424 non-null object
dtypes: float64(6), int64(7), object(10)
memory usage: 22.2+ MB

>>> nba.describe()

           gameorder        _iscopy  ...      opp_elo_n       forecast
count  126314.000000  126314.000000  ...  126314.000000  126314.000000
mean    31579.000000       0.500000  ...    1495.236055       0.500000
std     18231.927643       0.500002  ...     112.461687       0.215252
min         1.000000       0.000000  ...    1085.774400       0.020447
25%     15790.000000       0.000000  ...    1416.994900       0.327989
50%     31579.000000       0.500000  ...    1500.954400       0.500000
75%     47368.000000       1.000000  ...    1576.291625       0.672011
max     63157.000000       1.000000  ...    1853.104500       0.979553

[8 rows x 13 columns]

>>> import numpy as np # nba.describe() 默认只分析数值,分析其他类型需要传递include=np.object

>>> nba.describe(include=np.object)
             game_id   lg_id  ... game_result           notes
count         126314  126314  ...      126314            5424
unique         63157       2  ...           2             231
top     201403190MEM     NBA  ...           L  at New York NY
freq               2  118016  ...       63157             440

[4 rows x 10 columns]
describe.0be00956e704.png describe_object.2ec0a6039517.png

探索数据集

>>> nba["team_id"].value_counts()
BOS    5997
NYK    5769
LAL    5078
DET    4985
PHI    4533

INJ      60
DTF      60
PIT      60
TRH      60
SDS      11
Name: team_id, Length: 104, dtype: int64

>>> nba["fran_id"].value_counts()
Lakers          6024
Celtics         5997
Knicks          5769
Warriors        5657
Pistons         5650
Sixers          5644
Hawks           5572
Kings           5475
Wizards         4582
Spurs           4309
Bulls           4307
Pacers          4227
Thunder         4178
Rockets         4154
Nuggets         4120
Nets            4106
Suns            4080
Bucks           4034
Trailblazers    3870
Cavaliers       3810
Clippers        3733
Jazz            3555
Mavericks       3013
Heat            2371
Pelicans        2254
Magic           2207
Timberwolves    2131
Grizzlies       1657
Raptors         1634
Hornets          894
Colonels         846
Squires          799
Spirits          777
Stars            756
Sounds           697
Baltimore        467
Floridians       440
Condors          430
Capitols         291
Olympians        282
Sails            274
Stags            260
Bombers          249
Steamrollers     168
Packers           72
Redskins          65
Rebels            63
Denver            62
Waterloo          62
Huskies           60
Falcons           60
Ironmen           60
Jets              60
Name: fran_id, dtype: int64

>>> nba.loc[nba["fran_id"] == "Lakers", "team_id"].value_counts()
LAL    5078
MNL     946
Name: team_id, dtype: int64

>>> nba.loc[nba["team_id"] == "MNL", "date_game"].min()
'1/1/1949'

>>> nba.loc[nba["team_id"] == "MNL", "date_game"].max()
'4/9/1959'

>>> nba.loc[nba["team_id"] == "MNL", "date_game"].agg(("min", "max"))

min    1/1/1949
max    4/9/1959
Name: date_game, dtype: object

>>> nba.loc[nba["team_id"] == "BOS", "pts"].sum()
626484

pandas数据结构基础

  • 序列(Series)
>>> revenues = pd.Series([5555, 7000, 1980])
>>> revenues.values
array([5555, 7000, 1980], dtype=int64)

>>> revenues.index
RangeIndex(start=0, stop=3, step=1)

>>> type(revenues.values)
numpy.ndarray

>>> city_revenues = pd.Series(
    [4200, 8000, 6500],
    index=["Amsterdam", "Toronto", "Tokyo"]
)

>>> city_revenues
Amsterdam    4200
Toronto      8000
Tokyo        6500
dtype: int64

>>> city_employee_count = pd.Series({"Amsterdam": 5, "Tokyo": 8})

>>> city_employee_count
Amsterdam    5
Tokyo        8
dtype: int64

>>> city_employee_count.keys()
Index(['Amsterdam', 'Tokyo'], dtype='object')

>>> "Tokyo" in city_employee_count
True

>>> "New York" in city_employee_count
False
  • DataFrame
>>> city_data = pd.DataFrame({
    "revenue": city_revenues,
    "employee_count": city_employee_count
})

>>> city_data
           revenue  employee_count
Amsterdam     4200             5.0
Tokyo         6500             8.0
Toronto       8000             NaN

>>> city_data.index
Index(['Amsterdam', 'Tokyo', 'Toronto'], dtype='object')

>>> city_data.values

array([[4.2e+03, 5.0e+00],
       [6.5e+03, 8.0e+00],
       [8.0e+03,     nan]])

>>> city_data.axes

[Index(['Amsterdam', 'Tokyo', 'Toronto'], dtype='object'),
 Index(['revenue', 'employee_count'], dtype='object')]

>>> city_data.axes[0]
Index(['Amsterdam', 'Tokyo', 'Toronto'], dtype='object')

>>> city_data.axes[1]
Index(['revenue', 'employee_count'], dtype='object')

>>> city_data.keys()  # 注意这里是列
Index(['revenue', 'employee_count'], dtype='object')

>>> "Amsterdam" in city_data
 False

>>> "revenue" in city_data
True

>>> nba.index
RangeIndex(start=0, stop=126314, step=1)

>>> nba.axes
[RangeIndex(start=0, stop=126314, step=1),
 Index(['gameorder', 'game_id', 'lg_id', '_iscopy', 'year_id', 'date_game',
        'seasongame', 'is_playoffs', 'team_id', 'fran_id', 'pts', 'elo_i',
        'elo_n', 'win_equiv', 'opp_id', 'opp_fran', 'opp_pts', 'opp_elo_i',
        'opp_elo_n', 'game_location', 'game_result', 'forecast', 'notes'],
       dtype='object')]

>>> "points" in nba.keys()
False
>>> "pts" in nba.keys()
True

访问序列(Series)元素

>>> city_revenues["Toronto"]
8000

>>> city_revenues[1]
8000

>>> city_revenues[-1]
6500

>>> city_revenues[1:]

Toronto    8000
Tokyo      6500
dtype: int64

>>> city_revenues["Toronto":]

Toronto    8000
Tokyo      6500
dtype: int64

>>> colors = pd.Series(
    ["red", "purple", "blue", "green", "yellow"],
    index=[1, 2, 3, 5, 8]
)

>>> colors # 此时colors[1]有歧义
1       red
2    purple
3      blue
5     green
8    yellow
dtype: object

>>> colors.loc[1] # loc和iloc的性能更好
'red'
>>> colors.iloc[1]
'purple'

>>> colors.iloc[1:3]
2    purple
3      blue
dtype: object

>>> colors.loc[3:8] # 注意loc包含最后一个元素,iloc不包含
3      blue
5     green
8    yellow
dtype: object

相关文章

网友评论

      本文标题:使用Pandas和Python探索数据集1

      本文链接:https://www.haomeiwen.com/subject/sgwvqhtx.html