加载数据集
>>> import pandas as pd
>>> nba = pd.read_csv(r"d:\nbaallelo.csv")
>>> type(nba)
<class 'pandas.core.frame.DataFrame'>
>>> len(nba)
126314
>>> nba.shape
(126314, 23)
>>> nba.head()
>>> nba.head()
gameorder game_id lg_id ... game_result forecast notes
0 1 194611010TRH NBA ... L 0.640065 NaN
1 1 194611010TRH NBA ... W 0.359935 NaN
2 2 194611020CHS NBA ... W 0.631101 NaN
3 2 194611020CHS NBA ... L 0.368899 NaN
4 3 194611020DTF NBA ... L 0.640065 NaN
[5 rows x 23 columns]
>>> pd.set_option("display.max.columns", None) # 显示所有列
>>> nba.head()
gameorder game_id lg_id _iscopy year_id date_game seasongame \
0 1 194611010TRH NBA 0 1947 11/1/1946 1
1 1 194611010TRH NBA 1 1947 11/1/1946 1
2 2 194611020CHS NBA 0 1947 11/2/1946 1
3 2 194611020CHS NBA 1 1947 11/2/1946 2
4 3 194611020DTF NBA 0 1947 11/2/1946 1
is_playoffs team_id fran_id pts elo_i elo_n win_equiv opp_id \
0 0 TRH Huskies 66 1300.0000 1293.2767 40.294830 NYK
1 0 NYK Knicks 68 1300.0000 1306.7233 41.705170 TRH
2 0 CHS Stags 63 1300.0000 1309.6521 42.012257 NYK
3 0 NYK Knicks 47 1306.7233 1297.0712 40.692783 CHS
4 0 DTF Falcons 33 1300.0000 1279.6189 38.864048 WSC
opp_fran opp_pts opp_elo_i opp_elo_n game_location game_result \
0 Knicks 68 1300.0000 1306.7233 H L
1 Huskies 66 1300.0000 1293.2767 A W
2 Knicks 47 1306.7233 1297.0712 H W
3 Stags 63 1300.0000 1309.6521 A L
4 Capitols 50 1300.0000 1320.3811 H L
forecast notes
0 0.640065 NaN
1 0.359935 NaN
2 0.631101 NaN
3 0.368899 NaN
4 0.640065 NaN
>>> pd.set_option("display.max.columns", 8) # 显示8列
>>> nba.head()
gameorder game_id lg_id _iscopy ... game_location game_result \
0 1 194611010TRH NBA 0 ... H L
1 1 194611010TRH NBA 1 ... A W
2 2 194611020CHS NBA 0 ... H W
3 2 194611020CHS NBA 1 ... A L
4 3 194611020DTF NBA 0 ... H L
forecast notes
0 0.640065 NaN
1 0.359935 NaN
2 0.631101 NaN
3 0.368899 NaN
4 0.640065 NaN
[5 rows x 23 columns]
>>> pd.set_option("display.precision", 2) # 设置浮点数的精度
>>> nba.tail()
gameorder game_id lg_id _iscopy ... game_location \
126309 63155 201506110CLE NBA 0 ... H
126310 63156 201506140GSW NBA 0 ... H
126311 63156 201506140GSW NBA 1 ... A
126312 63157 201506170CLE NBA 0 ... H
126313 63157 201506170CLE NBA 1 ... A
game_result forecast notes
126309 L 0.55 NaN
126310 W 0.77 NaN
126311 L 0.23 NaN
126312 L 0.48 NaN
126313 W 0.52 NaN
[5 rows x 23 columns]


'nbaallelo.csv'可以在扣扣群630011153 144081101找到。
了解数据集
>>> nba.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 126314 entries, 0 to 126313
Data columns (total 23 columns):
gameorder 126314 non-null int64
game_id 126314 non-null object
lg_id 126314 non-null object
_iscopy 126314 non-null int64
year_id 126314 non-null int64
date_game 126314 non-null object
seasongame 126314 non-null int64
is_playoffs 126314 non-null int64
team_id 126314 non-null object
fran_id 126314 non-null object
pts 126314 non-null int64
elo_i 126314 non-null float64
elo_n 126314 non-null float64
win_equiv 126314 non-null float64
opp_id 126314 non-null object
opp_fran 126314 non-null object
opp_pts 126314 non-null int64
opp_elo_i 126314 non-null float64
opp_elo_n 126314 non-null float64
game_location 126314 non-null object
game_result 126314 non-null object
forecast 126314 non-null float64
notes 5424 non-null object
dtypes: float64(6), int64(7), object(10)
memory usage: 22.2+ MB
>>> nba.describe()
gameorder _iscopy ... opp_elo_n forecast
count 126314.000000 126314.000000 ... 126314.000000 126314.000000
mean 31579.000000 0.500000 ... 1495.236055 0.500000
std 18231.927643 0.500002 ... 112.461687 0.215252
min 1.000000 0.000000 ... 1085.774400 0.020447
25% 15790.000000 0.000000 ... 1416.994900 0.327989
50% 31579.000000 0.500000 ... 1500.954400 0.500000
75% 47368.000000 1.000000 ... 1576.291625 0.672011
max 63157.000000 1.000000 ... 1853.104500 0.979553
[8 rows x 13 columns]
>>> import numpy as np # nba.describe() 默认只分析数值,分析其他类型需要传递include=np.object
>>> nba.describe(include=np.object)
game_id lg_id ... game_result notes
count 126314 126314 ... 126314 5424
unique 63157 2 ... 2 231
top 201403190MEM NBA ... L at New York NY
freq 2 118016 ... 63157 440
[4 rows x 10 columns]


探索数据集
>>> nba["team_id"].value_counts()
BOS 5997
NYK 5769
LAL 5078
DET 4985
PHI 4533
INJ 60
DTF 60
PIT 60
TRH 60
SDS 11
Name: team_id, Length: 104, dtype: int64
>>> nba["fran_id"].value_counts()
Lakers 6024
Celtics 5997
Knicks 5769
Warriors 5657
Pistons 5650
Sixers 5644
Hawks 5572
Kings 5475
Wizards 4582
Spurs 4309
Bulls 4307
Pacers 4227
Thunder 4178
Rockets 4154
Nuggets 4120
Nets 4106
Suns 4080
Bucks 4034
Trailblazers 3870
Cavaliers 3810
Clippers 3733
Jazz 3555
Mavericks 3013
Heat 2371
Pelicans 2254
Magic 2207
Timberwolves 2131
Grizzlies 1657
Raptors 1634
Hornets 894
Colonels 846
Squires 799
Spirits 777
Stars 756
Sounds 697
Baltimore 467
Floridians 440
Condors 430
Capitols 291
Olympians 282
Sails 274
Stags 260
Bombers 249
Steamrollers 168
Packers 72
Redskins 65
Rebels 63
Denver 62
Waterloo 62
Huskies 60
Falcons 60
Ironmen 60
Jets 60
Name: fran_id, dtype: int64
>>> nba.loc[nba["fran_id"] == "Lakers", "team_id"].value_counts()
LAL 5078
MNL 946
Name: team_id, dtype: int64
>>> nba.loc[nba["team_id"] == "MNL", "date_game"].min()
'1/1/1949'
>>> nba.loc[nba["team_id"] == "MNL", "date_game"].max()
'4/9/1959'
>>> nba.loc[nba["team_id"] == "MNL", "date_game"].agg(("min", "max"))
min 1/1/1949
max 4/9/1959
Name: date_game, dtype: object
>>> nba.loc[nba["team_id"] == "BOS", "pts"].sum()
626484
pandas数据结构基础
- 序列(Series)
>>> revenues = pd.Series([5555, 7000, 1980])
>>> revenues.values
array([5555, 7000, 1980], dtype=int64)
>>> revenues.index
RangeIndex(start=0, stop=3, step=1)
>>> type(revenues.values)
numpy.ndarray
>>> city_revenues = pd.Series(
[4200, 8000, 6500],
index=["Amsterdam", "Toronto", "Tokyo"]
)
>>> city_revenues
Amsterdam 4200
Toronto 8000
Tokyo 6500
dtype: int64
>>> city_employee_count = pd.Series({"Amsterdam": 5, "Tokyo": 8})
>>> city_employee_count
Amsterdam 5
Tokyo 8
dtype: int64
>>> city_employee_count.keys()
Index(['Amsterdam', 'Tokyo'], dtype='object')
>>> "Tokyo" in city_employee_count
True
>>> "New York" in city_employee_count
False
- DataFrame
>>> city_data = pd.DataFrame({
"revenue": city_revenues,
"employee_count": city_employee_count
})
>>> city_data
revenue employee_count
Amsterdam 4200 5.0
Tokyo 6500 8.0
Toronto 8000 NaN
>>> city_data.index
Index(['Amsterdam', 'Tokyo', 'Toronto'], dtype='object')
>>> city_data.values
array([[4.2e+03, 5.0e+00],
[6.5e+03, 8.0e+00],
[8.0e+03, nan]])
>>> city_data.axes
[Index(['Amsterdam', 'Tokyo', 'Toronto'], dtype='object'),
Index(['revenue', 'employee_count'], dtype='object')]
>>> city_data.axes[0]
Index(['Amsterdam', 'Tokyo', 'Toronto'], dtype='object')
>>> city_data.axes[1]
Index(['revenue', 'employee_count'], dtype='object')
>>> city_data.keys() # 注意这里是列
Index(['revenue', 'employee_count'], dtype='object')
>>> "Amsterdam" in city_data
False
>>> "revenue" in city_data
True
>>> nba.index
RangeIndex(start=0, stop=126314, step=1)
>>> nba.axes
[RangeIndex(start=0, stop=126314, step=1),
Index(['gameorder', 'game_id', 'lg_id', '_iscopy', 'year_id', 'date_game',
'seasongame', 'is_playoffs', 'team_id', 'fran_id', 'pts', 'elo_i',
'elo_n', 'win_equiv', 'opp_id', 'opp_fran', 'opp_pts', 'opp_elo_i',
'opp_elo_n', 'game_location', 'game_result', 'forecast', 'notes'],
dtype='object')]
>>> "points" in nba.keys()
False
>>> "pts" in nba.keys()
True
访问序列(Series)元素
>>> city_revenues["Toronto"]
8000
>>> city_revenues[1]
8000
>>> city_revenues[-1]
6500
>>> city_revenues[1:]
Toronto 8000
Tokyo 6500
dtype: int64
>>> city_revenues["Toronto":]
Toronto 8000
Tokyo 6500
dtype: int64
>>> colors = pd.Series(
["red", "purple", "blue", "green", "yellow"],
index=[1, 2, 3, 5, 8]
)
>>> colors # 此时colors[1]有歧义
1 red
2 purple
3 blue
5 green
8 yellow
dtype: object
>>> colors.loc[1] # loc和iloc的性能更好
'red'
>>> colors.iloc[1]
'purple'
>>> colors.iloc[1:3]
2 purple
3 blue
dtype: object
>>> colors.loc[3:8] # 注意loc包含最后一个元素,iloc不包含
3 blue
5 green
8 yellow
dtype: object

网友评论