import pandas as pd
import numpy as np
s = pd.Series([1,3,6,np,np.nan,44,1])
dates = pd.date_range('20160101',periods=6)
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
df1 = pd.DataFrame(np.arange(12).reshape((3,4)))
df2 = pd.DataFrame({"A":1.,"B":pd.Categorical(["test",'tesst'])})
print('S',s)
print('data',dates)
print('df',df)
print('df1',df1)
print('df1',df2)
S 0 1
1 3
2 6
3 <module 'numpy' from 'E:\\program\\Anaconda3\\...
4 NaN
5 44
6 1
dtype: object
data DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
'2016-01-05', '2016-01-06'],
dtype='datetime64[ns]', freq='D')
df a b c d
2016-01-01 -1.018450 0.116236 0.171505 -0.989516
2016-01-02 0.608064 -0.523510 0.335354 -0.895313
2016-01-03 2.804357 -0.105679 -0.821816 -0.340312
2016-01-04 1.239038 -1.592973 -0.060665 -1.785517
2016-01-05 -0.307189 -1.255811 -0.420571 1.545740
2016-01-06 -1.469857 2.646400 0.101558 0.438751
df1 0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
df1 A B
0 1.0 test
1 1.0 tesst
print("df2.dtypes",df2.dtypes)
print("df2.index",df2.index)
print("df2.columns",df2.columns)
print("df2.values",df2.values)
print("df2.describe",df2.describe())
print("df2.T",df2.T)
print("df2.sort_index",df2.sort_index(axis=1,ascending=False))
print("df2.sort_index",df2.sort_values(by='B'))
df2.dtypes A float64
B category
dtype: object
df2.index RangeIndex(start=0, stop=2, step=1)
df2.columns Index(['A', 'B'], dtype='object')
df2.values [[1.0 'test']
[1.0 'tesst']]
df2.describe A
count 2.0
mean 1.0
std 0.0
min 1.0
25% 1.0
50% 1.0
75% 1.0
max 1.0
df2.T 0 1
A 1 1
B test tesst
df2.sort_index B A
0 test 1.0
1 tesst 1.0
df2.sort_index A B
1 1.0 tesst
0 1.0 test
选择数据
dates=pd.date_range('20130101',periods=6)
df=pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
print(df)
print(df['A'],df.A)
print(df[0:3],'\n\n',df["20130102":"20130104"])
# select by label:loc
print(df.loc['20130102'])
print(df.loc[:,["A","B"]])
# select by position:iloc
print(df.iloc[3,1])
print(df.iloc[3:5,1])
# mixed selection:ix
print(df.ix[:3,['A','C']])
#Boolean indexing
print(df[df.A>8])
A B C D
2013-01-01 0 1 2 3
2013-01-02 4 5 6 7
2013-01-03 8 9 10 11
2013-01-04 12 13 14 15
2013-01-05 16 17 18 19
2013-01-06 20 21 22 23
2013-01-01 0
2013-01-02 4
2013-01-03 8
2013-01-04 12
2013-01-05 16
2013-01-06 20
Freq: D, Name: A, dtype: int32 2013-01-01 0
2013-01-02 4
2013-01-03 8
2013-01-04 12
2013-01-05 16
2013-01-06 20
Freq: D, Name: A, dtype: int32
A B C D
2013-01-01 0 1 2 3
2013-01-02 4 5 6 7
2013-01-03 8 9 10 11
A B C D
2013-01-02 4 5 6 7
2013-01-03 8 9 10 11
2013-01-04 12 13 14 15
A 4
B 5
C 6
D 7
Name: 2013-01-02 00:00:00, dtype: int32
A B
2013-01-01 0 1
2013-01-02 4 5
2013-01-03 8 9
2013-01-04 12 13
2013-01-05 16 17
2013-01-06 20 21
13
2013-01-04 13
2013-01-05 17
Freq: D, Name: B, dtype: int32
A C
2013-01-01 0 2
2013-01-02 4 6
2013-01-03 8 10
A B C D
2013-01-04 12 13 14 15
2013-01-05 16 17 18 19
2013-01-06 20 21 22 23
E:\program\Anaconda3\lib\site-packages\ipykernel_launcher.py:13: DeprecationWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
del sys.path[0]
设置值
df.iloc[2,2] = 11111
df['Z'] = 0
df["E"]=pd.Series([1,2,3,4,5,6],index=pd.date_range('20130101',periods=6))
print(df)
A B C D Z E
2013-01-01 0 1 2 3 0 1
2013-01-02 4 5 6 7 0 2
2013-01-03 8 9 11111 11 0 3
2013-01-04 12 13 14 15 0 4
2013-01-05 16 17 18 19 0 5
2013-01-06 20 21 22 23 0 6
处理丢失值
dates=pd. date_range('20130101', periods=6)
df=pd. DataFrame(np. arange(24). reshape((6,4)), index=dates, columns=['A','B','C','D'])
df.iloc[0,1]=np. nan
df.iloc[1,2]=np. nan
print(df)
print(df.dropna())#丢掉行
print(df.dropna(axis=0,how='any'))#丢掉行how={fany','all}
print(df.fillna(value=0))#填充0
print(np.any(df.isna()) == True)#判断有无null
A B C D
2013-01-01 0 NaN 2.0 3
2013-01-02 4 5.0 NaN 7
2013-01-03 8 9.0 10.0 11
2013-01-04 12 13.0 14.0 15
2013-01-05 16 17.0 18.0 19
2013-01-06 20 21.0 22.0 23
A B C D
2013-01-03 8 9.0 10.0 11
2013-01-04 12 13.0 14.0 15
2013-01-05 16 17.0 18.0 19
2013-01-06 20 21.0 22.0 23
A B C D
2013-01-03 8 9.0 10.0 11
2013-01-04 12 13.0 14.0 15
2013-01-05 16 17.0 18.0 19
2013-01-06 20 21.0 22.0 23
A B C D
2013-01-01 0 0.0 2.0 3
2013-01-02 4 5.0 0.0 7
2013-01-03 8 9.0 10.0 11
2013-01-04 12 13.0 14.0 15
2013-01-05 16 17.0 18.0 19
2013-01-06 20 21.0 22.0 23
True
导入导出
read_csv
read_excel
read_hdf
read_sql
read_json
read_msgpack(experimental)
read_html
read_gbg(experimental)
read_stata
read_sas
read_clijpboard
read_pickle
to_csv
to_excel
to_hdf
to_sql
to_json
to_msgpack(experimental)
to_html
to_gbg(experimental)
to_stata
to_sas
to_clijpboard
to_pickle
合并
# concatenating
df1=pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
df2=pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
df3=pd.DataFrame(np.ones((3,4))*2, columns=['a','b','c','d'])
res = pd.concat([df1,df2,df3],axis=0,ignore_index=True)
print(res)
# join,['inner','outer']
df1=pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
df2=pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d','e'])
res1 = pd.concat([df1,df2],join='outer',ignore_index=True)
res2 = pd.concat([df1,df2],join='inner',ignore_index=True)
print(res1,res2)
# join_axes
dfl=pd. DataFrame(np. ones((3,4))*0, columns=['a','b','c','d'], index=[1,2,3])
df2=pd. DataFrame(np. ones((3,4))*1, columns=['b','c','d','e'], index=[2,3,4])
res=pd. concat([ df1, df2], axis=1, join_axes=[ df1. index])
print(res)
# append
df1=pd. DataFrame(np. ones((3,4))*0, columns=['a','b','c','d'])
df2=pd. DataFrame(np. ones((3,4))*1, columns=['a','b','c','d'])
df3=pd. DataFrame(np. ones((3,4))*1, columns=['b','c','d','e'], index=[2,3,4])
res=dfl. append(df2, ignore_index=True)
res=dfl. append([ df2, df3])
print(res)
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
6 2.0 2.0 2.0 2.0
7 2.0 2.0 2.0 2.0
8 2.0 2.0 2.0 2.0
E:\program\Anaconda3\lib\site-packages\ipykernel_launcher.py:11: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.
To accept the future behavior, pass 'sort=False'.
To retain the current behavior and silence the warning, pass 'sort=True'.
# This is added back by InteractiveShellApp.init_path()
a b c d e
0 0.0 0.0 0.0 0.0 NaN
1 0.0 0.0 0.0 0.0 NaN
2 0.0 0.0 0.0 0.0 NaN
3 NaN 1.0 1.0 1.0 1.0
4 NaN 1.0 1.0 1.0 1.0
5 NaN 1.0 1.0 1.0 1.0 b c d
0 0.0 0.0 0.0
1 0.0 0.0 0.0
2 0.0 0.0 0.0
3 1.0 1.0 1.0
4 1.0 1.0 1.0
5 1.0 1.0 1.0
a b c d b c d e
0 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
a b c d e
1 0.0 0.0 0.0 0.0 NaN
2 0.0 0.0 0.0 0.0 NaN
3 0.0 0.0 0.0 0.0 NaN
0 1.0 1.0 1.0 1.0 NaN
1 1.0 1.0 1.0 1.0 NaN
2 1.0 1.0 1.0 1.0 NaN
2 NaN 1.0 1.0 1.0 1.0
3 NaN 1.0 1.0 1.0 1.0
4 NaN 1.0 1.0 1.0 1.0
E:\program\Anaconda3\lib\site-packages\pandas\core\frame.py:6211: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.
To accept the future behavior, pass 'sort=False'.
To retain the current behavior and silence the warning, pass 'sort=True'.
sort=sort)
# merging two df by key/keys.(may be used in database)
# simple example
left=pd. DataFrame({"key":['KO','K1','K2','K3'],
'A':['A0','A1','A2','A3'],
'B':['BO','B1','B2','B3']})
right=pd. DataFrame({"key":['K0','K1','K2','K3'],
'C':['CO','C1','C2','C3'],
'D':['DO','D1','D2','D3']})
print(left,right)
res = pd.merge(left,right,on='key')
print(res)
# consider two keys
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
print(left)
print(right)
res = pd.merge(left, right, on=['key1', 'key2'], how='inner') # default for how='inner'
print(res)
# how = ['left', 'right', 'outer', 'inner']
res = pd.merge(left, right, on=['key1', 'key2'], how='left')
print(res)
# indicator
df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']})
df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
print(df1)
print(df2)
res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
# give the indicator a custom name
res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')
# merged by index
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']},
index=['K0', 'K2', 'K3'])
print(left)
print(right)
# left_index and right_index
res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
print(res)
res = pd.merge(left, right, left_index=True, right_index=True, how='inner')
print(res)
# handle overlapping
boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')
print(res)
# join function in pandas is similar with merge. If know merge, you will understand join
key A B
0 KO A0 BO
1 K1 A1 B1
2 K2 A2 B2
3 K3 A3 B3 key C D
0 K0 CO DO
1 K1 C1 D1
2 K2 C2 D2
3 K3 C3 D3
key A B C D
0 K1 A1 B1 C1 D1
1 K2 A2 B2 C2 D2
2 K3 A3 B3 C3 D3
key1 key2 A B
0 K0 K0 A0 B0
1 K0 K1 A1 B1
2 K1 K0 A2 B2
3 K2 K1 A3 B3
key1 key2 C D
0 K0 K0 C0 D0
1 K1 K0 C1 D1
2 K1 K0 C2 D2
3 K2 K0 C3 D3
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K1 K0 A2 B2 C1 D1
2 K1 K0 A2 B2 C2 D2
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K0 K1 A1 B1 NaN NaN
2 K1 K0 A2 B2 C1 D1
3 K1 K0 A2 B2 C2 D2
4 K2 K1 A3 B3 NaN NaN
col1 col_left
0 0 a
1 1 b
col1 col_right
0 1 2
1 2 2
2 2 2
A B
K0 A0 B0
K1 A1 B1
K2 A2 B2
C D
K0 C0 D0
K2 C2 D2
K3 C3 D3
A B C D
K0 A0 B0 C0 D0
K1 A1 B1 NaN NaN
K2 A2 B2 C2 D2
K3 NaN NaN C3 D3
A B C D
K0 A0 B0 C0 D0
K2 A2 B2 C2 D2
k age_boy age_girl
0 K0 1 4
1 K0 1 5
plt画图
import matplotlib.pyplot as plt
# plot data
# Series
data = pd.Series(np.random.randn(1000), index=np.arange(1000))
data = data.cumsum()
##data.plot()
# DataFrame
data = pd.DataFrame(np.random.randn(1000, 4), index=np.arange(1000), columns=list("ABCD"))
data = data.cumsum()
# plot methods:
# 'bar', 'hist', 'box', 'kde', 'area', scatter', hexbin', 'pie'
ax = data.plot.scatter(x='A', y='B', color='DarkBlue', label="Class 1")
data.plot.scatter(x='A', y='C', color='LightGreen', label='Class 2', ax=ax)
plt.show()
参考
https://morvanzhou.github.io/tutorials/data-manipulation/np-pd/
网友评论