美文网首页
二、pandas

二、pandas

作者: 一闪一闪亮日日日日日日 | 来源:发表于2018-10-07 19:17 被阅读0次

1、数据读取

import pandas
nba_2013 = pandas.read_csv('nba_2013.csv')
print(type(nba_2013))
print(nba_2013.dtypes)
print(nba_2013.head()) #默认显示前五行
print(nba_2013.tail()) #默认显示后五行
print(nba_2013.columns)
print(nba_2013.shape)

print(type(nba_2013))的输出:
<class 'pandas.core.frame.DataFrame'>

2、索引与计算

import pandas
nba_2013 = pandas.read_csv('nba_2013.csv')
print(nba_2013.loc[0]) #索引为0的数据
print(nba_2013.loc[3:6]) #索引为3、4、5、6的数据
print(nba_2013['player'])
print(nba_2013[['player','age']])
col_names = nba_2013.columns.tolist() #把变量名返回为列表形式
print(col_names)
end_point_columns = []
for i in col_names:
    if i.endswith('.'):
        end_point_columns.append(i) #把变量名为‘.’结尾的加入end_point_columns
end_point = nba_2013[end_point_columns]
print(end_point)

添加变量:

a=nba_2013['age']*10
print(a)
print(nba_2013.shape)
nba_2013['age*10']=a
print(nba_2013)

3、数据排序

import pandas
nba_2013 = pandas.read_csv('nba_2013.csv')
nba_2013.sort_values('age',inplace=True) #按年龄大小进行排序
print(nba_2013['age']) 
nba_2013.sort_values('age',inplace=True,ascending=False) #从大到小排
print(nba_2013['age'])

输出:
16 19
168 19
355 20
115 20
422 20
129 20
186 20
......
156 37
8 38
226 38
148 39
325 39
Name: age, Length: 481, dtype: int64
325 39
148 39
226 38
8 38
156 37
......
249 20
355 20
115 20
168 19
16 19
Name: age, Length: 481, dtype: int64

4、泰坦尼克数据简单处理

(1)首先导入数据,查看所有特征

import pandas
import numpy
pandas.set_option('display.max_columns',None) #显示数据的所有列
# pandas.set_option('display.max_rows',None) #显示数据的所有行
titanic = pandas.read_csv('titanic.csv')
print(titanic.columns)

输出:
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
dtype='object')

(2)查找Age的缺失值,并计数

age = titanic['Age']
print(age.loc[0:10])
age_is_null = pandas.isnull(age)
print(age_is_null)
age_is_NAN = age[age_is_null]
print(age_is_NAN)
print(len(age_is_NAN)) #缺失值117

输出:
0 22.0
1 38.0
2 26.0
3 35.0
4 35.0
5 NaN
6 54.0
7 2.0
8 27.0
9 14.0
10 4.0
Name: Age, dtype: float64
0 False
1 False
2 False
3 False
4 False
5 True
6 False
......
884 False
885 False
886 False
887 False
888 True
889 False
890 False
Name: Age, Length: 891, dtype: bool
5 NaN
17 NaN
19 NaN
26 NaN
28 NaN

......
863 NaN
868 NaN
878 NaN
888 NaN
Name: Age, Length: 177, dtype: float64
177

(3)含有缺失值时无法直接计算均值等,要进行处理

mean_age = sum(titanic['Age'])/len(titanic['Age'])
print(mean_age)
good_age = age[age_is_null==False]
correct_mean_age = sum(good_age)/len(good_age)
print(correct_mean_age)

输出:
nan
29.69911764705882

(4)透视图

# 方法一:计算不同仓位的价格的平均数
fare = titanic.pivot_table(index='Pclass',values='Fare',aggfunc=numpy.mean)
print(fare)
#不同仓位获救人数的平均数
passenager_survival = titanic.pivot_table(index='Pclass',values='Survived',aggfunc=numpy.mean)
print(passenager_survival)
#不同仓位的平均年龄
passenager_age = titanic.pivot_table(index='Pclass',values='Age',aggfunc=numpy.mean)
print(passenager_age)

输出:
Fare
Pclass
1 84.154687
2 20.662183
3 13.675550
Survived
Pclass
1 0.629630
2 0.472826
3 0.242363
Age
Pclass
1 38.233441
2 29.877630
3 25.140620

(5)去掉缺失值

drop_na_columns = titanic.dropna(axis=1)  #axis=1丢掉列 默认丢掉行
new_titanic = titanic.dropna(axis=0,subset=['Age','Sex'])
row_index_5_age = titanic.loc[5,'Age'] #5是没有值的
print(row_index_5_age) #现在打印5上的age

输出:
nan
(6)排序

new_titanic = titanic.sort_values('Age',ascending=False)
print(new_titanic[0:10])
titanic_reindex = new_titanic.reset_index(drop=True) #drop=true 以前的索引值不要了,重新排
print('----------')
print(titanic_reindex.loc[0:10])

输出:

 PassengerId  Survived  Pclass                                  Name  \
630          631         1       1  Barkworth, Mr. Algernon Henry Wilson   
851          852         0       3                   Svensson, Mr. Johan   
493          494         0       1               Artagaveytia, Mr. Ramon   
96            97         0       1             Goldschmidt, Mr. George B   
116          117         0       3                  Connors, Mr. Patrick   
672          673         0       2           Mitchell, Mr. Henry Michael   
745          746         0       1          Crosby, Capt. Edward Gifford   
33            34         0       2                 Wheadon, Mr. Edward H   
54            55         0       1        Ostby, Mr. Engelhart Cornelius   
280          281         0       3                      Duane, Mr. Frank   

      Sex   Age  SibSp  Parch      Ticket     Fare Cabin Embarked  
630  male  80.0      0      0       27042  30.0000   A23        S  
851  male  74.0      0      0      347060   7.7750   NaN        S  
493  male  71.0      0      0    PC 17609  49.5042   NaN        C  
96   male  71.0      0      0    PC 17754  34.6542    A5        C  
116  male  70.5      0      0      370369   7.7500   NaN        Q  
672  male  70.0      0      0  C.A. 24580  10.5000   NaN        S  
745  male  70.0      1      1   WE/P 5735  71.0000   B22        S  
33   male  66.0      0      0  C.A. 24579  10.5000   NaN        S  
54   male  65.0      0      1      113509  61.9792   B30        C  
280  male  65.0      0      0      336439   7.7500   NaN        Q  
----------
    PassengerId  Survived  Pclass                                  Name   Sex  \
0           631         1       1  Barkworth, Mr. Algernon Henry Wilson  male   
1           852         0       3                   Svensson, Mr. Johan  male   
2           494         0       1               Artagaveytia, Mr. Ramon  male   
3            97         0       1             Goldschmidt, Mr. George B  male   
4           117         0       3                  Connors, Mr. Patrick  male   
5           673         0       2           Mitchell, Mr. Henry Michael  male   
6           746         0       1          Crosby, Capt. Edward Gifford  male   
7            34         0       2                 Wheadon, Mr. Edward H  male   
8            55         0       1        Ostby, Mr. Engelhart Cornelius  male   
9           281         0       3                      Duane, Mr. Frank  male   
10          457         0       1             Millet, Mr. Francis Davis  male   

     Age  SibSp  Parch      Ticket     Fare Cabin Embarked  
0   80.0      0      0       27042  30.0000   A23        S  
1   74.0      0      0      347060   7.7750   NaN        S  
2   71.0      0      0    PC 17609  49.5042   NaN        C  
3   71.0      0      0    PC 17754  34.6542    A5        C  
4   70.5      0      0      370369   7.7500   NaN        Q  
5   70.0      0      0  C.A. 24580  10.5000   NaN        S  
6   70.0      1      1   WE/P 5735  71.0000   B22        S  
7   66.0      0      0  C.A. 24579  10.5000   NaN        S  
8   65.0      0      1      113509  61.9792   B30        C  
9   65.0      0      0      336439   7.7500   NaN        Q  
10  65.0      0      0       13509  26.5500   E38        S  

Process finished with exit code 0

(7)自定义函数

A、第一百个人的数据

def hundredth_row(data):
    hundredth_item = data.loc[99]
    return hundredth_item
hundredth_row = titanic.apply(hundredth_row)
print(hundredth_row)

输出:
PassengerId 100
Survived 0
Pclass 2
Name Kantor, Mr. Sinai
Sex male
Age 34
SibSp 1
Parch 0
Ticket 244367
Fare 26
Cabin NaN
Embarked S
dtype: object

B、计算各特征的缺失值

def not_null_count(data):
    data_null = pandas.isnull(data)
    null = data[data_null]
    return len(null)
data_null_count = titanic.apply(not_null_count)
print(data_null_count)

PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64

C、把数值型变量改成定性变量,并由此做透视图

def age_to_lei(data):
    age = data['Age']
    if pandas.isnull(age)==True:
        return 'unknow'
    elif age>18:
        return 'adult'
    else:
        return 'child'
new_age = titanic.apply(age_to_lei,axis=1)
print(new_age)
#不同年龄段的存活率
titanic['new_age']=new_age
newage_survived = pandas.pivot_table(titanic,index='new_age',values='Survived')
print(newage_survived)

输出:
0 adult
1 adult
2 adult
3 adult
4 adult
......
885 adult
886 adult
887 adult
888 unknow
889 adult
890 adult
Length: 891, dtype: object

new_age Survived
adult 0.382609
child 0.503597
unknow 0.293785

5、series

import pandas as pd
import numpy as np
T10yr = pd.read_csv('T10yr.csv')
print(T10yr.columns)
serise_Date = T10yr['Date']
print(serise_Date[0:5])
print(type(serise_Date))
serise_High = T10yr['High']
print(serise_High[0:5])

输出:
Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close'], dtype='object')
0 2000-01-03
1 2000-01-04
2 2000-01-05
3 2000-01-06
4 2000-01-07
Name: Date, dtype: object
<class 'pandas.core.series.Series'>
0 6.603
1 6.548
2 6.599
3 6.585
4 6.595
Name: High, dtype: float64


对两个series进行处理,一个作为索引,一个作为值

from pandas import Series
serise_Date_value = serise_Date.values
print(type(serise_Date_value))
serise_High_value = serise_High.values
serise_High_Date = Series(index=serise_Date_value,data=serise_High_value)
print(serise_High_Date[['2000-01-03','2000-01-05']])
print(serise_High_Date['2000-01-03':'2000-01-07'])
print('-------------------')
print(serise_High_Date[0:5])

输出:
<class 'numpy.ndarray'>
2000-01-03 6.603
2000-01-05 6.599
dtype: float64
2000-01-03 6.603
2000-01-04 6.548
2000-01-05 6.599
2000-01-06 6.585
2000-01-07 6.595
dtype: float64


2000-01-03 6.603
2000-01-04 6.548
2000-01-05 6.599
2000-01-06 6.585
2000-01-07 6.595
dtype: float64


.apply可以调函数用

high_low = T10yr[['Low','High']]
high_low.apply(lambda x:np.std,axis=1)
print(high_low)

输出:
Low High
0 6.498 6.603
1 6.485 6.548
2 6.508 6.599
3 6.540 6.585
......
4163 1.549 1.587
4164 1.511 1.570
4165 1.493 1.535
4166 1.458 1.530

[4167 rows x 2 columns]

相关文章

网友评论

      本文标题:二、pandas

      本文链接:https://www.haomeiwen.com/subject/qakkaftx.html