Pandas文件IO Tools
读CSV文件
import pandas as pd
classes = pd.read_csv('imagenet_class.csv',sep=',',header=1)
print(classes)
print(classes.keys())
print(classes[' 1'])
1 goldfish Carassius auratus
0 2 great white shark white shark man-eater man-e...
1 3 tiger shark Galeocerdo cuvieri
2 4 hammerhead hammerhead shark
3 5 electric ray crampfish numbfish torpedo
4 6 stingray
.. ... ...
993 995 earthstar
994 996 hen-of-the-woods hen of the woods Polyporus f...
995 997 bolete
996 998 ear spike capitulum
997 999 toilet tissue toilet paper bathroom tissue}
[998 rows x 2 columns]
Index([' 1', ' goldfish Carassius auratus'], dtype='object')
0 2
1 3
2 4
3 5
4 6
...
993 995
994 996
995 997
996 998
997 999
Name: 1, Length: 998, dtype: int64
csv文件读取常用参数
from io import StringIO #这里因为为了偷懒,不从本地读取文件,直接从内存创建读取文件
data = ('col1,col2,col3\n'
'a,b,1\n'
'a,b,2\n'
'c,d,3')
a = pd.read_csv(StringIO(data))
print(a) #没有设置其他参数,默认列索引为文件第一行
print(a.keys()) #查看列索引值
col1 col2 col3
0 a b 1
1 a b 2
2 c d 3
Index(['col1', 'col2', 'col3'], dtype='object')
#usecols查看指定列值,使用lambda表达式
print(pd.read_csv(StringIO(data),usecols = lambda x:x.upper() in ['COL1','COL3']))
#与下面的式等价
print(pd.read_csv(StringIO(data),usecols = ['col1','col3']))
col1 col3
0 a 1
1 a 2
2 c 3
col1 col3
0 a 1
1 a 2
2 c 3
使用skiprows去掉指定行之后读文件
skiprows=[0,1,2] 需要跳过的行号,从文件头0开始,skip_footer从文件尾开始
#从原始文件中去掉指定行之后读取文件
data = ('col1,col2,col3\n'
'a,b,1\n'
'a,b,2\n'
'c,d,3')
print(data)
print(pd.read_csv(StringIO(data),skiprows=lambda x: x % 2 != 0))
print(pd.read_csv(StringIO(data),skiprows=[1,2]))#划掉二三行 用列表的形式索引从0开始
print(pd.read_csv(StringIO(data),skiprows=[0,1])) #划掉一二行
print(pd.read_csv(StringIO(data),skiprows=0)) #相当于没划掉行
print(pd.read_csv(StringIO(data),skiprows=1)) #划掉第一行
print(pd.read_csv(StringIO(data),skiprows=2)) #划掉一二行
print(pd.read_csv(StringIO(data),skiprows=3)) #划掉一二三行
print(pd.read_csv(StringIO(data),skiprows=[3])) #划掉第四行
col1,col2,col3
a,b,1
a,b,2
c,d,3
col1 col2 col3
0 a b 2
col1 col2 col3
0 c d 3
a b 2
0 c d 3
col1 col2 col3
0 a b 1
1 a b 2
2 c d 3
a b 1
0 a b 2
1 c d 3
a b 2
0 c d 3
Empty DataFrame
Columns: [c, d, 3]
Index: []
col1 col2 col3
0 a b 1
1 a b 2
dtype设置各字段的数据类型
data = ('a,b,c,d\n'
....: '1,2,3,4\n'
....: '5,6,7,8\n'
....: '9,10,11')
print(data)
a,b,c,d
1,2,3,4
5,6,7,8
9,10,11
import numpy as np
df = pd.read_csv(StringIO(data),dtype={'b': object, 'c': np.float64, 'd': 'Int64'})
print(df.dtypes)
a int64
b object
c float64
d Int64
dtype: object
converters对指定行进行转换
print(pd.read_csv(StringIO(data)))
a = pd.read_csv(StringIO(data),converters={'a':lambda x:int(x)*int(x)})
print(a)
#下面这种方式出错TypeError: can't multiply sequence by non-int of type 'str'
print(pd.read_csv(StringIO(data),converters={'a':lambda x:x*x}))
a b c d
0 1 2 3 4.0
1 5 6 7 8.0
2 9 10 11 NaN
a b c d
0 1 2 3 4.0
1 25 6 7 8.0
2 81 10 11 NaN
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-10-ea546e819413> in <module>
3 print(a)
4 #下面这种方式出错TypeError: can't multiply sequence by non-int of type 'str'
----> 5 print(pd.read_csv(StringIO(data),converters={'a':lambda x:x*x}))
D:\Anaconda\lib\site-packages\pandas\io\parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
683 )
684
--> 685 return _read(filepath_or_buffer, kwds)
686
687 parser_f.__name__ = name
D:\Anaconda\lib\site-packages\pandas\io\parsers.py in _read(filepath_or_buffer, kwds)
461
462 try:
--> 463 data = parser.read(nrows)
464 finally:
465 parser.close()
D:\Anaconda\lib\site-packages\pandas\io\parsers.py in read(self, nrows)
1152 def read(self, nrows=None):
1153 nrows = _validate_integer("nrows", nrows)
-> 1154 ret = self._engine.read(nrows)
1155
1156 # May alter columns / col_dict
D:\Anaconda\lib\site-packages\pandas\io\parsers.py in read(self, nrows)
2057 def read(self, nrows=None):
2058 try:
-> 2059 data = self._reader.read(nrows)
2060 except StopIteration:
2061 if self._first_chunk:
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader.read()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._read_low_memory()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._read_rows()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._convert_column_data()
pandas\_libs\parsers.pyx in pandas._libs.parsers._apply_converter()
<ipython-input-10-ea546e819413> in <lambda>(x)
3 print(a)
4 #下面这种方式出错TypeError: can't multiply sequence by non-int of type 'str'
----> 5 print(pd.read_csv(StringIO(data),converters={'a':lambda x:x*x}))
TypeError: can't multiply sequence by non-int of type 'str'
利用names参数修改表头名称,header参数指定表头且去掉表头前的行,默认值为0即第一行为表头
print(data)
print(pd.read_csv(StringIO(data)))
print(pd.read_csv(StringIO(data),names = ['c1','c2','c3','c4']))#无表头
# 本来是C1,C2,C3,C4排列应为字典是无顺序的所以下面names字典无序排列
print(pd.read_csv(StringIO(data),names = {'c1','c2','c3','c4'},header=None)) #无表头
print(pd.read_csv(StringIO(data),names = ('c1','c2','c3','c4'),header=0))
a,b,c,d
1,2,3,4
5,6,7,8
9,10,11
a b c d
0 1 2 3 4.0
1 5 6 7 8.0
2 9 10 11 NaN
c1 c2 c3 c4
0 a b c d
1 1 2 3 4
2 5 6 7 8
3 9 10 11 NaN
c3 c2 c1 c4
0 a b c d
1 1 2 3 4
2 5 6 7 8
3 9 10 11 NaN
c1 c2 c3 c4
0 1 2 3 4.0
1 5 6 7 8.0
2 9 10 11 NaN
comment参数去掉每行特殊字符后面的字符串之后读取文件
import pandas as pd
data = ( '\n'
'a,b,c\n'
' \n'
'# commented line\n'
'1,2,3\n'
'\n'
'4,5,6')
print(data)
print(pd.read_csv(StringIO(data), comment='#'))
a,b,c
# commented line
1,2,3
4,5,6
a b c
0 1 2 3
1 4 5 6
#skip_blank_lines去掉空行之后读文件 False的时候不忽视空行
data = ( 'a,b,c\n'
'\n'
'1,2,3\n'
'\n'
'\n'
'4,5,6')
print(pd.read_csv(StringIO(data), skip_blank_lines=False))
a b c
0 NaN NaN NaN
1 1.0 2.0 3.0
2 NaN NaN NaN
3 NaN NaN NaN
4 4.0 5.0 6.0
data = ('#comment\n'
'a,b,c\n'
'A,B,C\n'
'1,2,3')
print(data)
print()
print(pd.read_csv(StringIO(data), comment='#', header=1)) #先划掉行在读表
print()
data = ('A,B,C\n'
'#comment\n'
'a,b,c\n'
'1,2,3')
print(data)
print()
print(pd.read_csv(StringIO(data), comment='#', skiprows=0))
print(pd.read_csv(StringIO(data), comment='#', skiprows=1))
print(pd.read_csv(StringIO(data), comment='#', skiprows=2))
print(pd.read_csv(StringIO(data), skiprows=3))
#comment
a,b,c
A,B,C
1,2,3
A B C
0 1 2 3
A,B,C
#comment
a,b,c
1,2,3
A B C
0 a b c
1 1 2 3
a b c
0 1 2 3
a b c
0 1 2 3
Empty DataFrame
Columns: [1, 2, 3]
Index: []
|-WARNING:当有既有参数header又有参数comment或者skiprows的时候header的执行在后两个之后
而且header会忽视掉空行,comment划和skiprows忽视掉的行都为空行,并且在shiprows的时候是不会忽视空行的
即当comment之后的行skiprows处理的时候不会忽视
If a file has one more column of data than the number of column names, the first column will be used as the DataFrame’s row names:
In [85]: data = ('a,b,c\n'
....: '4,apple,bat,5.7\n'
....: '8,orange,cow,10')
....:
print(pd.read_csv(StringIO(data)))
a b c
4 apple bat 5.7
8 orange cow 10.0
# index_col设置某column为行names
In [87]: data = ('index,a,b,c\n'
....: '4,apple,bat,5.7\n'
....: '8,orange,cow,10')
....:
#index行有缺失
pd.read_csv(StringIO(data), index_col=0)
data = ('a,b,c\n'
....: '4,apple,bat,a\n'
....: '8,orange,cow')
....:
print(data)
print(pd.read_csv(StringIO(data)))
print(pd.read_csv(StringIO(data), index_col=False)) #这时候会自动对齐多余的列会丢掉
a,b,c
4,apple,bat,a
8,orange,cow
a b c
4 apple bat a
8 orange cow NaN
a b c
0 4 apple bat
1 8 orange cow
In [93]: data = ('a,b,c\n'
....: '4,apple,bat,\n'
....: '8,orange,cow,')
....:
print(data)
#下面两式等价
print(pd.read_csv(StringIO(data), usecols=['b', 'c']))
print(pd.read_csv(StringIO(data), usecols=['b', 'c'], index_col=0))
a,b,c
4,apple,bat,
8,orange,cow,
b c
4 bat NaN
8 cow NaN
b c
4 bat NaN
8 cow NaN
sep = ','为分支界定符号
In [128]: print(open('tmp.csv').read())
ID|level|category
Patient1|123,000|x
Patient2|23,000|y
Patient3|1,234,018|z
In [129]: df = pd.read_csv('tmp.csv', sep='|')
In [130]: df
Out[130]:
ID level category
0 Patient1 123,000 x
1 Patient2 23,000 y
2 Patient3 1,234,018 z
In [131]: df.level.dtype
Out[131]: dtype('O')
In [132]: print(open('tmp.csv').read())
ID|level|category
Patient1|123,000|x
Patient2|23,000|y
Patient3|1,234,018|z
In [133]: df = pd.read_csv('tmp.csv', sep='|', thousands=',') #thousands去掉千分位符号
In [134]: df
Out[134]:
ID level category
0 Patient1 123000 x
1 Patient2 23000 y
2 Patient3 1234018 z
In [135]: df.level.dtype
Out[135]: dtype('int64')
多重索引
In [174]: print(open('data/mindex_ex.csv').read())
year,indiv,zit,xit
1977,"A",1.2,.6
1977,"B",1.5,.5
1977,"C",1.7,.8
1978,"A",.2,.06
1978,"B",.7,.2
1978,"C",.8,.3
1978,"D",.9,.5
1978,"E",1.4,.9
1979,"C",.2,.15
1979,"D",.14,.05
1979,"E",.5,.15
1979,"F",1.2,.5
1979,"G",3.4,1.9
1979,"H",5.4,2.7
1979,"I",6.4,1.2
In [175]: df = pd.read_csv("data/mindex_ex.csv", index_col=[0, 1]) #设置第一列和第二列为索引
#多重形式会自动合并重复索引
In [176]: df
Out[176]:
zit xit
year indiv
1977 A 1.20 0.60
B 1.50 0.50
C 1.70 0.80
1978 A 0.20 0.06
B 0.70 0.20
C 0.80 0.30
D 0.90 0.50
E 1.40 0.90
1979 C 0.20 0.15
D 0.14 0.05
E 0.50 0.15
F 1.20 0.50
G 3.40 1.90
H 5.40 2.70
I 6.40 1.20
In [177]: df.loc[1978]
Out[177]:
zit xit
indiv
A 0.2 0.06
B 0.7 0.20
C 0.8 0.30
D 0.9 0.50
E 1.4 0.90
chunksize返回一个文件迭代器对象一般用于数据量过多难以显示的时候
In [190]: reader = pd.read_csv('tmp.sv', sep='|', chunksize=4)
In [191]: reader
Out[191]: <pandas.io.parsers.TextFileReader at 0x7f3d18adb350>
In [192]: for chunk in reader:
.....: print(chunk)
.....:
Unnamed: 0 0 1 2 3
0 0 0.469112 -0.282863 -1.509059 -1.135632
1 1 1.212112 -0.173215 0.119209 -1.044236
2 2 -0.861849 -2.104569 -0.494929 1.071804
3 3 0.721555 -0.706771 -1.039575 0.271860
Unnamed: 0 0 1 2 3
4 4 -0.424972 0.567020 0.276232 -1.087401
5 5 -0.673690 0.113648 -1.478427 0.524988
6 6 0.404705 0.577046 -1.715002 -1.039268
7 7 -0.370647 -1.157892 -1.344312 0.844885
Unnamed: 0 0 1 2 3
8 8 1.075770 -0.10905 1.643563 -1.469388
9 9 0.357021 -0.67460 -1.776904 -0.968914
Specifying iterator=True will also return the TextFileReader object:
In [193]: reader = pd.read_csv('tmp.sv', sep='|', iterator=True)
In [194]: reader.get_chunk(5)
Out[194]:
Unnamed: 0 0 1 2 3
0 0 0.469112 -0.282863 -1.509059 -1.135632
1 1 1.212112 -0.173215 0.119209 -1.044236
2 2 -0.861849 -2.104569 -0.494929 1.071804
3 3 0.721555 -0.706771 -1.039575 0.271860
4 4 -0.424972 0.567020 0.276232 -1.087401
MyGitHub
https://github.com/luozekun1230/MyPyhonProgram/tree/master/Pandas
网友评论