Pandas-01:CSV

作者: 罗泽坤 | 来源:发表于2020-03-28 00:10 被阅读0次

Pandas文件IO Tools

读CSV文件

import pandas as pd
classes = pd.read_csv('imagenet_class.csv',sep=',',header=1)
print(classes)
print(classes.keys())
print(classes[' 1'])
       1                         goldfish Carassius auratus
0      2   great white shark white shark man-eater man-e...
1      3                     tiger shark Galeocerdo cuvieri
2      4                        hammerhead hammerhead shark
3      5            electric ray crampfish numbfish torpedo
4      6                                           stingray
..   ...                                                ...
993  995                                          earthstar
994  996   hen-of-the-woods hen of the woods Polyporus f...
995  997                                             bolete
996  998                                ear spike capitulum
997  999        toilet tissue toilet paper bathroom tissue}

[998 rows x 2 columns]
Index([' 1', ' goldfish Carassius auratus'], dtype='object')
0        2
1        3
2        4
3        5
4        6
      ... 
993    995
994    996
995    997
996    998
997    999
Name:  1, Length: 998, dtype: int64

csv文件读取常用参数

from io import StringIO  #这里因为为了偷懒,不从本地读取文件,直接从内存创建读取文件
data = ('col1,col2,col3\n'
            'a,b,1\n'
            'a,b,2\n'
            'c,d,3')
a = pd.read_csv(StringIO(data))
print(a)  #没有设置其他参数,默认列索引为文件第一行
print(a.keys())  #查看列索引值
  col1 col2  col3
0    a    b     1
1    a    b     2
2    c    d     3
Index(['col1', 'col2', 'col3'], dtype='object')
#usecols查看指定列值,使用lambda表达式
print(pd.read_csv(StringIO(data),usecols = lambda x:x.upper() in ['COL1','COL3']))
#与下面的式等价
print(pd.read_csv(StringIO(data),usecols = ['col1','col3']))
  col1  col3
0    a     1
1    a     2
2    c     3
  col1  col3
0    a     1
1    a     2
2    c     3

使用skiprows去掉指定行之后读文件

skiprows=[0,1,2] 需要跳过的行号,从文件头0开始,skip_footer从文件尾开始

#从原始文件中去掉指定行之后读取文件
data = ('col1,col2,col3\n'
            'a,b,1\n'
            'a,b,2\n'
            'c,d,3')
print(data)
print(pd.read_csv(StringIO(data),skiprows=lambda x: x % 2 != 0))
print(pd.read_csv(StringIO(data),skiprows=[1,2]))#划掉二三行 用列表的形式索引从0开始
print(pd.read_csv(StringIO(data),skiprows=[0,1])) #划掉一二行
print(pd.read_csv(StringIO(data),skiprows=0))  #相当于没划掉行
print(pd.read_csv(StringIO(data),skiprows=1))  #划掉第一行
print(pd.read_csv(StringIO(data),skiprows=2))  #划掉一二行
print(pd.read_csv(StringIO(data),skiprows=3))  #划掉一二三行
print(pd.read_csv(StringIO(data),skiprows=[3]))  #划掉第四行

col1,col2,col3
a,b,1
a,b,2
c,d,3
  col1 col2  col3
0    a    b     2
  col1 col2  col3
0    c    d     3
   a  b  2
0  c  d  3
  col1 col2  col3
0    a    b     1
1    a    b     2
2    c    d     3
   a  b  1
0  a  b  2
1  c  d  3
   a  b  2
0  c  d  3
Empty DataFrame
Columns: [c, d, 3]
Index: []
  col1 col2  col3
0    a    b     1
1    a    b     2

dtype设置各字段的数据类型

data = ('a,b,c,d\n'
   ....:         '1,2,3,4\n'
   ....:         '5,6,7,8\n'
   ....:         '9,10,11')
print(data)
a,b,c,d
1,2,3,4
5,6,7,8
9,10,11
import numpy as np
df = pd.read_csv(StringIO(data),dtype={'b': object, 'c': np.float64, 'd': 'Int64'})
print(df.dtypes)
a      int64
b     object
c    float64
d      Int64
dtype: object

converters对指定行进行转换

print(pd.read_csv(StringIO(data)))
a = pd.read_csv(StringIO(data),converters={'a':lambda x:int(x)*int(x)})
print(a)
#下面这种方式出错TypeError: can't multiply sequence by non-int of type 'str'
print(pd.read_csv(StringIO(data),converters={'a':lambda x:x*x}))
   a   b   c    d
0  1   2   3  4.0
1  5   6   7  8.0
2  9  10  11  NaN
    a   b   c    d
0   1   2   3  4.0
1  25   6   7  8.0
2  81  10  11  NaN



---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-10-ea546e819413> in <module>
      3 print(a)
      4 #下面这种方式出错TypeError: can't multiply sequence by non-int of type 'str'
----> 5 print(pd.read_csv(StringIO(data),converters={'a':lambda x:x*x}))


D:\Anaconda\lib\site-packages\pandas\io\parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
    683         )
    684 
--> 685         return _read(filepath_or_buffer, kwds)
    686 
    687     parser_f.__name__ = name


D:\Anaconda\lib\site-packages\pandas\io\parsers.py in _read(filepath_or_buffer, kwds)
    461 
    462     try:
--> 463         data = parser.read(nrows)
    464     finally:
    465         parser.close()


D:\Anaconda\lib\site-packages\pandas\io\parsers.py in read(self, nrows)
   1152     def read(self, nrows=None):
   1153         nrows = _validate_integer("nrows", nrows)
-> 1154         ret = self._engine.read(nrows)
   1155 
   1156         # May alter columns / col_dict


D:\Anaconda\lib\site-packages\pandas\io\parsers.py in read(self, nrows)
   2057     def read(self, nrows=None):
   2058         try:
-> 2059             data = self._reader.read(nrows)
   2060         except StopIteration:
   2061             if self._first_chunk:


pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader.read()


pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._read_low_memory()


pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._read_rows()


pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._convert_column_data()


pandas\_libs\parsers.pyx in pandas._libs.parsers._apply_converter()


<ipython-input-10-ea546e819413> in <lambda>(x)
      3 print(a)
      4 #下面这种方式出错TypeError: can't multiply sequence by non-int of type 'str'
----> 5 print(pd.read_csv(StringIO(data),converters={'a':lambda x:x*x}))


TypeError: can't multiply sequence by non-int of type 'str'

利用names参数修改表头名称,header参数指定表头且去掉表头前的行,默认值为0即第一行为表头

print(data)
print(pd.read_csv(StringIO(data)))
print(pd.read_csv(StringIO(data),names = ['c1','c2','c3','c4']))#无表头
# 本来是C1,C2,C3,C4排列应为字典是无顺序的所以下面names字典无序排列
print(pd.read_csv(StringIO(data),names = {'c1','c2','c3','c4'},header=None)) #无表头
print(pd.read_csv(StringIO(data),names = ('c1','c2','c3','c4'),header=0))
a,b,c,d
1,2,3,4
5,6,7,8
9,10,11
   a   b   c    d
0  1   2   3  4.0
1  5   6   7  8.0
2  9  10  11  NaN
  c1  c2  c3   c4
0  a   b   c    d
1  1   2   3    4
2  5   6   7    8
3  9  10  11  NaN
  c3  c2  c1   c4
0  a   b   c    d
1  1   2   3    4
2  5   6   7    8
3  9  10  11  NaN
   c1  c2  c3   c4
0   1   2   3  4.0
1   5   6   7  8.0
2   9  10  11  NaN

comment参数去掉每行特殊字符后面的字符串之后读取文件

import pandas as pd
data = (    '\n'
            'a,b,c\n'
            '  \n'
            '# commented line\n'
            '1,2,3\n'
            '\n'
            '4,5,6')
    

print(data)

print(pd.read_csv(StringIO(data), comment='#'))

a,b,c
  
# commented line
1,2,3

4,5,6
   a  b  c
0  1  2  3
1  4  5  6
#skip_blank_lines去掉空行之后读文件 False的时候不忽视空行
data = (    'a,b,c\n'
            '\n'
            '1,2,3\n'
            '\n'
            '\n'
            '4,5,6')
    

print(pd.read_csv(StringIO(data), skip_blank_lines=False))

     a    b    c
0  NaN  NaN  NaN
1  1.0  2.0  3.0
2  NaN  NaN  NaN
3  NaN  NaN  NaN
4  4.0  5.0  6.0
data = ('#comment\n'
            'a,b,c\n'
            'A,B,C\n'
            '1,2,3')
    
print(data)
print()
print(pd.read_csv(StringIO(data), comment='#', header=1)) #先划掉行在读表

print()
data = ('A,B,C\n'
            '#comment\n'
            'a,b,c\n'
            '1,2,3')
    
print(data)
print()
print(pd.read_csv(StringIO(data), comment='#', skiprows=0)) 
print(pd.read_csv(StringIO(data), comment='#', skiprows=1))
print(pd.read_csv(StringIO(data), comment='#', skiprows=2))
print(pd.read_csv(StringIO(data), skiprows=3))
#comment
a,b,c
A,B,C
1,2,3

   A  B  C
0  1  2  3

A,B,C
#comment
a,b,c
1,2,3

   A  B  C
0  a  b  c
1  1  2  3
   a  b  c
0  1  2  3
   a  b  c
0  1  2  3
Empty DataFrame
Columns: [1, 2, 3]
Index: []
|-WARNING:当有既有参数header又有参数comment或者skiprows的时候header的执行在后两个之后
    而且header会忽视掉空行,comment划和skiprows忽视掉的行都为空行,并且在shiprows的时候是不会忽视空行的
    即当comment之后的行skiprows处理的时候不会忽视

If a file has one more column of data than the number of column names, the first column will be used as the DataFrame’s row names:


In [85]: data = ('a,b,c\n'
   ....:         '4,apple,bat,5.7\n'
   ....:         '8,orange,cow,10')
   ....: 

print(pd.read_csv(StringIO(data)))
        a    b     c
4   apple  bat   5.7
8  orange  cow  10.0
# index_col设置某column为行names
In [87]: data = ('index,a,b,c\n'
   ....:         '4,apple,bat,5.7\n'
   ....:         '8,orange,cow,10')
   ....: 

#index行有缺失
pd.read_csv(StringIO(data), index_col=0) 
data = ('a,b,c\n'
   ....:         '4,apple,bat,a\n'
   ....:         '8,orange,cow')
   ....: 

print(data)
print(pd.read_csv(StringIO(data)))
print(pd.read_csv(StringIO(data), index_col=False)) #这时候会自动对齐多余的列会丢掉
a,b,c
4,apple,bat,a
8,orange,cow
        a    b    c
4   apple  bat    a
8  orange  cow  NaN
   a       b    c
0  4   apple  bat
1  8  orange  cow
In [93]: data = ('a,b,c\n'
   ....:         '4,apple,bat,\n'
   ....:         '8,orange,cow,')
   ....: 
print(data)
#下面两式等价
print(pd.read_csv(StringIO(data), usecols=['b', 'c']))
print(pd.read_csv(StringIO(data), usecols=['b', 'c'], index_col=0))
a,b,c
4,apple,bat,
8,orange,cow,
     b   c
4  bat NaN
8  cow NaN
     b   c
4  bat NaN
8  cow NaN

sep = ','为分支界定符号

In [128]: print(open('tmp.csv').read())
ID|level|category
Patient1|123,000|x
Patient2|23,000|y
Patient3|1,234,018|z

In [129]: df = pd.read_csv('tmp.csv', sep='|')

In [130]: df
Out[130]: 
         ID      level category
0  Patient1    123,000        x
1  Patient2     23,000        y
2  Patient3  1,234,018        z

In [131]: df.level.dtype
Out[131]: dtype('O')

In [132]: print(open('tmp.csv').read())
ID|level|category
Patient1|123,000|x
Patient2|23,000|y
Patient3|1,234,018|z

In [133]: df = pd.read_csv('tmp.csv', sep='|', thousands=',') #thousands去掉千分位符号

In [134]: df
Out[134]: 
         ID    level category
0  Patient1   123000        x
1  Patient2    23000        y
2  Patient3  1234018        z

In [135]: df.level.dtype
Out[135]: dtype('int64')

多重索引

In [174]: print(open('data/mindex_ex.csv').read())
year,indiv,zit,xit
1977,"A",1.2,.6
1977,"B",1.5,.5
1977,"C",1.7,.8
1978,"A",.2,.06
1978,"B",.7,.2
1978,"C",.8,.3
1978,"D",.9,.5
1978,"E",1.4,.9
1979,"C",.2,.15
1979,"D",.14,.05
1979,"E",.5,.15
1979,"F",1.2,.5
1979,"G",3.4,1.9
1979,"H",5.4,2.7
1979,"I",6.4,1.2

In [175]: df = pd.read_csv("data/mindex_ex.csv", index_col=[0, 1]) #设置第一列和第二列为索引
#多重形式会自动合并重复索引
In [176]: df
Out[176]: 
             zit   xit
year indiv            
1977 A      1.20  0.60
     B      1.50  0.50
     C      1.70  0.80
1978 A      0.20  0.06
     B      0.70  0.20
     C      0.80  0.30
     D      0.90  0.50
     E      1.40  0.90
1979 C      0.20  0.15
     D      0.14  0.05
     E      0.50  0.15
     F      1.20  0.50
     G      3.40  1.90
     H      5.40  2.70
     I      6.40  1.20

In [177]: df.loc[1978]
Out[177]: 
       zit   xit
indiv           
A      0.2  0.06
B      0.7  0.20
C      0.8  0.30
D      0.9  0.50
E      1.4  0.90

chunksize返回一个文件迭代器对象一般用于数据量过多难以显示的时候

In [190]: reader = pd.read_csv('tmp.sv', sep='|', chunksize=4)

In [191]: reader
Out[191]: <pandas.io.parsers.TextFileReader at 0x7f3d18adb350>

In [192]: for chunk in reader:
   .....:     print(chunk)
   .....: 
   Unnamed: 0         0         1         2         3
0           0  0.469112 -0.282863 -1.509059 -1.135632
1           1  1.212112 -0.173215  0.119209 -1.044236
2           2 -0.861849 -2.104569 -0.494929  1.071804
3           3  0.721555 -0.706771 -1.039575  0.271860
   Unnamed: 0         0         1         2         3
4           4 -0.424972  0.567020  0.276232 -1.087401
5           5 -0.673690  0.113648 -1.478427  0.524988
6           6  0.404705  0.577046 -1.715002 -1.039268
7           7 -0.370647 -1.157892 -1.344312  0.844885
   Unnamed: 0         0        1         2         3
8           8  1.075770 -0.10905  1.643563 -1.469388
9           9  0.357021 -0.67460 -1.776904 -0.968914
Specifying iterator=True will also return the TextFileReader object:

In [193]: reader = pd.read_csv('tmp.sv', sep='|', iterator=True)

In [194]: reader.get_chunk(5)
Out[194]: 
   Unnamed: 0         0         1         2         3
0           0  0.469112 -0.282863 -1.509059 -1.135632
1           1  1.212112 -0.173215  0.119209 -1.044236
2           2 -0.861849 -2.104569 -0.494929  1.071804
3           3  0.721555 -0.706771 -1.039575  0.271860
4           4 -0.424972  0.567020  0.276232 -1.087401


MyGitHub

https://github.com/luozekun1230/MyPyhonProgram/tree/master/Pandas

相关文章

网友评论

    本文标题:Pandas-01:CSV

    本文链接:https://www.haomeiwen.com/subject/hdpduhtx.html