#!/usr/bin/python
# -*- coding: utf-8 -*-
import pandas as pd
file_name = '文件名'
file_path = r'文件路径%s.csv' % (file_name,) # 文件后缀名可变
file_read = pd.read_csv(file_path, sep=',', encoding='utf8', low_memory=False).sort_values('分组字段') # 文件路径
print('Reading & GroupCount')
print(pd.DataFrame(file_read['主键字段'].groupby(file_read['分组字段']).count().drop_duplicates()))
print('--------------------------------')
print('Total:', file_read['主键字段'].count())
print('')
print('================================')
print('')
print('Splitting')
print('--------------------------------')
for i1, g in file_read.groupby('分组字段'): # 通过索引字段分组导出
g.to_csv('%s%s.csv' % (file_name, '{}'.format(i1)), mode='w', header=True) # 文件名
print('%s:' % (i1,), g['主键字段'].count())
row_limit = 100 * 10000
if g['主键字段'].count() > row_limit:
rows_chunk_set = pd.read_csv('%s%s.csv' % (file_name, '{}'.format(i1)), low_memory=False,
chunksize='%d' % (row_limit,))
for i2, chuck in enumerate(rows_chunk_set):
chuck.to_csv('%s%s%s.csv' % (file_name, '{}'.format(i1), '{}'.format(i2)), mode='w', header=True)
print('--------------------------------')
print('Complete')
网友评论