1.替换
思路:
- 把双引号【"】替换为横线【-】
-
把双竖线【||】替换为单个双引号【"】
image.png
#替换
import re
from tqdm import tqdm
#清洗数据
def file_sub(old_file,new_file):
file_data = [] # 初始化
with open(old_file, "r", encoding="GB18030") as f:
print('开始替换...')
rows = f.readlines()
for line in tqdm(rows):
a = re.sub('\x00','',re.sub('\s','',line))
a = re.sub('"','-',a)
a = re.sub('\|\|','"',a)
file_data.append(a)
with open(new_file, "w", encoding="GB18030") as f: # 写入替换好的文本
print('写入替换文本...')
for line in tqdm(file_data):
f.write(line + '\n')
print('批量替换完成')
def main():
file_sub('work.txt','work_new.csv')
2.查看是否有串行
image.png无报错
#2.发现串行
#发现错误行
import pandas as pd
file ='work_new.csv'
dat = pd.read_table(file, sep=',',encoding = "GB18030",
dtype=str,low_memory=False,
header=None
)
#无错误行,导出文件
dat.to_csv('work_clean.csv',sep='|',index=False,quoting=1,encoding = 'GB18030')
#读取错误行
dat = pd.read_table(file, sep=',',encoding = "GB18030",
dtype=str,skiprows=2704912, nrows=1 ,low_memory=False,
header=None
)
网友评论