问题描述:文件夹下有75个txt文件,每个文件并不是用UTF-8或者GBK编码。
解决方案,首先读入这个目录下的所有文件
import pandas as pd, numpy as np
import os
import re
import sys
#首先将文本取出成为csv文件
def file_name(file_dir):
for root, dirs, files in os.walk(file_dir):
namelist=(files)# 当前路径下所有非目录子文件
return namelist
针对每个文件做以下操作,以其中一个作为例子
path=file_name(file_dir)[0]
import chardet
f=open(path ,'rb')
f_read=f.read()
f_charInfo=chardet.detect(f_read)
print(f_charInfo)
查看该文件是什么类型的编码,chardet会输出置信度和预测的编码,这里是{'confidence': 0.73, 'encoding': 'Windows-1252', 'language': ''},我没有见过的编码。
用这个编码方式读入文件
doc_content_list = []
with open(path, 'rb') as f:
for line in f.readlines():
doc_content_list.append(line.strip().decode('Windows-1252'))
定制自己的清洗字符串文件
def clean_str(string):
"""
Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
"""
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"[\(\)\(\)]", "", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
string=re.sub(r"-"," ",string)
return string.strip().lower()
然后就可以处理啦,最后保存回原路径
clean_docs = []
for doc_content in doc_content_list:
doc_str = clean_str(doc_content).strip()
clean_docs.append(doc_str)
clean_corpus_str = '\n'.join(clean_docs)
clean_corpus_str= re.sub("\n\n","\n",clean_corpus_str)
with open(path, 'w') as f:
f.write(clean_corpus_str)
再使用普通方法打开,统计每行的长度
len_list = []
with open(path, 'r') as f:
for line in f.readlines():
if line == '\n':
continue
temp = line.strip().split()
len_list.append(len(temp))
print('min_len : ' + str(min(len_list)))
print('max_len : ' + str(max(len_list)))
print('average_len : ' + str(sum(len_list)/len(len_list)))
网友评论