import csv
import re
import random
import numpy as np
import os
import codecs
import pandas as pd
from pandas import Series,DataFrame
pa = input("请输入待清洗文本路径:")
# nn = random.randint(1, 10)
# path1 = '/Users/lilong/Desktop/354.csv'
if os.path.exists('/Users/lilong/Desktop/数据清洗') is False:
dir_name = os.makedirs('/Users/lilong/Desktop/数据清洗')
name = input("请输入清洗后文本名称:")
desktop_path = '/Users/lilong/Desktop/数据清洗/'
path1 = desktop_path + name + '.csv'
file = open(path1, 'w')
file.write(" ")
file = open(pa, 'r')
file_pa1 = open(path1, 'r')
content = ''
for i in file:
f = i.replace(' ', '\r\n')
f = f.replace(' ', '\r\n')
f = f.replace('阿卡索', '立刻说')
f = f.replace('1.', '')
f = f.replace('2.', '')
f = f.replace('3.', '')
f = f.replace('4.', '')
f = f.replace('5.', '')
f = f.replace('6.', '')
f = f.replace('7.', '')
f = f.replace('8.', '')
f = f.replace('1:', '')
f = f.replace('2:', '')
f = f.replace('3:', '')
f = f.replace('4:', '')
f = f.replace('5:', '')
f = f.replace('6:', '')
f = f.replace('7:', '')
f = f.replace('8:', '')
regex = 'com' or 'www'
p_string = f.split('。')
# print(p_string)
co = ''
for line1 in p_string:
line1 = line1.replace(" ", "")
if re.search(regex, line1) is None:
co += line1
content += co
def write(content):
csv = os.path.join(path1)
w_txt = open(csv, 'w')
f = codecs.open(csv, 'r+', encoding='utf-8')
f.write(content)
f.close()
return content
write(content)
try:
con = ''
while True:
text_line = file_pa1.readline()
if text_line:
if len(text_line) > 120 and len(text_line) < 350:
con += text_line
else:
break
finally:
file.close()
write(con)
all_content = ''
with open(path1)as f1:
cNames = f1.readlines()
for i in range(0, len(cNames)):
cNames[i] = '{b}{a}{c}{b}'.format(b='"', c='。', a=cNames[i]) + '\r\n'
all_content += cNames[i]
with open(path1, 'w') as f2:
f2.writelines(cNames)
df = pd.read_csv(path1)
df1 = df.drop_duplicates()
df1.columns = ['"content"']
cleaning_result = df1.to_csv(path1,index=0)
网友评论