import re
import os
import csv
import time
import codecs
import random
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
Inpath = input("请输入待清洗文本路径:")
if os.path.exists('/Users/lilong/Desktop/Data_cleaning') is False:
dir_name = os.makedirs('/Users/lilong/Desktop/Data_cleaning')
name = input("请输入清洗后文本名称:")
desktop_path = '/Users/lilong/Desktop/Data_cleaning/'
The_custom_path = desktop_path + name + '.csv'
file_w = open(The_custom_path, 'w')
file_w.write(" ")
file = open(Inpath, 'r')
file_r = open(The_custom_path, 'r')
def stopwordslist():
stopwords = [line.strip() for line in open('/Users/lilong/Desktop/stop_words', encoding='UTF-8').readlines()]
return stopwords
content = ''
for i in file:
f = i.replace(' ', '\r\n')
f = f.replace(' ', '\r\n')
f = f.replace('阿卡索', '立刻说')
# f = f.replace(' ', '')
# f = f.replace(' ', '')
# f = f.replace(' ', '')
# f = f.replace(' ', '')
# f = f.replace(' ', '')
# f = f.replace(' ', '')
f = f.replace('2019', '2020')
f = f.replace('2018', '2020')
f = f.replace('00年', '20年')
f = f.replace('1,', '')
f = f.replace('2,', '')
f = f.replace('3,', '')
f = f.replace('4,', '')
f = f.replace('5,', '')
f = f.replace('6,', '')
f = f.replace('7,', '')
f = f.replace('8,', '')
f = f.replace('1.', '')
f = f.replace('2.', '')
f = f.replace('3.', '')
f = f.replace('4.', '')
f = f.replace('5.', '')
f = f.replace('6.', '')
f = f.replace('7.', '')
f = f.replace('8.', '')
f = f.replace('9.', '')
f = f.replace('(1)', '')
f = f.replace('(2)', '')
f = f.replace('(3)', '')
f = f.replace('(4)', '')
f = f.replace('(5)', '')
f = f.replace('(6)', '')
f = f.replace('(7)', '')
f = f.replace('(8)', '')
f = f.replace('1:', '')
f = f.replace('2:', '')
f = f.replace('3:', '')
f = f.replace('4:', '')
f = f.replace('5:', '')
f = f.replace('6:', '')
f = f.replace('7:', '')
f = f.replace('8:', '')
f = f.replace('1、', '')
f = f.replace('2、', '')
f = f.replace('3、', '')
f = f.replace('4、', '')
f = f.replace('5、', '')
f = f.replace('6、', '')
f = f.replace('7、', '')
f = f.replace('8、', '')
f = f.replace('一、', '')
f = f.replace('二、', '')
f = f.replace('三、', '')
f = f.replace('四、', '')
f = f.replace('五、', '')
f = f.replace('六、', '')
f = f.replace('七、', '')
f = f.replace('八、', '')
f = f.replace('一.', '')
f = f.replace('二.', '')
f = f.replace('三.', '')
f = f.replace('四.', '')
f = f.replace('五.', '')
f = f.replace('六.', '')
f = f.replace('七.', '')
f = f.replace('八.', '')
f = f.replace('a.', '')
f = f.replace('b.', '')
f = f.replace('c.', '')
f = f.replace('d.', '')
f = f.replace('.。', '。')
f = f.replace('。 ', ',')
f = f.replace('。。', '。')
f = f.replace('?。', '。')
f = f.replace('。,', '。')
f = f.replace('。。', '。')
f = f.replace(',。', '。')
f = f.replace(',', ',,')
f = f.replace('""', '')
f = f.replace('。\n,', '。')
f = f.replace('。\n。', '。')
f = f.replace(',\n。', '。')
f = f.replace('1)', '')
f = f.replace('2)', '')
f = f.replace('3)', '')
f = f.replace('4)', '')
f = f.replace('5)', '')
f = f.replace('6)', '')
f = f.replace('7)', '')
f = f.replace('8)', '')
f = f.replace('第一点', '')
f = f.replace('第二点', '')
f = f.replace('第三点', '')
f = f.replace('第四点', '')
f = f.replace('的第一步是', '')
f = f.replace('的第二步是', '')
f = f.replace('的第三步是', '')
f = f.replace('的第四步是', '')
f = f.replace('一。', '')
f = f.replace('二。', '')
f = f.replace('三。', '')
f = f.replace('四。', '')
f = f.replace('五。', '')
f = f.replace('六。', '')
f = f.replace('七。', '')
f = f.replace('八。', '')
f = f.replace('第一', '')
f = f.replace('第二', '')
f = f.replace('第三', '')
f = f.replace('第四', '')
regex = 'com' or 'www' or 'text-align' or '元'
p_string = f.split(',' or '。' or ':' or ',')
co = ''
for line1 in p_string:
# line2 = line1.replace(" ", "")
# line1 = line1.strip()
line1 = line1 + ',' # 恢复逗号
if re.search(regex, line1) is None:
co += line1
content += co
def write(content):
csv = os.path.join(The_custom_path)
w_txt = open(csv, 'w')
f = codecs.open(csv, 'r+', encoding='utf-8')
f.write(content)
f.close()
return content
write(content)
try:
con = ''
while True:
text_line = file_r.readline()
if text_line:
if len(text_line) > 150 and len(text_line) < 250:
con += text_line
else:
break
finally:
file.close()
con = con.replace(',,,,', ',')
con = con.replace(',,,', ',')
conn = con.replace(',,', ',')
connn = conn.replace(' ', '')
write(connn)
df = pd.read_csv(The_custom_path, sep=' ')
df1 = df.drop_duplicates()
df1.columns = ['"ook"']
cleaning_result = df1.to_csv(The_custom_path, index=0)
all_content = ''
with open(The_custom_path)as f3:
cNames = f3.readlines()
for i in range(0, len(cNames)):
cNames[i] = cNames[i].lstrip()
cNames[i] = cNames[i].strip()
# print(type(cNames[i]))
cNames[i] = cNames[i].lstrip(',') # 去除每段句首的符号
cNames[i] = cNames[i].lstrip(',')
cNames[i] = cNames[i].lstrip(':')
cNames[i] = cNames[i].rstrip() # 去除每段句末的空格
cNames[i] = '{b}{a}{c}{b}'.format(b='"', c='。', a=cNames[i]) + '\r\n' # 加引号
# cNames[i] = '{c}{a}'.format(c='。',a = cNames[i]) + '\r\n' # 不加引号
all_content += cNames[i]
with open(The_custom_path, 'w') as f4:
f4.writelines(cNames)
write(all_content)
with open(The_custom_path, 'r') as file_r:
ct = ''
for i in file_r:
ct += i
cg = ct.replace('""', "")
cg = cg.replace('。。', "。")
cg = cg.replace('。。。', "。")
# cg = cg.replace('。', ",")
cg = cg.replace('以下', "这些")
cg = cg.replace('其次', "")
# cg = cg.replace(',\n,"', "。\"\n")
# cg = cg.replace(',\n,"', "。\"\n")
# cg = cg.replace(':\n,"', "。\"\n")
# cg = cg.replace('\n,"', "。\"\n")
cg = cg.replace('.。', "。")
# cg = cg.replace(' ', "")
# cg = cg.replace(' ', "")
# cg = cg.replace(' ', "")
# cg = cg.replace(' ', "")
# cg = cg.replace(' ', "")
# cg = cg.replace(' ', "")
# cg = cg.replace(' ', "")
# cg = cg.replace(' ', "")
cg = cg.replace(',\n。"', "。\"\n")
cg = cg.replace(',\n。"', "。\"\n")
cg = cg.replace(':\n。"', "。\"\n")
cg = cg.replace('\n。"', "。\"\n")
cg = cg.replace(',"。"', "。\"")
cg = cg.replace('。"。"', "。\"")
cg = cg.replace(':"。"', "。\"")
cg = cg.replace('。。"', "。\"")
cg = cg.replace('。""', "。\"")
cg = cg.replace('。 。', "。")
cg = cg.replace(" ", "")
cg = cg.replace(' 。', "。")
cg = cg.replace(',。', "。")
cg = cg.replace(';,', "。")
cg = cg.replace(':(。"', "。")
cg = cg.replace('。(。', "。\"")
cg = cg.replace('",', "\"")
cg = cg.replace(',,', ",")
cg = cg.replace('①', "")
cg = cg.replace(':。', "。")
cg = cg.replace('1', '')
cg = cg.replace('2', '')
cg = cg.replace('3', '')
cg = cg.replace('4', '')
cg = cg.replace('5', '')
cg = cg.replace('?。', '?')
cg = cg.replace(':,', ',')
cg = cg.replace(',,', ',')
cg = cg.replace('!important;text-align:left;}', '')
cg = cg.replace('"“', '"')
cg = cg.replace('。”。"', '。"')
cg = cg.replace('?。', '?')
cg = cg.replace('!。', '?')
# print(cg)
write(cg)
file_r = open(The_custom_path, 'r')
try:
con1 = ''
while True:
text_line = file_r.readline()
text_line = text_line.replace('。。', "。")
if text_line:
if len(text_line) > 2:
con1 += text_line
else:
break
finally:
file.close()
write(con1)
print("ok")
# con = ''
# with open(The_custom_path)as file_r:
# try:
# con = ''
#
# while True:
#
# text_line = file_r.readline()
# if text_line:
# if len(text_line) > 150 and len(text_line) < 250:
# # text_line = text_line.replace("\n", "")
# con += text_line
# # print(con)
# else:
# break
# # print(con)
# finally:
# file_r.close()
# write(con)
# df2 = pd.read_csv(The_custom_path, sep=' ')
# df3 = df2.drop_duplicates()
# df2.columns = ['"content"']
# cleaning_result = df2.to_csv(The_custom_path, index=0)
# print("ok")
# ctt = ''
# for i in file_r:
# regex = 'com' or 'www'
# p_string = i.split(':')
# coo = ''
# for line1 in p_string:
# line1 = line1.replace(" ", "")
# if re.search(regex, line1) is None:
# coo += line1
# coo += line1
# ctt += coo
# df = pd.read_csv(The_custom_path, sep=' ')
# print("ok")
# df1 = df.drop_duplicates()
# df1.columns = ['"content"']
# cleaning_result = df1.to_csv(The_custom_path, index=0)
网友评论