美文网首页
文本清洗工具

文本清洗工具

作者: 月夜星空下 | 来源:发表于2020-03-13 18:20 被阅读0次
import csv
import re
import random
import numpy as np
import os
import codecs
import pandas as pd
from pandas import Series,DataFrame
pa = input("请输入待清洗文本路径:")
# nn = random.randint(1, 10)
# path1 = '/Users/lilong/Desktop/354.csv'
if os.path.exists('/Users/lilong/Desktop/数据清洗') is False:
    dir_name = os.makedirs('/Users/lilong/Desktop/数据清洗')
name = input("请输入清洗后文本名称:")
desktop_path = '/Users/lilong/Desktop/数据清洗/'
path1 = desktop_path + name + '.csv'
file = open(path1, 'w')
file.write(" ")
file = open(pa, 'r')
file_pa1 = open(path1, 'r')

content = ''
for i in file:
    f = i.replace('  ', '\r\n')
    f = f.replace(' ', '\r\n')
    f = f.replace('阿卡索', '立刻说')
    f = f.replace('1.', '')
    f = f.replace('2.', '')
    f = f.replace('3.', '')
    f = f.replace('4.', '')
    f = f.replace('5.', '')
    f = f.replace('6.', '')
    f = f.replace('7.', '')
    f = f.replace('8.', '')
    f = f.replace('1:', '')
    f = f.replace('2:', '')
    f = f.replace('3:', '')
    f = f.replace('4:', '')
    f = f.replace('5:', '')
    f = f.replace('6:', '')
    f = f.replace('7:', '')
    f = f.replace('8:', '')


    regex = 'com' or 'www'
    p_string = f.split('。')
    # print(p_string)
    co = ''
    for line1 in p_string:
        line1 = line1.replace(" ", "")
        if re.search(regex, line1) is None:

            co += line1
    content += co
def write(content):
    csv = os.path.join(path1)
    w_txt = open(csv, 'w')
    f = codecs.open(csv, 'r+', encoding='utf-8')
    f.write(content)
    f.close()
    return content
write(content)

try:
    con = ''
    while True:
        text_line = file_pa1.readline()
        if text_line:
            if len(text_line) > 120 and len(text_line) < 350:
                con += text_line
        else:
            break
finally:

    file.close()
write(con)

all_content = ''
with open(path1)as f1:
    cNames = f1.readlines()
    for i in range(0, len(cNames)):
        cNames[i] = '{b}{a}{c}{b}'.format(b='"', c='。', a=cNames[i]) + '\r\n'
        all_content += cNames[i]
with open(path1, 'w') as f2:
    f2.writelines(cNames)

df = pd.read_csv(path1)
df1 = df.drop_duplicates()
df1.columns = ['"content"']

cleaning_result = df1.to_csv(path1,index=0)

相关文章

网友评论

      本文标题:文本清洗工具

      本文链接:https://www.haomeiwen.com/subject/byisshtx.html