美文网首页
文本清洗工具

文本清洗工具

作者: 月夜星空下 | 来源:发表于2020-03-13 18:20 被阅读0次
    import csv
    import re
    import random
    import numpy as np
    import os
    import codecs
    import pandas as pd
    from pandas import Series,DataFrame
    pa = input("请输入待清洗文本路径:")
    # nn = random.randint(1, 10)
    # path1 = '/Users/lilong/Desktop/354.csv'
    if os.path.exists('/Users/lilong/Desktop/数据清洗') is False:
        dir_name = os.makedirs('/Users/lilong/Desktop/数据清洗')
    name = input("请输入清洗后文本名称:")
    desktop_path = '/Users/lilong/Desktop/数据清洗/'
    path1 = desktop_path + name + '.csv'
    file = open(path1, 'w')
    file.write(" ")
    file = open(pa, 'r')
    file_pa1 = open(path1, 'r')
    
    content = ''
    for i in file:
        f = i.replace('  ', '\r\n')
        f = f.replace(' ', '\r\n')
        f = f.replace('阿卡索', '立刻说')
        f = f.replace('1.', '')
        f = f.replace('2.', '')
        f = f.replace('3.', '')
        f = f.replace('4.', '')
        f = f.replace('5.', '')
        f = f.replace('6.', '')
        f = f.replace('7.', '')
        f = f.replace('8.', '')
        f = f.replace('1:', '')
        f = f.replace('2:', '')
        f = f.replace('3:', '')
        f = f.replace('4:', '')
        f = f.replace('5:', '')
        f = f.replace('6:', '')
        f = f.replace('7:', '')
        f = f.replace('8:', '')
    
    
        regex = 'com' or 'www'
        p_string = f.split('。')
        # print(p_string)
        co = ''
        for line1 in p_string:
            line1 = line1.replace(" ", "")
            if re.search(regex, line1) is None:
    
                co += line1
        content += co
    def write(content):
        csv = os.path.join(path1)
        w_txt = open(csv, 'w')
        f = codecs.open(csv, 'r+', encoding='utf-8')
        f.write(content)
        f.close()
        return content
    write(content)
    
    try:
        con = ''
        while True:
            text_line = file_pa1.readline()
            if text_line:
                if len(text_line) > 120 and len(text_line) < 350:
                    con += text_line
            else:
                break
    finally:
    
        file.close()
    write(con)
    
    all_content = ''
    with open(path1)as f1:
        cNames = f1.readlines()
        for i in range(0, len(cNames)):
            cNames[i] = '{b}{a}{c}{b}'.format(b='"', c='。', a=cNames[i]) + '\r\n'
            all_content += cNames[i]
    with open(path1, 'w') as f2:
        f2.writelines(cNames)
    
    df = pd.read_csv(path1)
    df1 = df.drop_duplicates()
    df1.columns = ['"content"']
    
    cleaning_result = df1.to_csv(path1,index=0)
    
    

    相关文章

      网友评论

          本文标题:文本清洗工具

          本文链接:https://www.haomeiwen.com/subject/byisshtx.html