美文网首页
数据清洗

数据清洗

作者: 月夜星空下 | 来源:发表于2020-07-13 10:04 被阅读0次
    import re
    import os
    import csv
    import time
    import codecs
    import random
    import numpy as np
    import pandas as pd
    from pandas import Series, DataFrame
    Inpath = input("请输入待清洗文本路径:")
    if os.path.exists('/Users/lilong/Desktop/Data_cleaning') is False:
        dir_name = os.makedirs('/Users/lilong/Desktop/Data_cleaning')
    name = input("请输入清洗后文本名称:")
    desktop_path = '/Users/lilong/Desktop/Data_cleaning/'
    The_custom_path = desktop_path + name + '.csv'
    file_w = open(The_custom_path, 'w')
    file_w.write(" ")
    file = open(Inpath, 'r')
    file_r = open(The_custom_path, 'r')
    def stopwordslist():
        stopwords = [line.strip() for line in open('/Users/lilong/Desktop/stop_words', encoding='UTF-8').readlines()]
        return stopwords
    
    content = ''
    for i in file:
        f = i.replace('  ', '\r\n')
        f = f.replace(' ', '\r\n')
        f = f.replace('阿卡索', '立刻说')
        # f = f.replace('    ', '')
        # f = f.replace('    ', '')
        # f = f.replace('  ', '')
        # f = f.replace('   ', '')
        # f = f.replace('    ', '')
        # f = f.replace('        ', '')
        f = f.replace('2019', '2020')
        f = f.replace('2018', '2020')
        f = f.replace('00年', '20年')
        f = f.replace('1,', '')
        f = f.replace('2,', '')
        f = f.replace('3,', '')
        f = f.replace('4,', '')
        f = f.replace('5,', '')
        f = f.replace('6,', '')
        f = f.replace('7,', '')
        f = f.replace('8,', '')
        f = f.replace('1.', '')
        f = f.replace('2.', '')
        f = f.replace('3.', '')
        f = f.replace('4.', '')
        f = f.replace('5.', '')
        f = f.replace('6.', '')
        f = f.replace('7.', '')
        f = f.replace('8.', '')
        f = f.replace('9.', '')
        f = f.replace('(1)', '')
        f = f.replace('(2)', '')
        f = f.replace('(3)', '')
        f = f.replace('(4)', '')
        f = f.replace('(5)', '')
        f = f.replace('(6)', '')
        f = f.replace('(7)', '')
        f = f.replace('(8)', '')
        f = f.replace('1:', '')
        f = f.replace('2:', '')
        f = f.replace('3:', '')
        f = f.replace('4:', '')
        f = f.replace('5:', '')
        f = f.replace('6:', '')
        f = f.replace('7:', '')
        f = f.replace('8:', '')
        f = f.replace('1、', '')
        f = f.replace('2、', '')
        f = f.replace('3、', '')
        f = f.replace('4、', '')
        f = f.replace('5、', '')
        f = f.replace('6、', '')
        f = f.replace('7、', '')
        f = f.replace('8、', '')
        f = f.replace('一、', '')
        f = f.replace('二、', '')
        f = f.replace('三、', '')
        f = f.replace('四、', '')
        f = f.replace('五、', '')
        f = f.replace('六、', '')
        f = f.replace('七、', '')
        f = f.replace('八、', '')
        f = f.replace('一.', '')
        f = f.replace('二.', '')
        f = f.replace('三.', '')
        f = f.replace('四.', '')
        f = f.replace('五.', '')
        f = f.replace('六.', '')
        f = f.replace('七.', '')
        f = f.replace('八.', '')
        f = f.replace('a.', '')
        f = f.replace('b.', '')
        f = f.replace('c.', '')
        f = f.replace('d.', '')
        f = f.replace('.。', '。')
        f = f.replace('。 ', ',')
        f = f.replace('。。', '。')
        f = f.replace('?。', '。')
        f = f.replace('。,', '。')
        f = f.replace('。。', '。')
        f = f.replace(',。', '。')
        f = f.replace(',', ',,')
        f = f.replace('""', '')
        f = f.replace('。\n,', '。')
        f = f.replace('。\n。', '。')
        f = f.replace(',\n。', '。')
        f = f.replace('1)', '')
        f = f.replace('2)', '')
        f = f.replace('3)', '')
        f = f.replace('4)', '')
        f = f.replace('5)', '')
        f = f.replace('6)', '')
        f = f.replace('7)', '')
        f = f.replace('8)', '')
        f = f.replace('第一点', '')
        f = f.replace('第二点', '')
        f = f.replace('第三点', '')
        f = f.replace('第四点', '')
        f = f.replace('的第一步是', '')
        f = f.replace('的第二步是', '')
        f = f.replace('的第三步是', '')
        f = f.replace('的第四步是', '')
        f = f.replace('一。', '')
        f = f.replace('二。', '')
        f = f.replace('三。', '')
        f = f.replace('四。', '')
        f = f.replace('五。', '')
        f = f.replace('六。', '')
        f = f.replace('七。', '')
        f = f.replace('八。', '')
        f = f.replace('第一', '')
        f = f.replace('第二', '')
        f = f.replace('第三', '')
        f = f.replace('第四', '')
        regex = 'com' or 'www' or 'text-align' or '元'
        p_string = f.split(',' or '。' or ':' or ',')
        co = ''
        for line1 in p_string:
            # line2 = line1.replace(" ", "")
            # line1 = line1.strip()
            line1 = line1 + ','  # 恢复逗号
            if re.search(regex, line1) is None:
                co += line1
        content += co
    def write(content):
        csv = os.path.join(The_custom_path)
        w_txt = open(csv, 'w')
        f = codecs.open(csv, 'r+', encoding='utf-8')
        f.write(content)
        f.close()
        return content
    
    write(content)
    try:
        con = ''
        while True:
            text_line = file_r.readline()
            if text_line:
                if len(text_line) > 150 and len(text_line) < 250:
                    con += text_line
            else:
                break
    finally:
    
        file.close()
    con = con.replace(',,,,', ',')
    con = con.replace(',,,', ',')
    conn = con.replace(',,', ',')
    connn = conn.replace('    ', '')
    
    write(connn)
    
    df = pd.read_csv(The_custom_path, sep=' ')
    df1 = df.drop_duplicates()
    df1.columns = ['"ook"']
    
    cleaning_result = df1.to_csv(The_custom_path, index=0)
    
    
    all_content = ''
    with open(The_custom_path)as f3:
        cNames = f3.readlines()
        for i in range(0, len(cNames)):
            cNames[i] = cNames[i].lstrip()
            cNames[i] = cNames[i].strip()
            # print(type(cNames[i]))
            cNames[i] = cNames[i].lstrip(',')  # 去除每段句首的符号
            cNames[i] = cNames[i].lstrip(',')
            cNames[i] = cNames[i].lstrip(':')
            cNames[i] = cNames[i].rstrip()   # 去除每段句末的空格
            cNames[i] = '{b}{a}{c}{b}'.format(b='"', c='。', a=cNames[i]) + '\r\n'  # 加引号
            # cNames[i] = '{c}{a}'.format(c='。',a = cNames[i]) + '\r\n'  # 不加引号
            all_content += cNames[i]
    with open(The_custom_path, 'w') as f4:
        f4.writelines(cNames)
    write(all_content)
    
    
    with open(The_custom_path, 'r') as file_r:
        ct = ''
        for i in file_r:
            ct += i
        cg = ct.replace('""', "")
        cg = cg.replace('。。', "。")
        cg = cg.replace('。。。', "。")
        # cg = cg.replace('。', ",")
        cg = cg.replace('以下', "这些")
        cg = cg.replace('其次', "")
        # cg = cg.replace(',\n,"', "。\"\n")
        # cg = cg.replace(',\n,"', "。\"\n")
        # cg = cg.replace(':\n,"', "。\"\n")
        # cg = cg.replace('\n,"', "。\"\n")
        cg = cg.replace('.。', "。")
        # cg = cg.replace(' ', "")
        # cg = cg.replace('  ', "")
        # cg = cg.replace('   ', "")
        # cg = cg.replace('    ', "")
        # cg = cg.replace('     ', "")
        # cg = cg.replace('      ', "")
        # cg = cg.replace('        ', "")
        # cg = cg.replace('                ', "")
        cg = cg.replace(',\n。"', "。\"\n")
        cg = cg.replace(',\n。"', "。\"\n")
        cg = cg.replace(':\n。"', "。\"\n")
        cg = cg.replace('\n。"', "。\"\n")
        cg = cg.replace(',"。"', "。\"")
        cg = cg.replace('。"。"', "。\"")
        cg = cg.replace(':"。"', "。\"")
        cg = cg.replace('。。"', "。\"")
        cg = cg.replace('。""', "。\"")
        cg = cg.replace('。    。', "。")
        cg = cg.replace(" ", "")
        cg = cg.replace(' 。', "。")
        cg = cg.replace(',。', "。")
        cg = cg.replace(';,', "。")
        cg = cg.replace(':(。"', "。")
        cg = cg.replace('。(。', "。\"")
        cg = cg.replace('",', "\"")
        cg = cg.replace(',,', ",")
        cg = cg.replace('①', "")
        cg = cg.replace(':。', "。")
        cg = cg.replace('1', '')
        cg = cg.replace('2', '')
        cg = cg.replace('3', '')
        cg = cg.replace('4', '')
        cg = cg.replace('5', '')
        cg = cg.replace('?。', '?')
        cg = cg.replace(':,', ',')
        cg = cg.replace(',,', ',')
        cg = cg.replace('!important;text-align:left;}', '')
        cg = cg.replace('"“', '"')
        cg = cg.replace('。”。"', '。"')
        cg = cg.replace('?。', '?')
        cg = cg.replace('!。', '?')
    
    # print(cg)
    write(cg)
    file_r = open(The_custom_path, 'r')
    try:
        con1 = ''
        while True:
            text_line = file_r.readline()
            text_line = text_line.replace('。。', "。")
            if text_line:
                if len(text_line) > 2:
                    con1 += text_line
            else:
                break
    finally:
        file.close()
    write(con1)
    
    print("ok")
    
    
    # con = ''
    # with open(The_custom_path)as file_r:
    #     try:
    #         con = ''
    #
    #         while True:
    #
    #             text_line = file_r.readline()
    #             if text_line:
    #                 if len(text_line) > 150 and len(text_line) < 250:
    #                     # text_line = text_line.replace("\n", "")
    #                     con += text_line
    #                     # print(con)
    #             else:
    #                 break
    #         # print(con)
    #     finally:
    #         file_r.close()
    # write(con)
    
    
    # df2 = pd.read_csv(The_custom_path, sep=' ')
    # df3 = df2.drop_duplicates()
    # df2.columns = ['"content"']
    # cleaning_result = df2.to_csv(The_custom_path, index=0)
    # print("ok")
    
    # ctt = ''
    # for i in file_r:
    #     regex = 'com' or 'www'
    #     p_string = i.split(':')
    #     coo = ''
    #     for line1 in p_string:
    #         line1 = line1.replace(" ", "")
    #         if re.search(regex, line1) is None:
    #             coo += line1
    #         coo += line1
    #     ctt += coo
    
    # df = pd.read_csv(The_custom_path, sep=' ')
    # print("ok")
    # df1 = df.drop_duplicates()
    # df1.columns = ['"content"']
    # cleaning_result = df1.to_csv(The_custom_path, index=0)
    
    
    

    相关文章

      网友评论

          本文标题:数据清洗

          本文链接:https://www.haomeiwen.com/subject/qwzocktx.html