美文网首页
一个内存友好的短文本聚类工具

一个内存友好的短文本聚类工具

作者: 612twilight | 来源:发表于2019-12-01 18:58 被阅读0次

    介绍

    ​ 今日从AINLP的公众号上面看到一个推送,关于短文本聚类的工具,刚好公司项目有用到短文本聚类,就进去看看。项目不复杂,使用了jaccard相似度量方法,当然也可以修改度量办法。其特点是内存友好,对于大批量的数据聚类有帮助。

    代码解析

    cluster.py

    cluster.py文件的main方法是工具的主入口

    def main():
        args = _get_parser()
    
        # preliminary work
        check_file(args.infile)
        ensure_dir(args.output)
        # ==检查并确保输入文件和输出文件夹的存在
        if args.name_len_update:
            line_cnt = line_counter(args.infile)
            args.name_len = len(str(line_cnt)) + 1
        # ==删除输出文件夹中与聚类输出文件名模式相匹配的文件
        clean_dir(args.output, args.name_len)
        # end preliminary work
    
        p_bucket = defaultdict(list)
        save_idx = 0
        id_name = '{0:0' + str(args.name_len) + 'd}'
        # load stop words
        # ==载入停用词
        stop_words = get_stop_words(args.stop_words) if os.path.exists(args.stop_words) else list()
        # load tokenizer
        # ==建立分词类,分词有两种,支持中英文
        seg = Segmentor(args)
    
        print('Splitting sentence into different clusters ...')
        infile = open(args.infile, 'r', encoding="utf-8")
        for line in tqdm(infile):
        # 遍历输入文件的每一行
            line = line.rstrip()
            is_match = False
            # 分词并去除停用词
            seg_list = list(seg.cut(line))
            if stop_words:
                seg_list = list(filter(lambda x: x not in stop_words, seg_list))
            for wd in seg_list:
            # ==p_bucket是一个根据词汇来查找已有聚类簇的列表,记录的列表名
                w_bucket = p_bucket[wd]
                # is_match = False
                for bucket in w_bucket:
                # == 如果某个聚类簇包含该词汇,那么就去该列表下面寻找已经有的样本行,不足五行,就全选,反之采样五行。
                    bucket_path = os.path.join(args.output, bucket)
                    check_file(bucket_path)
                    selected = sample_file(bucket_path, args.sample_number)
                    #  == 将选取的样本进行分词并去除通用词
                    selected = list(map(lambda x: list(seg.cut(x)), selected))
                    if stop_words:
                        filt_selected = list()
                        for sen in selected:
                            sen = list(filter(lambda x: x not in stop_words, sen))
                            filt_selected.append(sen)
                        selected = filt_selected
                    # ==如果待分析样本与采样选取的每个样本都满足jaccard系数大于阈值,就归入该聚类簇
                    if all(jaccard(seg_list, cmp_list) > args.threshold for cmp_list in selected):
                        is_match = True
                        with open(bucket_path, 'a', encoding='utf-8') as outfile:
                            outfile.write(line+'\n')
                        break
                # == 这里应该是一个不足之处,因为如果在这里就进行判断的话,无法对不同顺序单词组成的句子进行聚类
                
                if not is_match:
                    bucket_name = ('tmp' + id_name).format(save_idx)
                    w_bucket.append(bucket_name)
                    bucket_path = os.path.join(args.output, bucket_name)
                    with open(bucket_path, 'a', encoding='utf-8') as outfile:
                        outfile.write(line+'\n')
                    save_idx += 1
                    break
    
        infile.close()
    
        # sort and rename file
        file_list = os.listdir(args.output)
        file_list = list(filter(lambda x: x.startswith('tmp'), file_list))
        cnt = dict()
        for file in file_list:
            file_path = os.path.join(args.output, file)
            cnt[file] = line_counter(file_path)
    
        sorted_cnt = sorted(cnt.items(), key=lambda kv: kv[1], reverse=True)
        for idx, (file_name, times) in enumerate(sorted_cnt):
            origin_path = os.path.join(args.output, file_name)
            new_path = os.path.join(args.output, id_name.format(idx))
            os.rename(origin_path, new_path)
    
        print('All is well')
    

    评价

    优点:内存友好

    缺点:项目有个bug,无法将不同顺序单词组成的句子归到同一类中,建议加入原有的聚类簇之后,不要break,只是不要再进行聚类类别判定或者新建,而对该样本的每个单词的p_bucket列表都加入该类,同时需要调整新建类别的逻辑,如果第一个单词匹配不通过,建立一个tmp词汇,将该单词放入,然后对下一个单词进行判定,直到所有的都不通过,才建立新的类别,并将tmp词汇里的p_bucket都加上这个新的类别。

    项目地址

    原始项目地址:(已经提了个issue,可能会修改这个问题

    https://github.com/RandyPen/TextCluster

    有bug的项目fork了一份:

    https://github.com/612yese/TextCluster

    追加一

    def main():
        args = _get_parser()
    
        # preliminary work
        check_file(args.infile)
        ensure_dir(args.output)
    
        if args.name_len_update:
            line_cnt = line_counter(args.infile)
            args.name_len = len(str(line_cnt)) + 1
    
        clean_dir(args.output, args.name_len)
        # end preliminary work
    
        p_bucket = defaultdict(list)
        save_idx = 0
        id_name = '{0:0' + str(args.name_len) + 'd}'
        # load stop words
        stop_words = get_stop_words(args.stop_words) if os.path.exists(args.stop_words) else list()
        # load tokenizer
        seg = Segmentor(args)
    
        print('Splitting sentence into different clusters ...')
        infile = open(args.infile, 'r', encoding="utf-8")
        for line in tqdm(infile):
            line = line.rstrip()
            is_match = False
            seg_list = list(seg.cut(line))
            if stop_words:
                seg_list = list(filter(lambda x: x not in stop_words, seg_list))
            for wd in seg_list:
            ################# 追加之处 begin ####################
                if is_match:
                    break
            ################# 追加之处 end ####################
                w_bucket = p_bucket[wd]
                for bucket in w_bucket:
                    bucket_path = os.path.join(args.output, bucket)
                    check_file(bucket_path)
                    selected = sample_file(bucket_path, args.sample_number)
                    selected = list(map(lambda x: list(seg.cut(x)), selected))
                    # remove stop words
                    if stop_words:
                        filt_selected = list()
                        for sen in selected:
                            sen = list(filter(lambda x: x not in stop_words, sen))
                            filt_selected.append(sen)
                        selected = filt_selected
                    # calculate similarity with each bucket
                    if all(jaccard(seg_list, cmp_list) > args.threshold for cmp_list in selected):
                        is_match = True
                        with open(bucket_path, 'a', encoding='utf-8') as outfile:
                            outfile.write(line+'\n')
    ################# 追加之处 begin ####################
                        for w in seg_list:
                            if bucket not in p_bucket[w]:
                                p_bucket[w].append(bucket)
    ################# 追加之处 end ####################
                        break
    ################追加改变,缩进改变##################
            if not is_match:
                bucket_name = ('tmp' + id_name).format(save_idx)
                bucket_path = os.path.join(args.output, bucket_name)
                with open(bucket_path, 'a', encoding='utf-8') as outfile:
                    outfile.write(line+'\n')
    ################# 追加之处 begin ####################
                for w in seg_list:
                    p_bucket[w].append(bucket_name)
    ################# 追加之处 end ####################
                save_idx += 1
    
        infile.close()
    
        # sort and rename file
        file_list = os.listdir(args.output)
        file_list = list(filter(lambda x: x.startswith('tmp'), file_list))
        cnt = dict()
        for file in file_list:
            file_path = os.path.join(args.output, file)
            cnt[file] = line_counter(file_path)
    
        sorted_cnt = sorted(cnt.items(), key=lambda kv: kv[1], reverse=True)
        for idx, (file_name, times) in enumerate(sorted_cnt):
            origin_path = os.path.join(args.output, file_name)
            new_path = os.path.join(args.output, id_name.format(idx))
            os.rename(origin_path, new_path)
    
        print('All is well')
    

    作者的追加显然是可以解决该问题的,将新增的聚类逻辑放大了单词for循环之外,对于已有聚类簇,也将该类加入到所有单词的bucket中。

    相关文章

      网友评论

          本文标题:一个内存友好的短文本聚类工具

          本文链接:https://www.haomeiwen.com/subject/kjzzwctx.html