美文网首页
使用TF-IDF和BM25提取文章关键词

使用TF-IDF和BM25提取文章关键词

作者: 予早 | 来源:发表于2019-03-19 18:00 被阅读0次

    评估方法:
    人工从文章中提取1-5个关键词,和机器提取的关键词做比较
    召回 = 机器提词∩人工提词 / 人工提词
    准确 = 机器提词∩人工提词 / 机器提词

    TF-IDF

    原理参考:http://www.ruanyifeng.com/blog/2013/03/tf-idf.html
    实现参考:tf-idf-keyword
    其他参考: 使用不同的方法计算TF-IDF值

    第一版 标题和正文加权计算tf-idf

    主要策略

    • 使用nlpc切词服务(可用jieba切词代替)+TF-IDF提取关键词。
    • 去除停用词
    • 按照体裁+年级分成若干类型,来训练模型,示例用高中+叙事类,取了20000条数据训练
    • 对标题进行加权,标题的每个词汇频率+6,再合一起计算tf-idf
    • 按照权重取前4个关键词,在这4个关键词中对于权重小于 频率(5)*平均IDF/总词数 的进行过滤
      注:以上数据均为调节后最优解

    代码实现

    config.py

    program = 'composition_term_weight'
    logger = logging.getLogger(program)
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s',
                        stream=sys.stderr,
                        datefmt='%a, %d %b %Y %H:%M:%S')
    logging.root.setLevel(level=logging.INFO)
    

    IDFLoader.py

    class IDFLoader(object):
        """词典加载类"""
    
        def __init__(self, idf_path):
            self.idf_path = idf_path
            self.idf_freq = {}  # idf
            self.mean_len = 0 #平均长度
            self.mean_idf = 0.0  # 均值
            self.load_idf()
    
        def load_idf(self):
            """从文件中载入idf"""
            cnt = 0
            with open(self.idf_path, 'rb') as f:
                for line in f:
                    try:
                        word, freq = line.strip().decode('utf-8', errors='ignore').split(' ')
                        if word == 'LEN_AVG':
                            self.mean_len = int(freq)
                            break
                        self.idf_freq[word] = float(freq)
                        cnt += 1
                    except Exception as e:
                        # logger.error('load_idf error: ' + e.message + ' line: ' + line.decode('utf-8', errors='ignore'))
                        continue
    
            self.mean_idf = sum(self.idf_freq.values()) / cnt
            logger.info('Vocabularies %s loaded: %d mean_idf: %d' % (self.idf_path, cnt, self.mean_idf))
    
    class TfIdf(object):
        """TF-IDF"""
        # 对正文进行过滤
        p_cut = re.compile(r'[a-zA-Z0-9]', re.VERBOSE)
        # 对标题进行过滤
        p_title = re.compile(r'作文|\d+字|.年级|_', re.VERBOSE)
        # 过滤常用标点符号等,也可以放到停用词表中
        ignored = ['', ' ', '', '。', ':', ',', ')', '(', '!', '?', '”', '“', '"', '―', '.', '说', '好', '时']
        # 主题最小出现次数,用于过滤权重不达标的关键词
        min_times = 5.0
        # 标题加权次数
        title_add_times = 6.0
        # 取关键词的个数
        words_num = 4
    
        def __init__(self):
            # 1. 获取停用词库
            my_stop_words_path = 'stop_words.utf8.txt'
            self.stop_words_dict = []
            with open(my_stop_words_path, 'rb') as fr:
                for line in fr.readlines():
                    self.stop_words_dict.append(line.strip())
    
        def my_cut(self, inTxt):
            """切词"""
            inTxt = self.p_cut.sub('', str(inTxt))
            words_list = []
            # 由于性能问题,一句一句的切词
            for l in inTxt.split('。'):
                # NLPC切词服务,可用jieba切词代替
                r = cut(l)
                if r is not None:
                    words_list += r
            return [w for w in words_list if w not in self.stop_words_dict and w not in self.ignored and len(w.strip()) > 0]
    
        def get_tfidf(self, idf_loader, title, content):
            """计算文章tf-idf"""
            filter_title = self.p_title.sub('', title.encode('utf-8', errors='ignore'))
            title_words = self.my_cut(filter_title)
    
            corpus0 = title_words + self.my_cut(content)
    
            freq = {}
            for w in corpus0:
                freq[w] = freq.get(w, 0.0) + 1.0
            # 对标题进行加权
            for w in title_words:
                logger.info(freq[w])
                freq[w] = freq.get(w, 0.0) + self.title_add_times
                logger.info(freq[w])
            total = sum(freq.values())
    
            for k in freq:  # 计算 TF-IDF
                freq[k] *= idf_loader.idf_freq.get(k, idf_loader.mean_idf) / total
    
            return sorted(freq.items(), key=lambda d: d[1], reverse=True), len(corpus0), title_words
    
        def get_term_weight(self, idf_loader, title, content):
            """获得term权重"""
            result, words_number, title_words = self.get_tfidf(idf_loader, title, content)
            bound = self.min_times * idf_loader.mean_idf / words_number
            machine_words = [item for item in result[:4] if item[1] > bound]
            # machine_words = [item for item in result[:self.words_num]]
            if len(machine_words) < 1:
                # 如果一个term都没有,则把标题拿出来
                machine_words = [item for item in result if item[1] in title_words]
            data = []
            offset = 0
            for i, word in enumerate(machine_words):
                data.append('%s:%d:%s' % (word[0], offset, str(round(word[1], 4))))
                offset += len(word[0].decode('utf-8', errors='ignore'))
            return data
    
        def getCorpus(self, data_path):
            """获取词表"""
            count = 0
            corpus_list = []
            with open(data_path, 'rb') as f:
                for line in f:
                    info = json.loads(line.decode('utf-8', errors='ignore'))
                    sentence = self.p_title.sub('', info.get('title').encode('utf-8', errors='ignore')) + '。' + info.get(
                        '@merge_text').encode('utf-8', errors='ignore')
                    r = self.my_cut(sentence)
                    if not r:
                        continue
                    corpus_list.append(r)
                    count += 1
                    if count % 1000 == 0:
                        logger.info("processd " + str(count) + " segment_sentence")
            return corpus_list
    
        def train(self, dir_name, data_path):
            """训练模型"""
            idf_path = 'data/%s/idf.txt' % dir_name
            documents = self.getCorpus(data_path)
            id_freq = {}
            i = 0
            len_sum = 0
            for doc in documents:
                len_sum += len(doc)
                doc = set(doc)
                for x in doc:
                    id_freq[x] = id_freq.get(x, 0) + 1
                if i % 1000 == 0:
                    logger.info('Documents processed: ' + str(i) + ', time: ' + str(datetime.datetime.now()))
                i += 1
    
            del documents
            with open(idf_path, 'wb') as f:
                for key, value in id_freq.items():
                    f.write(key + ' ' + str(math.log(i / value, 2)) + '\n')
                logger.info(str(i) + ' ' + str(len_sum))
                f.write('LEN_AVG ' + str(len_sum / i))
    
        def test_one(self, dir_name, method='tfidf'):
            """单个测试"""
            idf_loader = IDFLoader('data/%s/idf.txt' % dir_name)
            for item in sys.stdin:
                info = json.loads(item.decode('utf-8', errors='ignore'))
                title = info['title']
                content = info['@merge_text']
                if method == 'tfidf':
                    result, words_number, title_words = self.get_tfidf(idf_loader, title, content)
                else:
                    result, words_number, title_words = self.get_bm25(idf_loader, title, content)
                bound = self.min_times * idf_loader.mean_idf / words_number
    
                print '_____words_number bound_____'
                print words_number, bound
                print '_____tfidf_result_____'
                for item in result[:20]:
                    print item[0].encode('utf-8', errors='ignore'), item[1]
    

    经调优,最优解为:min_times=5 title_add_times=6.0 words_num=4

    结果

    人工抽样评估了100个
    TF-IDF召回率:0.2778
    TF-IDF准确率:0.2778

    BM25

    算法参考: 搜索中的权重度量利器: TF-IDF和BM25

    第一版

    TfIdf.py 增加方法:

        def get_bm25(self, idf_loader, title, content):
            """计算bm25"""
            k = 1.2  # 用来限制TF值的增长极限
            b = 0.75  # b是一个常数,它的作用是规定L对评分的影响有多大。
            # L是文档长度与平均长度的比值
            EPSILON = 0.25  # 如果idf词表中没有,则平均idf*该值
    
            filter_title = self.p_title.sub('', title.encode('utf-8', errors='ignore'))
            title_words = self.my_cut(filter_title)
    
            corpus0 = title_words + self.my_cut(content)
    
            freq = {}
            for w in corpus0:
                freq[w] = freq.get(w, 0.0) + 1.0
            # 对标题进行加权
            for w in title_words:
                freq[w] = freq.get(w, 0.0) + self.title_add_times
            total = sum(freq.values())
    
            logger.info(str((k, b, total, idf_loader.mean_len)))
            for i in freq:
                tf = freq[i] / total
                idf = idf_loader.idf_freq.get(i, idf_loader.mean_idf * EPSILON)
                freq[i] = idf * ((k + 1) * tf) / (k * (1.0 - b + b * (total / idf_loader.mean_len)) + tf)
    
            return sorted(freq.items(), key=lambda d: d[1], reverse=True), len(corpus0), title_words
    

    经调优,最优解为:min_times=2.5 title_add_times=6.0 words_num=4 k=1.2 b=0.75 EPSILON=0.25

    结果

    人工抽样评估了100个
    BM25召回率:0.2889
    BM25准确率:0.3333

    相关文章

      网友评论

          本文标题:使用TF-IDF和BM25提取文章关键词

          本文链接:https://www.haomeiwen.com/subject/acynmqtx.html