基于同义词表,进行同义词替换
同义词词表如下,词与词之间用 \t
隔开:
开心 快乐 happy
生气 angrey 愤怒
什么玩意儿 孺子不可教也 气死了
代码如下:
import jieba
import multiprocessing
# 获取同义词词典和同义词词表
def get_synonym_dict(synonym_file):
synonym_dict = {} # 同义词词典
synonym_words = [] # 同义词词表
with open(synonym_file, 'r', encoding='utf-8') as f:
for line in f:
words = line.strip().split('\t')
n = len(words)
if n:
synonym_words.append(words[0])
for i in range(1, n):
synonym_dict[words[i]] = words[0]
synonym_words.append(words[i])
print('synonym_dict: ', synonym_dict)
return synonym_dict, synonym_words
# 生成同义词词表文件,后续作为词典供加载
def generate_synonym_words_file(synonym_words, synonym_words_file):
with open(synonym_words_file, 'w', encoding='utf-8') as f:
for item in synonym_words:
f.write(item)
f.write('\n')
# 分词器加载同义词表文件
def load_synonym_words(synonym_words_file, words_cutter=jieba):
words_cutter.load_userdict(synonym_words_file)
# 分词,替换同义词
def replace_synonym(sentence, synonym_dict, words_cutter=jieba):
print('sentence: ', sentence)
tokens = words_cutter.lcut(sentence) # 此处要使用 lcut,cut 返回的是 generator, lcut 返回的是 list
print('before replace: ', tokens)
for i in range(len(tokens)):
if synonym_dict.get(tokens[i]):
tokens[i] = synonym_dict[tokens[i]]
print('after repalce: ', tokens)
return tokens
def replace_synonym_helper(param, words_cutter=jieba):
return replace_synonym(param[0], param[1], words_cutter=jieba)
# 并行
def multiprocess_replace_synonym(sentence_list, synonym_dict, words_cutter=jieba, occupation_rate=0.5):
max_pool_num = multiprocessing.cpu_count()
pool_num = int(max_pool_num * occupation_rate)
print('occupation CPU core number: ', pool_num)
pool = multiprocessing.Pool(processes=pool_num)
param = [(sentence, synonym_dict) for sentence in sentence_list]
pool.map(replace_synonym_helper, param)
if __name__ == '__main__':
sentence_list = ['你说你怎么这么愤怒', '今天真的是气死了的一天', '莫名地happy']
synonym_dict, synonym_words = get_synonym_dict('tongyici.txt')
synonym_words_file = 'tongyicibiao.txt'
generate_synonym_words_file(synonym_words, synonym_words_file)
jieba.load_userdict(synonym_words_file)
multiprocess_replace_synonym(sentence_list, synonym_dict=synonym_dict)
输出:
synonym_dict: {'快乐': '开心', 'happy': '开心', 'angrey': '生气', '愤怒': '生气', '孺子不可教也': '什么玩意儿', '气死了': '什么玩意儿'}
occupation CPU core number: 2
sentence: 你说你怎么这么愤怒
before replace: ['你', '说', '你', '怎么', '这么', '愤怒']
after repalce: ['你', '说', '你', '怎么', '这么', '生气']
sentence: 莫名地happy
sentence: 今天真的是气死了的一天
before replace: ['莫名', '地', 'happy']
after repalce: ['莫名', '地', '开心']
before replace: ['今天', '真的', '是', '气死了', '的', '一天']
after repalce: ['今天', '真的', '是', '什么玩意儿', '的', '一天']
并行处理还有点小问题,后期再优化
网友评论