VSM模型原理
- 基本思路:如果两句话的用词越相似,它们的内容就应该越相似。[思路详解]
- 实现步骤:文章分词 => TF-IDF关键词 => 合并关键词集 => 统计词频 => 词频向量 => 向量相似度 => 文章相似度。
VSM模型实现
from math import sqrt
# 合并标签集
def create_vocabulary(tag_list1, tag_list2):
return list(set(tag_list1+tag_list2))
# 统计词频
def calc_tag_frequency(tag_list):
tag_frequency = {}
tag_set = set(tag_list)
for tag in tag_set:
tag_frequency[tag] = tag_list.count(tag)
return tag_frequency
# 建立词频向量
def create_vector(tag_frequency, vocabulary):
vector = []
tag_set = tag_frequency.keys()
for tag in vocabulary:
if tag in tag_set:
vector.append(tag_frequency[tag])
else:
vector.append(0)
return vector
# 计算词频向量相似度
def calc_similar(vector1, vector2, tag_count):
x = 0.0 # 分子
y1 = 0.0 # 分母1
y2 = 0.0 # 分母2
tag_count = float(tag_count)
for i in range(0, len(vector1)): # same length
t1 = vector1[i] / tag_count
t2 = vector2[i] / tag_count
x = x + (t1 * t2)
y1 += pow(t1, 2)
y2 += pow(t2, 2)
return x / sqrt(y1 * y2)
# VSM模型实现
def vsm(tag_list1, tag_list2):
count = len(tag_list1) + len(tag_list2)
vocabulary = create_vocabulary(tag_list1, tag_list2)
vector1 = create_vector(calc_tag_frequency(tag_list1), vocabulary)
vector2 = create_vector(calc_tag_frequency(tag_list2), vocabulary)
similar = calc_similar(vector1, vector2, count)
return similar
if __name__ == '__main__':
# 1. 文章分词
# 2. TF-IDF 提取关键词作为文章标签
# 3. 计算VSM模型相似度
pass
网友评论