数据介绍
相似度系数使用场景比较
- 如果数据存在“分数膨胀”问题,就使用皮尔逊相关系数。
- 如果数据比较“密集”,变量之间基本都存在公有值,且这些距离数据是非常重要的,那就使用欧几里得或曼哈顿距离。
- 如果数据是稀疏的,则使用余弦相似度。
本次实验使用皮尔逊相关系数。
皮尔逊相关系数
皮尔逊相关系数的计算公式是:
上面的公式除了看起来比较复杂,另一个问题是要获得计算结果必须对数据做多次遍历。好
在我们有另外一个公式,能够计算皮尔逊相关系数的近似值:
这个公式虽然看起来更加复杂,而且其计算结果会不太稳定,有一定误差存在,但它最大的优点是,用代码实现的时候可以只需遍历一次数据。代码如下:
def pearson_distance(self,usr1,usr2):
sum_x_y = 0
sum_x = 0
sum_y = 0
sum_x_2 = 0
sum_y_2 = 0
n = 0
for movie in usr1.keys():
if movie in usr2.keys():
n += 1
x = usr1[movie]
y = usr2[movie]
sum_x_y += x*y
sum_x += x
sum_y += y
sum_x_2 += x**2
sum_y_2 += y**2
if n==0:return 0
denominator = math.sqrt(sum_x_2-float(sum_x**2)/n) * math.sqrt(sum_y_2-float(sum_y**2)/n)
if denominator==0:return 0
return (sum_x_y - float(sum_x*sum_y)/n)/denominator
完整代码
import math
class recommender(object):
def o_distance(self,usr1,usr2):
distance = 0
for movie in usr1.keys():
if movie in usr2.keys():
distance += abs(usr1[movie]-usr2[movie])**2
return math.sqrt(distance)
def pearson_distance(self,usr1,usr2):
sum_x_y = 0
sum_x = 0
sum_y = 0
sum_x_2 = 0
sum_y_2 = 0
n = 0
for movie in usr1.keys():
if movie in usr2.keys():
n += 1
x = usr1[movie]
y = usr2[movie]
sum_x_y += x*y
sum_x += x
sum_y += y
sum_x_2 += x**2
sum_y_2 += y**2
if n==0:return 0
denominator = math.sqrt(sum_x_2-float(sum_x**2)/n) * math.sqrt(sum_y_2-float(sum_y**2)/n)
if denominator==0:return 0
return (sum_x_y - float(sum_x*sum_y)/n)/denominator
def k_nearst(self,k):
distances = []
for usr,rate in self.usr_rating.items():
if not usr == self.usr:
distance = self.pearson_distance(self.usr_rating[self.usr],self.usr_rating[usr])
if distance != 0:distances.append((usr,distance))
distances.sort(key=lambda item:item[1],reverse=True)
#print(distances)
if k>len(distances):return distances
else:return distances[:k]
def load_data(self,path):
with open(path) as f:
lines = f.readlines()
usr_name = [i.strip('"') for i in lines[0].strip().split(',')[1:]]
for line in lines[1:]:
items = line.strip().split(',')
movie = items[0].strip('"')
for index in range(1,len(items)):
if not items[index]=='':
if usr_name[index-1] not in self.usr_rating:
self.usr_rating[usr_name[index-1]]={movie:int(items[index])}
else:self.usr_rating[usr_name[index-1]][movie] = int(items[index])
def recomend_k(self,nearst,k):
recommend = {}
total_distance = 0
for item in nearst:
total_distance+=item[1]
for item in nearst:
u_name = item[0]
weight = float(item[1])/total_distance
for movie,rate in self.usr_rating[u_name].items():
if movie not in self.usr_rating[self.usr].keys():
if movie not in recommend.keys():
recommend[movie] = rate*weight
else:
recommend[movie] += rate*weight
print(recommend)
top_k = list(recommend.items())
top_k.sort(key=lambda x:x[1],reverse=True)
if k>len(top_k):return top_k
else:return top_k[:k]
def __init__(self,path,usr):
self.path=path
#'/home/ant2017/Downloads/Movie_Ratings.csv'
self.usr = usr
self.usr_rating = {}
self.load_data(path)
def run(self):
nearst = self.k_nearst(5)
print(nearst)
top_k = self.recomend_k(nearst,5)
for item in top_k:
print("为您推荐影片:"+item[0]+"\t推荐指数:"+str(item[1]))
path = '/home/ant2017/Downloads/Movie_Ratings.csv'
r = recommender(path,'vanessa')
r.run()
网友评论