#-*-coding:utf8-*-
import sys
import math
import operator
import os
def get_user_click(rating_file):
#收集用户的点击序列
if not os.path.exists(rating_file):
return {}
num = 0
user_click = {}
fp = open(rating_file,encoding='utf-8')
for line in fp:
if num == 0:
num += 1
continue
#过滤掉第一行
item = line.strip().split(',')
if len(item) < 4:
continue
#过滤掉信息不全的行
[userid,itemid,rating,timestamp] = item
if float(rating) < 3.0:
continue
#过滤掉低分评价,>=3分的表示喜欢
if userid not in user_click:
user_click[userid] = []
user_click[userid].append(itemid)
#用list方式存储信息
fp.close()
return user_click
def get_item_info(item_file):
if not os.path.exists(item_file):
return {}
num = 0
item_info = {}
fp = open(item_file,encoding='utf-8')
for line in fp:
if num == 0:
num += 1
continue
#过滤掉第一行
item = line.strip().split(',')
if len(item) < 3:
continue
if len(item) == 3:
[itemid,title,genres] = item
elif len(item) > 3:
itemid = item[0]
genres = item[-1]
title = ",".join(item[1:-1])
if itemid not in item_info:
item_info[itemid] = [title,genres]
fp.close()
return item_info
def base_contribute_score():
return 1
def cal_item_sim(user_click):
#item_sim_score(字典类型) :key-->itemid_i,value-->[itemid_j:simscore](字典类型)
#user_click(字典类型):key-->userid,value-->[itemid1,itemid2]
co_appear = {}
item_user_click_num = {}
for user,itemlist in user_click.items():
for index_i in range(0,len(itemlist)):
itemid_i = itemlist[index_i]
item_user_click_num.setdefault(itemid_i,0)
item_user_click_num[itemid_i] += 1
for index_j in range(index_i+1,len(itemlist)):
itemid_j = itemlist[index_j]
#如果用户同时点击了itemid_i,itemid_j,就增加贡献值
#One:itemid_i对itemid_j的贡献
co_appear.setdefault(itemid_i,{})
#参考注释里的sim_info
co_appear[itemid_i].setdefault(itemid_j,0)
#co_appear[itemid_i][itemid_j] += 1
co_appear[itemid_i][itemid_j] += base_contribute_score()
#Two:itemid_j对itemid_i的贡献
co_appear.setdefault(itemid_j,{})
#参考注释里的sim_info
co_appear[itemid_j].setdefault(itemid_i,0)
#co_appear[itemid_i][itemid_j] += 1
co_appear[itemid_j][itemid_i] += base_contribute_score()
#计算相似度
item_sim_score = {}
for itemid_i,relate_item in co_appear.items():
for itemid_j,co_time in relate_item.items():
#公式的分母
if (itemid_i in item_user_click_num) and (itemid_j in item_user_click_num):
fenmu = math.sqrt(item_user_click_num[itemid_i]*item_user_click_num[itemid_j])
sim_score = co_time / fenmu
else:
continue
#存储得分
item_sim_score.setdefault(itemid_i,{})
item_sim_score[itemid_i].setdefault(itemid_j,0)
item_sim_score[itemid_i][itemid_j] = sim_score
#按相似性进行排序
item_sim_score_sorted = {}
for itemid in item_sim_score:
item_sim_score_sorted[itemid] = sorted(item_sim_score[itemid].items(),key=operator.itemgetter(1),reverse=True)
return item_sim_score_sorted
def cal_recom_result(sim_info,user_click):
#result(字典类型):key-->userid,value-->[itemid:recom_score](字典类型)
recent_click_num = 3
topk = 5
recom_info1 = {}
recom_info2 = {}
for user in user_click:
click_list = user_click[user]
for itemid in click_list[:recent_click_num]:
if itemid not in sim_info:
continue
#如果不存在,跳过
for itemsim_zuhe in sim_info[itemid][:topk]:
itemsimid = itemsim_zuhe[0]
itemsimscore = itemsim_zuhe[1]
recom_info1[itemsimid] = itemsimscore
recom_info2[user] = recom_info1
return recom_info2
def main_flow():
#step1:得到用户的点击序列
user_click=get_user_click('./data/ratings.csv')
#step2:计算物品的相似度
sim_info = cal_item_sim(user_click)
#step3:根据相似度来推荐
recom_result = cal_recom_result(sim_info,user_click)
print (recom_result['1'])
if __name__ == '__main__':
main_flow()
参考:https://www.imooc.com/learn/1029
网友评论