推荐算法: SVD++ 协同过滤 代码实现


原理

论文笔记
github链接,源码以数据集

代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import numpy as np
import random
import os


class SVDPP:
def __init__(self, mat, K=20):
self.mat = np.array(mat)
self.K = K
self.bi = {}
self.bu = {}
self.qi = {}
self.pu = {}
self.avg = np.mean(self.mat[:, 2])
self.y = {}
self.u_dict = {}
for i in range(self.mat.shape[0]):
uid = self.mat[i, 0]
iid = self.mat[i, 1]
self.u_dict.setdefault(uid, [])
self.u_dict[uid].append(iid)
self.bi.setdefault(iid, 0)
self.bu.setdefault(uid, 0)
self.qi.setdefault(iid, np.random.random((self.K, 1)) / 10 * np.sqrt(self.K))
self.pu.setdefault(uid, np.random.random((self.K, 1)) / 10 * np.sqrt(self.K))
self.y.setdefault(iid, np.zeros((self.K, 1)) + .1)

def predict(self, uid, iid): # 预测评分的函数
# setdefault的作用是当该用户或者物品未出现过时,新建它的bi,bu,qi,pu及用户评价过的物品u_dict,并设置初始值为0
self.bi.setdefault(iid, 0)
self.bu.setdefault(uid, 0)
self.qi.setdefault(iid, np.zeros((self.K, 1)))
self.pu.setdefault(uid, np.zeros((self.K, 1)))
self.y.setdefault(uid, np.zeros((self.K, 1)))
self.u_dict.setdefault(uid, [])
u_impl_prf, sqrt_Nu = self.getY(uid, iid)
rating = self.avg + self.bi[iid] + self.bu[uid] + np.sum(self.qi[iid] * (self.pu[uid] + u_impl_prf)) # 预测评分公式
# 由于评分范围在1到5,所以当分数大于5或小于1时,返回5,1.
if rating > 5:
rating = 5
if rating < 1:
rating = 1
return rating

# 计算sqrt_Nu和∑yj

def getY(self, uid, iid):
Nu = self.u_dict[uid]
I_Nu = len(Nu)
sqrt_Nu = np.sqrt(I_Nu)
y_u = np.zeros((self.K, 1))
if I_Nu == 0:
u_impl_prf = y_u
else:
for i in Nu:
y_u += self.y[i]
u_impl_prf = y_u / sqrt_Nu

return u_impl_prf, sqrt_Nu

def train(self, steps=30, gamma=0.04, Lambda=0.15): # 训练函数,step为迭代次数。
print('train data size', self.mat.shape)
for step in range(steps):
print('step', step + 1, 'is running')
KK = np.random.permutation(self.mat.shape[0]) # 随机梯度下降算法,kk为对矩阵进行随机洗牌
rmse = 0.0
for i in range(self.mat.shape[0]):
j = KK[i]
uid = self.mat[j, 0]
iid = self.mat[j, 1]
rating = self.mat[j, 2]
predict = self.predict(uid, iid)
u_impl_prf, sqrt_Nu = self.getY(uid, iid)
eui = rating - predict
rmse += eui ** 2
self.bu[uid] += gamma * (eui - Lambda * self.bu[uid])
self.bi[iid] += gamma * (eui - Lambda * self.bi[iid])
self.pu[uid] += gamma * (eui * self.qi[iid] - Lambda * self.pu[uid])
self.qi[iid] += gamma * (eui * (self.pu[uid] + u_impl_prf) - Lambda * self.qi[iid])
for j in self.u_dict[uid]:
self.y[j] += gamma * (eui * self.qi[j] / sqrt_Nu - Lambda * self.y[j])

gamma = 0.93 * gamma
print('rmse is', np.sqrt(rmse / self.mat.shape[0]))

def test(self, test_data): # gamma以0.93的学习率递减

test_data = np.array(test_data)
print('test data size', test_data.shape)
rmse = 0.0
for i in range(test_data.shape[0]):
uid = test_data[i, 0]
iid = test_data[i, 1]
rating = test_data[i, 2]
eui = rating - self.predict(uid, iid)
rmse += eui ** 2
print('rmse of test data is', np.sqrt(rmse / test_data.shape[0]))


def getData(file_name):
"""
获取训练集和测试集的函数
"""
data = []
with open(os.path.expanduser(file_name)) as f:
for line in f.readlines():
list = line.split()
data.append([int(i) for i in list[:3]])
random.shuffle(data)
train_data = data[:int(len(data) * 7 / 10)]
test_data = data[int(len(data) * 7 / 10):]
print('load data finished')
print('total data ', len(data))
return train_data, test_data


if __name__ == '__main__':
train_data, test_data = getData('./u.data')
a = SVDPP(train_data, 30)
a.train()
a.test(test_data)

Author: BY 水蓝
Reprint policy: All articles in this blog are used except for special statements CC BY 4.0 reprint polocy. If reproduced, please indicate source BY 水蓝 !
  TOC