1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121
| import numpy as np import random import os
class SVDPP: def __init__(self, mat, K=20): self.mat = np.array(mat) self.K = K self.bi = {} self.bu = {} self.qi = {} self.pu = {} self.avg = np.mean(self.mat[:, 2]) self.y = {} self.u_dict = {} for i in range(self.mat.shape[0]): uid = self.mat[i, 0] iid = self.mat[i, 1] self.u_dict.setdefault(uid, []) self.u_dict[uid].append(iid) self.bi.setdefault(iid, 0) self.bu.setdefault(uid, 0) self.qi.setdefault(iid, np.random.random((self.K, 1)) / 10 * np.sqrt(self.K)) self.pu.setdefault(uid, np.random.random((self.K, 1)) / 10 * np.sqrt(self.K)) self.y.setdefault(iid, np.zeros((self.K, 1)) + .1)
def predict(self, uid, iid): self.bi.setdefault(iid, 0) self.bu.setdefault(uid, 0) self.qi.setdefault(iid, np.zeros((self.K, 1))) self.pu.setdefault(uid, np.zeros((self.K, 1))) self.y.setdefault(uid, np.zeros((self.K, 1))) self.u_dict.setdefault(uid, []) u_impl_prf, sqrt_Nu = self.getY(uid, iid) rating = self.avg + self.bi[iid] + self.bu[uid] + np.sum(self.qi[iid] * (self.pu[uid] + u_impl_prf)) if rating > 5: rating = 5 if rating < 1: rating = 1 return rating
def getY(self, uid, iid): Nu = self.u_dict[uid] I_Nu = len(Nu) sqrt_Nu = np.sqrt(I_Nu) y_u = np.zeros((self.K, 1)) if I_Nu == 0: u_impl_prf = y_u else: for i in Nu: y_u += self.y[i] u_impl_prf = y_u / sqrt_Nu
return u_impl_prf, sqrt_Nu
def train(self, steps=30, gamma=0.04, Lambda=0.15): print('train data size', self.mat.shape) for step in range(steps): print('step', step + 1, 'is running') KK = np.random.permutation(self.mat.shape[0]) rmse = 0.0 for i in range(self.mat.shape[0]): j = KK[i] uid = self.mat[j, 0] iid = self.mat[j, 1] rating = self.mat[j, 2] predict = self.predict(uid, iid) u_impl_prf, sqrt_Nu = self.getY(uid, iid) eui = rating - predict rmse += eui ** 2 self.bu[uid] += gamma * (eui - Lambda * self.bu[uid]) self.bi[iid] += gamma * (eui - Lambda * self.bi[iid]) self.pu[uid] += gamma * (eui * self.qi[iid] - Lambda * self.pu[uid]) self.qi[iid] += gamma * (eui * (self.pu[uid] + u_impl_prf) - Lambda * self.qi[iid]) for j in self.u_dict[uid]: self.y[j] += gamma * (eui * self.qi[j] / sqrt_Nu - Lambda * self.y[j])
gamma = 0.93 * gamma print('rmse is', np.sqrt(rmse / self.mat.shape[0]))
def test(self, test_data):
test_data = np.array(test_data) print('test data size', test_data.shape) rmse = 0.0 for i in range(test_data.shape[0]): uid = test_data[i, 0] iid = test_data[i, 1] rating = test_data[i, 2] eui = rating - self.predict(uid, iid) rmse += eui ** 2 print('rmse of test data is', np.sqrt(rmse / test_data.shape[0]))
def getData(file_name): """ 获取训练集和测试集的函数 """ data = [] with open(os.path.expanduser(file_name)) as f: for line in f.readlines(): list = line.split() data.append([int(i) for i in list[:3]]) random.shuffle(data) train_data = data[:int(len(data) * 7 / 10)] test_data = data[int(len(data) * 7 / 10):] print('load data finished') print('total data ', len(data)) return train_data, test_data
if __name__ == '__main__': train_data, test_data = getData('./u.data') a = SVDPP(train_data, 30) a.train() a.test(test_data)
|