1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
| import numpy as np import random import os
class SVD: def __init__(self, mat, K=20): self.mat = np.array(mat) self.K = K self.bi = {} self.bu = {} self.qi = {} self.pu = {} self.avg = np.mean(self.mat[:, 2]) for i in range(self.mat.shape[0]): uid = self.mat[i, 0] iid = self.mat[i, 1] self.bi.setdefault(iid, 0) self.bu.setdefault(uid, 0) self.qi.setdefault(iid, np.random.random((self.K, 1)) / 10 * np.sqrt(self.K)) self.pu.setdefault(uid, np.random.random((self.K, 1)) / 10 * np.sqrt(self.K))
def predict(self, uid, iid): self.bi.setdefault(iid, 0) self.bu.setdefault(uid, 0) self.qi.setdefault(iid, np.zeros((self.K, 1))) self.pu.setdefault(uid, np.zeros((self.K, 1))) rating = self.avg + self.bi[iid] + self.bu[uid] + np.sum(self.qi[iid] * self.pu[uid]) if rating > 5: rating = 5 if rating < 1: rating = 1 return rating
def train(self, steps=30, gamma=0.04, Lambda=0.15): print('train data size', self.mat.shape) for step in range(steps): print('step', step + 1, 'is running') KK = np.random.permutation(self.mat.shape[0]) rmse = 0.0 mae = 0 for i in range(self.mat.shape[0]): j = KK[i] uid = self.mat[j, 0] iid = self.mat[j, 1] rating = self.mat[j, 2] eui = rating - self.predict(uid, iid) rmse += eui ** 2 mae += abs(eui) self.bu[uid] += gamma * (eui - Lambda * self.bu[uid]) self.bi[iid] += gamma * (eui - Lambda * self.bi[iid]) tmp = self.qi[iid] self.qi[iid] += gamma * (eui * self.pu[uid] - Lambda * self.qi[iid]) self.pu[uid] += gamma * (eui * tmp - Lambda * self.pu[uid]) gamma = 0.93 * gamma print('rmse is {0:3f}, ase is {1:3f}'.format(np.sqrt(rmse / self.mat.shape[0]), mae / self.mat.shape[0]))
def test(self, test_data):
test_data = np.array(test_data) print('test data size', test_data.shape) rmse = 0.0 mae = 0 for i in range(test_data.shape[0]): uid = test_data[i, 0] iid = test_data[i, 1] rating = test_data[i, 2] eui = rating - self.predict(uid, iid) rmse += eui ** 2 mae += abs(eui) print('rmse is {0:3f}, ase is {1:3f}'.format(np.sqrt(rmse / self.mat.shape[0]), mae / self.mat.shape[0]))
def getData(file_name): data = [] with open(os.path.expanduser(file_name), encoding='utf-8') as f: for line in f.readlines(): list = line.split() data.append([int(i) for i in list[:3]]) random.shuffle(data) train_data = data[:int(len(data) * 7 / 10)] test_data = data[int(len(data) * 7 / 10):] print('load data finished') print('total data ', len(data)) return train_data, test_data
if __name__ == '__main__': train_data, test_data = getData('./u.data') a = SVD(train_data, 30) a.train() a.test(test_data)
|