关于pytorch中矩阵分解的问题

问题描述 投票:0回答:0

我尝试用 yelp 数据集在 Pytorch 中实现矩阵分解。但是我得到的结果非常糟糕。

我使用 RMSE 来评估我的模型。然而,RMSE 值始终保持在 4。我不知道哪部分是错误的。

这是我的代码。我的数据集:数据集

from matplotlib import pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import time


dfRating = pd.read_pickle(dir_path + 'ratingData.pkl')
userNum = dfRating['user_id'].nunique()
itemNum = dfRating['business_id'].nunique()


def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

class RateDataset(Dataset):

    def __init__(self, userArr, itemArr, rateArr):
        self.userArr = userArr
        self.itemArr = itemArr
        self.rateArr = rateArr    

    def __len__(self):
        return len(self.userArr)

    def __getitem__(self, index):
        user = self.userArr[index]
        item = self.itemArr[index]
        rating = self.rateArr[index]
        return torch.tensor(user, dtype=torch.long), torch.tensor(item, dtype=torch.long), torch.tensor(rating, dtype=torch.float)

class MF(nn.Module):
    def __init__(self, userNum, itemNum, latenFactor):
        super(MF, self).__init__()
        self.userEmbedding = torch.nn.Embedding(userNum, latenFactor)
        self.itemEmbedding = torch.nn.Embedding(itemNum, latenFactor)

    def forward(self, userList, itemList):
        P = self.userEmbedding(userList)
        Q = self.itemEmbedding(itemList)
        return torch.mul(P, Q).sum(dim=1)

class MF_Data(object):

    def __init__(self, df):
        self.ratings = df
        self.batchSize = batchSize         
        self.userPool = set(self.ratings['user_id'].unique())
        self.itemPool = set(self.ratings['business_id'].unique())
        self.trainRatings, self.testRatings = self.splitData(self.ratings)
        random.seed(seed)

    def splitData(self, ratings):
        dataNum = len(ratings)
        trainNum = int(dataNum * 0.8)
        train = ratings[:trainNum]
        test = ratings[trainNum:]
        return train, test

    def getTrainInstance(self):
        users, items, ratings = [], [], []
        for row in self.trainRatings.itertuples():
            users.append(int(row.user_id))
            items.append(int(row.business_id))
            ratings.append(float(row.stars))
        dataset = RateDataset(users,items,ratings)
        return DataLoader(dataset, self.batchSize, shuffle=True)

    def getTestInstance(self):
        users, items, ratings = [], [], []
        for row in self.trainRatings.itertuples():
            users.append(int(row.user_id))
            items.append(int(row.business_id))
            ratings.append(float(row.stars))
        dataset = RateDataset(users,items,ratings)
        return DataLoader(dataset, self.batchSize, shuffle=True)

# parameter setting
gpu = 0
latent_features_guess=10
learning_rate=1e-3
lamda=0.05
epochs = 10
batchSize = 16
seed = 42

seed_everything(seed)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# construct the train and test datasets
data = MF_Data(dfRating)
trainLoader = data.getTrainInstance()
testLoader = data.getTestInstance()

# set model、Loss and Optimizer
model = MF(userNum, itemNum, latent_features_guess)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate, weight_decay = lamda)
criterion = torch.nn.MSELoss()

trainRMSE, testRMSE = [],[]

for epoch in range(1, epochs+1):

    model.train() # Enable dropout (if have).
    
    start = time.time()
    trainLoss = 0
    numTrain = 0
    train_Loss, test_Loss = 0,0

    for users, items, r in trainLoader:
        users, items, r = users.to(device), items.to(device), r.to(device)
        
        # forward
        r_hat= model(users, items)
        loss=criterion(r_hat, r)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        trainLoss+=loss.item()
        numTrain+=1
                
    trainRMSE.append(np.sqrt(trainLoss/numTrain))
    model.eval()
    testLoss=0
    numTest=0
    with torch.no_grad():
      for users,items,r in testLoader:
        user,items,r=users.to(device),items.to(device),r.to(device)
        r_hat=model(users, items)
        loss=criterion(r_hat, r)
        testLoss+=loss.item()
        numTest+=1
      testRMSE.append(np.sqrt(testLoss/numTest))


plt.style.use('seaborn-whitegrid')
x = list(range(epochs))
fig = plt.figure()
ax = plt.axes()

plt.plot(x, trainRMSE, label='train_rmse')
plt.plot(x, testRMSE, label='test_rmse')
leg = ax.legend()


这是我的结果: RMSE

为什么RMSE变成4.

是因为数据集太稀疏了吗?

我期望 RMSE 值在 0 到 1 之间

pytorch matrix-factorization
© www.soinside.com 2019 - 2024. All rights reserved.