Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 110 additions & 0 deletions data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import torch
import random
import pandas as pd
from copy import deepcopy
from torch.utils.data import DataLoader, Dataset

random.seed(0)


class UserItemRatingDataset(Dataset):
"""Wrapper, convert <user, item, rating> Tensor into Pytorch Dataset"""
def __init__(self, user_tensor, item_tensor, target_tensor):
"""
args:

target_tensor: torch.Tensor, the corresponding rating for <user, item> pair
"""
self.user_tensor = user_tensor
self.item_tensor = item_tensor
self.target_tensor = target_tensor

def __getitem__(self, index):
return self.user_tensor[index], self.item_tensor[index], self.target_tensor[index]

def __len__(self):
return self.user_tensor.size(0)


class SampleGenerator(object):
"""Construct dataset for NCF"""

def __init__(self, ratings):
"""
args:
ratings: pd.DataFrame, which contains 4 columns = ['userId', 'itemId', 'rating', 'timestamp']
"""
assert 'userId' in ratings.columns
assert 'itemId' in ratings.columns
assert 'rating' in ratings.columns

self.ratings = ratings
# explicit feedback using _normalize and implicit using _binarize
# self.preprocess_ratings = self._normalize(ratings)
self.preprocess_ratings = self._binarize(ratings)
self.user_pool = set(self.ratings['userId'].unique())
self.item_pool = set(self.ratings['itemId'].unique())
# create negative item samples for NCF learning
self.negatives = self._sample_negative(ratings)
self.train_ratings, self.test_ratings = self._split_loo(self.preprocess_ratings)

def _normalize(self, ratings):
"""normalize into [0, 1] from [0, max_rating], explicit feedback"""
ratings = deepcopy(ratings)
max_rating = ratings.rating.max()
ratings['rating'] = ratings.rating * 1.0 / max_rating
return ratings

def _binarize(self, ratings):
"""binarize into 0 or 1, imlicit feedback"""
ratings = deepcopy(ratings)
ratings['rating'][ratings['rating'] > 0] = 1.0
return ratings

def _split_loo(self, ratings):
"""leave one out train/test split """
ratings['rank_latest'] = ratings.groupby(['userId'])['timestamp'].rank(method='first', ascending=False)
test = ratings[ratings['rank_latest'] == 1]
train = ratings[ratings['rank_latest'] > 1]
assert train['userId'].nunique() == test['userId'].nunique()
return train[['userId', 'itemId', 'rating']], test[['userId', 'itemId', 'rating']]

def _sample_negative(self, ratings):
"""return all negative items & 100 sampled negative items"""
interact_status = ratings.groupby('userId')['itemId'].apply(set).reset_index().rename(
columns={'itemId': 'interacted_items'})
interact_status['negative_items'] = interact_status['interacted_items'].apply(lambda x: self.item_pool - x)
interact_status['negative_samples'] = interact_status['negative_items'].apply(lambda x: random.sample(x, 99))
return interact_status[['userId', 'negative_items', 'negative_samples']]

def instance_a_train_loader(self, num_negatives, batch_size):
"""instance train loader for one training epoch"""
users, items, ratings = [], [], []
train_ratings = pd.merge(self.train_ratings, self.negatives[['userId', 'negative_items']], on='userId')
train_ratings['negatives'] = train_ratings['negative_items'].apply(lambda x: random.sample(x, num_negatives))
for row in train_ratings.itertuples():
users.append(int(row.userId))
items.append(int(row.itemId))
ratings.append(float(row.rating))
for i in range(num_negatives):
users.append(int(row.userId))
items.append(int(row.negatives[i]))
ratings.append(float(0)) # negative samples get 0 rating
dataset = UserItemRatingDataset(user_tensor=torch.LongTensor(users),
item_tensor=torch.LongTensor(items),
target_tensor=torch.FloatTensor(ratings))
return DataLoader(dataset, batch_size=batch_size, shuffle=True)

@property
def evaluate_data(self):
"""create evaluate data"""
test_ratings = pd.merge(self.test_ratings, self.negatives[['userId', 'negative_samples']], on='userId')
test_users, test_items, negative_users, negative_items = [], [], [], []
for row in test_ratings.itertuples():
test_users.append(int(row.userId))
test_items.append(int(row.itemId))
for i in range(len(row.negative_samples)):
negative_users.append(int(row.userId))
negative_items.append(int(row.negative_samples[i]))
return [torch.LongTensor(test_users), torch.LongTensor(test_items), torch.LongTensor(negative_users),
torch.LongTensor(negative_items)]
86 changes: 86 additions & 0 deletions engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import torch
from torch.autograd import Variable
from tensorboardX import SummaryWriter

from utils import save_checkpoint, use_optimizer
from metrics import MetronAtK


class Engine(object):
"""Meta Engine for training & evaluating NCF model

Note: Subclass should implement self.model !
"""

def __init__(self, config):
self.config = config # model configuration
self._metron = MetronAtK(top_k=10)
self._writer = SummaryWriter(log_dir='runs/{}'.format(config['alias'])) # tensorboard writer
self._writer.add_text('config', str(config), 0)
self.opt = use_optimizer(self.model, config)
# explicit feedback
# self.crit = torch.nn.MSELoss()
# implicit feedback
self.crit = torch.nn.BCELoss()

def train_single_batch(self, users, items, ratings):
assert hasattr(self, 'model'), 'Please specify the exact model !'
if self.config['use_cuda'] is True:
users, items, ratings = users.cuda(), items.cuda(), ratings.cuda()
self.opt.zero_grad()
ratings_pred = self.model(users, items)
loss = self.crit(ratings_pred.view(-1), ratings)
loss.backward()
self.opt.step()
loss = loss.item()
return loss

def train_an_epoch(self, train_loader, epoch_id):
assert hasattr(self, 'model'), 'Please specify the exact model !'
self.model.train()
total_loss = 0
for batch_id, batch in enumerate(train_loader):
assert isinstance(batch[0], torch.LongTensor)
user, item, rating = batch[0], batch[1], batch[2]
rating = rating.float()
loss = self.train_single_batch(user, item, rating)
print('[Training Epoch {}] Batch {}, Loss {}'.format(epoch_id, batch_id, loss))
total_loss += loss
self._writer.add_scalar('model/loss', total_loss, epoch_id)

def evaluate(self, evaluate_data, epoch_id):
assert hasattr(self, 'model'), 'Please specify the exact model !'
self.model.eval()
with torch.no_grad():
test_users, test_items = evaluate_data[0], evaluate_data[1]
negative_users, negative_items = evaluate_data[2], evaluate_data[3]
if self.config['use_cuda'] is True:
test_users = test_users.cuda()
test_items = test_items.cuda()
negative_users = negative_users.cuda()
negative_items = negative_items.cuda()
test_scores = self.model(test_users, test_items)
negative_scores = self.model(negative_users, negative_items)
if self.config['use_cuda'] is True:
test_users = test_users.cpu()
test_items = test_items.cpu()
test_scores = test_scores.cpu()
negative_users = negative_users.cpu()
negative_items = negative_items.cpu()
negative_scores = negative_scores.cpu()
self._metron.subjects = [test_users.data.view(-1).tolist(),
test_items.data.view(-1).tolist(),
test_scores.data.view(-1).tolist(),
negative_users.data.view(-1).tolist(),
negative_items.data.view(-1).tolist(),
negative_scores.data.view(-1).tolist()]
hit_ratio, ndcg = self._metron.cal_hit_ratio(), self._metron.cal_ndcg()
self._writer.add_scalar('performance/HR', hit_ratio, epoch_id)
self._writer.add_scalar('performance/NDCG', ndcg, epoch_id)
print('[Evluating Epoch {}] HR = {:.4f}, NDCG = {:.4f}'.format(epoch_id, hit_ratio, ndcg))
return hit_ratio, ndcg

def save(self, alias, epoch_id, hit_ratio, ndcg):
assert hasattr(self, 'model'), 'Please specify the exact model !'
model_dir = self.config['model_dir'].format(alias, epoch_id, hit_ratio, ndcg)
save_checkpoint(self.model, model_dir)
46 changes: 46 additions & 0 deletions gmf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import torch
from engine import Engine
from utils import use_cuda
from torch import nn


class GMF(torch.nn.Module):
def __init__(self, config):
super(GMF, self).__init__()
self.num_users = config['num_users']
self.num_items = config['num_items']
self.latent_dim = config['latent_dim']

self.embedding_user = torch.nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.latent_dim)
self.embedding_item = torch.nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.latent_dim)

self.affine_output = torch.nn.Linear(in_features=self.latent_dim, out_features=1)
self.logistic = torch.nn.Sigmoid()

# Initialize model parameters with a Gaussian distribution (with a mean of 0 and standard deviation of 0.01)
if config['weight_init_gaussian']:
for sm in self.modules():
if isinstance(sm, (nn.Embedding, nn.Linear)):
print(sm)
torch.nn.init.normal_(sm.weight.data, 0.0, 0.01)

def forward(self, user_indices, item_indices):
user_embedding = self.embedding_user(user_indices)
item_embedding = self.embedding_item(item_indices)
element_product = torch.mul(user_embedding, item_embedding)
logits = self.affine_output(element_product)
rating = self.logistic(logits)
return rating

def init_weight(self):
pass


class GMFEngine(Engine):
"""Engine for training & evaluating GMF model"""
def __init__(self, config):
self.model = GMF(config)
if config['use_cuda'] is True:
use_cuda(True, config['device_id'])
self.model.cuda()
super(GMFEngine, self).__init__(config)
57 changes: 57 additions & 0 deletions metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import math
import pandas as pd


class MetronAtK(object):
def __init__(self, top_k):
self._top_k = top_k
self._subjects = None # Subjects which we ran evaluation on

@property
def top_k(self):
return self._top_k

@top_k.setter
def top_k(self, top_k):
self._top_k = top_k

@property
def subjects(self):
return self._subjects

@subjects.setter
def subjects(self, subjects):
"""
args:
subjects: list, [test_users, test_items, test_scores, negative users, negative items, negative scores]
"""
assert isinstance(subjects, list)
test_users, test_items, test_scores = subjects[0], subjects[1], subjects[2]
neg_users, neg_items, neg_scores = subjects[3], subjects[4], subjects[5]
# the golden set
test = pd.DataFrame({'user': test_users,
'test_item': test_items,
'test_score': test_scores})
# the full set
full = pd.DataFrame({'user': neg_users + test_users,
'item': neg_items + test_items,
'score': neg_scores + test_scores})
full = pd.merge(full, test, on=['user'], how='left')
# rank the items according to the scores for each user
full['rank'] = full.groupby('user')['score'].rank(method='first', ascending=False)
full.sort_values(['user', 'rank'], inplace=True)
self._subjects = full

def cal_hit_ratio(self):
"""Hit Ratio @ top_K"""
full, top_k = self._subjects, self._top_k
top_k = full[full['rank']<=top_k]
test_in_top_k =top_k[top_k['test_item'] == top_k['item']] # golden items hit in the top_K items
return len(test_in_top_k) * 1.0 / full['user'].nunique()

def cal_ndcg(self):
full, top_k = self._subjects, self._top_k
top_k = full[full['rank']<=top_k]
test_in_top_k =top_k[top_k['test_item'] == top_k['item']]
test_in_top_k['ndcg'] = test_in_top_k['rank'].apply(lambda x: math.log(2) / math.log(1 + x)) # the rank starts from 1
return test_in_top_k['ndcg'].sum() * 1.0 / full['user'].nunique()
71 changes: 71 additions & 0 deletions mlp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import torch
from gmf import GMF
from engine import Engine
from utils import use_cuda, resume_checkpoint
from torch import nn


class MLP(torch.nn.Module):
def __init__(self, config):
super(MLP, self).__init__()
self.config = config
self.num_users = config['num_users']
self.num_items = config['num_items']
self.latent_dim = config['latent_dim']

self.embedding_user = torch.nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.latent_dim)
self.embedding_item = torch.nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.latent_dim)

self.fc_layers = torch.nn.ModuleList()
for idx, (in_size, out_size) in enumerate(zip(config['layers'][:-1], config['layers'][1:])):
self.fc_layers.append(torch.nn.Linear(in_size, out_size))

self.affine_output = torch.nn.Linear(in_features=config['layers'][-1], out_features=1)
self.logistic = torch.nn.Sigmoid()

# Initialize model parameters with a Gaussian distribution (with a mean of 0 and standard deviation of 0.01)
if config['weight_init_gaussian']:
for sm in self.modules():
if isinstance(sm, (nn.Embedding, nn.Linear)):
print(sm)
torch.nn.init.normal_(sm.weight.data, 0.0, 0.01)

def forward(self, user_indices, item_indices):
user_embedding = self.embedding_user(user_indices)
item_embedding = self.embedding_item(item_indices)
vector = torch.cat([user_embedding, item_embedding], dim=-1) # the concat latent vector
for idx, _ in enumerate(range(len(self.fc_layers))):
vector = self.fc_layers[idx](vector)
vector = torch.nn.ReLU()(vector)
# vector = torch.nn.BatchNorm1d()(vector)
# vector = torch.nn.Dropout(p=0.5)(vector)
logits = self.affine_output(vector)
rating = self.logistic(logits)
return rating

def init_weight(self):
pass

def load_pretrain_weights(self):
"""Loading weights from trained GMF model"""
config = self.config
gmf_model = GMF(config)
if config['use_cuda'] is True:
gmf_model.cuda()
resume_checkpoint(gmf_model, model_dir=config['pretrain_mf'], device_id=config['device_id'])
self.embedding_user.weight.data = gmf_model.embedding_user.weight.data
self.embedding_item.weight.data = gmf_model.embedding_item.weight.data


class MLPEngine(Engine):
"""Engine for training & evaluating GMF model"""
def __init__(self, config):
self.model = MLP(config)
if config['use_cuda'] is True:
use_cuda(True, config['device_id'])
self.model.cuda()
super(MLPEngine, self).__init__(config)
print(self.model)

if config['pretrain']:
self.model.load_pretrain_weights()
Loading