Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions checkpoints/note.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Data is removed due to limited memory
112 changes: 112 additions & 0 deletions data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import torch
import random
import pandas as pd
from copy import deepcopy
from torch.utils.data import DataLoader, Dataset
import os
script_dir = os.path.dirname(os.path.abspath(__file__))
os.chdir(script_dir)
random.seed(0)


class UserItemRatingDataset(Dataset):
"""Wrapper, convert <user, item, rating> Tensor into Pytorch Dataset"""
def __init__(self, user_tensor, item_tensor, target_tensor):
"""
args:

target_tensor: torch.Tensor, the corresponding rating for <user, item> pair
"""
self.user_tensor = user_tensor
self.item_tensor = item_tensor
self.target_tensor = target_tensor

def __getitem__(self, index):
return self.user_tensor[index], self.item_tensor[index], self.target_tensor[index]

def __len__(self):
return self.user_tensor.size(0)


class SampleGenerator(object):
"""Construct dataset for NCF"""

def __init__(self, ratings):
"""
args:
ratings: pd.DataFrame, which contains 4 columns = ['userId', 'itemId', 'rating', 'timestamp']
"""
assert 'userId' in ratings.columns
assert 'itemId' in ratings.columns
assert 'rating' in ratings.columns

self.ratings = ratings
# explicit feedback using _normalize and implicit using _binarize
# self.preprocess_ratings = self._normalize(ratings)
self.preprocess_ratings = self._binarize(ratings)
self.user_pool = set(self.ratings['userId'].unique())
self.item_pool = set(self.ratings['itemId'].unique())
# create negative item samples for NCF learning
self.negatives = self._sample_negative(ratings)
self.train_ratings, self.test_ratings = self._split_loo(self.preprocess_ratings)

def _normalize(self, ratings):
"""normalize into [0, 1] from [0, max_rating], explicit feedback"""
ratings = deepcopy(ratings)
max_rating = ratings.rating.max()
ratings['rating'] = ratings.rating * 1.0 / max_rating
return ratings

def _binarize(self, ratings):
"""binarize into 0 or 1, imlicit feedback"""
ratings = deepcopy(ratings)
ratings['rating'][ratings['rating'] > 0] = 1.0
return ratings

def _split_loo(self, ratings):
"""leave one out train/test split """
ratings['rank_latest'] = ratings.groupby(['userId'])['timestamp'].rank(method='first', ascending=False)
test = ratings[ratings['rank_latest'] == 1]
train = ratings[ratings['rank_latest'] > 1]
assert train['userId'].nunique() == test['userId'].nunique()
return train[['userId', 'itemId', 'rating']], test[['userId', 'itemId', 'rating']]

def _sample_negative(self, ratings):
"""return all negative items & 100 sampled negative items"""
interact_status = ratings.groupby('userId')['itemId'].apply(set).reset_index().rename(
columns={'itemId': 'interacted_items'})
interact_status['negative_items'] = interact_status['interacted_items'].apply(lambda x: self.item_pool - x)
interact_status['negative_samples'] = interact_status['negative_items'].apply(lambda x: random.sample(list(x), 99))
return interact_status[['userId', 'negative_items', 'negative_samples']]

def instance_a_train_loader(self, num_negatives, batch_size):
"""instance train loader for one training epoch"""
users, items, ratings = [], [], []
train_ratings = pd.merge(self.train_ratings, self.negatives[['userId', 'negative_items']], on='userId')
train_ratings['negatives'] = train_ratings['negative_items'].apply(lambda x: random.sample(list(x), num_negatives))
for row in train_ratings.itertuples():
users.append(int(row.userId))
items.append(int(row.itemId))
ratings.append(float(row.rating))
for i in range(num_negatives):
users.append(int(row.userId))
items.append(int(row.negatives[i]))
ratings.append(float(0)) # negative samples get 0 rating
dataset = UserItemRatingDataset(user_tensor=torch.LongTensor(users),
item_tensor=torch.LongTensor(items),
target_tensor=torch.FloatTensor(ratings))
return DataLoader(dataset, batch_size=batch_size, shuffle=True)

@property
def evaluate_data(self):
"""create evaluate data"""
test_ratings = pd.merge(self.test_ratings, self.negatives[['userId', 'negative_samples']], on='userId')
test_users, test_items, negative_users, negative_items = [], [], [], []
for row in test_ratings.itertuples():
test_users.append(int(row.userId))
test_items.append(int(row.itemId))
for i in range(len(row.negative_samples)):
negative_users.append(int(row.userId))
negative_items.append(int(row.negative_samples[i]))
return [torch.LongTensor(test_users), torch.LongTensor(test_items), torch.LongTensor(negative_users),
torch.LongTensor(negative_items)]
1 change: 1 addition & 0 deletions data/note.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Data is removed due to limited memory
89 changes: 89 additions & 0 deletions engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import torch
from torch.autograd import Variable
from tensorboardX import SummaryWriter

from utils import save_checkpoint, use_optimizer
from metrics import MetronAtK
import os
script_dir = os.path.dirname(os.path.abspath(__file__))
os.chdir(script_dir)

class Engine(object):
"""Meta Engine for training & evaluating NCF model

Note: Subclass should implement self.model !
"""

def __init__(self, config):
self.config = config # model configuration
self._metron = MetronAtK(top_k=10)
self._writer = SummaryWriter(log_dir='runs/{}'.format(config['alias'])) # tensorboard writer
self._writer.add_text('config', str(config), 0)
self.opt = use_optimizer(self.model, config)
# explicit feedback
# self.crit = torch.nn.MSELoss()
# implicit feedback
self.crit = torch.nn.BCELoss()

def train_single_batch(self, users, items, ratings):
assert hasattr(self, 'model'), 'Please specify the exact model !'
if self.config['use_cuda'] is True:
users, items, ratings = users.cuda(), items.cuda(), ratings.cuda()
self.opt.zero_grad()
ratings_pred = self.model(users, items)
loss = self.crit(ratings_pred.view(-1), ratings)
loss.backward()
self.opt.step()
loss = loss.item()
return loss

def train_an_epoch(self, train_loader, epoch_id):
print('train an epoch')
assert hasattr(self, 'model'), 'Please specify the exact model !'
self.model.train()
total_loss = 0
for batch_id, batch in enumerate(train_loader):
assert isinstance(batch[0], torch.LongTensor)
user, item, rating = batch[0], batch[1], batch[2]
rating = rating.float()
loss = self.train_single_batch(user, item, rating)
print('[Training Epoch {}] Batch {}, Loss {}'.format(epoch_id, batch_id, loss))
total_loss += loss
self._writer.add_scalar('model/loss', total_loss, epoch_id)

def evaluate(self, evaluate_data, epoch_id):
assert hasattr(self, 'model'), 'Please specify the exact model !'
self.model.eval()
with torch.no_grad():
test_users, test_items = evaluate_data[0], evaluate_data[1]
negative_users, negative_items = evaluate_data[2], evaluate_data[3]
if self.config['use_cuda'] is True:
test_users = test_users.cuda()
test_items = test_items.cuda()
negative_users = negative_users.cuda()
negative_items = negative_items.cuda()
test_scores = self.model(test_users, test_items)
negative_scores = self.model(negative_users, negative_items)
if self.config['use_cuda'] is True:
test_users = test_users.cpu()
test_items = test_items.cpu()
test_scores = test_scores.cpu()
negative_users = negative_users.cpu()
negative_items = negative_items.cpu()
negative_scores = negative_scores.cpu()
self._metron.subjects = [test_users.data.view(-1).tolist(),
test_items.data.view(-1).tolist(),
test_scores.data.view(-1).tolist(),
negative_users.data.view(-1).tolist(),
negative_items.data.view(-1).tolist(),
negative_scores.data.view(-1).tolist()]
hit_ratio, ndcg = self._metron.cal_hit_ratio(), self._metron.cal_ndcg()
self._writer.add_scalar('performance/HR', hit_ratio, epoch_id)
self._writer.add_scalar('performance/NDCG', ndcg, epoch_id)
print('[Evluating Epoch {}] HR = {:.4f}, NDCG = {:.4f}'.format(epoch_id, hit_ratio, ndcg))
return hit_ratio, ndcg

def save(self, alias, epoch_id, hit_ratio, ndcg):
assert hasattr(self, 'model'), 'Please specify the exact model !'
model_dir = self.config['model_dir'].format(alias, epoch_id, hit_ratio, ndcg)
save_checkpoint(self.model, model_dir)
174 changes: 174 additions & 0 deletions experiments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
import pandas as pd
import numpy as np
from gmf import GMFEngine
from mlp import MLPEngine
from neumf import NeuMFEngine
from data import SampleGenerator
import threading


import os
script_dir = os.path.dirname(os.path.abspath(__file__))
os.chdir(script_dir)

mlp0_config = {'alias': 'mlp_0',
'num_epoch': 50,
'batch_size': 256, # 1024,
'optimizer': 'adam',
'adam_lr': 1e-3,
'num_users': 6040,
'num_items': 3706,
'latent_dim': 8,
'num_negative': 4,
'layers': [16], # layers[0] is the concat of latent user vector & latent item vector
'l2_regularization': 0.0000001, # MLP model is sensitive to hyper params
'weight_init_gaussian': True,
'use_cuda': False,
'device_id': 0,
'pretrain': False,
'pretrain_mf': 'checkpoints/{}'.format('gmf_factor8neg4_Epoch100_HR0.6391_NDCG0.2852.model'),
'model_dir': 'checkpoints/{}_Epoch{}_HR{:.4f}_NDCG{:.4f}.model'}

mlp1_config = {'alias': 'mlp_1',
'num_epoch': 50,
'batch_size': 256, # 1024,
'optimizer': 'adam',
'adam_lr': 1e-3,
'num_users': 6040,
'num_items': 3706,
'latent_dim': 8,
'num_negative': 4,
'layers': [16, 8], # layers[0] is the concat of latent user vector & latent item vector
'l2_regularization': 0.0000001, # MLP model is sensitive to hyper params
'weight_init_gaussian': True,
'use_cuda': False,
'device_id': 0,
'pretrain': False,
'pretrain_mf': 'checkpoints/{}'.format('gmf_factor8neg4_Epoch100_HR0.6391_NDCG0.2852.model'),
'model_dir': 'checkpoints/{}_Epoch{}_HR{:.4f}_NDCG{:.4f}.model'}

mlp2_config = {'alias': 'mlp_2',
'num_epoch': 50,
'batch_size': 256, # 1024,
'optimizer': 'adam',
'adam_lr': 1e-3,
'num_users': 6040,
'num_items': 3706,
'latent_dim': 8,
'num_negative': 4,
'layers': [16,16, 8], # layers[0] is the concat of latent user vector & latent item vector
'l2_regularization': 0.0000001, # MLP model is sensitive to hyper params
'weight_init_gaussian': True,
'use_cuda': False,
'device_id': 0,
'pretrain': False,
'pretrain_mf': 'checkpoints/{}'.format('gmf_factor8neg4_Epoch100_HR0.6391_NDCG0.2852.model'),
'model_dir': 'checkpoints/{}_Epoch{}_HR{:.4f}_NDCG{:.4f}.model'}

mlp3_config = {'alias': 'mlp_3',
'num_epoch': 50,
'batch_size': 256, # 1024,
'optimizer': 'adam',
'adam_lr': 1e-3,
'num_users': 6040,
'num_items': 3706,
'latent_dim': 8,
'num_negative': 4,
'layers': [16, 32, 16, 8], # layers[0] is the concat of latent user vector & latent item vector
'l2_regularization': 0.0000001, # MLP model is sensitive to hyper params
'weight_init_gaussian': True,
'use_cuda': False,
'device_id': 0,
'pretrain': False,
'pretrain_mf': 'checkpoints/{}'.format('gmf_factor8neg4_Epoch100_HR0.6391_NDCG0.2852.model'),
'model_dir': 'checkpoints/{}_Epoch{}_HR{:.4f}_NDCG{:.4f}.model'}

mlp4_config = {'alias': 'mlp_4',
'num_epoch': 50,
'batch_size': 256, # 1024,
'optimizer': 'adam',
'adam_lr': 1e-3,
'num_users': 6040,
'num_items': 3706,
'latent_dim': 8,
'num_negative': 4,
'layers': [16, 64, 32, 16, 8], # layers[0] is the concat of latent user vector & latent item vector
'l2_regularization': 0.0000001, # MLP model is sensitive to hyper params
'weight_init_gaussian': True,
'use_cuda': False,
'device_id': 0,
'pretrain': False,
'pretrain_mf': 'checkpoints/{}'.format('gmf_factor8neg4_Epoch100_HR0.6391_NDCG0.2852.model'),
'model_dir': 'checkpoints/{}_Epoch{}_HR{:.4f}_NDCG{:.4f}.model'}


result_exp=pd.DataFrame(columns=['alias','epoch','hit_ratio','ndcg'])


def train_model(engine, config, model_name, result_exp):
sample_generator = SampleGenerator(ratings=ml1m_rating)
evaluate_data = sample_generator.evaluate_data
print(f"Training {model_name} starts!")

for epoch in range(config['num_epoch']):
print(f'Epoch {epoch} for {model_name} starts!')
print('-' * 80)
train_loader = sample_generator.instance_a_train_loader(config['num_negative'], config['batch_size'])
engine.train_an_epoch(train_loader, epoch_id=epoch)
hit_ratio, ndcg = engine.evaluate(evaluate_data, epoch_id=epoch)
engine.save(config['alias'], epoch, hit_ratio, ndcg)
result_exp=pd.concat([result_exp, pd.DataFrame({'alias': [config['alias']],
'epoch': [epoch],
'hit_ratio': [hit_ratio],
'ndcg': [ndcg]})], ignore_index=True)
print(f"Training {model_name} finished!")
return result_exp


# Load Data
ml1m_dir = 'data/ml-1m/ratings.dat'
ml1m_rating = pd.read_csv(ml1m_dir, sep='::', header=None, names=['uid', 'mid', 'rating', 'timestamp'], engine='python')
# Reindex
user_id = ml1m_rating[['uid']].drop_duplicates().reindex()
user_id['userId'] = np.arange(len(user_id))
ml1m_rating = pd.merge(ml1m_rating, user_id, on=['uid'], how='left')
item_id = ml1m_rating[['mid']].drop_duplicates()
item_id['itemId'] = np.arange(len(item_id))
ml1m_rating = pd.merge(ml1m_rating, item_id, on=['mid'], how='left')
ml1m_rating = ml1m_rating[['userId', 'itemId', 'rating', 'timestamp']]
print('Range of userId is [{}, {}]'.format(ml1m_rating.userId.min(), ml1m_rating.userId.max()))
print('Range of itemId is [{}, {}]'.format(ml1m_rating.itemId.min(), ml1m_rating.itemId.max()))


mlp0_engine = MLPEngine(mlp0_config)
result_exp=train_model(mlp0_engine, mlp0_config, "MLP",result_exp)

mlp1_engine = MLPEngine(mlp1_config)
result_exp=train_model(mlp1_engine, mlp1_config, "MLP",result_exp)

mlp2_engine = MLPEngine(mlp2_config)
result_exp=train_model(mlp2_engine, mlp2_config, "MLP",result_exp)

mlp3_engine = MLPEngine(mlp3_config)
result_exp=train_model(mlp3_engine, mlp3_config, "MLP",result_exp)

mlp4_engine = MLPEngine(mlp4_config)
result_exp=train_model(mlp4_engine, mlp4_config, "MLP",result_exp)

result_exp.to_csv('./result_exp.csv')


print('mlp_0 metric:')
print(result_exp[result_exp['alias']=='mlp_0'].tail(1))

print('mlp_1 metric:')
print(result_exp[result_exp['alias']=='mlp_1'].tail(1))

print('mlp_2 metric:')
print(result_exp[result_exp['alias']=='mlp_2'].tail(1))

print('mlp_3 metric:')
print(result_exp[result_exp['alias']=='mlp_3'].tail(1))

print('mlp_4 metric:')
print(result_exp[result_exp['alias']=='mlp_14'].tail(1))
Loading