Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions abl_study.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from experiment import experiment
import pandas as pd


factor_nums = [8]
layer_nums = [0, 1, 2, 3, 4]

df_table3 = pd.DataFrame(columns=["factor_num", "layer_num", "HR@10"])
for factor_num in factor_nums:
for layer_num in layer_nums:
layers = [factor_num * (2**i) for i in range(layer_num+1)][::-1]
HR, _ = experiment(method="MLP", embed_size=factor_num * (2**layer_num) // 2, layers=layers, epochs=10)
new_row = pd.DataFrame({"factor_num": [factor_num], "layer_num": [layer_num], "HR@10": [HR]})
df_table3 = pd.concat([df_table3, new_row], ignore_index=True)


print("layer_num=: ", layer_num)
df_table3.to_csv("table3.csv", index=False)
8 changes: 8 additions & 0 deletions config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
DATA_URL = "http://files.grouplens.org/datasets/movielens/ml-100k/u.data"

MAIN_PATH = '/root/DataPrac/as/as3/Neural-Collaborative-Filtering/'

DATA_PATH = MAIN_PATH + 'data/ml-1m/ratings.dat'
MODEL_PATH = MAIN_PATH + 'models/'

MODEL = 'ml-1m_Neu_MF'
117 changes: 117 additions & 0 deletions data_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
import random
import numpy as np
import pandas as pd
import torch
import config

class NCF_Data(object):
"""
Construct Dataset for NCF
"""
def __init__(self, args, ratings):
self.ratings = ratings
self.num_ng = args.num_ng
self.num_ng_test = args.num_ng_test
self.batch_size = args.batch_size

self.preprocess_ratings = self._reindex(self.ratings)

self.user_pool = set(self.ratings['user_id'].unique())
self.item_pool = set(self.ratings['item_id'].unique())

self.train_ratings, self.test_ratings = self._leave_one_out(self.preprocess_ratings)
self.negatives = self._negative_sampling(self.preprocess_ratings)
random.seed(args.seed)

def _reindex(self, ratings):
"""
Process dataset to reindex userID and itemID, also set rating as binary feedback
"""
user_list = list(ratings['user_id'].drop_duplicates())
user2id = {w: i for i, w in enumerate(user_list)}

item_list = list(ratings['item_id'].drop_duplicates())
item2id = {w: i for i, w in enumerate(item_list)}

ratings['user_id'] = ratings['user_id'].apply(lambda x: user2id[x])
ratings['item_id'] = ratings['item_id'].apply(lambda x: item2id[x])
ratings['rating'] = ratings['rating'].apply(lambda x: float(x > 0))
return ratings

def _leave_one_out(self, ratings):
"""
leave-one-out evaluation protocol in paper https://www.comp.nus.edu.sg/~xiangnan/papers/ncf.pdf
"""
ratings['rank_latest'] = ratings.groupby(['user_id'])['timestamp'].rank(method='first', ascending=False)
test = ratings.loc[ratings['rank_latest'] == 1]
train = ratings.loc[ratings['rank_latest'] > 1]
assert train['user_id'].nunique()==test['user_id'].nunique(), 'Not Match Train User with Test User'
return train[['user_id', 'item_id', 'rating']], test[['user_id', 'item_id', 'rating']]


def _negative_sampling(self, ratings):
interact_status = (
ratings.groupby('user_id')['item_id']
.apply(set)
.reset_index()
.rename(columns={'item_id': 'interacted_items'}))
interact_status['negative_items'] = interact_status['interacted_items'].apply(lambda x: self.item_pool - x)
interact_status['negative_samples'] = interact_status['negative_items'].apply(lambda x: random.sample(x, self.num_ng_test))
return interact_status[['user_id', 'negative_items', 'negative_samples']]

def get_train_instance(self):
users, items, ratings = [], [], []
train_ratings = pd.merge(self.train_ratings, self.negatives[['user_id', 'negative_items']], on='user_id')
train_ratings['negatives'] = train_ratings['negative_items'].apply(lambda x: random.sample(x, self.num_ng))
for row in train_ratings.itertuples():
users.append(int(row.user_id))
items.append(int(row.item_id))
ratings.append(float(row.rating))
for i in range(self.num_ng):
users.append(int(row.user_id))
items.append(int(row.negatives[i]))
ratings.append(float(0)) # negative samples get 0 rating
dataset = Rating_Datset(
user_list=users,
item_list=items,
rating_list=ratings)
return torch.utils.data.DataLoader(dataset, batch_size=self.batch_size, shuffle=True, num_workers=4)

def get_test_instance(self):
users, items, ratings = [], [], []
test_ratings = pd.merge(self.test_ratings, self.negatives[['user_id', 'negative_samples']], on='user_id')
for row in test_ratings.itertuples():
users.append(int(row.user_id))
items.append(int(row.item_id))
ratings.append(float(row.rating))
for i in getattr(row, 'negative_samples'):
users.append(int(row.user_id))
items.append(int(i))
ratings.append(float(0))
dataset = Rating_Datset(
user_list=users,
item_list=items,
rating_list=ratings)
return torch.utils.data.DataLoader(dataset, batch_size=self.num_ng_test+1, shuffle=False, num_workers=4)


class Rating_Datset(torch.utils.data.Dataset):
def __init__(self, user_list, item_list, rating_list):
super(Rating_Datset, self).__init__()
self.user_list = user_list
self.item_list = item_list
self.rating_list = rating_list

def __len__(self):
return len(self.user_list)

def __getitem__(self, idx):
user = self.user_list[idx]
item = self.item_list[idx]
rating = self.rating_list[idx]

return (
torch.tensor(user, dtype=torch.long),
torch.tensor(item, dtype=torch.long),
torch.tensor(rating, dtype=torch.float)
)
44 changes: 44 additions & 0 deletions evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import numpy as np
import torch


def hit(ng_item, pred_items):
if ng_item in pred_items:
return 1
return 0


def ndcg(ng_item, pred_items):
if ng_item in pred_items:
index = pred_items.index(ng_item)
return np.reciprocal(np.log2(index+2))
return 0


def metrics(model, test_loader, top_k, device):
HR, NDCG = [], []

for user, item, label in test_loader:
user = user.to(device)
item = item.to(device)

predictions = model(user, item)

_, indices = torch.topk(predictions, top_k, dim=0)
recommends = torch.take(
item, indices).cpu().numpy().T.tolist()


# adjust the shape of recommends
# GMF: [[1,1,1,1]], MLP: [[1,1,1,1]]
if isinstance(recommends[0], list):
recommends = recommends[0]


ng_item = item[0].item() # leave one-out evaluation has only one item per user
hr_ = hit(ng_item, recommends)
ndcg_ = ndcg(ng_item, recommends)
HR.append(hr_)
NDCG.append(ndcg_)

return np.mean(HR), np.mean(NDCG)
125 changes: 125 additions & 0 deletions experiment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import os
import time
import argparse
import pandas as pd
import numpy as np


import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data


import model as Model
import config
import util
import data_utils
import evaluate



def experiment(method='NeuMF',
seed=42,
lr=0.001,
dropout=0.2,
batch_size=256,
epochs=30,
top_k=10,
embed_size=32,
layers=[64,32,16,8],
num_ng=4,
num_ng_test=100,
out=True):

# set device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
util.seed_everything(seed)






# load data
class RequiredArgs:
def __init__(self, method, seed, lr, dropout, batch_size, epochs, top_k, embed_size, layers, num_ng, num_ng_test, out):
self.method = method
self.seed = seed
self.lr = lr
self.dropout = dropout
self.batch_size = batch_size
self.epochs = epochs
self.top_k = top_k
self.embed_size = embed_size
self.layers = layers
self.num_ng = num_ng
self.num_ng_test = num_ng_test
self.out = out

args = RequiredArgs(method, seed, lr, dropout, batch_size, epochs, top_k, embed_size, layers, num_ng, num_ng_test, out)
ml_1m = pd.read_csv(
config.DATA_PATH,
sep="::",
names = ['user_id', 'item_id', 'rating', 'timestamp'],
engine='python')

num_users = ml_1m['user_id'].nunique()+1
num_items = ml_1m['item_id'].nunique()+1

data = data_utils.NCF_Data(args, ml_1m)
train_loader =data.get_train_instance()
test_loader =data.get_test_instance()





# model, loss function, optimizer
if method == "GMF":
model = Model.Generalized_Matrix_Factorization(args, num_users, num_items)
if method == "MLP":
model = Model.Multi_Layer_Perceptron(args, num_users, num_items)
if method == "NeuMF":
model = Model.NeuMF(args, num_users, num_items)
model = model.to(device)
loss_function = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=args.lr)






# train, evaluation
best_hr = 0
best_ndcg = 0
for epoch in range(1, args.epochs+1):
model.train()
start_time = time.time()

for user, item, label in train_loader:
user = user.to(device)
item = item.to(device)
label = label.to(device)

optimizer.zero_grad()
prediction = model(user, item)

if args.method == 'GMF' or args.method == 'MLP':
prediction = prediction.squeeze()

loss = loss_function(prediction, label)
loss.backward()
optimizer.step()

model.eval()
HR, NDCG = evaluate.metrics(model, test_loader, args.top_k, device)
print("HR = {:.3f}, NDCG = {:.3f}".format(HR, NDCG))
if HR > best_hr: best_hr = HR
if NDCG > best_ndcg: best_ndcg = NDCG



print("Best HR = {:.3f}, NDCG = {:.3f}".format(best_hr, best_ndcg))
return best_hr, best_ndcg
3 changes: 3 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from experiment import experiment

experiment(method="NeuMF", epochs=10)
Loading