diff --git a/configs/train/siren_train_config.json b/configs/train/siren_train_config.json new file mode 100644 index 00000000..23223e5f --- /dev/null +++ b/configs/train/siren_train_config.json @@ -0,0 +1,167 @@ +{ + "experiment_name": "siren_ml1m_test", + "dataset": { + "type": "negative_ratings_graph", + "positive_domain": "positive", + "graph_dir_path": "../data/ml-1m", + "dataset": { + "type": "negative_rating", + "positive_domain": "positive", + "negative_domain": "negative", + "path_to_data_dir": "../data", + "name": "ml-1m", + "max_sequence_length": 50, + "samplers": { + "num_negatives_val": 100, + "type": "negative_ratings_next_item_prediction", + "negative_sampler_type": "negative_ratings_negative_sampler", + "negative_sampler_type_graph": "random", + "positive_domain": "positive", + "offset": 4 + } + } + }, + "dataloader": { + "train": { + "type": "torch", + "batch_size": 256, + "batch_processor": { + "type": "negative_batch" + }, + "drop_last": true, + "shuffle": true + }, + "validation": { + "type": "torch", + "batch_size": 256, + "batch_processor": { + "type": "negative_batch" + }, + "drop_last": false, + "shuffle": false + } + }, + "model": { + "type": "siren", + "user_prefix": "user.graph", + "positive_prefix": "positive.graph", + "negative_prefix": "negative.graph", + "candidate_prefix": "candidates.graph", + "mlp_layers": 2, + "embedding_dim": 64, + "num_layers": 3, + "dropout": 0.5, + "initializer_range": 0.02 + }, + "optimizer": { + "type": "basic", + "optimizer": { + "type": "adam", + "lr": 0.001 + } + }, + "loss": { + "type": "composite", + "losses": [ + { + "type": "bpr", + "positive_prefix": "positive_scores", + "negative_prefix": "negative_scores", + "output_prefix": "lightgcn_downstream_loss" + }, + { + "type": "regularization", + "prefix": [ + "item_embeddings" + ], + "weight": 1e-2, + "output_prefix": "lightgcn_regularization_loss" + } + ], + "output_prefix": "lightgcn_loss" + }, + "callback": { + "type": "composite", + "callbacks": [ + { + "type": "metric", + "on_step": 1, + "loss_prefix": "lightgcn_loss" + }, + { + "type": "metric", + "on_step": 1, + "loss_prefix": "lightgcn_downstream_loss" + }, + { + "type": "metric", + "on_step": 1, + "loss_prefix": "lightgcn_regularization_loss" + }, + { + "type": "validation", + "on_step": 64, + "pred_prefix": "logits", + "labels_prefix": "labels.graph", + "metrics": { + "ndcg@5": { + "type": "ndcg", + "k": 5 + }, + "ndcg@10": { + "type": "ndcg", + "k": 10 + }, + "ndcg@20": { + "type": "ndcg", + "k": 20 + }, + "recall@5": { + "type": "recall", + "k": 5 + }, + "recall@10": { + "type": "recall", + "k": 10 + }, + "recall@20": { + "type": "recall", + "k": 20 + } + } + }, + { + "type": "eval", + "on_step": 256, + "pred_prefix": "logits", + "labels_prefix": "labels.graph", + "metrics": { + "ndcg@5": { + "type": "ndcg", + "k": 5 + }, + "ndcg@10": { + "type": "ndcg", + "k": 10 + }, + "ndcg@20": { + "type": "ndcg", + "k": 20 + }, + "recall@5": { + "type": "recall", + "k": 5 + }, + "recall@10": { + "type": "recall", + "k": 10 + }, + "recall@20": { + "type": "recall", + "k": 20 + } + } + } + ] + } +} \ No newline at end of file diff --git a/modeling/callbacks/__init__.py b/modeling/callbacks/__init__.py index 24a98768..d13b6c80 100644 --- a/modeling/callbacks/__init__.py +++ b/modeling/callbacks/__init__.py @@ -1 +1 @@ -from .base import BaseCallback, CompositeCallback, EvalCallback +from .base import BaseCallback, CompositeCallback, EvalCallback, ValidationCallback diff --git a/modeling/dataloader/batch_processors.py b/modeling/dataloader/batch_processors.py index 436f98fe..464d1dfd 100644 --- a/modeling/dataloader/batch_processors.py +++ b/modeling/dataloader/batch_processors.py @@ -35,3 +35,32 @@ def __call__(self, batch): processed_batch[part] = torch.tensor(values, dtype=torch.long) return processed_batch + + +class NegativeBatchProcessor(BaseBatchProcessor, config_name='negative_batch'): + + def __call__(self, batch): + processed_batch = {} + + for key in batch[0].keys(): + if key.endswith('.ids'): + # prefix = key.split('.')[0] + prefix = key[:-4] + assert '{}.length'.format(prefix) in batch[0] + + processed_batch[f'{prefix}.ids'] = [] + processed_batch[f'{prefix}.length'] = [] + + for sample in batch: + # Тк item.negative_domain.ids может не быть из-за отсутствия негативных взаимодействий пользователя + if f'{prefix}.ids' in sample: + processed_batch[f'{prefix}.ids'].extend(sample[f'{prefix}.ids']) + processed_batch[f'{prefix}.length'].append(sample[f'{prefix}.length']) + + for part, values in processed_batch.items(): + if part == 'ratings.ids': + processed_batch[part] = torch.tensor(values, dtype=torch.float) + else: + processed_batch[part] = torch.tensor(values, dtype=torch.long) + + return processed_batch diff --git a/modeling/dataset/base.py b/modeling/dataset/base.py index b7a23614..b6abaf40 100644 --- a/modeling/dataset/base.py +++ b/modeling/dataset/base.py @@ -1,20 +1,18 @@ +import logging +import os +import pickle +from collections import Counter from collections import defaultdict +from itertools import zip_longest -from tqdm import tqdm - -from dataset.samplers import TrainSampler, ValidationSampler, EvalSampler -from dataset.samplers import MultiDomainTrainSampler, MultiDomainValidationSampler, MultiDomainEvalSampler - -from utils import MetaParent, DEVICE - -import pickle -import torch import numpy as np import scipy.sparse as sp +import torch from scipy.sparse import csr_matrix +from tqdm import tqdm -import os -import logging +from dataset.samplers import TrainSampler, ValidationSampler, EvalSampler +from utils import MetaParent, DEVICE logger = logging.getLogger(__name__) @@ -57,7 +55,8 @@ def create_from_config(cls, config, **kwargs): validation_dataset, validation_max_user_idx, validation_max_item_idx, validation_max_sequence_length = cls._create_dataset( data_dir_path, 'validation_new', config['max_sequence_length'] ) - max_user_idx, max_item_idx = max(max_user_idx, validation_max_user_idx), max(max_item_idx, validation_max_item_idx) + max_user_idx, max_item_idx = max(max_user_idx, validation_max_user_idx), max(max_item_idx, + validation_max_item_idx) max_sequence_length = max(max_sequence_length, validation_max_sequence_length) test_dataset, test_max_user_idx, test_max_item_idx, test_max_sequence_length = cls._create_dataset( @@ -213,18 +212,21 @@ def create_from_config(cls, config, **kwargs): max_user_idx_by_domain, max_item_idx_by_domain = {}, {} for domain in domains: - train_dataset[domain], train_max_user_idx, train_max_item_idx, train_max_sequence_length = cls._create_dataset( + train_dataset[ + domain], train_max_user_idx, train_max_item_idx, train_max_sequence_length = cls._create_dataset( os.path.join(data_dir_path, domain), 'train_new', config['max_sequence_length'] ) max_user_idx, max_item_idx = max(max_user_idx, train_max_user_idx), max(max_item_idx, train_max_item_idx) max_sequence_length = max(max_sequence_length, train_max_sequence_length) - - validation_dataset[domain], validation_max_user_idx, validation_max_item_idx, validation_max_sequence_length = cls._create_dataset( + + validation_dataset[ + domain], validation_max_user_idx, validation_max_item_idx, validation_max_sequence_length = cls._create_dataset( os.path.join(data_dir_path, domain), 'validation_new', config['max_sequence_length'] ) - max_user_idx, max_item_idx = max(max_user_idx, validation_max_user_idx), max(max_item_idx, validation_max_item_idx) + max_user_idx, max_item_idx = max(max_user_idx, validation_max_user_idx), max(max_item_idx, + validation_max_item_idx) max_sequence_length = max(max_sequence_length, validation_max_sequence_length) - + test_dataset[domain], test_max_user_idx, test_max_item_idx, test_max_sequence_length = cls._create_dataset( os.path.join(data_dir_path, domain), 'test_new', config['max_sequence_length'] ) @@ -238,33 +240,34 @@ def create_from_config(cls, config, **kwargs): logger.info('Max item idx: {}'.format(max_item_idx)) for domain in domains: logger.info('{} domain dataset sparsity: {}'.format( - domain, (len(train_dataset[domain]) + len(test_dataset[domain])) / max_user_idx_by_domain[domain] / max_item_idx_by_domain[domain] + domain, (len(train_dataset[domain]) + len(test_dataset[domain])) / max_user_idx_by_domain[domain] / + max_item_idx_by_domain[domain] )) # TODO replace unodomain samplers with multidomain ones train_sampler = TrainSampler.create_from_config( - dict(config['samplers'], + dict(config['samplers'], **{'target_domain': target_domain, 'other_domains': other_domains - }), + }), dataset=train_dataset, num_users=max_user_idx, num_items=max_item_idx ) validation_sampler = ValidationSampler.create_from_config( - dict(config['samplers'], + dict(config['samplers'], **{'target_domain': target_domain, 'other_domains': other_domains - }), + }), dataset=validation_dataset, num_users=max_user_idx, num_items=max_item_idx ) test_sampler = EvalSampler.create_from_config( - dict(config['samplers'], + dict(config['samplers'], **{'target_domain': target_domain, 'other_domains': other_domains - }), + }), dataset=test_dataset, num_users=max_user_idx, num_items=max_item_idx @@ -282,6 +285,7 @@ def create_from_config(cls, config, **kwargs): ) +# TODO скопировать весь этот код и везде где надо вызывать код от positive_domain, как в siren class GraphDataset(BaseDataset, config_name='graph'): def __init__( @@ -324,7 +328,6 @@ def __init__( visited_user_item_pairs.add((user_id, item_id)) - # TODO create separated function if not self._use_train_data_only: for sample in validation_sampler.dataset: user_id = sample['user.ids'][0] @@ -405,7 +408,8 @@ def __init__( # (user, user) graph user2user_connections = csr_matrix( ( - np.ones(len(user2user_interactions_fst)), (user2user_interactions_fst, user2user_interactions_snd)), + np.ones(len(user2user_interactions_fst)), + (user2user_interactions_fst, user2user_interactions_snd)), shape=(self._num_users + 2, self._num_users + 2) ) @@ -447,7 +451,8 @@ def __init__( # (item, item) graph item2item_connections = csr_matrix( ( - np.ones(len(item2item_interactions_fst)), (item2item_interactions_fst, item2item_interactions_snd)), + np.ones(len(item2item_interactions_fst)), + (item2item_interactions_fst, item2item_interactions_snd)), shape=(self._num_items + 2, self._num_items + 2) ) self._item_graph = self.get_sparse_graph_layer( @@ -540,6 +545,181 @@ def meta(self): return meta +class NegativeRatingsGraphDataset(BaseDataset, config_name='negative_ratings_graph'): + + def __init__( + self, + dataset, + graph_dir_path, + positive_domain, + use_train_data_only=True, + use_user_graph=False, + use_item_graph=False + ): + self._positive_domain = positive_domain + self._dataset = dataset + self._graph_dir_path = graph_dir_path + self._use_train_data_only = use_train_data_only + self._use_user_graph = use_user_graph + self._use_item_graph = use_item_graph + + self._num_users = dataset.num_users + self._num_items = dataset.num_items + + train_sampler, validation_sampler, test_sampler = dataset.get_samplers() + + train_interactions, train_user_interactions, train_item_interactions = [], [], [] + + train_user_2_items = defaultdict(set) + train_item_2_users = defaultdict(set) + visited_user_item_pairs = set() + + for sample in train_sampler: + user_id = sample['user.positive.ids'][0] + item_ids = sample['item.positive.ids'] + + for item_id in item_ids: + if (user_id, item_id) not in visited_user_item_pairs: + train_interactions.append((user_id, item_id)) + train_user_interactions.append(user_id) + train_item_interactions.append(item_id) + + train_user_2_items[user_id].add(item_id) + train_item_2_users[item_id].add(user_id) + + visited_user_item_pairs.add((user_id, item_id)) + + if not self._use_train_data_only: + for sample in validation_sampler: + user_id = sample['user.positive.ids'][0] + item_ids = sample['item.positive.ids'] + + for item_id in item_ids: + if (user_id, item_id) not in visited_user_item_pairs: + train_interactions.append((user_id, item_id)) + train_user_interactions.append(user_id) + train_item_interactions.append(item_id) + + train_user_2_items[user_id].add(item_id) + train_item_2_users[item_id].add(user_id) + + visited_user_item_pairs.add((user_id, item_id)) + + for sample in test_sampler: + user_id = sample['user.positive.ids'][0] + item_ids = sample['item.positive.ids'] + + for item_id in item_ids: + if (user_id, item_id) not in visited_user_item_pairs: + train_interactions.append((user_id, item_id)) + train_user_interactions.append(user_id) + train_item_interactions.append(item_id) + + train_user_2_items[user_id].add(item_id) + train_item_2_users[item_id].add(user_id) + + visited_user_item_pairs.add((user_id, item_id)) + + self._train_interactions = np.array(train_interactions) + self._train_user_interactions = np.array(train_user_interactions) + self._train_item_interactions = np.array(train_item_interactions) + + path_to_graph = os.path.join(graph_dir_path, 'general_negative_graph.npz') + if os.path.exists(path_to_graph): + self._graph = sp.load_npz(path_to_graph) + else: + # place ones only when co-occurrence happens + user2item_connections = csr_matrix( + (np.ones(len(train_user_interactions)), (train_user_interactions, train_item_interactions)), + shape=(self._num_users + 2, self._num_items + 2) + ) # (num_users + 2, num_items + 2), bipartite graph + self._graph = self.get_sparse_graph_layer( + user2item_connections, + self._num_users + 2, + self._num_items + 2, + biparite=True + ) + sp.save_npz(path_to_graph, self._graph) + + self._graph = self._convert_sp_mat_to_sp_tensor(self._graph).coalesce().to(DEVICE) + + @classmethod + def create_from_config(cls, config): + dataset = BaseDataset.create_from_config(config['dataset']) + return cls( + dataset=dataset, + graph_dir_path=config['graph_dir_path'], + positive_domain=config['positive_domain'], + use_user_graph=config.get('use_user_graph', False), + use_item_graph=config.get('use_item_graph', False) + ) + + @staticmethod + def get_sparse_graph_layer(sparse_matrix, fst_dim, snd_dim, biparite=False): + mat_dim_size = fst_dim + snd_dim if biparite else fst_dim + + adj_mat = sp.dok_matrix( + (mat_dim_size, mat_dim_size), + dtype=np.float32 + ) + adj_mat = adj_mat.tolil() + + R = sparse_matrix.tolil() # list of lists (fst_dim, snd_dim) + + if biparite: + adj_mat[:fst_dim, fst_dim:] = R # (num_users, num_items) + adj_mat[fst_dim:, :fst_dim] = R.T # (num_items, num_users) + else: + adj_mat = R + + adj_mat = adj_mat.todok() + # adj_mat += sp.eye(adj_mat.shape[0]) # remove division by zero issue + + edges_degree = np.array(adj_mat.sum(axis=1)) # D + + rowsum = np.array(adj_mat.sum(1)) + d_inv = np.power(rowsum, -1).flatten() + d_inv[np.isinf(d_inv)] = 0. + d_mat_inv = sp.diags(d_inv) + + d_inv = np.power(edges_degree, -0.5).flatten() # D^(-0.5) + d_inv[np.isinf(d_inv)] = 0. # fix NaNs in case if row with zero connections + d_mat = sp.diags(d_inv) # make it square matrix + + # D^(-0.5) @ A @ D^(-0.5) + norm_adj = d_mat.dot(adj_mat).dot(d_mat) + + return norm_adj.tocsr() + + @staticmethod + def _convert_sp_mat_to_sp_tensor(X): + coo = X.tocoo().astype(np.float32) + row = torch.Tensor(coo.row).long() + col = torch.Tensor(coo.col).long() + index = torch.stack([row, col]) + data = torch.FloatTensor(coo.data) + return torch.sparse.FloatTensor(index, data, torch.Size(coo.shape)) + + @property + def num_users(self): + return self._dataset.num_users + + @property + def num_items(self): + return self._dataset.num_items + + def get_samplers(self): + return self._dataset.get_samplers() + + @property + def meta(self): + meta = { + 'graph': self._graph, + **self._dataset.meta + } + return meta + + class DuorecDataset(BaseDataset, config_name='duorec'): def __init__(self, dataset): @@ -626,21 +806,21 @@ def create_from_config(cls, config, **kwargs): 'item.ids': item_ids[:-2][-max_sequence_length:], 'item.length': len(item_ids[:-2][-max_sequence_length:]) }) - assert len(item_ids[:-2][-max_sequence_length:]) == len(set(item_ids[:-2][-max_sequence_length:])) + # assert len(item_ids[:-2][-max_sequence_length:]) == len(set(item_ids[:-2][-max_sequence_length:])) validation_dataset.append({ 'user.ids': [user_idx], 'user.length': 1, 'item.ids': item_ids[:-1][-max_sequence_length:], 'item.length': len(item_ids[:-1][-max_sequence_length:]) }) - assert len(item_ids[:-1][-max_sequence_length:]) == len(set(item_ids[:-1][-max_sequence_length:])) + # assert len(item_ids[:-1][-max_sequence_length:]) == len(set(item_ids[:-1][-max_sequence_length:])) test_dataset.append({ 'user.ids': [user_idx], 'user.length': 1, 'item.ids': item_ids[-max_sequence_length:], 'item.length': len(item_ids[-max_sequence_length:]) }) - assert len(item_ids[-max_sequence_length:]) == len(set(item_ids[-max_sequence_length:])) + # assert len(item_ids[-max_sequence_length:]) == len(set(item_ids[-max_sequence_length:])) logger.info('Train dataset size: {}'.format(len(train_dataset))) logger.info('Test dataset size: {}'.format(len(test_dataset))) @@ -779,33 +959,34 @@ def create_from_config(cls, config, **kwargs): logger.info('{} domain Train dataset size: {}'.format(domain, len(train_dataset[domain]))) logger.info('{} domain Test dataset size: {}'.format(domain, len(test_dataset[domain]))) logger.info('{} domain dataset sparsity: {}'.format( - domain, (len(train_dataset[domain]) + len(test_dataset[domain])) / max_user_idx_by_domain[domain] / max_item_idx_by_domain[domain] + domain, (len(train_dataset[domain]) + len(test_dataset[domain])) / max_user_idx_by_domain[domain] / + max_item_idx_by_domain[domain] )) # TODO replace unodomain samplers with multidomain ones train_sampler = TrainSampler.create_from_config( - dict(config['samplers'], + dict(config['samplers'], **{'target_domain': target_domain, 'other_domains': other_domains - }), + }), dataset=train_dataset, num_users=max_user_idx, num_items=max_item_idx ) validation_sampler = ValidationSampler.create_from_config( - dict(config['samplers'], + dict(config['samplers'], **{'target_domain': target_domain, 'other_domains': other_domains - }), + }), dataset=validation_dataset, num_users=max_user_idx, num_items=max_item_idx ) test_sampler = EvalSampler.create_from_config( - dict(config['samplers'], + dict(config['samplers'], **{'target_domain': target_domain, 'other_domains': other_domains - }), + }), dataset=test_dataset, num_users=max_user_idx, num_items=max_item_idx @@ -821,3 +1002,176 @@ def create_from_config(cls, config, **kwargs): target_domain=target_domain, other_domains=other_domains ) + + +class NegativeRatingsScientificDataset(ScientificDataset, config_name='negative_rating'): + def __init__( + self, + train_sampler, + validation_sampler, + test_sampler, + num_users, + num_items, + max_sequence_length, + positive_domain, + negative_domain, + negative_items_popularity + ): + super().__init__(train_sampler, validation_sampler, test_sampler, num_users, num_items, max_sequence_length) + self._positive_domain = positive_domain + self._negative_domain = negative_domain + self._negative_items_popularity = negative_items_popularity + + @classmethod + def create_from_config(cls, config, **kwargs): + data_dir_path = os.path.join(config['path_to_data_dir'], config['name']) + positive_domain = config['positive_domain'] + negative_domain = config['negative_domain'] + domains = [positive_domain, negative_domain] + max_sequence_length = config['max_sequence_length'] + max_user_idx, max_item_idx = 0, 0 + + train_dataset, validation_dataset, test_dataset = {}, {}, {} + max_user_idx_by_domain, max_item_idx_by_domain = {}, {} + + negative_count_items = {} + all_count_items = {} + for domain in domains: + dataset_path = os.path.join(data_dir_path, domain + '_data.txt') + train_dataset[domain], validation_dataset[domain], test_dataset[domain] = [], [], [] + with open(dataset_path, 'r') as f: + for user_items_info, ratings in zip_longest(*[f] * 2): + user_items_info = user_items_info.strip('\n').split(' ') + ratings = [float(x) for x in ratings.strip('\n').split(' ')] + user_idx = int(user_items_info[0]) + item_ids = [int(item_id) for item_id in user_items_info[1:]] + items_dict = dict(Counter(item_ids)) + all_count_items = {k: all_count_items.get(k, 0) + items_dict.get(k, 0) for k in + set(all_count_items) | set(items_dict)} + if domain == negative_domain: + negative_count_items = {k: negative_count_items.get(k, 0) + items_dict.get(k, 0) for k in + set(negative_count_items) | set(items_dict)} + + max_user_idx = max(max_user_idx, user_idx) + max_item_idx = max(max_item_idx, max(item_ids)) + + assert len(item_ids) >= 5 + assert len(ratings) == len(item_ids) + + train_dataset[domain].append({ + 'user.ids': [user_idx], + 'user.length': 1, + 'item.ids': item_ids[:-2][-max_sequence_length:], + 'item.length': len(item_ids[:-2][-max_sequence_length:]), + 'ratings.ids': ratings[:-2][-max_sequence_length:], + 'ratings.length': len(ratings[:-2][-max_sequence_length:]) + }) + # assert len(item_ids[:-2][-max_sequence_length:]) == len(set(item_ids[:-2][-max_sequence_length:])) + validation_dataset[domain].append({ + 'user.ids': [user_idx], + 'user.length': 1, + 'item.ids': item_ids[:-1][-max_sequence_length:], + 'item.length': len(item_ids[:-1][-max_sequence_length:]), + 'ratings.ids': ratings[:-1][-max_sequence_length:], + 'ratings.length': len(ratings[:-1][-max_sequence_length:]) + }) + # assert len(item_ids[:-1][-max_sequence_length:]) == len(set(item_ids[:-1][-max_sequence_length:])) + test_dataset[domain].append({ + 'user.ids': [user_idx], + 'user.length': 1, + 'item.ids': item_ids[-max_sequence_length:], + 'item.length': len(item_ids[-max_sequence_length:]), + 'ratings.ids': ratings[-max_sequence_length:], + 'ratings.length': len(ratings[-max_sequence_length:]) + }) + # assert len(item_ids[-max_sequence_length:]) == len(set(item_ids[-max_sequence_length:])) + + max_user_idx_by_domain[domain] = max_user_idx + max_item_idx_by_domain[domain] = max_item_idx + + # negative_items_popularity = np.zeros(max_item_idx) + # for item_id, item_count in negative_count_items.items(): + # negative_items_popularity[item_id - 1] = item_count ** 0.75 + + # negative_items_popularity = torch.tensor([[negative_count_items[i] ** 0.75] if i in negative_count_items.keys() else [0] for i in range(1, max_item_idx + 3)]).to(DEVICE) + max_negative_count_items = max(negative_count_items.values()) + negative_items_popularity = torch.tensor( + [[(negative_count_items[i] / max_negative_count_items) ** 0.5] if i in negative_count_items.keys() else [0] for i in + range(1, max_item_idx + 3)]).to(DEVICE) + + # all_items_popularity = np.zeros(max_item_idx) + # for item_id, item_count in all_count_items.items(): + # all_items_popularity[item_id - 1] = item_count ** 0.75 + # all_items_popularity = torch.tensor(all_items_popularity).to(DEVICE) + + all_items_popularity = torch.tensor( + [[all_count_items[i] ** 0.75] if i in all_count_items.keys() else [0] for i in range(1, max_item_idx + 3)]).to(DEVICE) + + logger.info('Max user idx: {}'.format(max_user_idx)) + logger.info('Max item idx: {}'.format(max_item_idx)) + logger.info('Max sequence length: {}'.format(max_sequence_length)) + for domain in domains: + logger.info('{} domain Train dataset size: {}'.format(domain, len(train_dataset[domain]))) + logger.info('{} domain Test dataset size: {}'.format(domain, len(test_dataset[domain]))) + logger.info('{} domain dataset sparsity: {}'.format( + domain, (len(train_dataset[domain]) + len(test_dataset[domain])) / max_user_idx_by_domain[domain] / + max_item_idx_by_domain[domain] + )) + + train_sampler = TrainSampler.create_from_config( + dict(config['samplers'], + **{'positive_domain': positive_domain, + 'negative_domain': negative_domain + }), + dataset=train_dataset, + num_users=max_user_idx, + num_items=max_item_idx, + items_popularity=all_items_popularity, + negative_items_popularity=negative_items_popularity, + positive_domain=positive_domain + ) + validation_sampler = ValidationSampler.create_from_config( + dict(config['samplers'], + **{'positive_domain': positive_domain, + 'negative_domain': negative_domain + }), + dataset=validation_dataset, + num_users=max_user_idx, + num_items=max_item_idx, + items_popularity=all_items_popularity, + negative_items_popularity=negative_items_popularity, + positive_domain=positive_domain + ) + test_sampler = EvalSampler.create_from_config( + dict(config['samplers'], + **{'positive_domain': positive_domain, + 'negative_domain': negative_domain + }), + dataset=test_dataset, + num_users=max_user_idx, + num_items=max_item_idx, + items_popularity=all_items_popularity, + negative_items_popularity=negative_items_popularity, + positive_domain=positive_domain + ) + + return cls( + train_sampler=train_sampler, + validation_sampler=validation_sampler, + test_sampler=test_sampler, + num_users=max_user_idx, + num_items=max_item_idx, + max_sequence_length=max_sequence_length, + positive_domain=positive_domain, + negative_domain=negative_domain, + negative_items_popularity=negative_items_popularity + ) + + @property + def meta(self): + return { + 'num_users': self.num_users, + 'num_items': self.num_items, + 'max_sequence_length': self.max_sequence_length, + 'negative_items_popularity': self._negative_items_popularity + } diff --git a/modeling/dataset/negative_samplers/__init__.py b/modeling/dataset/negative_samplers/__init__.py index 498de21f..a3159b80 100644 --- a/modeling/dataset/negative_samplers/__init__.py +++ b/modeling/dataset/negative_samplers/__init__.py @@ -1,9 +1,15 @@ from .base import BaseNegativeSampler from .popular import PopularNegativeSampler from .random import RandomNegativeSampler +from .base_negative import BaseNegRatingsNegativeSampler +from .negative_ratings import NegativeRatingsNegativeSampler +from .random_negative_ratings import RandomNegativeRatingsSampler __all__ = [ 'BaseNegativeSampler', 'PopularNegativeSampler', - 'RandomNegativeSampler' + 'RandomNegativeSampler', + 'BaseNegRatingsNegativeSampler', + 'NegativeRatingsNegativeSampler', + 'RandomNegativeRatingsSampler' ] diff --git a/modeling/dataset/negative_samplers/base_negative.py b/modeling/dataset/negative_samplers/base_negative.py new file mode 100644 index 00000000..cbffde37 --- /dev/null +++ b/modeling/dataset/negative_samplers/base_negative.py @@ -0,0 +1,32 @@ +from utils import MetaParent +from numpy import arange +from collections import defaultdict + + +class BaseNegRatingsNegativeSampler(metaclass=MetaParent): + + def __init__( + self, + dataset, + num_users, + num_items, + items_popularity, + negative_items_popularity, + positive_domain + ): + self._dataset = dataset + self._num_users = num_users + self._num_items = num_items + self._all_items = arange(self._num_items) + self._items_popularity = items_popularity + self._negative_items_popularity = negative_items_popularity + self._positive_domain = positive_domain + + self._seen_items = defaultdict(set) + for sample in self._dataset[self._positive_domain]: + user_id = sample['user.ids'][0] + items = list(sample['item.ids']) + self._seen_items[user_id].update(items) + + def generate_negative_samples(self, sample, num_negatives): + raise NotImplementedError diff --git a/modeling/dataset/negative_samplers/negative_ratings.py b/modeling/dataset/negative_samplers/negative_ratings.py new file mode 100644 index 00000000..07663b0f --- /dev/null +++ b/modeling/dataset/negative_samplers/negative_ratings.py @@ -0,0 +1,25 @@ +from numpy import setdiff1d, random + +from dataset.negative_samplers import BaseNegRatingsNegativeSampler +from torch import tensor + + +class NegativeRatingsNegativeSampler(BaseNegRatingsNegativeSampler, config_name='negative_ratings_negative_sampler'): + + @classmethod + def create_from_config(cls, config, **kwargs): + return cls( + dataset=kwargs['dataset'], + num_users=kwargs['num_users'], + num_items=kwargs['num_items'], + items_popularity=kwargs['items_popularity'], + negative_items_popularity=kwargs['negative_items_popularity'], + positive_domain=kwargs['positive_domain'] + ) + + def generate_negative_samples(self, sample, num_negatives): + users_items = sample['item.ids'] + none_interactions = setdiff1d(self._all_items, users_items) + probabilities = self._items_popularity[none_interactions] / self._items_popularity[none_interactions].sum() + temp = (tensor(random.choice(none_interactions, num_negatives, replace=True, p=probabilities))).long() + return temp diff --git a/modeling/dataset/negative_samplers/random.py b/modeling/dataset/negative_samplers/random.py index b83042b0..9816427f 100644 --- a/modeling/dataset/negative_samplers/random.py +++ b/modeling/dataset/negative_samplers/random.py @@ -8,7 +8,6 @@ class RandomNegativeSampler(BaseNegativeSampler, config_name='random'): - @classmethod def create_from_config(cls, _, **kwargs): return cls( diff --git a/modeling/dataset/negative_samplers/random_negative_ratings.py b/modeling/dataset/negative_samplers/random_negative_ratings.py new file mode 100644 index 00000000..a1a96c1c --- /dev/null +++ b/modeling/dataset/negative_samplers/random_negative_ratings.py @@ -0,0 +1,35 @@ +from collections import defaultdict + +from tqdm import tqdm + +from dataset.negative_samplers import BaseNegRatingsNegativeSampler + +import numpy as np + + +class RandomNegativeRatingsSampler(BaseNegRatingsNegativeSampler, config_name='random'): + @classmethod + def create_from_config(cls, _, **kwargs): + return cls( + dataset=kwargs['dataset'], + num_users=kwargs['num_users'], + num_items=kwargs['num_items'], + items_popularity=kwargs['items_popularity'], + negative_items_popularity=kwargs['negative_items_popularity'], + positive_domain=kwargs['positive_domain'] + ) + + def generate_negative_samples(self, sample, num_negatives): + user_id = sample['user.ids'][0] + all_items = list(range(1, self._num_items + 1)) + np.random.shuffle(all_items) + + negatives = [] + running_idx = 0 + while len(negatives) < num_negatives and running_idx < len(all_items): + negative_idx = all_items[running_idx] + if negative_idx not in self._seen_items[user_id]: + negatives.append(negative_idx) + running_idx += 1 + + return negatives diff --git a/modeling/dataset/samplers/__init__.py b/modeling/dataset/samplers/__init__.py index 7e2b7547..7c342c0d 100644 --- a/modeling/dataset/samplers/__init__.py +++ b/modeling/dataset/samplers/__init__.py @@ -2,9 +2,14 @@ from .base import MultiDomainTrainSampler, MultiDomainValidationSampler, MultiDomainEvalSampler from .cl4srec import Cl4SRecTrainSampler, Cl4SRecValidationSampler, Cl4SRecEvalSampler from .duorec import DuorecTrainSampler, DuoRecValidationSampler, DuoRecEvalSampler -from .next_item_prediction import NextItemPredictionTrainSampler, NextItemPredictionValidationSampler, NextItemPredictionEvalSampler -from .next_item_prediction import MultiDomainNextItemPredictionTrainSampler, MultiDomainNextItemPredictionValidationSampler, MultiDomainNextItemPredictionEvalSampler -from .masked_item_prediction import MaskedItemPredictionTrainSampler, MaskedItemPredictionValidationSampler, MaskedItemPredictionEvalSampler +from .next_item_prediction import NextItemPredictionTrainSampler, NextItemPredictionValidationSampler, \ + NextItemPredictionEvalSampler +from .next_item_prediction import MultiDomainNextItemPredictionTrainSampler, \ + MultiDomainNextItemPredictionValidationSampler, MultiDomainNextItemPredictionEvalSampler +from .next_item_prediction import NegativeRatingsTrainSampler, NegativeRatingsValidationSampler, \ + NegativeRatingsEvalSampler +from .masked_item_prediction import MaskedItemPredictionTrainSampler, MaskedItemPredictionValidationSampler, \ + MaskedItemPredictionEvalSampler from .mclsr import MCLSRTrainSampler, MCLSRValidationSampler, MCLSRPredictionEvalSampler from .pop import PopTrainSampler, PopValidationSampler, PopEvalSampler from .s3rec import S3RecPretrainTrainSampler, S3RecPretrainValidationSampler, S3RecPretrainEvalSampler diff --git a/modeling/dataset/samplers/base.py b/modeling/dataset/samplers/base.py index 2a0e7ecb..93909e66 100644 --- a/modeling/dataset/samplers/base.py +++ b/modeling/dataset/samplers/base.py @@ -17,7 +17,7 @@ def __len__(self): def __getitem__(self, index): raise NotImplementedError - + class MultiDomainTrainSampler(TrainSampler): @@ -39,6 +39,32 @@ def __getitem__(self, index): raise NotImplementedError +# TODO use this samplers +class NegativeRatingsTrainSampler(TrainSampler): + + def __init__(self, positive_domain, negative_domain): + super().__init__() + self._positive_domain = positive_domain + self._negative_domain = negative_domain + + @property + def dataset(self, domain): + return self._dataset[domain] + + def __len__(self, domain=None): + if domain is None: + return len(self._dataset[self._positive_domain]) + return len(self._dataset[domain]) + + def __getitem__(self, index): + # TODO логика которая в siren 20 эпох - переместить сюда, сделать без эпох, на каждый getitem для одного юзера + # if EPOCH % 20 - 1 == 0: training_dataset.negs_gen_EP(20) + # training_dataset.edge_4 = training_dataset.edge_4_tot[:, :, EPOCH % 20 - 1] + # По индексу который приходит, достать u, v, w + # TODO ? + raise NotImplementedError + + class ValidationSampler(metaclass=MetaParent): def __init__(self): @@ -53,7 +79,7 @@ def __len__(self): def __getitem__(self, index): raise NotImplementedError - + class MultiDomainValidationSampler(ValidationSampler): @@ -75,6 +101,27 @@ def __getitem__(self, index): raise NotImplementedError +# TODO use this samplers +class NegativeRatingsValidationSampler(ValidationSampler): + + def __init__(self, positive_domain, negative_domain): + super().__init__() + self._positive_domain = positive_domain + self._negative_domain = negative_domain + + @property + def dataset(self, domain): + return self._dataset[domain] + + def __len__(self, domain=None): + if domain is None: + return len(self._dataset[self._positive_domain]) + return len(self._dataset[domain]) + + def __getitem__(self, index): + raise NotImplementedError + + class EvalSampler(metaclass=MetaParent): def __init__(self, dataset, num_users, num_items): @@ -110,13 +157,13 @@ def __getitem__(self, index): class MultiDomainEvalSampler(EvalSampler): - def __init__(self, - dataset, - num_users, - num_items, - target_domain, + def __init__(self, + dataset, + num_users, + num_items, + target_domain, other_domains - ): + ): super().__init__(dataset, num_users, num_items) self._target_domain = target_domain self._other_domains = other_domains @@ -142,3 +189,47 @@ def __getitem__(self, index, domain): 'labels.ids': [next_item], 'labels.length': 1 } + + +# TODO use this samplers +class NegativeRatingsEvalSampler(EvalSampler): + + def __init__(self, + dataset, + num_users, + num_items, + positive_domain, + negative_domain + ): + super().__init__(dataset, num_users, num_items) + self._positive_domain = positive_domain + self._negative_domain = negative_domain + + def __len__(self, domain=None): + if domain is None: + return len(self._dataset[self._positive_domain]) + return len(self._dataset[domain]) + + def __getitem__(self, index, domain): + sample = copy.deepcopy(self._dataset[domain][index]) + + item_sequence = sample['item.ids'][:-1] + ratings_sequence = sample['ratings.ids'][:-1] + + next_item = sample['item.ids'][-1] + next_item_rating = sample['ratings.ids'][-1] + + return { + 'user.ids': sample['user.ids'], + 'user.length': sample['user.length'], + + 'item.ids': item_sequence, + 'item.length': len(item_sequence), + + 'ratings.ids': ratings_sequence, + 'ratings.length': len(ratings_sequence), + + 'labels.ids': [next_item], + 'labels.ratings': [next_item_rating], + 'labels.length': 1, + } diff --git a/modeling/dataset/samplers/next_item_prediction.py b/modeling/dataset/samplers/next_item_prediction.py index 5d501ae1..0bfe348b 100644 --- a/modeling/dataset/samplers/next_item_prediction.py +++ b/modeling/dataset/samplers/next_item_prediction.py @@ -1,6 +1,7 @@ from dataset.samplers.base import TrainSampler, ValidationSampler, EvalSampler -from dataset.samplers.base import MultiDomainTrainSampler, MultiDomainValidationSampler, MultiDomainEvalSampler -from dataset.negative_samplers.base import BaseNegativeSampler +from dataset.samplers.base import MultiDomainTrainSampler, MultiDomainValidationSampler, MultiDomainEvalSampler, \ + NegativeRatingsTrainSampler, NegativeRatingsValidationSampler, NegativeRatingsEvalSampler +from dataset.negative_samplers import BaseNegativeSampler, BaseNegRatingsNegativeSampler import copy @@ -115,15 +116,16 @@ def create_from_config(cls, config, **kwargs): ) -class MultiDomainNextItemPredictionTrainSampler(MultiDomainTrainSampler, config_name='multi_domain_next_item_prediction'): +class MultiDomainNextItemPredictionTrainSampler(MultiDomainTrainSampler, + config_name='multi_domain_next_item_prediction'): def __init__( - self, - dataset, - num_users, - num_items, - target_domain, - other_domains, + self, + dataset, + num_users, + num_items, + target_domain, + other_domains, negative_sampler ): @@ -135,7 +137,7 @@ def __init__( self._user_id_to_index_cross_domain_mapping = self.get_user_id_to_index_cross_domain_mapping() def get_user_id_to_index_cross_domain_mapping(self): - _user_id_to_index_cross_domain_mapping = {domain:{} for domain in self._other_domains} + _user_id_to_index_cross_domain_mapping = {domain: {} for domain in self._other_domains} for domain in self._other_domains: for index, sample in enumerate(self._dataset[domain]): user_id = sample['user.ids'][0] @@ -152,9 +154,9 @@ def create_from_config(cls, config, **kwargs): for domain in domains: kwargs['dataset'] = datasets[domain] negative_sampler[domain] = BaseNegativeSampler.create_from_config( - {'type': config['negative_sampler_type']}, - **kwargs - ) + {'type': config['negative_sampler_type']}, + **kwargs + ) kwargs['dataset'] = datasets return cls( @@ -197,7 +199,8 @@ def __getitem__(self, index): item_sequence = sample['item.ids'] next_item_sequence = sample['item.ids'][1:] - negative_sequence = self._negative_sampler[domain].generate_negative_samples(sample, len(next_item_sequence)) + negative_sequence = self._negative_sampler[domain].generate_negative_samples(sample, + len(next_item_sequence)) assert len(next_item_sequence) == len(negative_sequence) @@ -215,16 +218,17 @@ def __getitem__(self, index): return result -class MultiDomainNextItemPredictionValidationSampler(MultiDomainValidationSampler, config_name='multi_domain_next_item_prediction'): +class MultiDomainNextItemPredictionValidationSampler(MultiDomainValidationSampler, + config_name='multi_domain_next_item_prediction'): def __init__( - self, - dataset, - num_users, - num_items, - target_domain, - other_domains, - negative_sampler, + self, + dataset, + num_users, + num_items, + target_domain, + other_domains, + negative_sampler, num_negatives=100 ): @@ -237,7 +241,7 @@ def __init__( self._user_id_to_index_cross_domain_mapping = self.get_user_id_to_index_cross_domain_mapping() def get_user_id_to_index_cross_domain_mapping(self): - _user_id_to_index_cross_domain_mapping = {domain:{} for domain in self._other_domains} + _user_id_to_index_cross_domain_mapping = {domain: {} for domain in self._other_domains} for domain in self._other_domains: for index, sample in enumerate(self._dataset[domain]): user_id = sample['user.ids'][0] @@ -254,9 +258,9 @@ def create_from_config(cls, config, **kwargs): for domain in domains: kwargs['dataset'] = datasets[domain] negative_sampler[domain] = BaseNegativeSampler.create_from_config( - {'type': config['negative_sampler_type']}, - **kwargs - ) + {'type': config['negative_sampler_type']}, + **kwargs + ) kwargs['dataset'] = datasets return cls( @@ -325,19 +329,19 @@ def __getitem__(self, index): class MultiDomainNextItemPredictionEvalSampler(MultiDomainEvalSampler, config_name='multi_domain_next_item_prediction'): def __init__( - self, - dataset, - num_users, - num_items, - target_domain, + self, + dataset, + num_users, + num_items, + target_domain, other_domains ): - + super().__init__(dataset, num_users, num_items, target_domain, other_domains) self._user_id_to_index_cross_domain_mapping = self.get_user_id_to_index_cross_domain_mapping() def get_user_id_to_index_cross_domain_mapping(self): - _user_id_to_index_cross_domain_mapping = {domain:{} for domain in self._other_domains} + _user_id_to_index_cross_domain_mapping = {domain: {} for domain in self._other_domains} for domain in self._other_domains: for index, sample in enumerate(self._dataset[domain]): user_id = sample['user.ids'][0] @@ -390,3 +394,304 @@ def __getitem__(self, index): }) return result + + +class NegativeRatingsNextItemPredictionTrainSampler(NegativeRatingsTrainSampler, + config_name='negative_ratings_next_item_prediction'): + + def __init__( + self, + dataset, + num_users, + num_items, + positive_domain, + negative_domain, + num_negatives, + negative_sampler, + offset + ): + + super().__init__(positive_domain, negative_domain) + self._dataset = dataset + self._num_users = num_users + self._num_items = num_items + self._num_negatives = num_negatives + self._negative_sampler = negative_sampler + self._offset = offset + self._user_id_to_index_cross_domain_mapping = self.get_user_id_to_index_cross_domain_mapping() + + def get_user_id_to_index_cross_domain_mapping(self): + _user_id_to_index_cross_domain_mapping = {self._negative_domain: {}} + for index, sample in enumerate(self._dataset[self._negative_domain]): + user_id = sample['user.ids'][0] + _user_id_to_index_cross_domain_mapping[self._negative_domain][user_id] = index + + return _user_id_to_index_cross_domain_mapping + + @classmethod + def create_from_config(cls, config, **kwargs): + # TODO попробовать оставить обычный random negative sampler + negative_sampler = BaseNegRatingsNegativeSampler.create_from_config({'type': config['negative_sampler_type']}, **kwargs) + # negative_sampler = BaseNegRatingsNegativeSampler.create_from_config({'type': config['negative_sampler_type']}, **kwargs) + + return cls( + dataset=kwargs['dataset'], + num_users=kwargs['num_users'], + num_items=kwargs['num_items'], + positive_domain=config['positive_domain'], + negative_domain=config['negative_domain'], + negative_sampler=negative_sampler, + num_negatives=config.get('num_negatives_train', -1), + offset=config['offset'] + ) + + def __getitem__(self, index): + sample_positive_domain = copy.deepcopy(self._dataset[self._positive_domain][index]) + item_sequence_positive_domain = sample_positive_domain['item.ids'][:-1] + ratings_positive_domain = sample_positive_domain['ratings.ids'][:-1] + next_item_sequence_positive_domain = sample_positive_domain['item.ids'][1:] + next_ratings_sequence_positive_domain = sample_positive_domain['ratings.ids'][1:] + + item_sequence_negative_domain = [] + ratings_negative_domain = [] + if sample_positive_domain['user.ids'][0] in self._user_id_to_index_cross_domain_mapping[self._negative_domain]: + domain_index_negative_domain = self._user_id_to_index_cross_domain_mapping[self._negative_domain][ + sample_positive_domain['user.ids'][0]] + sample_negative_domain = copy.deepcopy(self._dataset[self._negative_domain][domain_index_negative_domain]) + item_sequence_negative_domain = sample_negative_domain['item.ids'][:-1] + ratings_negative_domain = sample_negative_domain['ratings.ids'][:-1] + + item_sequence = item_sequence_positive_domain + item_sequence_negative_domain + ratings = ratings_positive_domain + ratings_negative_domain + ratings = [rating - self._offset for rating in ratings] + + len_item_sequence = len(item_sequence) + + if self._num_negatives == -1: + negatives = self._negative_sampler.generate_negative_samples(sample_positive_domain, len_item_sequence) + negative_sequence = self._negative_sampler.generate_negative_samples(sample_positive_domain, + len(next_item_sequence_positive_domain)) + else: + negatives = self._negative_sampler.generate_negative_samples(sample_positive_domain, self._num_negatives) + negative_sequence = self._negative_sampler.generate_negative_samples(sample_positive_domain, + self._num_negatives) + + result = { + 'user.ids': sample_positive_domain['user.ids'] * len_item_sequence, + 'user.length': len_item_sequence, + + 'item.ids': item_sequence, + 'item.length': len_item_sequence, + + 'item.negatives.ids': item_sequence_negative_domain, + 'item.negatives.length': len(item_sequence_negative_domain), + + 'ratings.ids': ratings, + 'ratings.length': len(ratings), + + 'negatives.ids': negatives, + 'negatives.length': len(negatives), + + 'user.positive.ids': sample_positive_domain['user.ids'], + 'user.positive.length': len(sample_positive_domain['user.ids']), + + 'item.positive.ids': item_sequence_positive_domain, + 'item.positive.length': len(item_sequence_positive_domain), + + 'positive.sequence.ids': next_item_sequence_positive_domain, + 'positive.sequence.length': len(next_item_sequence_positive_domain), + + 'ratings.sequence.ids': next_ratings_sequence_positive_domain, + 'ratings.sequence.length': len(next_ratings_sequence_positive_domain), + + 'negative.sequence.ids': negative_sequence, + 'negative.sequence.length': len(negative_sequence) + } + + return result + + +class NegativeRatingsNextItemPredictionValidationSampler(NegativeRatingsValidationSampler, + config_name='negative_ratings_next_item_prediction'): + def __init__( + self, + dataset, + num_users, + num_items, + positive_domain, + negative_domain, + negative_sampler, + num_negatives, + offset + ): + + super().__init__(positive_domain, negative_domain) + self._dataset = dataset + self._num_users = num_users + self._num_items = num_items + self._negative_sampler = negative_sampler + self._num_negatives = num_negatives + self._offset = offset + self._user_id_to_index_cross_domain_mapping = self.get_user_id_to_index_cross_domain_mapping() + + def get_user_id_to_index_cross_domain_mapping(self): + _user_id_to_index_cross_domain_mapping = {self._negative_domain: {}} + for index, sample in enumerate(self._dataset[self._negative_domain]): + user_id = sample['user.ids'][0] + _user_id_to_index_cross_domain_mapping[self._negative_domain][user_id] = index + + return _user_id_to_index_cross_domain_mapping + + @classmethod + def create_from_config(cls, config, **kwargs): + negative_sampler = BaseNegRatingsNegativeSampler.create_from_config({'type': config['negative_sampler_type']}, **kwargs) + # negative_sampler = BaseNegRatingsNegativeSampler.create_from_config({'type': config['negative_sampler_type']}, **kwargs) + + return cls( + dataset=kwargs['dataset'], + num_users=kwargs['num_users'], + num_items=kwargs['num_items'], + negative_sampler=negative_sampler, + num_negatives=config.get('num_negatives_val', -1), + positive_domain=config['positive_domain'], + negative_domain=config['negative_domain'], + offset=config['offset'] + ) + + def __getitem__(self, index): + sample_positive_domain = copy.deepcopy(self._dataset[self._positive_domain][index]) + item_sequence_positive_domain = sample_positive_domain['item.ids'][:-1] + ratings_positive_domain = sample_positive_domain['ratings.ids'][:-1] + positive_graph = sample_positive_domain['item.ids'][-1] + + item_sequence_negative_domain = [] + ratings_negative_domain = [] + if sample_positive_domain['user.ids'][0] in self._user_id_to_index_cross_domain_mapping[self._negative_domain]: + domain_index_negative_domain = self._user_id_to_index_cross_domain_mapping[self._negative_domain][ + sample_positive_domain['user.ids'][0]] + sample_negative_domain = copy.deepcopy(self._dataset[self._negative_domain][domain_index_negative_domain]) + item_sequence_negative_domain = sample_negative_domain['item.ids'][:-1] + ratings_negative_domain = sample_negative_domain['ratings.ids'][:-1] + + item_sequence = item_sequence_positive_domain + item_sequence_negative_domain + ratings = ratings_positive_domain + ratings_negative_domain + ratings = [rating - self._offset for rating in ratings] + + len_item_sequence = len(item_sequence) + + if self._num_negatives == -1: + negatives = self._negative_sampler.generate_negative_samples(sample_positive_domain, len_item_sequence) + else: + negatives = self._negative_sampler.generate_negative_samples(sample_positive_domain, self._num_negatives) + + try: + candidates_graph = [positive_graph] + negatives.tolist() + except Exception as e: + candidates_graph = [positive_graph] + negatives + + result = { + 'user.ids': sample_positive_domain['user.ids'] * len_item_sequence, + 'user.length': len_item_sequence, + + 'item.ids': item_sequence, + 'item.length': len_item_sequence, + + 'ratings.ids': ratings, + 'ratings.length': len(ratings), + + 'user.positive.ids': sample_positive_domain['user.ids'], + 'user.positive.length': len(sample_positive_domain['user.ids']), + + 'item.positive.ids': item_sequence_positive_domain, + 'item.positive.length': len(item_sequence_positive_domain), + + 'candidates.positive.ids': candidates_graph, + 'candidates.positive.length': len(candidates_graph), + + 'labels.positive.ids': [0], + 'labels.positive.length': 1 + } + + return result + + +class NegativeRatingsNextItemPredictionEvalSampler(NegativeRatingsEvalSampler, + config_name='negative_ratings_next_item_prediction'): + def __init__( + self, + dataset, + num_users, + num_items, + positive_domain, + negative_domain, + offset + ): + + super().__init__(dataset, num_users, num_items, positive_domain, negative_domain) + self._dataset = dataset + self._num_users = num_users + self._num_items = num_items + self._offset = offset + self._user_id_to_index_cross_domain_mapping = self.get_user_id_to_index_cross_domain_mapping() + + def get_user_id_to_index_cross_domain_mapping(self): + _user_id_to_index_cross_domain_mapping = {self._negative_domain: {}} + for index, sample in enumerate(self._dataset[self._negative_domain]): + user_id = sample['user.ids'][0] + _user_id_to_index_cross_domain_mapping[self._negative_domain][user_id] = index + + return _user_id_to_index_cross_domain_mapping + + @classmethod + def create_from_config(cls, config, **kwargs): + return cls( + dataset=kwargs['dataset'], + num_users=kwargs['num_users'], + num_items=kwargs['num_items'], + positive_domain=config['positive_domain'], + negative_domain=config['negative_domain'], + offset=config['offset'] + ) + + def __getitem__(self, index): + sample_positive_domain = copy.deepcopy(self._dataset[self._positive_domain][index]) + item_sequence_positive_domain = sample_positive_domain['item.ids'][:-1] + ratings_positive_domain = sample_positive_domain['ratings.ids'][:-1] + + next_item_graph = sample_positive_domain['item.ids'][-1] + + item_sequence_negative_domain = [] + ratings_negative_domain = [] + if sample_positive_domain['user.ids'][0] in self._user_id_to_index_cross_domain_mapping[self._negative_domain]: + domain_index_negative_domain = self._user_id_to_index_cross_domain_mapping[self._negative_domain][ + sample_positive_domain['user.ids'][0]] + sample_negative_domain = copy.deepcopy(self._dataset[self._negative_domain][domain_index_negative_domain]) + item_sequence_negative_domain = sample_negative_domain['item.ids'][:-1] + ratings_negative_domain = sample_negative_domain['ratings.ids'][:-1] + + item_sequence = item_sequence_positive_domain + item_sequence_negative_domain + ratings = ratings_positive_domain + ratings_negative_domain + ratings = [rating - self._offset for rating in ratings] + + len_item_sequence = len(item_sequence) + result = { + 'user.ids': sample_positive_domain['user.ids'] * len_item_sequence, + 'user.length': len_item_sequence, + + 'item.ids': item_sequence, + 'item.length': len_item_sequence, + + 'ratings.ids': ratings, + 'ratings.length': len(ratings), + + 'user.positive.ids': sample_positive_domain['user.ids'], + 'user.positive.length': len(sample_positive_domain['user.ids']), + + 'item.positive.ids': item_sequence_positive_domain, + 'item.positive.length': len(item_sequence_positive_domain), + + 'labels.positive.ids': [next_item_graph], + 'labels.positive.length': 1 + } + + return result diff --git a/modeling/infer.py b/modeling/infer.py index 790ab78b..3ed3c300 100644 --- a/modeling/infer.py +++ b/modeling/infer.py @@ -8,14 +8,12 @@ import json import numpy as np import torch -import datetime - logger = create_logger(name=__name__) seed_val = 42 -def inference(dataloader, model, metrics, pred_prefix, labels_prefix, output_path=None, output_params=None): +def inference(dataloader, model, metrics, pred_prefix, labels_prefix, output_path=None): running_metrics = {} for metric_name, metric_function in metrics.items(): running_metrics[metric_name] = [] @@ -25,6 +23,7 @@ def inference(dataloader, model, metrics, pred_prefix, labels_prefix, output_pat with torch.no_grad(): for idx, batch in enumerate(dataloader): + print(idx, len(running_metrics['ndcg@20'])) for key, value in batch.items(): batch[key] = value.to(DEVICE) @@ -42,45 +41,14 @@ def inference(dataloader, model, metrics, pred_prefix, labels_prefix, output_pat logger.debug('Inference procedure has been finished!') logger.debug('Metrics are the following:') + # TODO add file inference option for metric_name, metric_value in running_metrics.items(): logger.info('{}: {}'.format(metric_name, np.mean(metric_value))) - if output_path is not None: - line = { - 'datetime': str(datetime.datetime.now().replace(microsecond=0)) - } - - if output_params is not None: - line.update(output_params) - - for metric_name, metric_value in running_metrics.items(): - line[metric_name] = round(np.mean(metric_value), 8) - - with open(output_path, 'a') as output_file: - output_file.write('{}\n'.format('\t'.join([param_value for param_value in line.values()]))) - def main(): fix_random_seed(seed_val) config = parse_args() - - output_path = '../checkpoints/metrics.tsv' - output_params = { - 'experiment_name': config['experiment_name'], - 'model': config['model']['type'] - } - if 'dataset' not in config['dataset']: - output_params['dataset'] = config['dataset']['name'].split('/')[0] - if 'target_domain' in config['dataset']: - output_params['domain'] = config['dataset']['target_domain'] - else: - output_params['domain'] = config['dataset']['name'].split('/')[1] - else: - output_params['dataset'] = config['dataset']['dataset']['name'].split('/')[0] - if 'target_domain' in config['dataset']['dataset']: - output_params['domain'] = config['dataset']['dataset']['target_domain'] - else: - output_params['domain'] = config['dataset']['dataset']['name'].split('/')[1] logger.debug('Inference config: \n{}'.format(json.dumps(config, indent=2))) @@ -98,6 +66,7 @@ def main(): if isinstance(model, TorchModel): model = model.to(DEVICE) checkpoint_path = '../checkpoints/{}_final_state.pth'.format(config['experiment_name']) + print(checkpoint_path) model.load_state_dict(torch.load(checkpoint_path)) metrics = { @@ -105,15 +74,7 @@ def main(): for metric_name, metric_cfg in config['metrics'].items() } - _ = inference( - dataloader=eval_dataloader, - model=model, - metrics=metrics, - pred_prefix=config['pred_prefix'], - labels_prefix=config['label_prefix'], - output_path=output_path, - output_params=output_params - ) + inference(eval_dataloader, model, metrics, config['pred_prefix'], config['label_prefix']) if __name__ == '__main__': diff --git a/modeling/loss/base.py b/modeling/loss/base.py index 550c890b..20be1726 100644 --- a/modeling/loss/base.py +++ b/modeling/loss/base.py @@ -4,6 +4,7 @@ import torch import torch.nn as nn +import torch.nn.functional as F class BaseLoss(metaclass=MetaParent): @@ -132,6 +133,87 @@ def forward(self, inputs): return loss +class SBPRLoss(TorchLoss, config_name='sbpr'): + + def __init__( + self, + positive_prefix, + negative_prefix, + ratings_prefix, + output_prefix=None + ): + super().__init__() + self._positive_prefix = positive_prefix + self._negative_prefix = negative_prefix + self._output_prefix = output_prefix + self._ratings_prefix = ratings_prefix + + def forward(self, inputs): + pos_scores = inputs[self._positive_prefix] # (all_batch_items) + neg_scores = inputs[self._negative_prefix] # (all_batch_items) + ratings = inputs[self._ratings_prefix] + + assert neg_scores.shape == pos_scores.shape + + # loss = -(pos_scores - neg_scores).sigmoid().log().mean() # (1) + + loss = -(((-1 / 2 * torch.sign(ratings) + 3 / 2).view(len(pos_scores), 1) * pos_scores) - neg_scores).sigmoid().log().mean() + if self._output_prefix is not None: + inputs[self._output_prefix] = loss.cpu().item() + + return loss + + +class ContrastiveLoss(TorchLoss, config_name='contrastive'): + + def __init__( + self, + positive_prefix, + negative_prefix, + # ratings_prefix, + output_prefix=None, + ): + super().__init__() + self._positive_prefix = positive_prefix + self._negative_prefix = negative_prefix + self._output_prefix = output_prefix + # self._ratings_prefix = ratings_prefix + + self.size_average = True + + def distance_metric(self, z1, z2): + sim_matrix = 1 - F.cosine_similarity(z1, z2) + + return sim_matrix + + def forward(self, inputs): + rep_anchor = inputs[self._positive_prefix] + rep_other = inputs[self._negative_prefix] + + assert rep_anchor.shape == rep_other.shape + + # ratings = inputs[self._ratings_prefix] + # labels = ratings + margin = 0 + + distances = self.distance_metric(rep_anchor, rep_other) + + # losses = 0.5 * ( + # labels.float() * distances.pow(2) + + # (1 - labels).float() * F.relu(self.margin - distances).pow(2) + # ) + + # losses = 0.5 * F.relu(margin - distances).pow(2) + + losses = 0.5 * distances.pow(2) + loss = losses.mean() if self.size_average else losses.sum() + + if self._output_prefix is not None: + inputs[self._output_prefix] = loss.cpu().item() + + return loss + + class RegularizationLoss(TorchLoss, config_name='regularization'): def __init__(self, prefix, output_prefix=None): @@ -142,7 +224,7 @@ def __init__(self, prefix, output_prefix=None): def forward(self, inputs): loss = 0.0 for prefix in self._prefix: - loss += (1/2) * inputs[prefix].pow(2).mean() + loss += (1 / 2) * inputs[prefix].pow(2).mean() if self._output_prefix is not None: inputs[self._output_prefix] = loss.cpu().item() @@ -234,6 +316,8 @@ def forward(self, inputs): current_embeddings = inputs[self._representation_prefix] # (x, embedding_dim) assert positive_embeddings.shape[0] == negative_embeddings.shape[0] == current_embeddings.shape[0] + # positive_scores = positive_embeddings + # negative_scores = negative_embeddings positive_scores = torch.einsum( 'bd,bd->b', positive_embeddings, diff --git a/modeling/models/__init__.py b/modeling/models/__init__.py index 0672eda0..87bea8a8 100644 --- a/modeling/models/__init__.py +++ b/modeling/models/__init__.py @@ -13,3 +13,6 @@ from .sasrec import SasRecModel from .sasrec_ce import SasRecCeModel from .s3rec import S3RecModel +from .siren import SiReNModel +from .gsnrec3 import GSNRec3Model +from .gsnrec5 import GSNRec5Model \ No newline at end of file diff --git a/modeling/models/base.py b/modeling/models/base.py index 519a24da..41e8530b 100644 --- a/modeling/models/base.py +++ b/modeling/models/base.py @@ -123,7 +123,8 @@ def _apply_sequential_encoder(self, events, lengths): embeddings[~mask] = 0 if self._is_causal: - causal_mask = torch.tril(torch.tile(mask.unsqueeze(1), dims=[self._num_heads, seq_len, 1])).bool().to(DEVICE) # (seq_len, seq_len) + causal_mask = torch.tril(torch.tile(mask.unsqueeze(1), dims=[self._num_heads, seq_len, 1])).bool().to( + DEVICE) # (seq_len, seq_len) embeddings = self._encoder( src=embeddings, mask=~causal_mask, diff --git a/modeling/models/diplom/bert4rec.py b/modeling/models/diplom/bert4rec.py new file mode 100644 index 00000000..b0dfa05b --- /dev/null +++ b/modeling/models/diplom/bert4rec.py @@ -0,0 +1,328 @@ +from torch.nn import MultiheadAttention + +from models.base import SequentialTorchModel + +from utils import create_masked_tensor, get_activation_function + +import torch +import torch.nn as nn + + +class Bert4RecModel(SequentialTorchModel, config_name='bert4rec'): + + def __init__( + self, + sequence_prefix, + labels_prefix, + candidate_prefix, + num_items, + max_sequence_length, + embedding_dim, + num_heads, + num_layers, + dim_feedforward, + dropout=0.0, + activation='gelu', + layer_norm_eps=1e-5, + initializer_range=0.02 + ): + super().__init__( + num_items=num_items, + max_sequence_length=max_sequence_length, + embedding_dim=embedding_dim, + num_heads=num_heads, + num_layers=num_layers, + dim_feedforward=dim_feedforward, + dropout=dropout, + activation=activation, + layer_norm_eps=layer_norm_eps, + is_causal=False + ) + self._sequence_prefix = sequence_prefix + self._labels_prefix = labels_prefix + self._candidate_prefix = candidate_prefix + + self._output_projection = nn.Linear( + in_features=embedding_dim, + out_features=embedding_dim + ) + + self._bias = nn.Parameter( + data=torch.zeros(num_items + 2), + requires_grad=True + ) + + self._init_weights(initializer_range) + + @classmethod + def create_from_config(cls, config, **kwargs): + return cls( + sequence_prefix=config['sequence_prefix'], + labels_prefix=config['labels_prefix'], + candidate_prefix=config['candidate_prefix'], + num_items=kwargs['num_items'], + max_sequence_length=kwargs['max_sequence_length'], + embedding_dim=config['embedding_dim'], + num_heads=config.get('num_heads', int(config['embedding_dim'] // 64)), + num_layers=config['num_layers'], + dim_feedforward=config.get('dim_feedforward', 4 * config['embedding_dim']), + dropout=config.get('dropout', 0.0), + initializer_range=config.get('initializer_range', 0.02) + ) + + def forward(self, inputs): + all_sample_events = inputs['{}.ids'.format(self._sequence_prefix)] # (all_batch_events) + all_sample_lengths = inputs['{}.length'.format(self._sequence_prefix)] # (batch_size) + + embeddings, mask = self._apply_sequential_encoder( + all_sample_events, all_sample_lengths + ) # (batch_size, seq_len, embedding_dim), (batch_size, seq_len) + + embeddings = self._output_projection(embeddings) # (batch_size, seq_len, embedding_dim) + embeddings = torch.nn.functional.gelu(embeddings) # (batch_size, seq_len, embedding_dim) + embeddings = torch.einsum( + 'bsd,nd->bsn', embeddings, self._item_embeddings.weight + ) # (batch_size, seq_len, num_items) + embeddings += self._bias[None, None, :] # (batch_size, seq_len, num_items) + + if self.training: # training mode + all_sample_labels = inputs['{}.ids'.format(self._labels_prefix)] # (all_batch_events) + embeddings = embeddings[mask] # (all_batch_events, num_items) + labels_mask = (all_sample_labels != 0).bool() # (all_batch_events) + + needed_logits = embeddings[labels_mask] # (non_zero_events, num_items) + needed_labels = all_sample_labels[labels_mask] # (non_zero_events) + + return {'logits': needed_logits, 'labels.ids': needed_labels} + else: # eval mode + candidate_scores = self._get_last_embedding(embeddings, mask) # (batch_size, num_items) + candidate_scores[:, 0] = -torch.inf + candidate_scores[:, self._num_items + 1:] = -torch.inf + + if '{}.ids'.format(self._candidate_prefix) in inputs: + candidate_events = inputs['{}.ids'.format(self._candidate_prefix)] # (all_batch_candidates) + candidate_lengths = inputs['{}.length'.format(self._candidate_prefix)] # (batch_size) + + batch_size = candidate_lengths.shape[0] + num_candidates = candidate_lengths[0] + + candidate_scores = torch.gather( + input=candidate_scores, + dim=1, + index=torch.reshape(candidate_events, [batch_size, num_candidates]) + ) # (batch_size, num_candidates) + + _, indices = torch.topk( + candidate_scores, + k=20, dim=-1, largest=True + ) # (batch_size, 20) + + return indices + + +class GraphBert4RecModel(SequentialTorchModel, config_name='graph_bert4rec'): + + def __init__( + self, + sequence_prefix, + labels_prefix, + candidate_prefix, + common_graph, + user_graph, + item_graph, + num_hops, + graph_dropout, + num_items, + max_sequence_length, + embedding_dim, + num_heads, + num_layers, + dim_feedforward, + dropout=0.0, + norm_first=True, + activation='relu', + layer_norm_eps=1e-9, + initializer_range=0.02 + ): + super().__init__( + num_items=num_items, + max_sequence_length=max_sequence_length, + embedding_dim=embedding_dim, + num_heads=num_heads, + num_layers=num_layers, + dim_feedforward=dim_feedforward, + dropout=dropout, + activation=activation, + layer_norm_eps=layer_norm_eps, + is_causal=False + ) + self._sequence_prefix = sequence_prefix + self._labels_prefix = labels_prefix + self._candidate_prefix = candidate_prefix + + self._common_graph = common_graph + self._user_graph = user_graph + self._item_graph = item_graph + self._num_hops = num_hops + self._graph_dropout = graph_dropout + + self._mha = MultiheadAttention( + embed_dim=embedding_dim, + num_heads=num_heads, + dropout=dropout, + bias=True, + add_bias_kv=False, + add_zero_attn=False, + batch_first=True, + ) + + self.linear1 = nn.Linear(embedding_dim, dim_feedforward) + self.dropout = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, embedding_dim) + self.activation = get_activation_function(activation) + + self.norm_first = norm_first + self.norm1 = nn.LayerNorm(embedding_dim, eps=layer_norm_eps) + self.norm2 = nn.LayerNorm(embedding_dim, eps=layer_norm_eps) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + + self._output_projection = nn.Linear( + in_features=2 * embedding_dim, + out_features=embedding_dim, + ) + + self._bias = nn.Parameter( + data=torch.zeros(num_items + 2), + requires_grad=True + ) + + self._init_weights(initializer_range) + + @classmethod + def create_from_config(cls, config, **kwargs): + return cls( + sequence_prefix=config['sequence_prefix'], + labels_prefix=config['labels_prefix'], + candidate_prefix=config['candidate_prefix'], + num_items=kwargs['num_items'], + max_sequence_length=kwargs['max_sequence_length'], + embedding_dim=config['embedding_dim'], + num_heads=config.get('num_heads', int(config['embedding_dim'] // 64)), + num_layers=config['num_layers'], + dim_feedforward=config.get('dim_feedforward', 4 * config['embedding_dim']), + dropout=config.get('dropout', 0.0), + initializer_range=config.get('initializer_range', 0.02), + common_graph=kwargs['graph'], + user_graph=kwargs['user_graph'], + item_graph=kwargs['item_graph'], + num_hops=config['num_hops'], + graph_dropout=config['graph_dropout'], + ) + + def _ca_block(self, q, k, v, attn_mask=None, key_padding_mask=None): + x = self._mha( + q, k, v, + attn_mask=attn_mask, + key_padding_mask=key_padding_mask, + need_weights=False + )[0] # (batch_size, seq_len, embedding_dim) + return self.dropout1(x) # (batch_size, seq_len, embedding_dim) + + def _ff_block(self, x): + x = self.linear2(self.dropout(self.activation(self.linear1(x)))) + return self.dropout2(x) + + def _apply_graph_encoder(self, embeddings, graph): + if self.training: # training_mode + size = graph.size() + index = graph.indices().t() + values = graph.values() + dropout_mask = torch.rand(len(values)) + self._graph_dropout + dropout_mask = dropout_mask.int().bool() + index = index[~dropout_mask] + values = values[~dropout_mask] / (1.0 - self._graph_dropout) + graph_dropped = torch.sparse.FloatTensor(index.t(), values, size) + else: # eval mode + graph_dropped = graph + + for _ in range(self._num_hops): + embeddings = torch.sparse.mm(graph_dropped, embeddings) + + return embeddings + + def forward(self, inputs): + all_sample_events = inputs['{}.ids'.format(self._sequence_prefix)] # (all_batch_events) + all_sample_lengths = inputs['{}.length'.format(self._sequence_prefix)] # (batch_size) + + embeddings, mask = self._apply_sequential_encoder( + events=all_sample_events, + lengths=all_sample_lengths + ) # (batch_size, seq_len, embedding_dim), (batch_size, seq_len) + + common_graph_embeddings = self._apply_graph_encoder( + embeddings=self._item_embeddings.weight, + graph=self._item_graph + ) # (num_items + 2, embedding_dim) + + graph_embeddings = common_graph_embeddings[all_sample_events] # (all_batch_events, embedding_dim) + + graph_embeddings, graph_mask = create_masked_tensor( + data=graph_embeddings, + lengths=all_sample_lengths + ) # (batch_size, seq_len, embedding_dim), (batch_size, seq_len) + + if self.norm_first: + graph_embeddings = graph_embeddings + self._ca_block( + embeddings, + graph_embeddings, + graph_embeddings, + attn_mask=None, + key_padding_mask=~mask + ) # (batch_size, seq_len, embedding_dim) + graph_embeddings = graph_embeddings + self._ff_block(self.norm2(graph_embeddings)) + else: + graph_embeddings = self.norm1(graph_embeddings + self._ca_block( + embeddings, + graph_embeddings, + graph_embeddings, + attn_mask=None, + key_padding_mask=~mask + )) # (batch_size, seq_len, embedding_dim) + graph_embeddings = self.norm2(graph_embeddings + self._ff_block(graph_embeddings)) + + embeddings = torch.cat([embeddings, graph_embeddings], dim=-1) + embeddings = self._output_projection(embeddings) # (batch_size, seq_len, embedding_dim) + embeddings = torch.nn.functional.gelu(embeddings) # (batch_size, seq_len, embedding_dim) + embeddings = torch.einsum( + 'bsd,nd->bsn', embeddings, self._item_embeddings.weight + ) # (batch_size, seq_len, num_items) + embeddings += self._bias[None, None, :] # (batch_size, seq_len, num_items) + + if self.training: # training mode + all_sample_labels = inputs['{}.ids'.format(self._labels_prefix)] # (all_batch_events) + embeddings = embeddings[mask] # (all_batch_events, num_items) + labels_mask = (all_sample_labels != 0).bool() # (all_batch_events) + + needed_logits = embeddings[labels_mask] # (non_zero_events, num_items) + needed_labels = all_sample_labels[labels_mask] # (non_zero_events) + + return {'logits': needed_logits, 'labels.ids': needed_labels} + else: # eval mode + last_embeddings = self._get_last_embedding(embeddings, mask) # (batch_size, num_items) + + if '{}.ids'.format(self._candidate_prefix) in inputs: + candidate_events = inputs['{}.ids'.format(self._candidate_prefix)] # (all_batch_candidates) + candidate_lengths = inputs['{}.length'.format(self._candidate_prefix)] # (batch_size) + + candidate_ids = torch.reshape( + candidate_events, + (candidate_lengths.shape[0], candidate_lengths[0]) + ) # (batch_size, num_candidates) + candidate_scores = last_embeddings.gather(dim=1, index=candidate_ids) # (batch_size, num_candidates) + else: + candidate_scores = last_embeddings # (batch_size, num_items + 2) + candidate_scores[:, 0] = -torch.inf + candidate_scores[:, self._num_items + 1:] = -torch.inf + + return candidate_scores diff --git a/modeling/models/diplom/lightgcn.py b/modeling/models/diplom/lightgcn.py new file mode 100644 index 00000000..121d5233 --- /dev/null +++ b/modeling/models/diplom/lightgcn.py @@ -0,0 +1,181 @@ +from models.base import TorchModel + +from utils import create_masked_tensor + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class LightGCNModel(TorchModel, config_name='light_gcn'): + + def __init__( + self, + user_prefix, + positive_prefix, + negative_prefix, + candidate_prefix, + graph, + num_users, + num_items, + embedding_dim, + num_layers, + dropout=0.0, + initializer_range=0.02 + ): + super().__init__() + self._user_prefix = user_prefix + self._positive_prefix = positive_prefix + self._negative_prefix = negative_prefix + self._candidate_prefix = candidate_prefix + self._graph = graph + self._num_users = num_users + self._num_items = num_items + self._embedding_dim = embedding_dim + self._num_layers = num_layers + self._dropout_rate = dropout + + self._user_embeddings = nn.Embedding( + num_embeddings=self._num_users + 2, + embedding_dim=self._embedding_dim + ) + + self._item_embeddings = nn.Embedding( + num_embeddings=self._num_items + 2, + embedding_dim=self._embedding_dim + ) + + self._init_weights(initializer_range) + + @classmethod + def create_from_config(cls, config, **kwargs): + return cls( + user_prefix=config['user_prefix'], + positive_prefix=config['positive_prefix'], + negative_prefix=config['negative_prefix'], + candidate_prefix=config['candidate_prefix'], + graph=kwargs['graph'], + num_users=kwargs['num_users'], + num_items=kwargs['num_items'], + embedding_dim=config['embedding_dim'], + num_layers=config['num_layers'], + dropout=config.get('dropout', 0.0), + initializer_range=config.get('initializer_range', 0.02) + ) + + def _apply_graph_encoder(self): + ego_embeddings = torch.cat((self._user_embeddings.weight, self._item_embeddings.weight), dim=0) + all_embeddings = [ego_embeddings] + + if self._dropout_rate > 0: # drop some edges + if self.training: # training_mode + size = self._graph.size() + index = self._graph.indices().t() + values = self._graph.values() + random_index = torch.rand(len(values)) + (1 - self._dropout_rate) + random_index = random_index.int().bool() + index = index[random_index] + values = values[random_index] / (1 - self._dropout_rate) + graph_dropped = torch.sparse.FloatTensor(index.t(), values, size) + else: # eval mode + graph_dropped = self._graph + else: + graph_dropped = self._graph + + for i in range(self._num_layers): + ego_embeddings = torch.sparse.mm(graph_dropped, ego_embeddings) + norm_embeddings = F.normalize(ego_embeddings, p=2, dim=1) + all_embeddings += [norm_embeddings] + + all_embeddings = torch.cat(all_embeddings, dim=-1) + user_final_embeddings, item_final_embeddings = torch.split( + all_embeddings, [self._num_users + 2, self._num_items + 2] + ) + + return user_final_embeddings, item_final_embeddings + + def _get_embeddings(self, inputs, prefix, ego_embeddings, final_embeddings): + ids = inputs['{}.ids'.format(prefix)] # (all_batch_events) + lengths = inputs['{}.length'.format(prefix)] # (batch_size) + + final_embeddings = final_embeddings[ids] # (all_batch_events, embedding_dim) + ego_embeddings = ego_embeddings(ids) # (all_batch_events, embedding_dim) + + padded_embeddings, mask = create_masked_tensor( + final_embeddings, lengths + ) # (batch_size, seq_len, embedding_dim), (batch_size, seq_len) + + padded_ego_embeddings, ego_mask = create_masked_tensor( + ego_embeddings, lengths + ) # (batch_size, seq_len, embedding_dim), (batch_size, seq_len) + + assert torch.all(mask == ego_mask) + + return padded_embeddings, padded_ego_embeddings, mask + + def forward(self, inputs): + all_final_user_embeddings, all_final_item_embeddings = \ + self._apply_graph_encoder() # (num_users + 2, embedding_dim), (num_items + 2, embedding_dim) + + user_embeddings, user_ego_embeddings, user_mask = self._get_embeddings( + inputs, self._user_prefix, self._user_embeddings, all_final_user_embeddings + ) + user_embeddings = user_embeddings[user_mask] # (all_batch_events, embedding_dim) + + if self.training: # training mode + positive_embeddings, _, positive_mask = self._get_embeddings( + inputs, self._positive_prefix, self._item_embeddings, all_final_item_embeddings + ) # (batch_size, seq_len, embedding_dim) + negative_embeddings, _, negative_mask = self._get_embeddings( + inputs, self._negative_prefix, self._item_embeddings, all_final_item_embeddings + ) # (batch_size, seq_len, embedding_dim) + + # b - batch_size, s - seq_len, d - embedding_dim + positive_scores = torch.einsum( + 'bd,bsd->bs', + user_embeddings, + positive_embeddings + ) # (batch_size, seq_len) + negative_scores = torch.einsum( + 'bd,bsd->bs', + user_embeddings, + negative_embeddings + ) # (batch_size, seq_len) + + positive_scores = positive_scores[positive_mask] # (all_batch_events) + negative_scores = negative_scores[negative_mask] # (all_batch_events) + + return { + 'positive_scores': positive_scores, + 'negative_scores': negative_scores, + 'item_embeddings': torch.cat((self._user_embeddings.weight, self._item_embeddings.weight), dim=0) + } + else: # eval mode + # b - batch_size, n - num_candidates, d - embedding_dim + candidate_scores = torch.einsum( + 'bd,nd->bn', + user_embeddings, + all_final_item_embeddings + ) # (batch_size, num_items + 2) + candidate_scores[:, 0] = -torch.inf + candidate_scores[:, self._num_items + 1:] = -torch.inf + + if '{}.ids'.format(self._candidate_prefix) in inputs: + candidate_events = inputs['{}.ids'.format(self._candidate_prefix)] # (all_batch_candidates) + candidate_lengths = inputs['{}.length'.format(self._candidate_prefix)] # (batch_size) + + batch_size = candidate_lengths.shape[0] + num_candidates = candidate_lengths[0] + + candidate_scores = torch.gather( + input=candidate_scores, + dim=1, + index=torch.reshape(candidate_events, [batch_size, num_candidates]) + ) # (batch_size, num_candidates) + + _, indices = torch.topk( + candidate_scores, + k=20, dim=-1, largest=True + ) # (batch_size, 20) + + return indices diff --git a/modeling/models/diplom/sasrec.py b/modeling/models/diplom/sasrec.py new file mode 100644 index 00000000..fe90563d --- /dev/null +++ b/modeling/models/diplom/sasrec.py @@ -0,0 +1,658 @@ +from models.base import SequentialTorchModel + +from utils import create_masked_tensor + +import torch +import torch.nn as nn + + +class SasRecModel(SequentialTorchModel, config_name='sasrec'): + + def __init__( + self, + sequence_prefix, + positive_prefix, + negative_prefix, + candidate_prefix, + num_items, + max_sequence_length, + embedding_dim, + num_heads, + num_layers, + dim_feedforward, + dropout=0.0, + activation='relu', + layer_norm_eps=1e-9, + initializer_range=0.02 + ): + super().__init__( + num_items=num_items, + max_sequence_length=max_sequence_length, + embedding_dim=embedding_dim, + num_heads=num_heads, + num_layers=num_layers, + dim_feedforward=dim_feedforward, + dropout=dropout, + activation=activation, + layer_norm_eps=layer_norm_eps, + is_causal=True + ) + self._sequence_prefix = sequence_prefix + self._positive_prefix = positive_prefix + self._negative_prefix = negative_prefix + self._candidate_prefix = candidate_prefix + + self._output_projection = nn.Linear( + in_features=embedding_dim, + out_features=embedding_dim + ) + + self._bias = nn.Parameter( + data=torch.zeros(num_items + 2), + requires_grad=True + ) + + self._init_weights(initializer_range) + + @classmethod + def create_from_config(cls, config, **kwargs): + return cls( + sequence_prefix=config['sequence_prefix'], + positive_prefix=config['positive_prefix'], + negative_prefix=config['negative_prefix'], + candidate_prefix=config['candidate_prefix'], + num_items=kwargs['num_items'], + max_sequence_length=kwargs['max_sequence_length'], + embedding_dim=config['embedding_dim'], + num_heads=config.get('num_heads', int(config['embedding_dim'] // 64)), + num_layers=config['num_layers'], + dim_feedforward=config.get('dim_feedforward', 4 * config['embedding_dim']), + dropout=config.get('dropout', 0.0), + initializer_range=config.get('initializer_range', 0.02) + ) + + def forward(self, inputs): + all_sample_events = inputs['{}.ids'.format(self._sequence_prefix)] # (all_batch_events) + all_sample_lengths = inputs['{}.length'.format(self._sequence_prefix)] # (batch_size) + + embeddings, mask = self._apply_sequential_encoder( + all_sample_events, all_sample_lengths + ) # (batch_size, seq_len, embedding_dim), (batch_size, seq_len) + + if self.training: # training mode + all_positive_sample_events = inputs['{}.ids'.format(self._positive_prefix)] # (all_batch_events) + all_negative_sample_events = inputs['{}.ids'.format(self._negative_prefix)] # (all_batch_events) + + all_sample_embeddings = embeddings[mask] # (all_batch_events, embedding_dim) + all_positive_sample_embeddings = self._item_embeddings( + all_positive_sample_events + ) # (all_batch_events, embedding_dim) + all_negative_sample_embeddings = self._item_embeddings( + all_negative_sample_events + ) # (all_batch_events, embedding_dim) + + return { + 'current_embeddings': all_sample_embeddings, + 'positive_embeddings': all_positive_sample_embeddings, + 'negative_embeddings': all_negative_sample_embeddings + } + else: # eval mode + last_embeddings = self._get_last_embedding(embeddings, mask) # (batch_size, embedding_dim) + + # b - batch_size, n - num_candidates, d - embedding_dim + candidate_scores = torch.einsum( + 'bd,nd->bn', + last_embeddings, + self._item_embeddings.weight + ) # (batch_size, num_items + 2) + candidate_scores[:, 0] = -torch.inf + candidate_scores[:, self._num_items + 1:] = -torch.inf + + if '{}.ids'.format(self._candidate_prefix) in inputs: + candidate_events = inputs['{}.ids'.format(self._candidate_prefix)] # (all_batch_candidates) + candidate_lengths = inputs['{}.length'.format(self._candidate_prefix)] # (batch_size) + + batch_size = candidate_lengths.shape[0] + num_candidates = candidate_lengths[0] + + candidate_scores = torch.gather( + input=candidate_scores, + dim=1, + index=torch.reshape(candidate_events, [batch_size, num_candidates]) + ) # (batch_size, num_candidates) + + values, indices = torch.topk( + candidate_scores, + k=20, dim=-1, largest=True + ) # (batch_size, 20), (batch_size, 20) + + return indices + +# class SasRecMCLSRModel(SequentialTorchModel, config_name='sasrec_mclsr'): +# +# def __init__( +# self, +# sequence_prefix, +# user_prefix, +# positive_prefix, +# negative_prefix, +# candidate_prefix, +# common_graph, +# user_graph, +# item_graph, +# num_users, +# num_items, +# max_sequence_length, +# embedding_dim, +# num_heads, +# num_layers, +# num_hops, +# dim_feedforward, +# dropout=0.0, +# activation='relu', +# layer_norm_eps=1e-5, +# graph_dropout=0.0, +# alpha=0.5, +# initializer_range=0.02 +# ): +# super().__init__( +# num_items=num_items, +# max_sequence_length=max_sequence_length, +# embedding_dim=embedding_dim, +# num_heads=num_heads, +# num_layers=num_layers, +# dim_feedforward=dim_feedforward, +# dropout=dropout, +# activation=activation, +# layer_norm_eps=layer_norm_eps, +# is_causal=True +# ) +# self._sequence_prefix = sequence_prefix +# self._positive_prefix = positive_prefix +# self._negative_prefix = negative_prefix +# self._user_prefix = user_prefix +# self._candidate_prefix = candidate_prefix +# +# self._num_users = num_users +# self._num_items = num_items +# +# self._embedding_dim = embedding_dim +# +# self._num_hops = num_hops +# self._graph_dropout = graph_dropout +# +# self._alpha = alpha +# +# self._graph = common_graph +# self._user_graph = user_graph +# self._item_graph = item_graph +# +# self._user_embeddings = nn.Embedding( +# num_embeddings=num_users + 2, # add zero embedding + mask embedding +# embedding_dim=embedding_dim +# ) +# +# # Current interest learning +# self._current_interest_learning_encoder = nn.Sequential( +# nn.Linear(in_features=embedding_dim, out_features=4 * embedding_dim, bias=False), +# nn.Tanh(), +# nn.Linear(in_features=4 * embedding_dim, out_features=1, bias=False) +# ) +# +# # General interest learning +# self._general_interest_learning_encoder = nn.Sequential( +# nn.Linear(in_features=embedding_dim, out_features=embedding_dim, bias=False), +# nn.Tanh() +# ) +# +# # Cross-view contrastive learning +# self._sequential_projector = nn.Sequential( +# nn.Linear(in_features=embedding_dim, out_features=embedding_dim, bias=True), +# nn.ELU(), +# nn.Linear(in_features=embedding_dim, out_features=embedding_dim, bias=True) +# ) +# self._graph_projector = nn.Sequential( +# nn.Linear(in_features=embedding_dim, out_features=embedding_dim, bias=True), +# nn.ELU(), +# nn.Linear(in_features=embedding_dim, out_features=embedding_dim, bias=True) +# ) +# +# self._user_projection = nn.Sequential( +# nn.Linear(in_features=embedding_dim, out_features=embedding_dim, bias=True), +# nn.ELU(), +# nn.Linear(in_features=embedding_dim, out_features=embedding_dim, bias=True) +# ) +# +# self._item_projection = nn.Sequential( +# nn.Linear(in_features=embedding_dim, out_features=embedding_dim, bias=True), +# nn.ELU(), +# nn.Linear(in_features=embedding_dim, out_features=embedding_dim, bias=True) +# ) +# +# self._init_weights(initializer_range) +# +# @classmethod +# def create_from_config(cls, config, **kwargs): +# return cls( +# sequence_prefix=config['sequence_prefix'], +# user_prefix=config['user_prefix'], +# positive_prefix=config['positive_prefix'], +# negative_prefix=config['negative_prefix'], +# candidate_prefix=config['candidate_prefix'], +# common_graph=kwargs['graph'], +# user_graph=kwargs['user_graph'], +# item_graph=kwargs['item_graph'], +# num_users=kwargs['num_users'], +# num_items=kwargs['num_items'], +# max_sequence_length=kwargs['max_sequence_length'], +# embedding_dim=config['embedding_dim'], +# num_heads=config['num_heads'], +# num_layers=config['num_layers'], +# num_hops=config['num_hops'], +# dim_feedforward=config.get('dim_feedforward', 4 * config['embedding_dim']), +# dropout=config.get('dropout', 0.0), +# activation=config.get('activation', 'relu'), +# layer_norm_eps=config.get('layer_norm_eps', 1e-5), +# graph_dropout=config.get('graph_dropout', 0.0), +# initializer_range=config.get('initializer_range', 0.02) +# ) +# +# def _apply_graph_encoder(self, embeddings, graph): +# if self.training: # training_mode +# size = graph.size() +# index = graph.indices().t() +# values = graph.values() +# dropout_mask = torch.rand(len(values)) + self._graph_dropout +# dropout_mask = dropout_mask.int().bool() +# index = index[~dropout_mask] +# values = values[~dropout_mask] / (1.0 - self._graph_dropout) +# graph_dropped = torch.sparse.FloatTensor(index.t(), values, size) +# else: # eval mode +# graph_dropped = graph +# +# for _ in range(self._num_hops): +# embeddings = torch.sparse.mm(graph_dropped, embeddings) +# +# return embeddings +# +# def forward(self, inputs): +# all_sample_events = inputs['{}.ids'.format(self._sequence_prefix)] # (all_batch_events) +# all_sample_lengths = inputs['{}.length'.format(self._sequence_prefix)] # (batch_size) +# user_ids = inputs['{}.ids'.format(self._user_prefix)] # (batch_size) +# +# embeddings, mask = self._apply_sequential_encoder( +# all_sample_events, all_sample_lengths +# ) # (batch_size, seq_len, embedding_dim), (batch_size, seq_len) +# last_embedding = self._get_last_embedding(embeddings, mask) # (batch_size, embedding_dim) +# +# if self.training: # training mode +# all_positive_sample_events = inputs['{}.ids'.format(self._positive_prefix)] # (all_batch_events) +# all_negative_sample_events = inputs['{}.ids'.format(self._negative_prefix)] # (all_batch_events) +# +# all_sample_embeddings = embeddings[mask] # (all_batch_events, embedding_dim) +# all_positive_sample_embeddings = self._item_embeddings( +# all_positive_sample_events +# ) # (all_batch_events, embedding_dim) +# all_negative_sample_embeddings = self._item_embeddings( +# all_negative_sample_events +# ) # (all_batch_events, embedding_dim) +# +# # General interest learning +# all_embeddings = torch.cat( +# [self._item_embeddings.weight, self._user_embeddings.weight], +# dim=0 +# ) # (num_users + 2 + num_items + 2, embedding_dim) +# common_graph_embeddings = self._apply_graph_encoder( +# embeddings=all_embeddings, +# graph=self._graph +# ) # (num_users + 2 + num_items + 2, embedding_dim) +# common_graph_user_embeddings, common_graph_item_embeddings = torch.split( +# common_graph_embeddings, +# [self._num_users + 2, self._num_items + 2] +# ) # (num_users + 2, embedding_dim), (num_items + 2, embedding_dim) +# +# common_graph_user_embeddings = \ +# common_graph_user_embeddings[user_ids] # (batch_size, embedding_dim) +# common_graph_item_embeddings = \ +# common_graph_item_embeddings[all_sample_events] # (all_batch_events, embedding_dim) +# +# common_graph_item_embeddings, _ = create_masked_tensor( +# data=common_graph_item_embeddings, +# lengths=all_sample_lengths +# ) # (batch_size, seq_len, embedding_dim) +# +# graph_attention_matrix = torch.einsum( +# 'bd,bsd->bs', +# self._general_interest_learning_encoder(common_graph_user_embeddings), +# common_graph_item_embeddings +# ) # (batch_size, seq_len) +# graph_attention_matrix[~mask] = -torch.inf +# graph_attention_matrix = torch.softmax(graph_attention_matrix, dim=1) # (batch_size, seq_len) +# +# graph_representation = torch.einsum( +# 'bs,bsd->bd', graph_attention_matrix, common_graph_item_embeddings +# ) # (batch_size, embedding_dim) +# +# # Downstream task +# combined_representation = \ +# self._alpha * all_sample_embeddings + \ +# (1 - self._alpha) * common_graph_item_embeddings[mask] # (all_batch_events, embedding_dim) +# +# # Cross-view contrastive learning +# sequential_representation = self._sequential_projector(last_embedding) # (batch_size, embedding_dim) +# graph_representation = self._graph_projector(graph_representation) # (batch_size, embedding_dim) +# +# # Feature-level Contrastive Learning +# user_graph_user_embeddings = self._apply_graph_encoder( +# embeddings=self._user_embeddings.weight, +# graph=self._user_graph +# ) # (num_users + 2, embedding_dim) +# user_graph_user_embeddings = torch.gather( +# user_graph_user_embeddings, +# dim=0, +# index=user_ids[..., None].tile(1, self._embedding_dim) +# ) # (batch_size, embedding_dim) +# +# user_graph_user_embeddings = self._user_projection( +# user_graph_user_embeddings +# ) # (batch_size, embedding_dim) +# common_graph_user_embeddings = self._user_projection( +# common_graph_user_embeddings +# ) # (batch_size, embedding_dim) +# +# item_graph_item_embeddings = self._apply_graph_encoder( +# embeddings=self._item_embeddings.weight, +# graph=self._item_graph +# ) # (num_items + 2, embedding_dim) +# item_graph_item_embeddings = torch.gather( +# item_graph_item_embeddings, +# dim=0, +# index=all_sample_events[..., None].tile(1, self._embedding_dim) +# ) # (all_sample_events, embedding_dim) +# +# item_graph_item_embeddings = self._item_projection( +# item_graph_item_embeddings +# ) # (all_batch_events, embedding_dim) +# common_graph_item_embeddings = self._item_projection( +# common_graph_item_embeddings[mask] +# ) # (all_batch_events, embedding_dim) +# +# return { +# # Downstream task (sequential) +# 'current_embeddings': combined_representation, +# 'positive_embeddings': all_positive_sample_embeddings, +# 'negative_embeddings': all_negative_sample_embeddings, +# +# # Interest-level Contrastive Learning +# 'sequential_representation': sequential_representation, +# 'graph_representation': graph_representation, +# +# # Feature-level Contrastive Learning (users) +# 'user_graph_user_embeddings': user_graph_user_embeddings, +# 'common_graph_user_embeddings': common_graph_user_embeddings, +# +# # Feature-level Contrastive Learning (items) +# 'item_graph_item_embeddings': item_graph_item_embeddings, +# 'common_graph_item_embeddings': common_graph_item_embeddings +# } +# else: # eval mode +# if '{}.ids'.format(self._candidate_prefix) in inputs: +# candidate_events = inputs['{}.ids'.format(self._candidate_prefix)] # (all_batch_candidates) +# candidate_lengths = inputs['{}.length'.format(self._candidate_prefix)] # (batch_size) +# candidate_embeddings = self._item_embeddings(candidate_events) # (all_batch_candidates, embedding_dim) +# candidate_embeddings, _ = create_masked_tensor( +# data=candidate_embeddings, +# lengths=candidate_lengths +# ) # (batch_size, num_candidates, embedding_dim) +# candidate_scores = torch.einsum( +# 'bd,bnd->bn', +# last_embedding, +# candidate_embeddings +# ) # (batch_size, num_candidates) +# else: +# candidate_embeddings = self._item_embeddings.weight # (num_items, embedding_dim) +# candidate_scores = torch.einsum( +# 'bd,nd->bn', +# last_embedding, +# candidate_embeddings +# ) # (batch_size, num_items) +# candidate_scores[:, 0] = -torch.inf +# candidate_scores[:, self._num_items + 1:] = -torch.inf +# +# return candidate_scores +# +# +# class GraphSasRecModel(SequentialTorchModel, config_name='graph_sasrec'): +# +# def __init__( +# self, +# sequence_prefix, +# user_prefix, +# positive_prefix, +# negative_prefix, +# candidate_prefix, +# common_graph, +# user_graph, +# item_graph, +# num_users, +# num_items, +# max_sequence_length, +# embedding_dim, +# num_heads, +# num_layers, +# num_hops, +# dim_feedforward, +# dropout=0.0, +# norm_first=True, +# activation='relu', +# layer_norm_eps=1e-5, +# graph_dropout=0.0, +# initializer_range=0.02 +# ): +# super().__init__( +# num_items=num_items, +# max_sequence_length=max_sequence_length, +# embedding_dim=embedding_dim, +# num_heads=num_heads, +# num_layers=num_layers, +# dim_feedforward=dim_feedforward, +# dropout=dropout, +# activation=activation, +# layer_norm_eps=layer_norm_eps, +# is_causal=True +# ) +# self._sequence_prefix = sequence_prefix +# self._positive_prefix = positive_prefix +# self._negative_prefix = negative_prefix +# self._user_prefix = user_prefix +# self._candidate_prefix = candidate_prefix +# +# self._num_users = num_users +# self._num_items = num_items +# +# self._embedding_dim = embedding_dim +# +# self._num_hops = num_hops +# self._graph_dropout = graph_dropout +# +# self._graph = common_graph +# self._user_graph = user_graph +# self._item_graph = item_graph +# +# self._mha = MultiheadAttention( +# embed_dim=embedding_dim, +# num_heads=num_heads, +# dropout=dropout, +# bias=True, +# add_bias_kv=False, +# add_zero_attn=False, +# batch_first=True, +# ) +# +# self.linear1 = nn.Linear(embedding_dim, dim_feedforward) +# self.dropout = nn.Dropout(dropout) +# self.linear2 = nn.Linear(dim_feedforward, embedding_dim) +# self.activation = get_activation_function(activation) +# +# self.norm_first = norm_first +# self.norm1 = nn.LayerNorm(embedding_dim, eps=layer_norm_eps) +# self.norm2 = nn.LayerNorm(embedding_dim, eps=layer_norm_eps) +# self.dropout1 = nn.Dropout(dropout) +# self.dropout2 = nn.Dropout(dropout) +# +# self._output_projection = nn.Linear( +# in_features=2 * embedding_dim, +# out_features=embedding_dim, +# ) +# +# self._bias = nn.Parameter( +# data=torch.zeros(num_items + 2), +# requires_grad=True +# ) +# +# self._init_weights(initializer_range) +# +# @classmethod +# def create_from_config(cls, config, **kwargs): +# return cls( +# sequence_prefix=config['sequence_prefix'], +# user_prefix=config['user_prefix'], +# positive_prefix=config['positive_prefix'], +# negative_prefix=config['negative_prefix'], +# candidate_prefix=config['candidate_prefix'], +# common_graph=kwargs['graph'], +# user_graph=kwargs['user_graph'], +# item_graph=kwargs['item_graph'], +# num_users=kwargs['num_users'], +# num_items=kwargs['num_items'], +# max_sequence_length=kwargs['max_sequence_length'], +# embedding_dim=config['embedding_dim'], +# num_heads=config['num_heads'], +# num_layers=config['num_layers'], +# num_hops=config['num_hops'], +# dim_feedforward=config.get('dim_feedforward', 4 * config['embedding_dim']), +# dropout=config.get('dropout', 0.0), +# activation=config.get('activation', 'relu'), +# layer_norm_eps=config.get('layer_norm_eps', 1e-5), +# graph_dropout=config.get('graph_dropout', 0.0), +# initializer_range=config.get('initializer_range', 0.02) +# ) +# +# def _ca_block(self, q, k, v, attn_mask=None, key_padding_mask=None): +# x = self._mha( +# q, k, v, +# attn_mask=attn_mask, +# key_padding_mask=key_padding_mask, +# need_weights=False +# )[0] # (batch_size, seq_len, embedding_dim) +# return self.dropout1(x) # (batch_size, seq_len, embedding_dim) +# +# def _ff_block(self, x): +# x = self.linear2(self.dropout(self.activation(self.linear1(x)))) +# return self.dropout2(x) +# +# def _apply_graph_encoder(self, embeddings, graph): +# if self.training: # training_mode +# size = graph.size() +# index = graph.indices().t() +# values = graph.values() +# dropout_mask = torch.rand(len(values)) + self._graph_dropout +# dropout_mask = dropout_mask.int().bool() +# index = index[~dropout_mask] +# values = values[~dropout_mask] / (1.0 - self._graph_dropout) +# graph_dropped = torch.sparse.FloatTensor(index.t(), values, size) +# else: # eval mode +# graph_dropped = graph +# +# for _ in range(self._num_hops): +# embeddings = torch.sparse.mm(graph_dropped, embeddings) +# +# return embeddings +# +# def forward(self, inputs): +# all_sample_events = inputs['{}.ids'.format(self._sequence_prefix)] # (all_batch_events) +# all_sample_lengths = inputs['{}.length'.format(self._sequence_prefix)] # (batch_size) +# +# embeddings, mask = self._apply_sequential_encoder( +# all_sample_events, all_sample_lengths +# ) # (batch_size, seq_len, embedding_dim), (batch_size, seq_len) +# +# common_graph_embeddings = self._apply_graph_encoder( +# embeddings=self._item_embeddings.weight, +# graph=self._item_graph +# ) # (num_items + 2, embedding_dim) +# +# graph_embeddings = common_graph_embeddings[all_sample_events] # (all_batch_events, embedding_dim) +# +# graph_embeddings, graph_mask = create_masked_tensor( +# data=graph_embeddings, +# lengths=all_sample_lengths +# ) # (batch_size, seq_len, embedding_dim), (batch_size, seq_len) +# +# if self.norm_first: +# graph_embeddings = graph_embeddings + self.norm1(self._ca_block( +# q=embeddings, +# k=graph_embeddings, +# v=graph_embeddings, +# attn_mask=None, +# key_padding_mask=~mask +# )) # (batch_size, seq_len, embedding_dim) +# graph_embeddings = graph_embeddings + self.norm2(self._ff_block(graph_embeddings)) +# else: +# graph_embeddings = self.norm1(graph_embeddings + self._ca_block( +# q=embeddings, +# k=graph_embeddings, +# v=graph_embeddings, +# attn_mask=None, +# key_padding_mask=~mask +# )) # (batch_size, seq_len, embedding_dim) +# graph_embeddings = self.norm2(graph_embeddings + self._ff_block(graph_embeddings)) +# +# embeddings = torch.cat([embeddings, graph_embeddings], dim=-1) +# embeddings = self._output_projection(embeddings) # (batch_size, seq_len, embedding_dim) +# +# last_embedding = self._get_last_embedding(embeddings, mask) # (batch_size, embedding_dim) +# +# if self.training: # training mode +# all_positive_sample_events = inputs['{}.ids'.format(self._positive_prefix)] # (all_batch_events) +# all_negative_sample_events = inputs['{}.ids'.format(self._negative_prefix)] # (all_batch_events) +# +# all_sample_embeddings = embeddings[mask] # (all_batch_events, embedding_dim) +# all_positive_sample_embeddings = self._item_embeddings( +# all_positive_sample_events +# ) # (all_batch_events, embedding_dim) +# all_negative_sample_embeddings = self._item_embeddings( +# all_negative_sample_events +# ) # (all_batch_events, embedding_dim) +# +# return { +# # Downstream task (sequential) +# 'current_embeddings': all_sample_embeddings, +# 'positive_embeddings': all_positive_sample_embeddings, +# 'negative_embeddings': all_negative_sample_embeddings, +# } +# else: # eval mode +# if '{}.ids'.format(self._candidate_prefix) in inputs: +# candidate_events = inputs['{}.ids'.format(self._candidate_prefix)] # (all_batch_candidates) +# candidate_lengths = inputs['{}.length'.format(self._candidate_prefix)] # (batch_size) +# candidate_embeddings = self._item_embeddings(candidate_events) # (all_batch_candidates, embedding_dim) +# candidate_embeddings, _ = create_masked_tensor( +# data=candidate_embeddings, +# lengths=candidate_lengths +# ) # (batch_size, num_candidates, embedding_dim) +# candidate_scores = torch.einsum( +# 'bd,bnd->bn', +# last_embedding, +# candidate_embeddings +# ) # (batch_size, num_candidates) +# else: +# candidate_embeddings = self._item_embeddings.weight # (num_items, embedding_dim) +# candidate_scores = torch.einsum( +# 'bd,nd->bn', +# last_embedding, +# candidate_embeddings +# ) # (batch_size, num_items) +# candidate_scores[:, 0] = -torch.inf +# candidate_scores[:, self._num_items + 1:] = -torch.inf +# +# return candidate_scores diff --git a/modeling/models/graph_seq_rec.py b/modeling/models/graph_seq_rec.py deleted file mode 100644 index b227f184..00000000 --- a/modeling/models/graph_seq_rec.py +++ /dev/null @@ -1,241 +0,0 @@ -from models.base import SequentialTorchModel - -from utils import create_masked_tensor, DEVICE - -import torch -import torch.nn as nn - - -class GraphSeqRecModel(SequentialTorchModel, config_name='graph_seq_rec'): - - def __init__( - self, - sequence_prefix, - positive_prefix, - negative_prefix, - candidate_prefix, - common_graph, - user_graph, - item_graph, - num_hops, - graph_dropout, - num_items, - max_sequence_length, - embedding_dim, - num_heads, - num_layers, - dim_feedforward, - dropout=0.0, - use_ce=False, - activation='relu', - layer_norm_eps=1e-9, - initializer_range=0.02 - ): - super().__init__( - num_items=num_items, - max_sequence_length=max_sequence_length, - embedding_dim=embedding_dim, - num_heads=num_heads, - num_layers=num_layers, - dim_feedforward=dim_feedforward, - dropout=dropout, - activation=activation, - layer_norm_eps=layer_norm_eps, - is_causal=True - ) - self._sequence_prefix = sequence_prefix - self._positive_prefix = positive_prefix - self._negative_prefix = negative_prefix - self._candidate_prefix = candidate_prefix - - self._use_ce = use_ce - - self._common_graph = common_graph - self._user_graph = user_graph - self._item_graph = item_graph - self._num_hops = num_hops - self._graph_dropout = graph_dropout - - self._output_projection = nn.Linear( - in_features=embedding_dim, - out_features=embedding_dim - ) - - self._bias = nn.Parameter( - data=torch.zeros(num_items + 2), - requires_grad=True - ) - - self._init_weights(initializer_range) - - @classmethod - def create_from_config(cls, config, **kwargs): - return cls( - sequence_prefix=config['sequence_prefix'], - positive_prefix=config['positive_prefix'], - negative_prefix=config['negative_prefix'], - candidate_prefix=config['candidate_prefix'], - common_graph=kwargs['graph'], - user_graph=kwargs['user_graph'], - item_graph=kwargs['item_graph'], - num_hops=config['num_hops'], - graph_dropout=config['graph_dropout'], - num_items=kwargs['num_items'], - max_sequence_length=kwargs['max_sequence_length'], - embedding_dim=config['embedding_dim'], - num_heads=config.get('num_heads', int(config['embedding_dim'] // 64)), - num_layers=config['num_layers'], - dim_feedforward=config.get('dim_feedforward', 4 * config['embedding_dim']), - dropout=config.get('dropout', 0.0), - use_ce=config.get('use_ce', False), - initializer_range=config.get('initializer_range', 0.02) - ) - - def _apply_graph_encoder(self, embeddings, graph): - if self.training: # training_mode - size = graph.size() - index = graph.indices().t() - values = graph.values() - dropout_mask = torch.rand(len(values)) + self._graph_dropout - dropout_mask = dropout_mask.int().bool() - index = index[~dropout_mask] - values = values[~dropout_mask] / (1.0 - self._graph_dropout) - graph_dropped = torch.sparse.FloatTensor(index.t(), values, size) - else: # eval mode - graph_dropped = graph - - for _ in range(self._num_hops): - embeddings = torch.sparse.mm(graph_dropped, embeddings) - - return embeddings - - def forward(self, inputs): - all_sample_events = inputs['{}.ids'.format(self._sequence_prefix)] # (all_batch_events) - lengths = inputs['{}.length'.format(self._sequence_prefix)] # (batch_size) - - common_graph_embeddings = self._apply_graph_encoder( - embeddings=self._item_embeddings.weight, - graph=self._item_graph - ) # (num_items + 2, embedding_dim) - - embeddings = common_graph_embeddings[all_sample_events] # (all_batch_events, embedding_dim) - - embeddings, mask = create_masked_tensor( - data=embeddings, - lengths=lengths - ) # (batch_size, seq_len, embedding_dim), (batch_size, seq_len) - - batch_size = mask.shape[0] - seq_len = mask.shape[1] - - positions = torch.arange( - start=seq_len - 1, end=-1, step=-1, device=mask.device - )[None].tile([batch_size, 1]).long() # (batch_size, seq_len) - positions_mask = positions < lengths[:, None] # (batch_size, max_seq_len) - - positions = positions[positions_mask] # (all_batch_events) - position_embeddings = self._position_embeddings(positions) # (all_batch_events, embedding_dim) - position_embeddings, _ = create_masked_tensor( - data=position_embeddings, - lengths=lengths - ) # (batch_size, seq_len, embedding_dim) - assert torch.allclose(position_embeddings[~mask], embeddings[~mask]) - - embeddings = embeddings + position_embeddings # (batch_size, seq_len, embedding_dim) - - embeddings = self._layernorm(embeddings) # (batch_size, seq_len, embedding_dim) - embeddings = self._dropout(embeddings) # (batch_size, seq_len, embedding_dim) - - embeddings[~mask] = 0 - - if self._is_causal: - causal_mask = torch.tril(torch.tile(mask.unsqueeze(1), dims=[self._num_heads, seq_len, 1])).bool().to(DEVICE) # (seq_len, seq_len) - embeddings = self._encoder( - src=embeddings, - mask=~causal_mask, - ) # (batch_size, seq_len, embedding_dim) - else: - embeddings = self._encoder( - src=embeddings, - src_key_padding_mask=~mask - ) # (batch_size, seq_len, embedding_dim) - - if self._use_ce: - embeddings = self._output_projection(embeddings) # (batch_size, seq_len, embedding_dim) - embeddings = torch.nn.functional.gelu(embeddings) # (batch_size, seq_len, embedding_dim) - embeddings = torch.einsum( - 'bsd,nd->bsn', embeddings, self._item_embeddings.weight - ) # (batch_size, seq_len, num_items) - embeddings += self._bias[None, None, :] # (batch_size, seq_len, num_items) - else: - last_embeddings = self._get_last_embedding(embeddings, mask) # (batch_size, embedding_dim) - - if self.training: # training mode - if self._use_ce: - return {'logits': embeddings[mask]} - else: - all_positive_sample_events = inputs['{}.ids'.format(self._positive_prefix)] # (all_batch_events) - all_negative_sample_events = inputs['{}.ids'.format(self._negative_prefix)] # (all_batch_events) - - all_sample_embeddings = embeddings[mask] # (all_batch_events, embedding_dim) - all_positive_sample_embeddings = self._item_embeddings( - all_positive_sample_events - ) # (all_batch_events, embedding_dim) - all_negative_sample_embeddings = self._item_embeddings( - all_negative_sample_events - ) # (all_batch_events, embedding_dim) - - return { - 'current_embeddings': all_sample_embeddings, - 'positive_embeddings': all_positive_sample_embeddings, - 'negative_embeddings': all_negative_sample_embeddings - } - else: # eval mode - if self._use_ce: - last_embeddings = self._get_last_embedding(embeddings, mask) # (batch_size, num_items) - - if '{}.ids'.format(self._candidate_prefix) in inputs: - candidate_events = inputs['{}.ids'.format(self._candidate_prefix)] # (all_batch_candidates) - candidate_lengths = inputs['{}.length'.format(self._candidate_prefix)] # (batch_size) - - candidate_ids = torch.reshape( - candidate_events, - (candidate_lengths.shape[0], candidate_lengths[0]) - ) # (batch_size, num_candidates) - candidate_scores = last_embeddings.gather( - dim=1, index=candidate_ids - ) # (batch_size, num_candidates) - else: - candidate_scores = last_embeddings # (batch_size, num_items + 2) - candidate_scores[:, 0] = -torch.inf - candidate_scores[:, self._num_items + 1:] = -torch.inf - else: - if '{}.ids'.format(self._candidate_prefix) in inputs: - candidate_events = inputs['{}.ids'.format(self._candidate_prefix)] # (all_batch_candidates) - candidate_lengths = inputs['{}.length'.format(self._candidate_prefix)] # (batch_size) - - candidate_embeddings = self._item_embeddings( - candidate_events - ) # (all_batch_candidates, embedding_dim) - - candidate_embeddings, _ = create_masked_tensor( - data=candidate_embeddings, - lengths=candidate_lengths - ) # (batch_size, num_candidates, embedding_dim) - - candidate_scores = torch.einsum( - 'bd,bnd->bn', - last_embeddings, - candidate_embeddings - ) # (batch_size, num_candidates) - else: - candidate_embeddings = self._item_embeddings.weight # (num_items, embedding_dim) - candidate_scores = torch.einsum( - 'bd,nd->bn', - last_embeddings, - candidate_embeddings - ) # (batch_size, num_items) - candidate_scores[:, 0] = -torch.inf - candidate_scores[:, self._num_items + 1:] = -torch.inf - - return candidate_scores diff --git a/modeling/models/gsnrec3.py b/modeling/models/gsnrec3.py new file mode 100644 index 00000000..19e087da --- /dev/null +++ b/modeling/models/gsnrec3.py @@ -0,0 +1,389 @@ +from models.base import SequentialTorchModel + +from utils import create_masked_tensor, DEVICE +import torch +import torch.nn as nn +import torch.nn.functional as F + +import numpy as np + + +class GSNRec3Model(SequentialTorchModel, config_name='gsnrec3'): + + def __init__( + self, + user_prefix, + sequence_prefix, + positive_prefix, + negative_prefix, + candidate_prefix, + num_users, + num_items, + max_sequence_length, + negative_items_popularity, + embedding_dim, + num_heads, + num_layers, + mlp_layers, + graph, + dim_feedforward, + ratings_prefix, + dropout_seq, + dropout_graph, + dropout_gsnrec, + dropout_mlp, + activation='relu', + layer_norm_eps=1e-9, + initializer_range=0.02 + ): + super().__init__( + num_items=num_items, + max_sequence_length=max_sequence_length, + embedding_dim=embedding_dim, + num_heads=num_heads, + num_layers=num_layers, + dim_feedforward=dim_feedforward, + dropout=dropout_seq, + activation=activation, + layer_norm_eps=layer_norm_eps, + is_causal=True + ) + self._sequence_prefix = sequence_prefix + self._positive_prefix = positive_prefix + self._negative_prefix = negative_prefix + self._candidate_prefix = candidate_prefix + + self._user_prefix = user_prefix + self._graph = graph + self._num_layers = num_layers + self._num_users = num_users + self._dropout_rate = dropout_graph + self._mlp_layers = mlp_layers + self._ratings_prefix = ratings_prefix + self._dropout_gsnrec = dropout_gsnrec + self._dropout_mlp = dropout_mlp + + self._max_sequence_length = max_sequence_length + self._negative_items_popularity = negative_items_popularity + + self._output_projection = nn.Linear( + in_features=embedding_dim, + out_features=embedding_dim + ) + + self._bias = nn.Parameter( + data=torch.zeros(num_items + 2), + requires_grad=True + ) + + self._init_weights(initializer_range) + + self._user_embeddings = nn.Embedding( + num_embeddings=self._num_users + 2, + embedding_dim=self._embedding_dim + ) + + self._item_embeddings = nn.Embedding( + num_embeddings=self._num_items + 2, + embedding_dim=self._embedding_dim + ) + + self.attn = nn.Linear(self._embedding_dim, self._embedding_dim, bias=True) + self.q = nn.Linear(self._embedding_dim, 1, bias=False) + + # self.mlps = nn.ModuleList() + # for _ in range(self._mlp_layers): + # self.mlps.append(nn.Linear(self._embedding_dim, self._embedding_dim, bias=True)) + # nn.init.xavier_normal_(self.mlps[-1].weight.data) + + # self.E2 = nn.Parameter(torch.empty(self._num_users + 2 + self._num_items + 2, self._embedding_dim)) + # nn.init.xavier_normal_(self.E2.data) + + self.scores_gsnrec_linear = nn.Linear(1, self._embedding_dim, bias=True) + self.output_gsnrec_linear = nn.Linear(self._embedding_dim, 1, bias=False) + + self.attn_softmax = nn.Softmax(dim=1) + + self._mlp = nn.Linear(self._embedding_dim, self._embedding_dim, bias=True) + + @classmethod + def create_from_config(cls, config, **kwargs): + return cls( + user_prefix=config['user_prefix'], + sequence_prefix=config['sequence_prefix'], + positive_prefix=config['positive_prefix'], + negative_prefix=config['negative_prefix'], + candidate_prefix=config['candidate_prefix'], + graph=kwargs['graph'], + num_users=kwargs['num_users'], + num_items=kwargs['num_items'], + max_sequence_length=kwargs['max_sequence_length'], + negative_items_popularity=kwargs['negative_items_popularity'], + embedding_dim=config['embedding_dim'], + num_heads=config.get('num_heads', int(config['embedding_dim'] // 64)), + num_layers=config['num_layers'], + mlp_layers=config['mlp_layers'], + dim_feedforward=config.get('dim_feedforward', 4 * config['embedding_dim']), + dropout_seq=config.get('dropout_seq', 0.0), + dropout_graph=config.get('dropout_graph', 0.0), + dropout_gsnrec=config.get('dropout_gsnrec', 0.0), + dropout_mlp=config.get('dropout_mlp', 0.0), + ratings_prefix=config['ratings_prefix'], + initializer_range=config.get('initializer_range', 0.02), + ) + + def _apply_graph_encoder(self): + ego_embeddings = torch.cat((self._user_embeddings.weight, self._item_embeddings.weight), dim=0) + all_embeddings = [ego_embeddings] + + if self._dropout_rate > 0: # drop some edges + if self.training: # training_mode + size = self._graph.size() + index = self._graph.indices().t() + values = self._graph.values() + random_index = torch.rand(len(values)) + (1 - self._dropout_rate) + random_index = random_index.int().bool() + index = index[random_index] + values = values[random_index] / (1 - self._dropout_rate) + graph_dropped = torch.sparse.FloatTensor(index.t(), values, size) + else: # eval mode + graph_dropped = self._graph + else: + graph_dropped = self._graph + + for i in range(self._num_layers): + ego_embeddings = torch.sparse.mm(graph_dropped, ego_embeddings) + norm_embeddings = F.normalize(ego_embeddings, p=2, dim=1) + all_embeddings += [norm_embeddings] + + # all_embeddings = torch.cat(all_embeddings, dim=-1) + all_embeddings = torch.stack(all_embeddings, 0).mean(0) + + user_final_embeddings, item_final_embeddings = torch.split( + all_embeddings, [self._num_users + 2, self._num_items + 2] + ) + + return user_final_embeddings, item_final_embeddings + + def _get_embeddings(self, inputs, prefix, ego_embeddings, final_embeddings): + ids = inputs['{}.ids'.format(prefix)] # (all_batch_events) + lengths = inputs['{}.length'.format(prefix)] # (batch_size) + + final_embeddings = final_embeddings[ids] # (all_batch_events, embedding_dim) + ego_embeddings = ego_embeddings(ids) # (all_batch_events, embedding_dim) + + padded_embeddings, mask = create_masked_tensor( + final_embeddings, lengths + ) # (batch_size, seq_len, embedding_dim), (batch_size, seq_len) + + padded_ego_embeddings, ego_mask = create_masked_tensor( + ego_embeddings, lengths + ) # (batch_size, seq_len, embedding_dim), (batch_size, seq_len) + + assert torch.all(mask == ego_mask) + + return padded_embeddings, padded_ego_embeddings, mask + + def forward(self, inputs): + all_sample_events = inputs['{}.ids'.format(self._sequence_prefix)] # (all_batch_events) + all_sample_lengths = inputs['{}.length'.format(self._sequence_prefix)] # (batch_size) + + sasrec_embeddings, sasrec_mask = self._apply_sequential_encoder( + all_sample_events, all_sample_lengths + ) # (batch_size, seq_len, embedding_dim), (batch_size, seq_len) + + if self.training: # training mode + + ### light_gcn - graph part of siren + graph_user_embeddings, graph_item_embeddings = \ + self._apply_graph_encoder() # (num_users + 2, embedding_dim), (num_items + 2, embedding_dim) + + # user_embeddings, user_ego_embeddings, user_mask = self._get_embeddings( + # inputs, self._user_prefix, self._user_embeddings, graph_user_embeddings + # ) + # user_embeddings = user_embeddings[user_mask] # (all_batch_events, embedding_dim) + + # z_p siren + all_graph_embeddings = torch.cat((graph_user_embeddings, graph_item_embeddings), dim=0) + # shapes z_p: + # [num_users + 2 + num_items + 2; embedding_dim] + # например для датасета ml1m и num_layers = 2: [9069; 64], т.е. [6022 + 2 + 3043 + 2; 64] + + # print('graph_item_embeddings', graph_item_embeddings.shape) + # print('negative_items_popularity', self._negative_items_popularity.shape) + # print(self._negative_items_popularity) + + # list_minus_ids = inputs['item.negatives.ids'].tolist() + # degree_minus_ids = dict([(i, list_minus_ids.count(i)) for i in range(self._num_items + 2)]) + # max_degree = max(degree_minus_ids.values()) + + # minus_graph_item_embeddings = torch.mul( + # graph_item_embeddings, + # torch.tensor([[(degree_minus_ids[i] / max_degree) ** 0.5] for i in range(self._num_items + 2)]).to(DEVICE) + # ) + + minus_graph_item_embeddings = torch.mul( + graph_item_embeddings, + self._negative_items_popularity + ).to(DEVICE) + + minus_graph_embeddings = torch.cat((graph_user_embeddings, minus_graph_item_embeddings), dim=0) + + x = F.dropout(F.relu(self._mlp(minus_graph_embeddings)), p=0.5, training=self.training) + for i in range(1, self._mlp_layers): + x = F.dropout(F.relu(self._mlp(x)), p=0.5, training=self.training) + minus_graph_embeddings = x + + # print('minus_graph_embeddings', minus_graph_embeddings.shape) + + # # MLP part of siren + # mlp = [self.E2] + + # #x = F.dropout(F.relu(self.mlps[0](self.E2)), p=0.5, training=self.training) + # x = F.dropout(F.relu(self.mlps[0](minus_graph_embeddings)), p=0.5, training=self.training) + # for i in range(1, self._mlp_layers): + # x = self.mlps[i](x) + # x = F.relu(x) + # x = F.dropout(x, p=0.5, training=self.training) + # mlp.append(x) + # # z_n siren + # #print('mlp', len(mlp)) + # #print('mlp :', mlp) + # minus_graph_embeddings = mlp[-1] + # shapes z_n + # [num_users + 2 + num_items + 2; embedding_dim] + # например для датасета ml1m: [9069; 64] + + # Z + w_p = self.q(F.dropout(torch.tanh((self.attn(all_graph_embeddings))), p=self._dropout_mlp, training=self.training)) + w_n = self.q(F.dropout(torch.tanh((self.attn(minus_graph_embeddings))), p=self._dropout_mlp, training=self.training)) + alpha_ = self.attn_softmax(torch.cat([w_p, w_n], dim=1)) + + graph_embeddings = alpha_[:, 0].view(len(all_graph_embeddings), 1) * all_graph_embeddings - \ + alpha_[:, 1].view(len(all_graph_embeddings), 1) * minus_graph_embeddings + + graph_user_embeddings, graph_item_embeddings = torch.split(graph_embeddings, [self._num_users + 2, self._num_items + 2]) + + user_embeddings, user_ego_embeddings, user_mask = self._get_embeddings( + inputs, self._user_prefix, self._user_embeddings, graph_user_embeddings + ) + user_embeddings = user_embeddings[user_mask] # (all_batch_events, embedding_dim) + + positive_embeddings, _, positive_mask = self._get_embeddings( + inputs, self._positive_prefix, self._item_embeddings, graph_item_embeddings + ) # (batch_size, seq_len, embedding_dim) + negative_embeddings, _, negative_mask = self._get_embeddings( + inputs, self._negative_prefix, self._item_embeddings, graph_item_embeddings + ) # (batch_size, seq_len, embedding_dim) + + # b - batch_size, s - seq_len, d - embedding_dim + positive_graph_scores = torch.einsum( + 'bd,bsd->bs', + user_embeddings, + positive_embeddings + ) # (batch_size, seq_len) + negative_graph_scores = torch.einsum( + 'bd,bsd->bs', + user_embeddings, + negative_embeddings + ) # (batch_size, seq_len) + + positive_graph_scores = positive_graph_scores[positive_mask] # (all_batch_events) + negative_graph_scores = negative_graph_scores[negative_mask] # (all_batch_events) + + minus_positive_embeddings, _, minus_positive_mask = self._get_embeddings( + inputs, self._positive_prefix, self._item_embeddings, minus_graph_embeddings + ) # (batch_size, seq_len, embedding_dim) + + minus_graph_scores = torch.einsum( + 'bd,bsd->bs', + user_embeddings, + minus_positive_embeddings + ) # (batch_size, seq_len) + + minus_graph_scores = minus_graph_scores[minus_positive_mask].unsqueeze(-1) # (all_batch_events) + + # sasrec + all_positive_sample_events = inputs['{}.ids'.format(self._positive_prefix)] # (all_batch_events) + all_negative_sample_events = inputs['{}.ids'.format(self._negative_prefix)] # (all_batch_events) + + all_sample_embeddings = sasrec_embeddings[sasrec_mask] # (all_batch_events, embedding_dim) + all_positive_sample_embeddings = self._item_embeddings( + all_positive_sample_events + ) # (all_batch_events, embedding_dim) + all_negative_sample_embeddings = self._item_embeddings( + all_negative_sample_events + ) # (all_batch_events, embedding_dim) + + positive_sequence_scores = torch.einsum( + 'bd,bd->b', + all_positive_sample_embeddings, + all_sample_embeddings + ) + negative_sequence_scores = torch.einsum( + 'bd,bd->b', + all_negative_sample_embeddings, + all_sample_embeddings + ) + + positive_graph_scores = positive_graph_scores.unsqueeze(-1).unsqueeze(-1) + positive_sequence_scores = positive_sequence_scores.unsqueeze(-1).unsqueeze(-1) + negative_graph_scores = negative_graph_scores.unsqueeze(-1).unsqueeze(-1) + negative_sequence_scores = negative_sequence_scores.unsqueeze(-1).unsqueeze(-1) + + positive_all_scores = torch.cat((positive_sequence_scores, positive_graph_scores), dim=1) + negative_all_scores = torch.cat((negative_sequence_scores, negative_graph_scores), dim=1) + + positive_all_scores = F.dropout(torch.tanh(positive_all_scores), p=self._dropout_gsnrec, training=self.training) + negative_all_scores = F.dropout(torch.tanh(negative_all_scores), p=self._dropout_gsnrec, training=self.training) + + beta_ = self.attn_softmax(torch.cat([positive_all_scores, negative_all_scores], dim=1)) + + len_seq = len(positive_all_scores) + + final_positive_scores = (beta_[:, 0].view(len_seq, 1) * positive_all_scores[:, 0, :] + + beta_[:, 1].view(len_seq, 1) * positive_all_scores[:, 1, :]) + + final_negative_scores = (beta_[:, 2].view(len_seq, 1) * negative_all_scores[:, 0, :] + + beta_[:, 3].view(len_seq, 1) * negative_all_scores[:, 1, :]) + + return { + 'current_embeddings': all_sample_embeddings, + 'positive_embeddings': all_positive_sample_embeddings, + 'negative_embeddings': all_negative_sample_embeddings, + 'positive_scores': final_positive_scores, + 'negative_scores': final_negative_scores, + 'minus_scores': minus_graph_scores, + 'item_embeddings': torch.cat((self._user_embeddings.weight, self._item_embeddings.weight), dim=0) + } + else: # eval mode + last_embeddings = self._get_last_embedding(sasrec_embeddings, sasrec_mask) # (batch_size, embedding_dim) + + # b - batch_size, n - num_candidates, d - embedding_dim + candidate_scores = torch.einsum( + 'bd,nd->bn', + last_embeddings, + self._item_embeddings.weight + ) # (batch_size, num_items + 2) + candidate_scores[:, 0] = -torch.inf + candidate_scores[:, self._num_items + 1:] = -torch.inf + + if '{}.ids'.format(self._candidate_prefix) in inputs: + candidate_events = inputs['{}.ids'.format(self._candidate_prefix)] # (all_batch_candidates) + candidate_lengths = inputs['{}.length'.format(self._candidate_prefix)] # (batch_size) + + batch_size = candidate_lengths.shape[0] + num_candidates = candidate_lengths[0] + + candidate_scores = torch.gather( + input=candidate_scores, + dim=1, + index=torch.reshape(candidate_events, [batch_size, num_candidates]) + ) # (batch_size, num_candidates) + + values, indices = torch.topk( + candidate_scores, + k=20, dim=-1, largest=True + ) # (batch_size, 20), (batch_size, 20) + + return indices diff --git a/modeling/models/gsnrec5.py b/modeling/models/gsnrec5.py new file mode 100644 index 00000000..baca7d8a --- /dev/null +++ b/modeling/models/gsnrec5.py @@ -0,0 +1,335 @@ +from models.base import SequentialTorchModel + +from utils import create_masked_tensor, DEVICE +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class GSNRec5Model(SequentialTorchModel, config_name='gsnrec5'): + + def __init__( + self, + user_prefix, + sequence_prefix, + positive_prefix, + negative_prefix, + candidate_prefix, + num_users, + num_items, + max_sequence_length, + embedding_dim, + num_heads, + num_layers, + mlp_layers, + graph, + dim_feedforward, + ratings_prefix, + dropout_seq, + dropout_graph, + dropout_gsnrec, + dropout_mlp, + activation='relu', + layer_norm_eps=1e-9, + initializer_range=0.02 + ): + super().__init__( + num_items=num_items, + max_sequence_length=max_sequence_length, + embedding_dim=embedding_dim, + num_heads=num_heads, + num_layers=num_layers, + dim_feedforward=dim_feedforward, + dropout=dropout_seq, + activation=activation, + layer_norm_eps=layer_norm_eps, + is_causal=True + ) + self._sequence_prefix = sequence_prefix + self._positive_prefix = positive_prefix + self._negative_prefix = negative_prefix + self._candidate_prefix = candidate_prefix + + self._user_prefix = user_prefix + self._graph = graph + self._num_layers = num_layers + self._num_users = num_users + self._dropout_rate = dropout_graph + self._mlp_layers = mlp_layers + self._ratings_prefix = ratings_prefix + self._dropout_gsnrec = dropout_gsnrec + self._dropout_mlp = dropout_mlp + + self._output_projection = nn.Linear( + in_features=embedding_dim, + out_features=embedding_dim + ) + + self._bias = nn.Parameter( + data=torch.zeros(num_items + 2), + requires_grad=True + ) + + self._init_weights(initializer_range) + + self._user_embeddings = nn.Embedding( + num_embeddings=self._num_users + 2, + embedding_dim=self._embedding_dim + ) + + self._item_embeddings = nn.Embedding( + num_embeddings=self._num_items + 2, + embedding_dim=self._embedding_dim + ) + + self.attn = nn.Linear(self._embedding_dim, self._embedding_dim, bias=True) + self.q = nn.Linear(self._embedding_dim, 1, bias=False) + + self.mlps = nn.ModuleList() + for _ in range(self._mlp_layers): + self.mlps.append(nn.Linear(self._embedding_dim, self._embedding_dim, bias=True)) + nn.init.xavier_normal_(self.mlps[-1].weight.data) + + self.E2 = nn.Parameter(torch.empty(self._num_users + 2 + self._num_items + 2, self._embedding_dim)) + nn.init.xavier_normal_(self.E2.data) + + self.scores_gsnrec_linear = nn.Linear(1, self._embedding_dim, bias=True) + self.output_gsnrec_linear = nn.Linear(self._embedding_dim, 1, bias=False) + + self.attn_softmax = nn.Softmax(dim=1) + + @classmethod + def create_from_config(cls, config, **kwargs): + return cls( + user_prefix=config['user_prefix'], + sequence_prefix=config['sequence_prefix'], + positive_prefix=config['positive_prefix'], + negative_prefix=config['negative_prefix'], + candidate_prefix=config['candidate_prefix'], + graph=kwargs['graph'], + num_users=kwargs['num_users'], + num_items=kwargs['num_items'], + max_sequence_length=kwargs['max_sequence_length'], + embedding_dim=config['embedding_dim'], + num_heads=config.get('num_heads', int(config['embedding_dim'] // 64)), + num_layers=config['num_layers'], + mlp_layers=config['mlp_layers'], + dim_feedforward=config.get('dim_feedforward', 4 * config['embedding_dim']), + dropout_seq=config.get('dropout_seq', 0.0), + dropout_graph=config.get('dropout_graph', 0.0), + dropout_gsnrec=config.get('dropout_gsnrec', 0.0), + dropout_mlp=config.get('dropout_mlp', 0.0), + ratings_prefix=config['ratings_prefix'], + initializer_range=config.get('initializer_range', 0.02), + ) + + def _apply_graph_encoder(self): + ego_embeddings = torch.cat((self._user_embeddings.weight, self._item_embeddings.weight), dim=0) + all_embeddings = [ego_embeddings] + + if self._dropout_rate > 0: # drop some edges + if self.training: # training_mode + size = self._graph.size() + index = self._graph.indices().t() + values = self._graph.values() + random_index = torch.rand(len(values)) + (1 - self._dropout_rate) + random_index = random_index.int().bool() + index = index[random_index] + values = values[random_index] / (1 - self._dropout_rate) + graph_dropped = torch.sparse.FloatTensor(index.t(), values, size) + else: # eval mode + graph_dropped = self._graph + else: + graph_dropped = self._graph + + for i in range(self._num_layers): + ego_embeddings = torch.sparse.mm(graph_dropped, ego_embeddings) + norm_embeddings = F.normalize(ego_embeddings, p=2, dim=1) + all_embeddings += [norm_embeddings] + + all_embeddings = torch.stack(all_embeddings, 0).mean(0) + + user_final_embeddings, item_final_embeddings = torch.split( + all_embeddings, [self._num_users + 2, self._num_items + 2] + ) + + return user_final_embeddings, item_final_embeddings + + def _get_embeddings(self, inputs, prefix, ego_embeddings, final_embeddings): + ids = inputs['{}.ids'.format(prefix)] # (all_batch_events) + lengths = inputs['{}.length'.format(prefix)] # (batch_size) + + final_embeddings = final_embeddings[ids] # (all_batch_events, embedding_dim) + ego_embeddings = ego_embeddings(ids) # (all_batch_events, embedding_dim) + + padded_embeddings, mask = create_masked_tensor( + final_embeddings, lengths + ) # (batch_size, seq_len, embedding_dim), (batch_size, seq_len) + + padded_ego_embeddings, ego_mask = create_masked_tensor( + ego_embeddings, lengths + ) # (batch_size, seq_len, embedding_dim), (batch_size, seq_len) + + assert torch.all(mask == ego_mask) + + return padded_embeddings, padded_ego_embeddings, mask + + def forward(self, inputs): + all_sample_events = inputs['{}.ids'.format(self._sequence_prefix)] # (all_batch_events) + all_sample_lengths = inputs['{}.length'.format(self._sequence_prefix)] # (batch_size) + + sasrec_embeddings, sasrec_mask = self._apply_sequential_encoder( + all_sample_events, all_sample_lengths + ) # (batch_size, seq_len, embedding_dim), (batch_size, seq_len) + + if self.training: # training mode + + ### light_gcn - graph part of siren + graph_user_embeddings, graph_item_embeddings = \ + self._apply_graph_encoder() # (num_users + 2, embedding_dim), (num_items + 2, embedding_dim) + + user_embeddings, user_ego_embeddings, user_mask = self._get_embeddings( + inputs, self._user_prefix, self._user_embeddings, graph_user_embeddings + ) + user_embeddings = user_embeddings[user_mask] # (all_batch_events, embedding_dim) + + # z_p siren + # all_embeddings_positive = torch.cat((graph_user_embeddings, graph_item_embeddings), dim=0) + # # shapes z_p: + # # [num_users + 2 + num_items + 2; embedding_dim * (num_layers + 1) + # # например для датасета ml1m и num_layers = 2: [9069; 192], т.е. [6022 + 2 + 3043 + 2; 64 * 3] + # + # # MLP part of siren + # mlp = [self.E2] + # x = F.dropout(F.relu(self.mlps[0](self.E2)), self._dropout_mlp, training=self.training) + # for i in range(1, self._mlp_layers): + # x = self.mlps[i](x) + # x = F.relu(x) + # x = F.dropout(x, self._dropout_mlp, training=self.training) + # mlp.append(x) + # # z_n siren + # all_embeddings_negative = mlp[-1] + # # shapes z_n + # # [num_users + num_items; embedding_dim] + # # например для датасета ml1m: [9065; 64] + # + # # Z + # w_p = self.q(F.dropout(torch.tanh((self.attn(all_embeddings_positive))), self._dropout_mlp, training=self.training)) + # w_n = self.q(F.dropout(torch.tanh((self.attn(all_embeddings_negative))), self._dropout_mlp, training=self.training)) + # alpha_ = self.attn_softmax(torch.cat([w_p, w_n], dim=1)) + # + # graph_embeddings = alpha_[:, 0].view(len(all_embeddings_positive), 1) * all_embeddings_positive + \ + # alpha_[:, 1].view(len(all_embeddings_positive), 1) * all_embeddings_negative + # + # graph_user_embeddings, graph_item_embeddings = torch.split(graph_embeddings, [self._num_users + 2, self._num_items + 2]) + + positive_embeddings, _, positive_mask = self._get_embeddings( + inputs, self._positive_prefix, self._item_embeddings, graph_item_embeddings + ) # (batch_size, seq_len, embedding_dim) + negative_embeddings, _, negative_mask = self._get_embeddings( + inputs, self._negative_prefix, self._item_embeddings, graph_item_embeddings + ) # (batch_size, seq_len, embedding_dim) + + my_user_embeddings, _, my_user_mask = self._get_embeddings( + inputs, self._user_prefix, self._user_embeddings, graph_user_embeddings + ) # (batch_size, seq_len, embedding_dim) + + my_user_embeddings = my_user_embeddings[my_user_mask] # (all_batch_events, embedding_dim) + # b - batch_size, s - seq_len, d - embedding_dim + positive_graph_scores = torch.einsum( + 'bd,bsd->bs', + my_user_embeddings, + positive_embeddings + ) # (batch_size, seq_len) + negative_graph_scores = torch.einsum( + 'bd,bsd->bs', + my_user_embeddings, + negative_embeddings + ) # (batch_size, seq_len) + + positive_graph_scores = positive_graph_scores[positive_mask] # (all_batch_events) + negative_graph_scores = negative_graph_scores[negative_mask] # (all_batch_events) + + # sasrec + all_positive_sample_events = inputs['{}.ids'.format(self._positive_prefix)] # (all_batch_events) + all_negative_sample_events = inputs['{}.ids'.format(self._negative_prefix)] # (all_batch_events) + + all_sample_embeddings = sasrec_embeddings[sasrec_mask] # (all_batch_events, embedding_dim) + all_positive_sample_embeddings = self._item_embeddings( + all_positive_sample_events + ) # (all_batch_events, embedding_dim) + all_negative_sample_embeddings = self._item_embeddings( + all_negative_sample_events + ) # (all_batch_events, embedding_dim) + + positive_sequence_scores = torch.einsum( + 'bd,bd->b', + all_positive_sample_embeddings, + all_sample_embeddings + ) + negative_sequence_scores = torch.einsum( + 'bd,bd->b', + all_negative_sample_embeddings, + all_sample_embeddings + ) + + positive_graph_scores = positive_graph_scores.unsqueeze(-1).unsqueeze(-1) + positive_sequence_scores = positive_sequence_scores.unsqueeze(-1).unsqueeze(-1) + negative_graph_scores = negative_graph_scores.unsqueeze(-1).unsqueeze(-1) + negative_sequence_scores = negative_sequence_scores.unsqueeze(-1).unsqueeze(-1) + + positive_all_scores = torch.cat((positive_sequence_scores, positive_graph_scores), dim=1) + negative_all_scores = torch.cat((negative_sequence_scores, negative_graph_scores), dim=1) + + positive_all_scores = F.dropout(torch.tanh(positive_all_scores), p=self._dropout_gsnrec, training=self.training) + negative_all_scores = F.dropout(torch.tanh(negative_all_scores), p=self._dropout_gsnrec, training=self.training) + + beta_ = self.attn_softmax(torch.cat([positive_all_scores, negative_all_scores], dim=1)) + + len_seq = len(positive_all_scores) + + final_positive_scores = (beta_[:, 0].view(len_seq, 1) * positive_all_scores[:, 0, :] + + beta_[:, 1].view(len_seq, 1) * positive_all_scores[:, 1, :]) + + final_negative_scores = (beta_[:, 2].view(len_seq, 1) * negative_all_scores[:, 0, :] + + beta_[:, 3].view(len_seq, 1) * negative_all_scores[:, 1, :]) + + return { + 'current_embeddings': all_sample_embeddings, + 'positive_embeddings': all_positive_sample_embeddings, + 'negative_embeddings': all_negative_sample_embeddings, + 'positive_scores': final_positive_scores, + 'negative_scores': final_negative_scores, + 'item_embeddings': torch.cat((self._user_embeddings.weight, self._item_embeddings.weight), dim=0) + } + else: # eval mode + last_embeddings = self._get_last_embedding(sasrec_embeddings, sasrec_mask) # (batch_size, embedding_dim) + + # b - batch_size, n - num_candidates, d - embedding_dim + candidate_scores = torch.einsum( + 'bd,nd->bn', + last_embeddings, + self._item_embeddings.weight + ) # (batch_size, num_items + 2) + candidate_scores[:, 0] = -torch.inf + candidate_scores[:, self._num_items + 1:] = -torch.inf + + if '{}.ids'.format(self._candidate_prefix) in inputs: + candidate_events = inputs['{}.ids'.format(self._candidate_prefix)] # (all_batch_candidates) + candidate_lengths = inputs['{}.length'.format(self._candidate_prefix)] # (batch_size) + + batch_size = candidate_lengths.shape[0] + num_candidates = candidate_lengths[0] + + candidate_scores = torch.gather( + input=candidate_scores, + dim=1, + index=torch.reshape(candidate_events, [batch_size, num_candidates]) + ) # (batch_size, num_candidates) + + values, indices = torch.topk( + candidate_scores, + k=20, dim=-1, largest=True + ) # (batch_size, 20), (batch_size, 20) + + return indices diff --git a/modeling/models/lightgcn.py b/modeling/models/lightgcn.py index 121d5233..bb4d8ebd 100644 --- a/modeling/models/lightgcn.py +++ b/modeling/models/lightgcn.py @@ -1,6 +1,6 @@ from models.base import TorchModel -from utils import create_masked_tensor +from utils import create_masked_tensor, DEVICE import torch import torch.nn as nn @@ -13,8 +13,6 @@ def __init__( self, user_prefix, positive_prefix, - negative_prefix, - candidate_prefix, graph, num_users, num_items, @@ -26,8 +24,6 @@ def __init__( super().__init__() self._user_prefix = user_prefix self._positive_prefix = positive_prefix - self._negative_prefix = negative_prefix - self._candidate_prefix = candidate_prefix self._graph = graph self._num_users = num_users self._num_items = num_items @@ -52,8 +48,6 @@ def create_from_config(cls, config, **kwargs): return cls( user_prefix=config['user_prefix'], positive_prefix=config['positive_prefix'], - negative_prefix=config['negative_prefix'], - candidate_prefix=config['candidate_prefix'], graph=kwargs['graph'], num_users=kwargs['num_users'], num_items=kwargs['num_items'], @@ -120,38 +114,55 @@ def forward(self, inputs): user_embeddings, user_ego_embeddings, user_mask = self._get_embeddings( inputs, self._user_prefix, self._user_embeddings, all_final_user_embeddings ) - user_embeddings = user_embeddings[user_mask] # (all_batch_events, embedding_dim) + user_embeddings = user_embeddings[user_mask] # (batch_size, embedding_dim) if self.training: # training mode - positive_embeddings, _, positive_mask = self._get_embeddings( - inputs, self._positive_prefix, self._item_embeddings, all_final_item_embeddings - ) # (batch_size, seq_len, embedding_dim) - negative_embeddings, _, negative_mask = self._get_embeddings( - inputs, self._negative_prefix, self._item_embeddings, all_final_item_embeddings - ) # (batch_size, seq_len, embedding_dim) - - # b - batch_size, s - seq_len, d - embedding_dim - positive_scores = torch.einsum( - 'bd,bsd->bs', + positive_item_ids = inputs['{}.ids'.format(self._positive_prefix)] # (all_batch_events) + positive_item_lengths = inputs['{}.length'.format(self._positive_prefix)] # (batch_size) + + batch_size = positive_item_lengths.shape[0] + max_sequence_length = positive_item_lengths.max().item() + + mask = torch.arange( + end=max_sequence_length, + device=DEVICE + )[None].tile([batch_size, 1]) < positive_item_lengths[:, None] # (batch_size, max_seq_len) + + positive_user_ids = torch.arange( + batch_size, + device=DEVICE + )[None].tile([max_sequence_length, 1]).T # (batch_size, max_seq_len) + positive_user_ids = positive_user_ids[mask] # (all_batch_items) + user_embeddings = user_embeddings[positive_user_ids] # (all_batch_items, embedding_dim) + + all_scores = torch.einsum( + 'ad,nd->an', user_embeddings, - positive_embeddings - ) # (batch_size, seq_len) - negative_scores = torch.einsum( - 'bd,bsd->bs', - user_embeddings, - negative_embeddings - ) # (batch_size, seq_len) + all_final_item_embeddings + ) # (all_batch_items, num_items + 2) + + negative_mask = torch.zeros(self._num_items + 2, dtype=torch.bool, device=DEVICE) # (num_items + 2) + negative_mask[positive_item_ids] = 1 + + positive_scores = torch.gather( + input=all_scores, + dim=1, + index=positive_item_ids[..., None] + ) # (all_batch_items, 1) - positive_scores = positive_scores[positive_mask] # (all_batch_events) - negative_scores = negative_scores[negative_mask] # (all_batch_events) + all_scores = torch.scatter_add( + input=all_scores, + dim=1, + index=positive_item_ids[..., None], + src=torch.ones_like(positive_item_ids[..., None]).float() + ) # (all_batch_items, num_items + 2) return { 'positive_scores': positive_scores, - 'negative_scores': negative_scores, + 'negative_scores': all_scores, 'item_embeddings': torch.cat((self._user_embeddings.weight, self._item_embeddings.weight), dim=0) } else: # eval mode - # b - batch_size, n - num_candidates, d - embedding_dim candidate_scores = torch.einsum( 'bd,nd->bn', user_embeddings, @@ -160,22 +171,9 @@ def forward(self, inputs): candidate_scores[:, 0] = -torch.inf candidate_scores[:, self._num_items + 1:] = -torch.inf - if '{}.ids'.format(self._candidate_prefix) in inputs: - candidate_events = inputs['{}.ids'.format(self._candidate_prefix)] # (all_batch_candidates) - candidate_lengths = inputs['{}.length'.format(self._candidate_prefix)] # (batch_size) - - batch_size = candidate_lengths.shape[0] - num_candidates = candidate_lengths[0] - - candidate_scores = torch.gather( - input=candidate_scores, - dim=1, - index=torch.reshape(candidate_events, [batch_size, num_candidates]) - ) # (batch_size, num_candidates) - _, indices = torch.topk( candidate_scores, k=20, dim=-1, largest=True ) # (batch_size, 20) - return indices + return indices \ No newline at end of file diff --git a/modeling/models/sasrec.py b/modeling/models/sasrec.py index fe90563d..5910455f 100644 --- a/modeling/models/sasrec.py +++ b/modeling/models/sasrec.py @@ -1,9 +1,6 @@ from models.base import SequentialTorchModel -from utils import create_masked_tensor - import torch -import torch.nn as nn class SasRecModel(SequentialTorchModel, config_name='sasrec'): @@ -12,8 +9,6 @@ def __init__( self, sequence_prefix, positive_prefix, - negative_prefix, - candidate_prefix, num_items, max_sequence_length, embedding_dim, @@ -39,18 +34,6 @@ def __init__( ) self._sequence_prefix = sequence_prefix self._positive_prefix = positive_prefix - self._negative_prefix = negative_prefix - self._candidate_prefix = candidate_prefix - - self._output_projection = nn.Linear( - in_features=embedding_dim, - out_features=embedding_dim - ) - - self._bias = nn.Parameter( - data=torch.zeros(num_items + 2), - requires_grad=True - ) self._init_weights(initializer_range) @@ -59,8 +42,6 @@ def create_from_config(cls, config, **kwargs): return cls( sequence_prefix=config['sequence_prefix'], positive_prefix=config['positive_prefix'], - negative_prefix=config['negative_prefix'], - candidate_prefix=config['candidate_prefix'], num_items=kwargs['num_items'], max_sequence_length=kwargs['max_sequence_length'], embedding_dim=config['embedding_dim'], @@ -81,20 +62,28 @@ def forward(self, inputs): if self.training: # training mode all_positive_sample_events = inputs['{}.ids'.format(self._positive_prefix)] # (all_batch_events) - all_negative_sample_events = inputs['{}.ids'.format(self._negative_prefix)] # (all_batch_events) all_sample_embeddings = embeddings[mask] # (all_batch_events, embedding_dim) all_positive_sample_embeddings = self._item_embeddings( all_positive_sample_events ) # (all_batch_events, embedding_dim) - all_negative_sample_embeddings = self._item_embeddings( - all_negative_sample_events - ) # (all_batch_events, embedding_dim) + + all_embeddings = self._item_embeddings.weight # (num_items + 2, embedding_dim) + + all_scores = torch.einsum( + 'ad,nd->an', + all_sample_embeddings, + all_embeddings + ) # (all_batch_events, num_items + 2) + positive_scores = torch.gather( + input=all_scores, + dim=1, + index=all_positive_sample_events[..., None] + ) # (all_batch_items, 1) return { - 'current_embeddings': all_sample_embeddings, - 'positive_embeddings': all_positive_sample_embeddings, - 'negative_embeddings': all_negative_sample_embeddings + 'positive_scores': positive_scores, + 'negative_scores': all_scores } else: # eval mode last_embeddings = self._get_last_embedding(embeddings, mask) # (batch_size, embedding_dim) @@ -108,23 +97,10 @@ def forward(self, inputs): candidate_scores[:, 0] = -torch.inf candidate_scores[:, self._num_items + 1:] = -torch.inf - if '{}.ids'.format(self._candidate_prefix) in inputs: - candidate_events = inputs['{}.ids'.format(self._candidate_prefix)] # (all_batch_candidates) - candidate_lengths = inputs['{}.length'.format(self._candidate_prefix)] # (batch_size) - - batch_size = candidate_lengths.shape[0] - num_candidates = candidate_lengths[0] - - candidate_scores = torch.gather( - input=candidate_scores, - dim=1, - index=torch.reshape(candidate_events, [batch_size, num_candidates]) - ) # (batch_size, num_candidates) - - values, indices = torch.topk( + _, indices = torch.topk( candidate_scores, k=20, dim=-1, largest=True - ) # (batch_size, 20), (batch_size, 20) + ) # (batch_size, 20) return indices @@ -655,4 +631,4 @@ def forward(self, inputs): # candidate_scores[:, 0] = -torch.inf # candidate_scores[:, self._num_items + 1:] = -torch.inf # -# return candidate_scores +# return candidate_scores \ No newline at end of file diff --git a/modeling/models/siren.py b/modeling/models/siren.py new file mode 100644 index 00000000..45fffdb8 --- /dev/null +++ b/modeling/models/siren.py @@ -0,0 +1,241 @@ +from models.base import TorchModel + +from utils import create_masked_tensor +import torch.nn.functional as F +import torch +import torch.nn as nn + + +class SiReNModel(TorchModel, config_name='siren'): + + def __init__( + self, + user_prefix, + positive_prefix, + negative_prefix, + candidate_prefix, + num_users, + num_items, + embedding_dim, + num_layers, + mlp_layers, + graph, + dropout=0.0, + initializer_range=0.02, + ): + super().__init__() + self._user_prefix = user_prefix + self._positive_prefix = positive_prefix + self._negative_prefix = negative_prefix + self._candidate_prefix = candidate_prefix + self._graph = graph + self._num_users = num_users + self._num_items = num_items + self._embedding_dim = embedding_dim + self._num_layers = num_layers + self._dropout_rate = dropout + self._mlp_layers = mlp_layers + + self._user_embeddings = nn.Embedding( + num_embeddings=self._num_users + 2, + embedding_dim=self._embedding_dim + ) + + self._item_embeddings = nn.Embedding( + num_embeddings=self._num_items + 2, + embedding_dim=self._embedding_dim + ) + + self._output_projection = nn.Linear( + in_features=self._embedding_dim, + out_features=self._embedding_dim + ) + + self._bias = nn.Parameter( + data=torch.zeros(num_items + 2), + requires_grad=True + ) + + # Attntion model + self.attn = nn.Linear(self._embedding_dim, self._embedding_dim, bias=True) + self.q = nn.Linear(self._embedding_dim, 1, bias=False) + self.attn_softmax = nn.Softmax(dim=1) + + self._init_weights(initializer_range) + + self.mlps = nn.ModuleList() + for _ in range(self._mlp_layers): + self.mlps.append(nn.Linear(self._embedding_dim, self._embedding_dim, bias=True)) + nn.init.xavier_normal_(self.mlps[-1].weight.data) + + self.E2 = nn.Parameter(torch.empty(self._num_users + 2 + self._num_items + 2, self._embedding_dim)) + nn.init.xavier_normal_(self.E2.data) + + @classmethod + def create_from_config(cls, config, **kwargs): + return cls( + user_prefix=config['user_prefix'], + positive_prefix=config['positive_prefix'], + negative_prefix=config['negative_prefix'], + candidate_prefix=config['candidate_prefix'], + graph=kwargs['graph'], + num_users=kwargs['num_users'], + num_items=kwargs['num_items'], + embedding_dim=config['embedding_dim'], + # num_layers=config.get('num_layers', 2), + # mlp_layers=config.get('mlp_layers', 2), + # dropout=config.get('dropout', 0.5), + num_layers=config['num_layers'], + mlp_layers=config['mlp_layers'], + dropout=config['dropout'], + initializer_range=config.get('initializer_range', 0.02) + ) + + def _apply_graph_encoder(self): + ego_embeddings = torch.cat((self._user_embeddings.weight, self._item_embeddings.weight), dim=0) + all_embeddings = [ego_embeddings] + + if self._dropout_rate > 0: # drop some edges + if self.training: # training_mode + size = self._graph.size() + index = self._graph.indices().t() + values = self._graph.values() + random_index = torch.rand(len(values)) + (1 - self._dropout_rate) + random_index = random_index.int().bool() + index = index[random_index] + values = values[random_index] / (1 - self._dropout_rate) + graph_dropped = torch.sparse.FloatTensor(index.t(), values, size) + else: # eval mode + graph_dropped = self._graph + else: + graph_dropped = self._graph + + for i in range(self._num_layers): + ego_embeddings = torch.sparse.mm(graph_dropped, ego_embeddings) + norm_embeddings = F.normalize(ego_embeddings, p=2, dim=1) + all_embeddings += [norm_embeddings] + + all_embeddings = torch.stack(all_embeddings, 0).mean(0) + + user_final_embeddings, item_final_embeddings = torch.split( + all_embeddings, [self._num_users + 2, self._num_items + 2] + ) + + return user_final_embeddings, item_final_embeddings + + def _get_embeddings(self, inputs, prefix, ego_embeddings, final_embeddings): + ids = inputs['{}.ids'.format(prefix)] # (all_batch_events) + lengths = inputs['{}.length'.format(prefix)] # (batch_size) + + final_embeddings = final_embeddings[ids] # (all_batch_events, embedding_dim) + ego_embeddings = ego_embeddings(ids) # (all_batch_events, embedding_dim) + + padded_embeddings, mask = create_masked_tensor( + final_embeddings, lengths + ) # (batch_size, seq_len, embedding_dim), (batch_size, seq_len) + + padded_ego_embeddings, ego_mask = create_masked_tensor( + ego_embeddings, lengths + ) # (batch_size, seq_len, embedding_dim), (batch_size, seq_len) + + assert torch.all(mask == ego_mask) + + return padded_embeddings, padded_ego_embeddings, mask + + def forward(self, inputs): + # light gcn + all_user_embeddings, all_item_embeddings = \ + self._apply_graph_encoder() # (num_users + 2, embedding_dim), (num_items + 2, embedding_dim) + + user_embeddings, user_ego_embeddings, user_mask = self._get_embeddings( + inputs, self._user_prefix, self._user_embeddings, all_user_embeddings + ) + user_embeddings = user_embeddings[user_mask] # (all_batch_events, embedding_dim) + + # z_p siren + all_embeddings_positive = torch.cat((all_user_embeddings, all_item_embeddings), dim=0) + # shapes z_p: + # [num_users + 2 + num_items + 2; embedding_dim * (num_layers + 1) + # например для датасета ml1m и num_layers = 2: [9069; 192], т.е. [6022 + 2 + 3043 + 2; 64 * 3] + + # z_n siren + mlp = [self.E2] + x = F.dropout(F.relu(self.mlps[0](self.E2)), p=0.5, training=self.training) + for i in range(1, self._mlp_layers): + x = self.mlps[i](x) + x = F.relu(x) + x = F.dropout(x, p=0.5, training=self.training) + mlp.append(x) + embeddings_negative = mlp[-1] + # shapes z_n + # [num_users + num_items; embedding_dim] + # например для датасета ml1m: [9065; 64] + + # Z + w_p = self.q(F.dropout(torch.tanh((self.attn(all_embeddings_positive))), p=0.5, training=self.training)) + w_n = self.q(F.dropout(torch.tanh((self.attn(embeddings_negative))), p=0.5, training=self.training)) + alpha_ = self.attn_softmax(torch.cat([w_p, w_n], dim=1)) + + graph_embeddings = alpha_[:, 0].view(len(all_embeddings_positive), 1) * all_embeddings_positive + alpha_[:, 1].view( + len(all_embeddings_positive), 1) * embeddings_negative + + # embeddings light_gcn + negatives siren + graph_user_embeddings, graph_item_embeddings = torch.split(graph_embeddings, [self._num_users + 2, self._num_items + 2]) + + if self.training: # training mode + positive_embeddings, _, positive_mask = self._get_embeddings( + inputs, self._positive_prefix, self._item_embeddings, graph_item_embeddings + ) # (batch_size, seq_len, embedding_dim) + negative_embeddings, _, negative_mask = self._get_embeddings( + inputs, self._negative_prefix, self._item_embeddings, graph_item_embeddings + ) # (batch_size, seq_len, embedding_dim) + + # b - batch_size, s - seq_len, d - embedding_dim + positive_scores = torch.einsum( + 'bd,bsd->bs', + user_embeddings, + positive_embeddings + ) # (batch_size, seq_len) + negative_scores = torch.einsum( + 'bd,bsd->bs', + user_embeddings, + negative_embeddings + ) # (batch_size, seq_len) + + positive_scores = positive_scores[positive_mask] # (all_batch_events) + negative_scores = negative_scores[negative_mask] # (all_batch_events) + + return { + 'positive_scores': positive_scores, + 'negative_scores': negative_scores, + 'item_embeddings': torch.cat((self._user_embeddings.weight, self._item_embeddings.weight), dim=0) + } + else: # eval mode + # b - batch_size, n - num_candidates, d - embedding_dim + candidate_scores = torch.einsum( + 'bd,nd->bn', + user_embeddings, + graph_item_embeddings + ) # (batch_size, num_items + 2) + candidate_scores[:, 0] = -torch.inf + candidate_scores[:, self._num_items + 1:] = -torch.inf + + if '{}.ids'.format(self._candidate_prefix) in inputs: + candidate_events = inputs['{}.ids'.format(self._candidate_prefix)] # (all_batch_candidates) + candidate_lengths = inputs['{}.length'.format(self._candidate_prefix)] # (batch_size) + + batch_size = candidate_lengths.shape[0] + num_candidates = candidate_lengths[0] + + candidate_scores = torch.gather( + input=candidate_scores, + dim=1, + index=torch.reshape(candidate_events, [batch_size, num_candidates]) + ) # (batch_size, num_candidates) + + _, indices = torch.topk( + candidate_scores, + k=20, dim=-1, largest=True + ) # (batch_size, 20) + + return indices diff --git a/modeling/train.py b/modeling/train.py index b15672bf..71ff69ca 100644 --- a/modeling/train.py +++ b/modeling/train.py @@ -17,19 +17,19 @@ seed_val = 42 -def train(dataloader, model, optimizer, loss_function, callback, epoch_cnt=None, step_cnt=None, best_metric=None): +def train(dataloader, model, optimizer, loss_function, callback, epoch_cnt=None, best_metric=None): step_num = 0 epoch_num = 0 current_metric = 0 - epochs_threshold = 40 + epochs_threshold = 150 best_epoch = 0 best_checkpoint = None logger.debug('Start training...') - while (epoch_cnt is None or epoch_num < epoch_cnt) and (step_cnt is None or step_num < step_cnt): + while epoch_cnt is None or epoch_num < epoch_cnt: if best_epoch + epochs_threshold < epoch_num: logger.debug('There is no progress during {} epochs. Finish training'.format(epochs_threshold)) break @@ -129,9 +129,8 @@ def main(): optimizer=optimizer, loss_function=loss_function, callback=callback, - epoch_cnt=config.get('train_epochs_num'), - step_cnt=config.get('train_steps_num'), - best_metric=config.get('best_metric') + epoch_cnt=config.get('train_epochs_num', None), + best_metric=config.get('best_metric', None) ) logger.debug('Saving model...') diff --git a/modeling/train_multiple.py b/modeling/train_multiple.py index e08bcbc8..22c8ca5b 100644 --- a/modeling/train_multiple.py +++ b/modeling/train_multiple.py @@ -9,7 +9,7 @@ from train import train from infer import inference -from callbacks import BaseCallback, EvalCallback +from callbacks import BaseCallback, EvalCallback, ValidationCallback from dataset import BaseDataset from dataloader import BaseDataloader from loss import BaseLoss @@ -113,11 +113,11 @@ def main(): best_metric=config.get('best_metric') ) - eval_model = BaseModel.create_from_config(config['model'], **dataset.meta).to(DEVICE) + eval_model = BaseModel.create_from_config(model_param, **dataset.meta).to(DEVICE) eval_model.load_state_dict(best_model_checkpoint) for cl in callback._callbacks: - if isinstance(cl, EvalCallback): + if isinstance(cl, ValidationCallback): metrics = cl._metrics pred_prefix = cl._pred_prefix labels_prefix = cl._labels_prefix