Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 42 additions & 30 deletions scripts/plum/cooc_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,15 @@ class CoocMappingDataset:
def __init__(
self,
train_sampler,
validation_sampler,
test_sampler,
num_items,
max_sequence_length,
cooccur_counter_mapping=None
):
self._train_sampler = train_sampler
self._validation_sampler = validation_sampler
self._test_sampler = test_sampler
self._num_items = num_items
self._max_sequence_length = max_sequence_length
self._cooccur_counter_mapping = cooccur_counter_mapping

@classmethod
def create(cls, inter_json_path, max_sequence_length, sampler_type, window_size):
def create(cls, inter_json_path, window_size):
max_item_id = 0
train_dataset, validation_dataset, test_dataset = [], [], []

Expand All @@ -43,31 +37,59 @@ def create(cls, inter_json_path, max_sequence_length, sampler_type, window_size)
'user.ids': [user_id],
'item.ids': item_ids[:-2],
})
validation_dataset.append({
'user.ids': [user_id],
'item.ids': item_ids[:-1],
})
test_dataset.append({
'user.ids': [user_id],
'item.ids': item_ids,
})

cooccur_counter_mapping = cls.build_cooccur_counter_mapping(train_dataset, window_size=window_size)
logger.debug(f'Computed window-based co-occurrence mapping for {len(cooccur_counter_mapping)} items but max_item_id is {max_item_id}')

train_sampler = train_dataset
validation_sampler = validation_dataset
test_sampler = test_dataset

return cls(
train_sampler=train_sampler,
validation_sampler=validation_sampler,
test_sampler=test_sampler,
num_items=max_item_id + 1,
max_sequence_length=max_sequence_length,
cooccur_counter_mapping=cooccur_counter_mapping
)

@classmethod
def create_from_split_part(
cls,
train_inter_json_path,
window_size
):

max_item_id = 0
train_dataset = []

with open(train_inter_json_path, 'r') as f:
train_interactions = json.load(f)

# Обрабатываем TRAIN
for user_id_str, item_ids in train_interactions.items():
user_id = int(user_id_str)
if item_ids:
max_item_id = max(max_item_id, max(item_ids))

train_dataset.append({
'user.ids': [user_id],
'item.ids': item_ids,
})

logger.debug(f'Train: {len(train_dataset)} users')
logger.debug(f'Max item ID: {max_item_id}')

cooccur_counter_mapping = cls.build_cooccur_counter_mapping(
train_dataset,
window_size=window_size
)

logger.debug(f'Computed window-based co-occurrence mapping for {len(cooccur_counter_mapping)} items')

return cls(
train_sampler=train_dataset,
num_items=max_item_id + 1,
cooccur_counter_mapping=cooccur_counter_mapping
)


@staticmethod
def build_cooccur_counter_mapping(train_dataset, window_size): #TODO передавать время и по нему строить окно
cooccur_counts = defaultdict(Counter)
Expand All @@ -80,16 +102,6 @@ def build_cooccur_counter_mapping(train_dataset, window_size): #TODO перед
cooccur_counts[item_i][items[j]] += 1
return cooccur_counts

def get_datasets(self):
return self._train_sampler, self._validation_sampler, self._test_sampler

@property
def num_items(self):
return self._num_items

@property
def max_sequence_length(self):
return self._max_sequence_length

@property
def cooccur_counter_mapping(self):
Expand Down
48 changes: 21 additions & 27 deletions scripts/plum/infer_default.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,18 @@

from data import EmbeddingDataset, ProcessEmbeddings
from models import PlumRQVAE
from transforms import AddWeightedCooccurrenceEmbeddings
from cooc_data import CoocMappingDataset

# ПУТИ
IREC_PATH = '/home/jovyan/IRec/'
EMBEDDINGS_PATH = '/home/jovyan/tiger/data/Beauty/default_content_embeddings.pkl'
MODEL_PATH = '/home/jovyan/IRec/checkpoints/4-1_plum_rqvae_beauty_ws_2_best_0.0051.pth'
RESULTS_PATH = os.path.join(IREC_PATH, 'results')

WINDOW_SIZE = 2

EXPERIMENT_NAME = f'test_plum_rqvae_beauty_ws_{WINDOW_SIZE}'

# ОСТАЛЬНОЕ

SEED_VALUE = 42
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
Expand All @@ -26,29 +36,16 @@
NUM_CODEBOOKS = 3

BETA = 0.25
MODEL_PATH = '/home/jovyan/IRec/checkpoints/test_plum_rqvae_beauty_ws_2_best_0.0054.pth'

WINDOW_SIZE = 2

EXPERIMENT_NAME = f'test_plum_rqvae_beauty_ws_{WINDOW_SIZE}'

IREC_PATH = '/home/jovyan/IRec/'


def main():
fix_random_seed(SEED_VALUE)

data = CoocMappingDataset.create(
inter_json_path=os.path.join(IREC_PATH, 'data/Beauty/inter_new.json'),
max_sequence_length=20,
sampler_type='sasrec',
window_size=WINDOW_SIZE
)

dataset = EmbeddingDataset(
data_path='/home/jovyan/tiger/data/Beauty/default_content_embeddings.pkl'
data_path=EMBEDDINGS_PATH
)

item_id_to_embedding = {}
all_item_ids = []
for idx in range(len(dataset)):
Expand All @@ -57,15 +54,12 @@ def main():
item_id_to_embedding[item_id] = torch.tensor(sample['embedding'])
all_item_ids.append(item_id)

add_cooc_transform = AddWeightedCooccurrenceEmbeddings(
data.cooccur_counter_mapping, item_id_to_embedding, all_item_ids)

dataloader = DataLoader(
dataset,
batch_size=BATCH_SIZE,
shuffle=False,
drop_last=False,
).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])).map(add_cooc_transform)
).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding']))

model = PlumRQVAE(
input_dim=INPUT_DIM,
Expand Down Expand Up @@ -106,8 +100,8 @@ def main():
cb.Logger().every_num_steps(len(dataloader)),

cb.InferenceSaver(
metrics=lambda batch, model_outputs, _: {'item_id': batch['item_id'], 'clusters': model_outputs['clusters']},
save_path=f'/home/jovyan/IRec/results/{EXPERIMENT_NAME}_clusters.json',
metrics=lambda batch, model_outputs, _: {'item_id': batch['item_id'], 'clusters': model_outputs['clusters']},
save_path=os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'),
format='json'
)
]
Expand All @@ -125,9 +119,9 @@ def main():
from collections import defaultdict
import numpy as np

with open(f'/home/jovyan/IRec/results/{EXPERIMENT_NAME}_clusters.json', 'r') as f:
with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'), 'r') as f:
mappings = json.load(f)

inter = {}
sem_2_ids = defaultdict(list)
for mapping in mappings:
Expand All @@ -143,8 +137,8 @@ def main():
inter[item_id].append(collision_solver)
for i in range(len(inter[item_id])):
inter[item_id][i] += CODEBOOK_SIZE * i
with open(os.path.join(IREC_PATH, 'results', f'{EXPERIMENT_NAME}_clusters_colisionless.json'), 'w') as f:

with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters_colisionless.json'), 'w') as f:
json.dump(inter, f, indent=2)


Expand Down
2 changes: 0 additions & 2 deletions scripts/plum/train_plum.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,6 @@ def main():

data = CoocMappingDataset.create(
inter_json_path=os.path.join(IREC_PATH, 'data/Beauty/inter_new.json'),
max_sequence_length=20,
sampler_type='sasrec',
window_size=WINDOW_SIZE
)

Expand Down
168 changes: 168 additions & 0 deletions scripts/plum/train_plum_timestamp_based.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
from loguru import logger
import os

import torch

import pickle

import irec.callbacks as cb
from irec.data.dataloader import DataLoader
from irec.data.transforms import Collate, ToTorch, ToDevice
from irec.runners import TrainingRunner

from irec.utils import fix_random_seed

from callbacks import InitCodebooks, FixDeadCentroids
from data import EmbeddingDataset, ProcessEmbeddings
from models import PlumRQVAE
from transforms import AddWeightedCooccurrenceEmbeddings
from cooc_data import CoocMappingDataset

SEED_VALUE = 42
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

NUM_EPOCHS = 500
BATCH_SIZE = 1024

INPUT_DIM = 4096
HIDDEN_DIM = 32
CODEBOOK_SIZE = 256
NUM_CODEBOOKS = 3
BETA = 0.25
LR = 1e-4
WINDOW_SIZE = 2

EXPERIMENT_NAME = f'4-1_plum_rqvae_beauty_ws_{WINDOW_SIZE}'
INTER_TRAIN_PATH = "/home/jovyan/IRec/sigir/Beauty_new/splits/exp_data/exp_4.1_inter_semantics_train.json"
EMBEDDINGS_PATH = "/home/jovyan/tiger/data/Beauty/default_content_embeddings.pkl"
IREC_PATH = '../../'

def main():
fix_random_seed(SEED_VALUE)

data = CoocMappingDataset.create_from_split_part(
train_inter_json_path=INTER_TRAIN_PATH,
window_size=WINDOW_SIZE
)

dataset = EmbeddingDataset(
data_path=EMBEDDINGS_PATH
)

item_id_to_embedding = {}
all_item_ids = []
for idx in range(len(dataset)):
sample = dataset[idx]
item_id = int(sample['item_id'])
item_id_to_embedding[item_id] = torch.tensor(sample['embedding'])
all_item_ids.append(item_id)

add_cooc_transform = AddWeightedCooccurrenceEmbeddings(
data.cooccur_counter_mapping, item_id_to_embedding, all_item_ids)

train_dataloader = DataLoader(
dataset,
batch_size=BATCH_SIZE,
shuffle=True,
drop_last=True,
).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(
ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])
).map(add_cooc_transform).repeat(NUM_EPOCHS)

valid_dataloader = DataLoader(
dataset,
batch_size=BATCH_SIZE,
shuffle=False,
drop_last=False,
).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])).map(add_cooc_transform)

LOG_EVERY_NUM_STEPS = int(len(train_dataloader) // NUM_EPOCHS)

model = PlumRQVAE(
input_dim=INPUT_DIM,
num_codebooks=NUM_CODEBOOKS,
codebook_size=CODEBOOK_SIZE,
embedding_dim=HIDDEN_DIM,
beta=BETA,
quant_loss_weight=1.0,
contrastive_loss_weight=1.0,
temperature=1.0
).to(DEVICE)

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

logger.debug(f'Overall parameters: {total_params:,}')
logger.debug(f'Trainable parameters: {trainable_params:,}')

optimizer = torch.optim.Adam(model.parameters(), lr=LR, fused=True)

callbacks = [
InitCodebooks(valid_dataloader),

cb.BatchMetrics(metrics=lambda model_outputs, batch: {
'loss': model_outputs['loss'],
'recon_loss': model_outputs['recon_loss'],
'rqvae_loss': model_outputs['rqvae_loss'],
'con_loss': model_outputs['con_loss']
}, name='train'),

FixDeadCentroids(valid_dataloader),

cb.MetricAccumulator(
accumulators={
'train/loss': cb.MeanAccumulator(),
'train/recon_loss': cb.MeanAccumulator(),
'train/rqvae_loss': cb.MeanAccumulator(),
'train/con_loss': cb.MeanAccumulator(),
'num_dead/0': cb.MeanAccumulator(),
'num_dead/1': cb.MeanAccumulator(),
'num_dead/2': cb.MeanAccumulator(),
},
reset_every_num_steps=LOG_EVERY_NUM_STEPS
),

cb.Validation(
dataset=valid_dataloader,
callbacks=[
cb.BatchMetrics(metrics=lambda model_outputs, batch: {
'loss': model_outputs['loss'],
'recon_loss': model_outputs['recon_loss'],
'rqvae_loss': model_outputs['rqvae_loss'],
'con_loss': model_outputs['con_loss']
}, name='valid'),
cb.MetricAccumulator(
accumulators={
'valid/loss': cb.MeanAccumulator(),
'valid/recon_loss': cb.MeanAccumulator(),
'valid/rqvae_loss': cb.MeanAccumulator(),
'valid/con_loss': cb.MeanAccumulator()
}
),
],
).every_num_steps(LOG_EVERY_NUM_STEPS),

cb.Logger().every_num_steps(LOG_EVERY_NUM_STEPS),
cb.TensorboardLogger(experiment_name=EXPERIMENT_NAME, logdir=os.path.join(IREC_PATH, 'tensorboard_logs')),

cb.EarlyStopping(
metric='valid/recon_loss',
patience=40,
minimize=True,
model_path=os.path.join(IREC_PATH, 'checkpoints', EXPERIMENT_NAME)
).every_num_steps(LOG_EVERY_NUM_STEPS),
]

logger.debug('Everything is ready for training process!')

runner = TrainingRunner(
model=model,
optimizer=optimizer,
dataset=train_dataloader,
callbacks=callbacks,
)
runner.run()


if __name__ == '__main__':
main()
Loading