diff --git a/notebooks/AmazonBeautyDatasetStatistics.ipynb b/notebooks/AmazonBeautyDatasetStatistics.ipynb index 6d34ff2..379239d 100644 --- a/notebooks/AmazonBeautyDatasetStatistics.ipynb +++ b/notebooks/AmazonBeautyDatasetStatistics.ipynb @@ -405,7 +405,7 @@ ], "metadata": { "kernelspec": { - "display_name": ".venv", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -419,7 +419,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.6" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/notebooks/LsvdDownload.ipynb b/notebooks/LsvdDownload.ipynb new file mode 100644 index 0000000..c57e1af --- /dev/null +++ b/notebooks/LsvdDownload.ipynb @@ -0,0 +1,574 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "SbkKok0dfjjS" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import polars as pl" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'1.8.2'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pl.__version__" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "HF_ENDPOINT=\"http://huggingface.proxy\" hf download deepvk/VK-LSVD --repo-type dataset --include \"metadata/*\" --local-dir /home/jovyan/IRec/sigir/lsvd_data/raw\n", + "\n", + "HF_ENDPOINT=\"http://huggingface.proxy\" hf download deepvk/VK-LSVD --repo-type dataset --include \"subsamples/ur0.01_ir0.01/*\" --local-dir /home/jovyan/IRec/sigir/lsvd_data/raw\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Разбиение сабсэмплов на базовую, гэп, вал и тест части\n", + "\n", + "Добавляется колонка original_order чтобы сохранять порядок внутри каждой из частей" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(8, 1, 1, 1, 11, 9)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "subsample_name = 'ur0.01_ir0.01'\n", + "content_embedding_size = 256\n", + "DATASET_PATH = \"/home/jovyan/IRec/sigir/lsvd_data/raw\"\n", + "\n", + "metadata_files = ['metadata/users_metadata.parquet',\n", + " 'metadata/items_metadata.parquet',\n", + " 'metadata/item_embeddings.npz']\n", + "\n", + "BASE_WEEKS = (15, 23)\n", + "GAP_WEEKS = (23, 24) #увеличить гэп\n", + "VAL_WEEKS = (24, 25)\n", + "\n", + "base_interactions_files = [f'subsamples/{subsample_name}/train/week_{i:02}.parquet'\n", + " for i in range(BASE_WEEKS[0], BASE_WEEKS[1])]\n", + "\n", + "gap_interactions_files = [f'subsamples/{subsample_name}/train/week_{i:02}.parquet'\n", + " for i in range(GAP_WEEKS[0], GAP_WEEKS[1])]\n", + "\n", + "val_interactions_files = [f'subsamples/{subsample_name}/train/week_{i:02}.parquet'\n", + " for i in range(VAL_WEEKS[0], VAL_WEEKS[1])]\n", + "\n", + "test_interactions_files = [f'subsamples/{subsample_name}/validation/week_25.parquet']\n", + "\n", + "all_interactions_files = base_interactions_files + gap_interactions_files + val_interactions_files + test_interactions_files\n", + "\n", + "base_with_gap_interactions_files = base_interactions_files + gap_interactions_files\n", + "\n", + "len(base_interactions_files), len(gap_interactions_files), len(val_interactions_files), len(test_interactions_files), len(all_interactions_files), len(base_with_gap_interactions_files)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def get_parquet_interactions(data_files):\n", + " data_interactions = pl.concat([pl.scan_parquet(f'{DATASET_PATH}/{file}')\n", + " for file in data_files])\n", + " data_interactions = data_interactions.collect(streaming=True)\n", + " data_interactions = data_interactions.with_row_index(\"original_order\")\n", + " return data_interactions\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "base_interactions = get_parquet_interactions(base_interactions_files)\n", + "gap_interactions = get_parquet_interactions(gap_interactions_files)\n", + "val_interactions = get_parquet_interactions(val_interactions_files)\n", + "test_interactions = get_parquet_interactions(test_interactions_files)\n", + "all_data_interactions = get_parquet_interactions(all_interactions_files)\n", + "base_with_gap_interactions = get_parquet_interactions(base_with_gap_interactions_files)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Загрузка и фильтрация эмбеддингов" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "all_data_users = all_data_interactions.select('user_id').unique()\n", + "all_data_items = all_data_interactions.select('item_id').unique()\n", + "\n", + "item_ids = np.load(f\"{DATASET_PATH}/metadata/item_embeddings.npz\")['item_id']\n", + "item_embeddings = np.load(f\"{DATASET_PATH}/metadata/item_embeddings.npz\")['embedding']\n", + "\n", + "mask = np.isin(item_ids, all_data_items.to_numpy())\n", + "item_ids = item_ids[mask]\n", + "item_embeddings = item_embeddings[mask]\n", + "item_embeddings = item_embeddings[:, :content_embedding_size]\n", + "\n", + "users_metadata = pl.read_parquet(f\"{DATASET_PATH}/metadata/users_metadata.parquet\")\n", + "items_metadata = pl.read_parquet(f\"{DATASET_PATH}/metadata/items_metadata.parquet\")\n", + "\n", + "users_metadata = users_metadata.join(all_data_users, on='user_id')\n", + "items_metadata = items_metadata.join(all_data_items, on='item_id')\n", + "items_metadata = items_metadata.join(pl.DataFrame({'item_id': item_ids, \n", + " 'embedding': item_embeddings}), on='item_id')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Сжатие айтем айди и ремапинг" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total users: 79074, Total items: 62758\n" + ] + } + ], + "source": [ + "all_data_items = all_data_interactions.select('item_id').unique()\n", + "all_data_users = all_data_interactions.select('user_id').unique()\n", + "\n", + "unique_items_sorted = all_data_items.sort('item_id').with_row_index('new_item_id')\n", + "global_item_mapping = dict(zip(unique_items_sorted['item_id'], unique_items_sorted['new_item_id']))\n", + "\n", + "print(f\"Total users: {all_data_users.shape[0]}, Total items: {len(global_item_mapping)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "def remap_interactions(df, mapping):\n", + " return df.with_columns(\n", + " pl.col('item_id')\n", + " .map_elements(lambda x: mapping.get(x, None), return_dtype=pl.UInt32)\n", + " )\n", + "\n", + "base_interactions_remapped = remap_interactions(base_interactions, global_item_mapping)\n", + "gap_interactions_remapped = remap_interactions(gap_interactions, global_item_mapping)\n", + "test_interactions_remapped = remap_interactions(test_interactions, global_item_mapping)\n", + "val_interactions_remapped = remap_interactions(val_interactions, global_item_mapping)\n", + "all_data_interactions_remapped = remap_interactions(all_data_interactions, global_item_mapping)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "del base_interactions, gap_interactions, test_interactions, val_interactions, all_data_interactions" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "base_with_gap_interactions_remapped = remap_interactions(base_with_gap_interactions, global_item_mapping)\n", + "del base_with_gap_interactions" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "items_metadata_remapped = remap_interactions(items_metadata, global_item_mapping)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Группировка по юзер айди" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "interactions count: (1244323, 13)\n", + "users count: (74862, 3)\n", + "interactions count: (176791, 13)\n", + "users count: (44444, 3)\n", + "interactions count: (170111, 13)\n", + "users count: (43370, 3)\n", + "interactions count: (164151, 13)\n", + "users count: (43233, 3)\n", + "interactions count: (1755376, 13)\n", + "users count: (79074, 3)\n" + ] + } + ], + "source": [ + "def get_grouped_interactions(data_interactions):\n", + " print(f\"interactions count: {data_interactions.shape}\")\n", + " data_res = (\n", + " data_interactions\n", + " .select(['original_order', 'user_id', 'item_id'])\n", + " .group_by('user_id')\n", + " .agg(\n", + " pl.col('item_id')\n", + " .sort_by(pl.col('original_order'))\n", + " .alias('item_ids'),\n", + " pl.col('original_order').alias('timestamps')\n", + " )\n", + " .rename({'user_id': 'uid'})\n", + " )\n", + " print(f\"users count: {data_res.shape}\")\n", + " return data_res\n", + "base_interactions_grouped = get_grouped_interactions(base_interactions_remapped)\n", + "gap_interactions_grouped = get_grouped_interactions(gap_interactions_remapped)\n", + "test_interactions_grouped = get_grouped_interactions(test_interactions_remapped)\n", + "val_interactions_grouped = get_grouped_interactions(val_interactions_remapped)\n", + "all_data_interactions_grouped = get_grouped_interactions(all_data_interactions_remapped)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (1, 3)
uiditem_idstimestamps
u32list[u32]list[u32]
2655558[16621, 42990, … 51285][46109, 59132, … 1209536]
" + ], + "text/plain": [ + "shape: (1, 3)\n", + "┌─────────┬─────────────────────────┬───────────────────────────┐\n", + "│ uid ┆ item_ids ┆ timestamps │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ u32 ┆ list[u32] ┆ list[u32] │\n", + "╞═════════╪═════════════════════════╪═══════════════════════════╡\n", + "│ 2655558 ┆ [16621, 42990, … 51285] ┆ [46109, 59132, … 1209536] │\n", + "└─────────┴─────────────────────────┴───────────────────────────┘" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "base_interactions_grouped.head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "del base_interactions_remapped, gap_interactions_remapped, test_interactions_remapped, val_interactions_remapped" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "interactions count: (1421114, 13)\n", + "users count: (76483, 3)\n" + ] + } + ], + "source": [ + "base_with_gap_interactions_grouped = get_grouped_interactions(base_with_gap_interactions_remapped)\n", + "del base_with_gap_interactions_remapped" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Сохранение" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Сохранён маппинг: /home/jovyan/IRec/sigir/lsvd_data/8-days-base-ows/global_item_mapping.json\n" + ] + } + ], + "source": [ + "import json\n", + "OUTPUT_DIR = \"/home/jovyan/IRec/sigir/lsvd_data/8-days-base-ows\"\n", + "\n", + "mapping_output_path = f\"{OUTPUT_DIR}/global_item_mapping.json\"\n", + "\n", + "with open(mapping_output_path, 'w') as f:\n", + " json.dump({str(k): v for k, v in global_item_mapping.items()}, f, indent=2)\n", + "\n", + "print(f\"Сохранён маппинг: {mapping_output_path}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Сохранен файл: items_metadata_remapped\n", + "Сохранен файл: items_metadata_remapped_old\n", + "Сохранен файл: base_interactions_grouped\n", + "Сохранен файл: gap_interactions_grouped\n", + "Сохранен файл: test_interactions_grouped\n", + "Сохранен файл: val_interactions_grouped\n", + "Сохранен файл: base_with_gap_interactions_grouped\n", + "Сохранен файл: all_data_interactions_grouped\n", + "Сохранен файл: all_data_interactions_remapped\n" + ] + } + ], + "source": [ + "def write_parquet(output_dir, data, file_name):\n", + " output_parquet_path = f\"{output_dir}/{file_name}.parquet\"\n", + " data.write_parquet(output_parquet_path)\n", + " print(f\"Сохранен файл: {file_name}\")\n", + "\n", + "write_parquet(OUTPUT_DIR, items_metadata_remapped, \"items_metadata_remapped\")\n", + "write_parquet(OUTPUT_DIR, items_metadata, \"items_metadata_remapped_old\")\n", + "\n", + "write_parquet(OUTPUT_DIR, base_interactions_grouped, \"base_interactions_grouped\")\n", + "write_parquet(OUTPUT_DIR, gap_interactions_grouped, \"gap_interactions_grouped\")\n", + "write_parquet(OUTPUT_DIR, test_interactions_grouped, \"test_interactions_grouped\")\n", + "write_parquet(OUTPUT_DIR, val_interactions_grouped, \"val_interactions_grouped\")\n", + "write_parquet(OUTPUT_DIR, base_with_gap_interactions_grouped, \"base_with_gap_interactions_grouped\")\n", + "\n", + "write_parquet(OUTPUT_DIR, all_data_interactions_grouped, \"all_data_interactions_grouped\")\n", + "\n", + "write_parquet(OUTPUT_DIR, all_data_interactions_remapped, \"all_data_interactions_remapped\")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(64, 64)" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(list(items_metadata_remapped.head(1)['embedding'].item())), len(list(items_metadata.head(1)['embedding'].item()))" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(62758, 5)" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "items_metadata_remapped.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (1, 5)
item_idauthor_iddurationtrain_interactions_rankembedding
u32u32u8u32array[f32, 64]
012494249771612[-0.503418, 0.201538, … 0.007988]
" + ], + "text/plain": [ + "shape: (1, 5)\n", + "┌─────────┬───────────┬──────────┬─────────────────────────┬─────────────────────────────────┐\n", + "│ item_id ┆ author_id ┆ duration ┆ train_interactions_rank ┆ embedding │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ u32 ┆ u32 ┆ u8 ┆ u32 ┆ array[f32, 64] │\n", + "╞═════════╪═══════════╪══════════╪═════════════════════════╪═════════════════════════════════╡\n", + "│ 0 ┆ 1249424 ┆ 9 ┆ 771612 ┆ [-0.503418, 0.201538, … 0.0079… │\n", + "└─────────┴───────────┴──────────┴─────────────────────────┴─────────────────────────────────┘" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "items_metadata_remapped.head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 3)
uiditem_idstimestamps
u32list[u32]list[u32]
4465123[28298, 3829, … 28995][257260, 272293, … 1390041]
3043171[8638, 23487, … 15086][6628, 11364, … 1370935]
2757146[56345, 56828, … 37056][194522, 217739, … 1390752]
1148408[40326, 42152][427153, 1367211]
2537065[27766, 39966, … 19887][9428, 35459, … 1214991]
" + ], + "text/plain": [ + "shape: (5, 3)\n", + "┌─────────┬─────────────────────────┬─────────────────────────────┐\n", + "│ uid ┆ item_ids ┆ timestamps │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ u32 ┆ list[u32] ┆ list[u32] │\n", + "╞═════════╪═════════════════════════╪═════════════════════════════╡\n", + "│ 4465123 ┆ [28298, 3829, … 28995] ┆ [257260, 272293, … 1390041] │\n", + "│ 3043171 ┆ [8638, 23487, … 15086] ┆ [6628, 11364, … 1370935] │\n", + "│ 2757146 ┆ [56345, 56828, … 37056] ┆ [194522, 217739, … 1390752] │\n", + "│ 1148408 ┆ [40326, 42152] ┆ [427153, 1367211] │\n", + "│ 2537065 ┆ [27766, 39966, … 19887] ┆ [9428, 35459, … 1214991] │\n", + "└─────────┴─────────────────────────┴─────────────────────────────┘" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "base_with_gap_interactions_grouped.head()" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/scripts/plum-lsvd/callbacks.py b/scripts/plum-lsvd/callbacks.py new file mode 100644 index 0000000..43ec460 --- /dev/null +++ b/scripts/plum-lsvd/callbacks.py @@ -0,0 +1,64 @@ +import torch + +import irec.callbacks as cb +from irec.runners import TrainingRunner, TrainingRunnerContext + +class InitCodebooks(cb.TrainingCallback): + def __init__(self, dataloader): + super().__init__() + self._dataloader = dataloader + + @torch.no_grad() + def before_run(self, runner: TrainingRunner): + for i in range(len(runner.model.codebooks)): + X = next(iter(self._dataloader))['embedding'] + idx = torch.randperm(X.shape[0], device=X.device)[:len(runner.model.codebooks[i])] + remainder = runner.model.encoder(X[idx]) + + for j in range(i): + codebook_indices = runner.model.get_codebook_indices(remainder, runner.model.codebooks[j]) + codebook_vectors = runner.model.codebooks[j][codebook_indices] + remainder = remainder - codebook_vectors + + runner.model.codebooks[i].data = remainder.detach() + + +class FixDeadCentroids(cb.TrainingCallback): + def __init__(self, dataloader): + super().__init__() + self._dataloader = dataloader + + def after_step(self, runner: TrainingRunner, context: TrainingRunnerContext): + for i, num_fixed in enumerate(self.fix_dead_codebooks(runner)): + context.metrics[f'num_dead/{i}'] = num_fixed + + @torch.no_grad() + def fix_dead_codebooks(self, runner: TrainingRunner): + num_fixed = [] + for codebook_idx, codebook in enumerate(runner.model.codebooks): + centroid_counts = torch.zeros(codebook.shape[0], dtype=torch.long, device=codebook.device) + random_batch = next(iter(self._dataloader))['embedding'] + + for batch in self._dataloader: + remainder = runner.model.encoder(batch['embedding']) + for l in range(codebook_idx): + ind = runner.model.get_codebook_indices(remainder, runner.model.codebooks[l]) + remainder = remainder - runner.model.codebooks[l][ind] + + indices = runner.model.get_codebook_indices(remainder, codebook) + centroid_counts.scatter_add_(0, indices, torch.ones_like(indices)) + + dead_mask = (centroid_counts == 0) + num_dead = int(dead_mask.sum().item()) + num_fixed.append(num_dead) + if num_dead == 0: + continue + + remainder = runner.model.encoder(random_batch) + for l in range(codebook_idx): + ind = runner.model.get_codebook_indices(remainder, runner.model.codebooks[l]) + remainder = remainder - runner.model.codebooks[l][ind] + remainder = remainder[torch.randperm(remainder.shape[0], device=codebook.device)][:num_dead] + codebook[dead_mask] = remainder.detach() + + return num_fixed diff --git a/scripts/plum-lsvd/cooc_data.py b/scripts/plum-lsvd/cooc_data.py new file mode 100644 index 0000000..7cea906 --- /dev/null +++ b/scripts/plum-lsvd/cooc_data.py @@ -0,0 +1,117 @@ +import json +from collections import defaultdict, Counter +from data import InteractionsDatasetParquet +from collections import defaultdict, Counter + + +class CoocMappingDataset: + def __init__( + self, + train_sampler, + num_items, + cooccur_counter_mapping=None + ): + self._train_sampler = train_sampler + self._num_items = num_items + self._cooccur_counter_mapping = cooccur_counter_mapping + + @classmethod + def create(cls, inter_json_path, window_size): + max_item_id = 0 + train_dataset = [] + + with open(inter_json_path, 'r') as f: + user_interactions = json.load(f) + + for user_id_str, item_ids in user_interactions.items(): + user_id = int(user_id_str) + if item_ids: + max_item_id = max(max_item_id, max(item_ids)) + if len(item_ids) >= 5: + print(f'Core-5 dataset is used, user {user_id} has only {len(item_ids)} items') + train_dataset.append({ + 'user_ids': [user_id], + 'item_ids': item_ids[:-2], + }) + + + cooccur_counter_mapping = cls.build_cooccur_counter_mapping(train_dataset, window_size=window_size) + print(f'Computed window-based co-occurrence mapping for {len(cooccur_counter_mapping)} items but max_item_id is {max_item_id}') + + + train_sampler = train_dataset + + + return cls( + train_sampler=train_sampler, + num_items=max_item_id + 1, + cooccur_counter_mapping=cooccur_counter_mapping + ) + + + @classmethod + def create_from_split_part( + cls, + train_inter_parquet_path, + window_size, + ): + + max_item_id = 0 + train_dataset = [] + + + train_interactions = InteractionsDatasetParquet(train_inter_parquet_path) + + actions_num = 0 + for session in train_interactions: + user_id, item_ids = int(session['user_id']), session['item_ids'] + if item_ids.any(): + max_item_id = max(max_item_id, max(item_ids)) + actions_num += len(item_ids) + train_dataset.append({ + 'user_ids': [user_id], + 'item_ids': item_ids, + }) + + + print(f'Train: {len(train_dataset)} users') + print(f'Max item ID: {max_item_id}') + print(f"Actions num: {actions_num}") + + + cooccur_counter_mapping = cls.build_cooccur_counter_mapping( + train_dataset, + window_size=window_size + ) + + + print(f'Computed window-based co-occurrence mapping for {len(cooccur_counter_mapping)} items') + + + return cls( + train_sampler=train_dataset, + num_items=max_item_id + 1, + cooccur_counter_mapping=cooccur_counter_mapping + ) + + + + @staticmethod + def build_cooccur_counter_mapping(train_dataset, window_size): + cooccur_counts = defaultdict(Counter) + for session in train_dataset: + items = session['item_ids'] + for i in range(len(items)): + item_i = items[i] + for j in range(max(0, i - window_size), min(len(items), i + window_size + 1)): + if i != j: + cooccur_counts[item_i][items[j]] += 1 + max_hist_len = max(len(counter) for counter in cooccur_counts.values()) if cooccur_counts else 0 + print(f"Max cooccurrence history length is {max_hist_len}") + return cooccur_counts + + + + @property + def cooccur_counter_mapping(self): + return self._cooccur_counter_mapping \ No newline at end of file diff --git a/scripts/plum-lsvd/data.py b/scripts/plum-lsvd/data.py new file mode 100644 index 0000000..5a780fb --- /dev/null +++ b/scripts/plum-lsvd/data.py @@ -0,0 +1,87 @@ +import numpy as np +import pickle + +from irec.data.base import BaseDataset +from irec.data.transforms import Transform + + +import polars as pl + +class InteractionsDatasetParquet(BaseDataset): + def __init__(self, data_path, max_items=None): + self.df = pl.read_parquet(data_path) + assert 'uid' in self.df.columns, "Missing 'uid' column" + assert 'item_ids' in self.df.columns, "Missing 'item_ids' column" + print(f"Dataset loaded: {len(self.df)} users") + + if max_items is not None: + self.df = self.df.with_columns( + pl.col("item_ids").list.slice(-max_items).alias("item_ids") + ) + + def __getitem__(self, idx): + row = self.df.row(idx, named=True) + return { + 'user_id': row['uid'], + 'item_ids': np.array(row['item_ids'], dtype=np.uint32), + } + + def __len__(self): + return len(self.df) + + def __iter__(self): + for idx in range(len(self)): + yield self[idx] + + +class EmbeddingDatasetParquet(BaseDataset): + def __init__(self, data_path): + self.df = pl.read_parquet(data_path) + self.item_ids = np.array(self.df['item_id'], dtype=np.int64) + self.embeddings = np.array(self.df['embedding'].to_list(), dtype=np.float32) + print(f"embedding dim: {self.embeddings[0].shape}") + + def __getitem__(self, idx): + index = self.item_ids[idx] + tensor_emb = self.embeddings[idx] + return { + 'item_id': index, + 'embedding': tensor_emb, + 'embedding_dim': len(tensor_emb) + } + + def __len__(self): + return len(self.embeddings) + + +class EmbeddingDataset(BaseDataset): + def __init__(self, data_path): + self.data_path = data_path + with open(data_path, 'rb') as f: + self.data = pickle.load(f) + + self.item_ids = np.array(self.data['item_id'], dtype=np.int64) + self.embeddings = np.array(self.data['embedding'], dtype=np.float32) + + def __getitem__(self, idx): + index = self.item_ids[idx] + tensor_emb = self.embeddings[idx] + return { + 'item_id': index, + 'embedding': tensor_emb, + 'embedding_dim': len(tensor_emb) + } + + def __len__(self): + return len(self.embeddings) + + +class ProcessEmbeddings(Transform): + def __init__(self, embedding_dim, keys): + self.embedding_dim = embedding_dim + self.keys = keys + + def __call__(self, batch): + for key in self.keys: + batch[key] = batch[key].reshape(-1, self.embedding_dim) + return batch \ No newline at end of file diff --git a/scripts/plum-lsvd/infer_default.py b/scripts/plum-lsvd/infer_default.py new file mode 100644 index 0000000..b15fb6d --- /dev/null +++ b/scripts/plum-lsvd/infer_default.py @@ -0,0 +1,146 @@ +from loguru import logger +import os + +import torch + +import irec.callbacks as cb +from irec.data.dataloader import DataLoader +from irec.data.transforms import Collate, ToTorch, ToDevice +from irec.runners import InferenceRunner + +from irec.utils import fix_random_seed + +from data import EmbeddingDataset, ProcessEmbeddings +from models import PlumRQVAE + +# ПУТИ +IREC_PATH = '/home/jovyan/IRec/' +EMBEDDINGS_PATH = '/home/jovyan/tiger/data/Beauty/default_content_embeddings.pkl' +MODEL_PATH = '/home/jovyan/IRec/checkpoints/4-1_plum_rqvae_beauty_ws_2_best_0.0051.pth' +RESULTS_PATH = os.path.join(IREC_PATH, 'results') + +WINDOW_SIZE = 2 + +EXPERIMENT_NAME = f'test_plum_rqvae_beauty_ws_{WINDOW_SIZE}' + +# ОСТАЛЬНОЕ + +SEED_VALUE = 42 +DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + +BATCH_SIZE = 1024 + +INPUT_DIM = 4096 +HIDDEN_DIM = 32 +CODEBOOK_SIZE = 256 +NUM_CODEBOOKS = 3 + +BETA = 0.25 + + + +def main(): + fix_random_seed(SEED_VALUE) + + dataset = EmbeddingDataset( + data_path=EMBEDDINGS_PATH + ) + + item_id_to_embedding = {} + all_item_ids = [] + for idx in range(len(dataset)): + sample = dataset[idx] + item_id = int(sample['item_id']) + item_id_to_embedding[item_id] = torch.tensor(sample['embedding']) + all_item_ids.append(item_id) + + dataloader = DataLoader( + dataset, + batch_size=BATCH_SIZE, + shuffle=False, + drop_last=False, + ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])) + + model = PlumRQVAE( + input_dim=INPUT_DIM, + num_codebooks=NUM_CODEBOOKS, + codebook_size=CODEBOOK_SIZE, + embedding_dim=HIDDEN_DIM, + beta=BETA, + quant_loss_weight=1.0, + contrastive_loss_weight=1.0, + temperature=1.0 + ).to(DEVICE) + + total_params = sum(p.numel() for p in model.parameters()) + trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + + logger.debug(f'Overall parameters: {total_params:,}') + logger.debug(f'Trainable parameters: {trainable_params:,}') + + callbacks = [ + cb.LoadModel(MODEL_PATH), + + cb.BatchMetrics(metrics=lambda model_outputs, _: { + 'loss': model_outputs['loss'], + 'recon_loss': model_outputs['recon_loss'], + 'rqvae_loss': model_outputs['rqvae_loss'], + 'con_loss': model_outputs['con_loss'] + }, name='valid'), + + cb.MetricAccumulator( + accumulators={ + 'valid/loss': cb.MeanAccumulator(), + 'valid/recon_loss': cb.MeanAccumulator(), + 'valid/rqvae_loss': cb.MeanAccumulator(), + 'valid/con_loss': cb.MeanAccumulator(), + }, + ), + + cb.Logger().every_num_steps(len(dataloader)), + + cb.InferenceSaver( + metrics=lambda batch, model_outputs, _: {'item_id': batch['item_id'], 'clusters': model_outputs['clusters']}, + save_path=os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'), + format='json' + ) + ] + + logger.debug('Everything is ready for training process!') + + runner = InferenceRunner( + model=model, + dataset=dataloader, + callbacks=callbacks, + ) + runner.run() + + import json + from collections import defaultdict + import numpy as np + + with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'), 'r') as f: + mappings = json.load(f) + + inter = {} + sem_2_ids = defaultdict(list) + for mapping in mappings: + item_id = mapping['item_id'] + clusters = mapping['clusters'] + inter[int(item_id)] = clusters + sem_2_ids[tuple(clusters)].append(int(item_id)) + + for semantics, items in sem_2_ids.items(): + assert len(items) <= CODEBOOK_SIZE, str(len(items)) + collision_solvers = np.random.permutation(CODEBOOK_SIZE)[:len(items)].tolist() + for item_id, collision_solver in zip(items, collision_solvers): + inter[item_id].append(collision_solver) + for i in range(len(inter[item_id])): + inter[item_id][i] += CODEBOOK_SIZE * i + + with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters_colisionless.json'), 'w') as f: + json.dump(inter, f, indent=2) + + +if __name__ == '__main__': + main() diff --git a/scripts/plum-lsvd/infer_plum_4.1.py b/scripts/plum-lsvd/infer_plum_4.1.py new file mode 100644 index 0000000..bb70a9d --- /dev/null +++ b/scripts/plum-lsvd/infer_plum_4.1.py @@ -0,0 +1,146 @@ +from loguru import logger +import os + +import torch + +import irec.callbacks as cb +from irec.data.dataloader import DataLoader +from irec.data.transforms import Collate, ToTorch, ToDevice +from irec.runners import InferenceRunner + +from irec.utils import fix_random_seed + +from data import EmbeddingDatasetParquet, ProcessEmbeddings +from models import PlumRQVAE + +# ЭКСПЕРИМЕНТ С ПОЛНОЙ ИСТОРИЕЙ +IREC_PATH = '../../' +MODEL_PATH = '/home/jovyan/IRec/checkpoints/4-1_vk_lsvd_ods_base_with_gap_cb_512_ws_2_k_2000_8w_e35_best_0.0096.pth' +EMBEDDINGS_PATH = "/home/jovyan/IRec/sigir/lsvd_data/8-days-base-ows/items_metadata_remapped.parquet" + +RESULTS_PATH = os.path.join(IREC_PATH, 'results') + +WINDOW_SIZE = 2 +CODEBOOK_SIZE = 512 +K = 2000 +EXPERIMENT_NAME = f'4-1_vk_lsvd_ods_base_with_gap_cb_{CODEBOOK_SIZE}_ws_{WINDOW_SIZE}_k_{K}_8w_e_35' +# ОСТАЛЬНОЕ + +SEED_VALUE = 42 +DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + + +BATCH_SIZE = 1024 + +INPUT_DIM = 64 +HIDDEN_DIM = 32 +NUM_CODEBOOKS = 3 +BETA = 0.25 + + + +def main(): + fix_random_seed(SEED_VALUE) + + dataset = EmbeddingDatasetParquet( + data_path=EMBEDDINGS_PATH + ) + + item_id_to_embedding = {} + all_item_ids = [] + for idx in range(len(dataset)): + sample = dataset[idx] + item_id = int(sample['item_id']) + item_id_to_embedding[item_id] = torch.tensor(sample['embedding']) + all_item_ids.append(item_id) + + dataloader = DataLoader( + dataset, + batch_size=BATCH_SIZE, + shuffle=False, + drop_last=False, + ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])) + + model = PlumRQVAE( + input_dim=INPUT_DIM, + num_codebooks=NUM_CODEBOOKS, + codebook_size=CODEBOOK_SIZE, + embedding_dim=HIDDEN_DIM, + beta=BETA, + quant_loss_weight=1.0, + contrastive_loss_weight=1.0, + temperature=1.0 + ).to(DEVICE) + + total_params = sum(p.numel() for p in model.parameters()) + trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + + logger.debug(f'Overall parameters: {total_params:,}') + logger.debug(f'Trainable parameters: {trainable_params:,}') + + callbacks = [ + cb.LoadModel(MODEL_PATH), + + cb.BatchMetrics(metrics=lambda model_outputs, _: { + 'loss': model_outputs['loss'], + 'recon_loss': model_outputs['recon_loss'], + 'rqvae_loss': model_outputs['rqvae_loss'], + 'con_loss': model_outputs['con_loss'] + }, name='valid'), + + cb.MetricAccumulator( + accumulators={ + 'valid/loss': cb.MeanAccumulator(), + 'valid/recon_loss': cb.MeanAccumulator(), + 'valid/rqvae_loss': cb.MeanAccumulator(), + 'valid/con_loss': cb.MeanAccumulator(), + }, + ), + + cb.Logger().every_num_steps(len(dataloader)), + + cb.InferenceSaver( + metrics=lambda batch, model_outputs, _: {'item_id': batch['item_id'], 'clusters': model_outputs['clusters']}, + save_path=os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'), + format='json' + ) + ] + + logger.debug('Everything is ready for training process!') + + runner = InferenceRunner( + model=model, + dataset=dataloader, + callbacks=callbacks, + ) + runner.run() + + import json + from collections import defaultdict + import numpy as np + + with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'), 'r') as f: + mappings = json.load(f) + + inter = {} + sem_2_ids = defaultdict(list) + for mapping in mappings: + item_id = mapping['item_id'] + clusters = mapping['clusters'] + inter[int(item_id)] = clusters + sem_2_ids[tuple(clusters)].append(int(item_id)) + + for semantics, items in sem_2_ids.items(): + assert len(items) <= CODEBOOK_SIZE, str(len(items)) + collision_solvers = np.random.permutation(CODEBOOK_SIZE)[:len(items)].tolist() + for item_id, collision_solver in zip(items, collision_solvers): + inter[item_id].append(collision_solver) + for i in range(len(inter[item_id])): + inter[item_id][i] += CODEBOOK_SIZE * i + + with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters_colisionless.json'), 'w') as f: + json.dump(inter, f, indent=2) + + +if __name__ == '__main__': + main() diff --git a/scripts/plum-lsvd/infer_plum_4.2.py b/scripts/plum-lsvd/infer_plum_4.2.py new file mode 100644 index 0000000..977c0b5 --- /dev/null +++ b/scripts/plum-lsvd/infer_plum_4.2.py @@ -0,0 +1,146 @@ +from loguru import logger +import os + +import torch + +import irec.callbacks as cb +from irec.data.dataloader import DataLoader +from irec.data.transforms import Collate, ToTorch, ToDevice +from irec.runners import InferenceRunner + +from irec.utils import fix_random_seed + +from data import EmbeddingDatasetParquet, ProcessEmbeddings +from models import PlumRQVAE + +# ЭКСПЕРИМЕНТ С ОБРЕЗАННОЙ ИСТОРИЕЙ +IREC_PATH = '../../' +MODEL_PATH = '/home/jovyan/IRec/checkpoints/4-2_vk_lsvd_ods_base_with_gap_cb_512_ws_2_k_2000_8w_e35_best_0.0096.pth' +EMBEDDINGS_PATH = "/home/jovyan/IRec/sigir/lsvd_data/8-days-base-ows/items_metadata_remapped.parquet" + +RESULTS_PATH = os.path.join(IREC_PATH, 'results') + +WINDOW_SIZE = 2 +CODEBOOK_SIZE = 512 +K = 2000 +EXPERIMENT_NAME = f'4-2_vk_lsvd_ods_base_with_gap_cb_{CODEBOOK_SIZE}_ws_{WINDOW_SIZE}_k_{K}_8w_e_35' +# ОСТАЛЬНОЕ + +SEED_VALUE = 42 +DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + + +BATCH_SIZE = 1024 + +INPUT_DIM = 64 +HIDDEN_DIM = 32 +NUM_CODEBOOKS = 3 +BETA = 0.25 + + + +def main(): + fix_random_seed(SEED_VALUE) + + dataset = EmbeddingDatasetParquet( + data_path=EMBEDDINGS_PATH + ) + + item_id_to_embedding = {} + all_item_ids = [] + for idx in range(len(dataset)): + sample = dataset[idx] + item_id = int(sample['item_id']) + item_id_to_embedding[item_id] = torch.tensor(sample['embedding']) + all_item_ids.append(item_id) + + dataloader = DataLoader( + dataset, + batch_size=BATCH_SIZE, + shuffle=False, + drop_last=False, + ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])) + + model = PlumRQVAE( + input_dim=INPUT_DIM, + num_codebooks=NUM_CODEBOOKS, + codebook_size=CODEBOOK_SIZE, + embedding_dim=HIDDEN_DIM, + beta=BETA, + quant_loss_weight=1.0, + contrastive_loss_weight=1.0, + temperature=1.0 + ).to(DEVICE) + + total_params = sum(p.numel() for p in model.parameters()) + trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + + logger.debug(f'Overall parameters: {total_params:,}') + logger.debug(f'Trainable parameters: {trainable_params:,}') + + callbacks = [ + cb.LoadModel(MODEL_PATH), + + cb.BatchMetrics(metrics=lambda model_outputs, _: { + 'loss': model_outputs['loss'], + 'recon_loss': model_outputs['recon_loss'], + 'rqvae_loss': model_outputs['rqvae_loss'], + 'con_loss': model_outputs['con_loss'] + }, name='valid'), + + cb.MetricAccumulator( + accumulators={ + 'valid/loss': cb.MeanAccumulator(), + 'valid/recon_loss': cb.MeanAccumulator(), + 'valid/rqvae_loss': cb.MeanAccumulator(), + 'valid/con_loss': cb.MeanAccumulator(), + }, + ), + + cb.Logger().every_num_steps(len(dataloader)), + + cb.InferenceSaver( + metrics=lambda batch, model_outputs, _: {'item_id': batch['item_id'], 'clusters': model_outputs['clusters']}, + save_path=os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'), + format='json' + ) + ] + + logger.debug('Everything is ready for training process!') + + runner = InferenceRunner( + model=model, + dataset=dataloader, + callbacks=callbacks, + ) + runner.run() + + import json + from collections import defaultdict + import numpy as np + + with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'), 'r') as f: + mappings = json.load(f) + + inter = {} + sem_2_ids = defaultdict(list) + for mapping in mappings: + item_id = mapping['item_id'] + clusters = mapping['clusters'] + inter[int(item_id)] = clusters + sem_2_ids[tuple(clusters)].append(int(item_id)) + + for semantics, items in sem_2_ids.items(): + assert len(items) <= CODEBOOK_SIZE, str(len(items)) + collision_solvers = np.random.permutation(CODEBOOK_SIZE)[:len(items)].tolist() + for item_id, collision_solver in zip(items, collision_solvers): + inter[item_id].append(collision_solver) + for i in range(len(inter[item_id])): + inter[item_id][i] += CODEBOOK_SIZE * i + + with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters_colisionless.json'), 'w') as f: + json.dump(inter, f, indent=2) + + +if __name__ == '__main__': + main() diff --git a/scripts/plum-lsvd/infer_rqvae.py b/scripts/plum-lsvd/infer_rqvae.py new file mode 100644 index 0000000..53a587c --- /dev/null +++ b/scripts/plum-lsvd/infer_rqvae.py @@ -0,0 +1,161 @@ +from loguru import logger +import os + +import torch + +import irec.callbacks as cb +from irec.data.dataloader import DataLoader +from irec.data.transforms import Collate, ToTorch, ToDevice +from irec.runners import InferenceRunner + +from irec.utils import fix_random_seed + +from data import EmbeddingDatasetParquet, ProcessEmbeddings +from collections import Counter +from models import PlumRQVAE + +# ПУТИ +IREC_PATH = '/home/jovyan/IRec/' +EMBEDDINGS_PATH = '/home/jovyan/IRec/sigir/lsvd_data/8-weeks-base-ows/items_metadata_remapped.parquet' +MODEL_PATH = '/home/jovyan/IRec/checkpoints/rqvae_vk_lsvd_cz_512_8-weeks_best_0.009.pth' +RESULTS_PATH = os.path.join(IREC_PATH, 'results') + +WINDOW_SIZE = 2 + +EXPERIMENT_NAME = f'rqvae_vk_lsvd_cz_512_8-weeks' + +# ОСТАЛЬНОЕ + +SEED_VALUE = 42 +DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + +BATCH_SIZE = 1024 + +INPUT_DIM = 64 +HIDDEN_DIM = 32 +CODEBOOK_SIZE = 512 +NUM_CODEBOOKS = 3 + +BETA = 0.25 + + + +def main(): + fix_random_seed(SEED_VALUE) + + dataset = EmbeddingDatasetParquet( + data_path=EMBEDDINGS_PATH + ) + + item_id_to_embedding = {} + all_item_ids = [] + for idx in range(len(dataset)): + sample = dataset[idx] + item_id = int(sample['item_id']) + item_id_to_embedding[item_id] = torch.tensor(sample['embedding']) + all_item_ids.append(item_id) + + dataloader = DataLoader( + dataset, + batch_size=BATCH_SIZE, + shuffle=False, + drop_last=False, + ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])) + + model = PlumRQVAE( + input_dim=INPUT_DIM, + num_codebooks=NUM_CODEBOOKS, + codebook_size=CODEBOOK_SIZE, + embedding_dim=HIDDEN_DIM, + beta=BETA, + quant_loss_weight=1.0, + contrastive_loss_weight=1.0, + temperature=1.0 + ).to(DEVICE) + + total_params = sum(p.numel() for p in model.parameters()) + trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + + logger.debug(f'Overall parameters: {total_params:,}') + logger.debug(f'Trainable parameters: {trainable_params:,}') + + callbacks = [ + cb.LoadModel(MODEL_PATH), + + cb.BatchMetrics(metrics=lambda model_outputs, _: { + 'loss': model_outputs['loss'], + 'recon_loss': model_outputs['recon_loss'], + 'rqvae_loss': model_outputs['rqvae_loss'], + 'con_loss': model_outputs['con_loss'] + }, name='valid'), + + cb.MetricAccumulator( + accumulators={ + 'valid/loss': cb.MeanAccumulator(), + 'valid/recon_loss': cb.MeanAccumulator(), + 'valid/rqvae_loss': cb.MeanAccumulator(), + 'valid/con_loss': cb.MeanAccumulator(), + }, + ), + + cb.Logger().every_num_steps(len(dataloader)), + + cb.InferenceSaver( + metrics=lambda batch, model_outputs, _: {'item_id': batch['item_id'], 'clusters': model_outputs['clusters']}, + save_path=os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'), + format='json' + ) + ] + + logger.debug('Everything is ready for training process!') + + runner = InferenceRunner( + model=model, + dataset=dataloader, + callbacks=callbacks, + ) + runner.run() + + import json + from collections import defaultdict + import numpy as np + + with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'), 'r') as f: + mappings = json.load(f) + + inter = {} + sem_2_ids = defaultdict(list) + collision_stats = [] + for mapping in mappings: + item_id = mapping['item_id'] + clusters = mapping['clusters'] + inter[int(item_id)] = clusters + sem_2_ids[tuple(clusters)].append(int(item_id)) + + for semantics, items in sem_2_ids.items(): + assert len(items) <= CODEBOOK_SIZE, str(len(items)) + collision_solvers = np.random.permutation(CODEBOOK_SIZE)[:len(items)].tolist() + for item_id, collision_solver in zip(items, collision_solvers): + inter[item_id].append(collision_solver) + collision_stats.append(collision_solver) + for i in range(len(inter[item_id])): + inter[item_id][i] += CODEBOOK_SIZE * i + + if collision_stats: + max_col_tok = max(collision_stats) + avg_col_tok = np.mean(collision_stats) + collision_distribution = Counter(collision_stats) + + print(f"Max collision token: {max_col_tok}") + print(f"Avg collision token: {avg_col_tok:.2f}") + print(f"Total items with collisions: {len(collision_stats)}") + print(f"Collision solver distribution: {dict(collision_distribution)}") + else: + print("No collisions detected") + + with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters_colisionless.json'), 'w') as f: + json.dump(inter, f, indent=2) + + +if __name__ == '__main__': + main() diff --git a/scripts/plum-lsvd/models.py b/scripts/plum-lsvd/models.py new file mode 100644 index 0000000..d475712 --- /dev/null +++ b/scripts/plum-lsvd/models.py @@ -0,0 +1,131 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +class PlumRQVAE(nn.Module): + def __init__( + self, + input_dim, + num_codebooks, + codebook_size, + embedding_dim, + beta=0.25, + quant_loss_weight=1.0, + contrastive_loss_weight=1.0, + temperature=0.0, + ): + super().__init__() + self.register_buffer('beta', torch.tensor(beta)) + self.temperature = temperature + + self.input_dim = input_dim + self.num_codebooks = num_codebooks + self.codebook_size = codebook_size + self.embedding_dim = embedding_dim + self.quant_loss_weight = quant_loss_weight + + self.contrastive_loss_weight = contrastive_loss_weight + + self.encoder = self.make_encoding_tower(input_dim, embedding_dim) + self.decoder = self.make_encoding_tower(embedding_dim, input_dim) + + self.codebooks = torch.nn.ParameterList() + for _ in range(num_codebooks): + cb = torch.FloatTensor(codebook_size, embedding_dim) + #nn.init.normal_(cb) + self.codebooks.append(cb) + + @staticmethod + def make_encoding_tower(d1, d2, bias=False): + return torch.nn.Sequential( + nn.Linear(d1, d1), + nn.ReLU(), + nn.Linear(d1, d2), + nn.ReLU(), + nn.Linear(d2, d2, bias=bias) + ) + + @staticmethod + def get_codebook_indices(remainder, codebook): + dist = torch.cdist(remainder, codebook) + return dist.argmin(dim=-1) + + def _quantize_representation(self, latent_vector): + latent_restored = 0 + remainder = latent_vector + + for codebook in self.codebooks: + codebook_indices = self.get_codebook_indices(remainder, codebook) + quantized = codebook[codebook_indices] + codebook_vectors = remainder + (quantized - remainder).detach() + latent_restored += codebook_vectors + remainder = remainder - codebook_vectors + + return latent_restored + + def contrastive_loss(self, p_i, p_i_star): + N_b = p_i.size(0) + + p_i = F.normalize(p_i, p=2, dim=-1) #TODO посмотреть без нормалайза + p_i_star = F.normalize(p_i_star, p=2, dim=-1) + + similarities = torch.matmul(p_i, p_i_star.T) / self.temperature + + labels = torch.arange(N_b, dtype=torch.long, device=p_i.device) + + loss = F.cross_entropy(similarities, labels) + + return loss + + def forward(self, inputs): + latent_vector = self.encoder(inputs['embedding']) + item_ids = inputs['item_id'] + + latent_restored = 0 + rqvae_loss = 0 + clusters = [] + remainder = latent_vector + + for codebook in self.codebooks: + codebook_indices = self.get_codebook_indices(remainder, codebook) + clusters.append(codebook_indices) + + quantized = codebook[codebook_indices] + codebook_vectors = remainder + (quantized - remainder).detach() + + rqvae_loss += self.beta * torch.nn.functional.mse_loss(remainder, quantized.detach()) + rqvae_loss += torch.nn.functional.mse_loss(quantized, remainder.detach()) + + latent_restored += codebook_vectors + remainder = remainder - codebook_vectors + + embeddings_restored = self.decoder(latent_restored) + recon_loss = torch.nn.functional.mse_loss(embeddings_restored, inputs['embedding']) + + if 'cooccurrence_embedding' in inputs: + cooccurrence_latent = self.encoder(inputs['cooccurrence_embedding'].to(latent_restored.device)) + cooccurrence_restored = self._quantize_representation(cooccurrence_latent) + con_loss = self.contrastive_loss(latent_restored, cooccurrence_restored) + else: + con_loss = torch.as_tensor(0.0, device=latent_vector.device) + + loss = ( + recon_loss + + self.quant_loss_weight * rqvae_loss + + self.contrastive_loss_weight * con_loss + ).mean() + + clusters_counts = [] + for cluster in clusters: + clusters_counts.append(torch.bincount(cluster, minlength=self.codebook_size)) + + return loss, { + 'loss': loss.item(), + 'recon_loss': recon_loss.mean().item(), + 'rqvae_loss': rqvae_loss.mean().item(), + 'con_loss': con_loss.item(), + + 'clusters_counts': clusters_counts, + 'clusters': torch.stack(clusters).T, + 'embedding_hat': embeddings_restored, + } \ No newline at end of file diff --git a/scripts/plum-lsvd/train_plum_4.1.py b/scripts/plum-lsvd/train_plum_4.1.py new file mode 100644 index 0000000..85027b3 --- /dev/null +++ b/scripts/plum-lsvd/train_plum_4.1.py @@ -0,0 +1,180 @@ +from loguru import logger +import os + +import torch + +import pickle + +import irec.callbacks as cb +from irec.data.dataloader import DataLoader +from irec.data.transforms import Collate, ToTorch, ToDevice +from irec.runners import TrainingRunner + +from irec.utils import fix_random_seed + +from callbacks import InitCodebooks, FixDeadCentroids +from data import EmbeddingDatasetParquet, ProcessEmbeddings +from models import PlumRQVAE +from transforms import AddWeightedCooccurrenceEmbeddingsVectorized +from cooc_data import CoocMappingDataset + +# ЭКСПЕРИМЕНТ С ПОЛНОЙ ИСТОРИЕЙ +SEED_VALUE = 42 +DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + +NUM_EPOCHS = 35 +BATCH_SIZE = 1024 + +INPUT_DIM = 64 +HIDDEN_DIM = 32 +CODEBOOK_SIZE = 512 +NUM_CODEBOOKS = 3 +BETA = 0.25 +LR = 1e-4 +WINDOW_SIZE = 2 +K=2000 + +EXPERIMENT_NAME = f'4-1_vk_lsvd_ods_base_with_gap_cb_{CODEBOOK_SIZE}_ws_{WINDOW_SIZE}_k_{K}_8w_e{NUM_EPOCHS}' +INTER_TRAIN_PATH = "/home/jovyan/IRec/sigir/lsvd_data/8-weeks-base-ows/base_with_gap_interactions_grouped.parquet" +EMBEDDINGS_PATH = "/home/jovyan/IRec/sigir/lsvd_data/8-weeks-base-ows/items_metadata_remapped.parquet" +IREC_PATH = '../../' + +print(INTER_TRAIN_PATH) +def main(): + fix_random_seed(SEED_VALUE) + + dataset = EmbeddingDatasetParquet( + data_path=EMBEDDINGS_PATH, + ) + + data = CoocMappingDataset.create_from_split_part( + train_inter_parquet_path=INTER_TRAIN_PATH, + window_size=WINDOW_SIZE + ) + + item_id_to_embedding = {} + all_item_ids = [] + for idx in range(len(dataset)): + sample = dataset[idx] + item_id = int(sample['item_id']) + item_id_to_embedding[item_id] = torch.tensor(sample['embedding'], device=DEVICE) + all_item_ids.append(item_id) + + # add_cooc_transform = AddWeightedCooccurrenceEmbeddings(data.cooccur_counter_mapping, item_id_to_embedding, all_item_ids, K) + add_cooc_transform = AddWeightedCooccurrenceEmbeddingsVectorized( + cooccur_counts=data.cooccur_counter_mapping, + item_id_to_embedding=item_id_to_embedding, + all_item_ids=all_item_ids, + device=DEVICE, + max_neighbors=K, + seed=42 + ) + + train_dataloader = DataLoader( #call в основном потоке делается нужно исправить + dataset, + batch_size=BATCH_SIZE, + shuffle=True, + drop_last=True, + ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map( + ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding']) + ).map(add_cooc_transform + ).repeat(NUM_EPOCHS) + + valid_dataloader = DataLoader( + dataset, + batch_size=BATCH_SIZE, + shuffle=False, + drop_last=False, + ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding']) + ).map(add_cooc_transform) + + LOG_EVERY_NUM_STEPS = int(len(train_dataloader) // NUM_EPOCHS) + + model = PlumRQVAE( + input_dim=INPUT_DIM, + num_codebooks=NUM_CODEBOOKS, + codebook_size=CODEBOOK_SIZE, + embedding_dim=HIDDEN_DIM, + beta=BETA, + quant_loss_weight=1.0, + contrastive_loss_weight=1.0, + temperature=1.0 + ).to(DEVICE) + + total_params = sum(p.numel() for p in model.parameters()) + trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + + logger.debug(f'Overall parameters: {total_params:,}') + logger.debug(f'Trainable parameters: {trainable_params:,}') + + optimizer = torch.optim.Adam(model.parameters(), lr=LR, fused=True) + + callbacks = [ + InitCodebooks(valid_dataloader), + + cb.BatchMetrics(metrics=lambda model_outputs, batch: { + 'loss': model_outputs['loss'], + 'recon_loss': model_outputs['recon_loss'], + 'rqvae_loss': model_outputs['rqvae_loss'], + 'con_loss': model_outputs['con_loss'] + }, name='train'), + + FixDeadCentroids(valid_dataloader), + + cb.MetricAccumulator( + accumulators={ + 'train/loss': cb.MeanAccumulator(), + 'train/recon_loss': cb.MeanAccumulator(), + 'train/rqvae_loss': cb.MeanAccumulator(), + 'train/con_loss': cb.MeanAccumulator(), + 'num_dead/0': cb.MeanAccumulator(), + 'num_dead/1': cb.MeanAccumulator(), + 'num_dead/2': cb.MeanAccumulator(), + }, + reset_every_num_steps=LOG_EVERY_NUM_STEPS + ), + + cb.Validation( + dataset=valid_dataloader, + callbacks=[ + cb.BatchMetrics(metrics=lambda model_outputs, batch: { + 'loss': model_outputs['loss'], + 'recon_loss': model_outputs['recon_loss'], + 'rqvae_loss': model_outputs['rqvae_loss'], + 'con_loss': model_outputs['con_loss'] + }, name='valid'), + cb.MetricAccumulator( + accumulators={ + 'valid/loss': cb.MeanAccumulator(), + 'valid/recon_loss': cb.MeanAccumulator(), + 'valid/rqvae_loss': cb.MeanAccumulator(), + 'valid/con_loss': cb.MeanAccumulator() + } + ), + ], + ).every_num_steps(LOG_EVERY_NUM_STEPS), + + cb.Logger().every_num_steps(LOG_EVERY_NUM_STEPS), + cb.TensorboardLogger(experiment_name=EXPERIMENT_NAME, logdir=os.path.join(IREC_PATH, 'tensorboard_logs')), + + cb.EarlyStopping( + metric='valid/recon_loss', + patience=40, + minimize=True, + model_path=os.path.join(IREC_PATH, 'checkpoints', EXPERIMENT_NAME) + ).every_num_steps(LOG_EVERY_NUM_STEPS), + ] + + logger.debug('Everything is ready for training process!') + + runner = TrainingRunner( + model=model, + optimizer=optimizer, + dataset=train_dataloader, + callbacks=callbacks, + ) + runner.run() + + +if __name__ == '__main__': + main() diff --git a/scripts/plum-lsvd/train_plum_4.2.py b/scripts/plum-lsvd/train_plum_4.2.py new file mode 100644 index 0000000..de1864d --- /dev/null +++ b/scripts/plum-lsvd/train_plum_4.2.py @@ -0,0 +1,180 @@ +from loguru import logger +import os + +import torch + +import pickle + +import irec.callbacks as cb +from irec.data.dataloader import DataLoader +from irec.data.transforms import Collate, ToTorch, ToDevice +from irec.runners import TrainingRunner + +from irec.utils import fix_random_seed + +from callbacks import InitCodebooks, FixDeadCentroids +from data import EmbeddingDatasetParquet, ProcessEmbeddings +from models import PlumRQVAE +from transforms import AddWeightedCooccurrenceEmbeddingsVectorized +from cooc_data import CoocMappingDataset + +# ЭКСПЕРИМЕНТ С ОБРЕЗАННОЙ ИСТОРИЕЙ +SEED_VALUE = 42 +DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + +NUM_EPOCHS = 35 +BATCH_SIZE = 1024 + +INPUT_DIM = 64 +HIDDEN_DIM = 32 +CODEBOOK_SIZE = 512 +NUM_CODEBOOKS = 3 +BETA = 0.25 +LR = 1e-4 +WINDOW_SIZE = 2 +K=2000 + +EXPERIMENT_NAME = f'4-2_vk_lsvd_ods_base_with_gap_cb_{CODEBOOK_SIZE}_ws_{WINDOW_SIZE}_k_{K}_8w_e{NUM_EPOCHS}' +INTER_TRAIN_PATH = "/home/jovyan/IRec/sigir/lsvd_data/8-weeks-base-ows/base_interactions_grouped.parquet" +EMBEDDINGS_PATH = "/home/jovyan/IRec/sigir/lsvd_data/8-weeks-base-ows/items_metadata_remapped.parquet" +IREC_PATH = '../../' + +print(INTER_TRAIN_PATH) +def main(): + fix_random_seed(SEED_VALUE) + + dataset = EmbeddingDatasetParquet( + data_path=EMBEDDINGS_PATH, + ) + + data = CoocMappingDataset.create_from_split_part( + train_inter_parquet_path=INTER_TRAIN_PATH, + window_size=WINDOW_SIZE + ) + + item_id_to_embedding = {} + all_item_ids = [] + for idx in range(len(dataset)): + sample = dataset[idx] + item_id = int(sample['item_id']) + item_id_to_embedding[item_id] = torch.tensor(sample['embedding'], device=DEVICE) + all_item_ids.append(item_id) + + # add_cooc_transform = AddWeightedCooccurrenceEmbeddings(data.cooccur_counter_mapping, item_id_to_embedding, all_item_ids, K) + add_cooc_transform = AddWeightedCooccurrenceEmbeddingsVectorized( + cooccur_counts=data.cooccur_counter_mapping, + item_id_to_embedding=item_id_to_embedding, + all_item_ids=all_item_ids, + device=DEVICE, + max_neighbors=K, + seed=42 + ) + + train_dataloader = DataLoader( #call в основном потоке делается нужно исправить + dataset, + batch_size=BATCH_SIZE, + shuffle=True, + drop_last=True, + ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map( + ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding']) + ).map(add_cooc_transform + ).repeat(NUM_EPOCHS) + + valid_dataloader = DataLoader( + dataset, + batch_size=BATCH_SIZE, + shuffle=False, + drop_last=False, + ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding']) + ).map(add_cooc_transform) + + LOG_EVERY_NUM_STEPS = int(len(train_dataloader) // NUM_EPOCHS) + + model = PlumRQVAE( + input_dim=INPUT_DIM, + num_codebooks=NUM_CODEBOOKS, + codebook_size=CODEBOOK_SIZE, + embedding_dim=HIDDEN_DIM, + beta=BETA, + quant_loss_weight=1.0, + contrastive_loss_weight=1.0, + temperature=1.0 + ).to(DEVICE) + + total_params = sum(p.numel() for p in model.parameters()) + trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + + logger.debug(f'Overall parameters: {total_params:,}') + logger.debug(f'Trainable parameters: {trainable_params:,}') + + optimizer = torch.optim.Adam(model.parameters(), lr=LR, fused=True) + + callbacks = [ + InitCodebooks(valid_dataloader), + + cb.BatchMetrics(metrics=lambda model_outputs, batch: { + 'loss': model_outputs['loss'], + 'recon_loss': model_outputs['recon_loss'], + 'rqvae_loss': model_outputs['rqvae_loss'], + 'con_loss': model_outputs['con_loss'] + }, name='train'), + + FixDeadCentroids(valid_dataloader), + + cb.MetricAccumulator( + accumulators={ + 'train/loss': cb.MeanAccumulator(), + 'train/recon_loss': cb.MeanAccumulator(), + 'train/rqvae_loss': cb.MeanAccumulator(), + 'train/con_loss': cb.MeanAccumulator(), + 'num_dead/0': cb.MeanAccumulator(), + 'num_dead/1': cb.MeanAccumulator(), + 'num_dead/2': cb.MeanAccumulator(), + }, + reset_every_num_steps=LOG_EVERY_NUM_STEPS + ), + + cb.Validation( + dataset=valid_dataloader, + callbacks=[ + cb.BatchMetrics(metrics=lambda model_outputs, batch: { + 'loss': model_outputs['loss'], + 'recon_loss': model_outputs['recon_loss'], + 'rqvae_loss': model_outputs['rqvae_loss'], + 'con_loss': model_outputs['con_loss'] + }, name='valid'), + cb.MetricAccumulator( + accumulators={ + 'valid/loss': cb.MeanAccumulator(), + 'valid/recon_loss': cb.MeanAccumulator(), + 'valid/rqvae_loss': cb.MeanAccumulator(), + 'valid/con_loss': cb.MeanAccumulator() + } + ), + ], + ).every_num_steps(LOG_EVERY_NUM_STEPS), + + cb.Logger().every_num_steps(LOG_EVERY_NUM_STEPS), + cb.TensorboardLogger(experiment_name=EXPERIMENT_NAME, logdir=os.path.join(IREC_PATH, 'tensorboard_logs')), + + cb.EarlyStopping( + metric='valid/recon_loss', + patience=40, + minimize=True, + model_path=os.path.join(IREC_PATH, 'checkpoints', EXPERIMENT_NAME) + ).every_num_steps(LOG_EVERY_NUM_STEPS), + ] + + logger.debug('Everything is ready for training process!') + + runner = TrainingRunner( + model=model, + optimizer=optimizer, + dataset=train_dataloader, + callbacks=callbacks, + ) + runner.run() + + +if __name__ == '__main__': + main() diff --git a/scripts/plum-lsvd/train_rqvae.py b/scripts/plum-lsvd/train_rqvae.py new file mode 100644 index 0000000..ea41b74 --- /dev/null +++ b/scripts/plum-lsvd/train_rqvae.py @@ -0,0 +1,174 @@ +from loguru import logger +import os + +import torch + +import pickle + +import irec.callbacks as cb +from irec.data.dataloader import DataLoader +from irec.data.transforms import Collate, ToTorch, ToDevice +from irec.runners import TrainingRunner + +from irec.utils import fix_random_seed + +from callbacks import InitCodebooks, FixDeadCentroids +from data import EmbeddingDatasetParquet, ProcessEmbeddings +from models import PlumRQVAE +from transforms import AddWeightedCooccurrenceEmbeddings +from cooc_data import CoocMappingDataset + +SEED_VALUE = 42 +DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + +NUM_EPOCHS = 15 +BATCH_SIZE = 1024 + +INPUT_DIM = 64 +HIDDEN_DIM = 32 +CODEBOOK_SIZE = 512 +NUM_CODEBOOKS = 3 +BETA = 0.25 +LR = 1e-4 +WINDOW_SIZE = 3 +MAX_LEN = 500 +K=100 + +# EXPERIMENT_NAME = f'4-1_vk_lsvd_ods_base_with_gap_cb_{CODEBOOK_SIZE}_ws_{WINDOW_SIZE}_k_{K}_ml_{MAX_LEN}' +EXPERIMENT_NAME = f'rqvae_vk_lsvd_cz_512_8-weeks' +EMBEDDINGS_PATH = "/home/jovyan/IRec/sigir/lsvd_data/8-weeks-base-ows/items_metadata_remapped.parquet" +IREC_PATH = '../../' + +# print(INTER_TRAIN_PATH) +def main(): + fix_random_seed(SEED_VALUE) + + dataset = EmbeddingDatasetParquet( + data_path=EMBEDDINGS_PATH + ) + + # data = CoocMappingDataset.create_from_split_part( + # train_inter_parquet_path=INTER_TRAIN_PATH, + # window_size=WINDOW_SIZE, + # max_items=MAX_LEN + # ) + + item_id_to_embedding = {} + all_item_ids = [] + for idx in range(len(dataset)): + sample = dataset[idx] + item_id = int(sample['item_id']) + item_id_to_embedding[item_id] = torch.tensor(sample['embedding'], device=DEVICE) + all_item_ids.append(item_id) + + # add_cooc_transform = AddWeightedCooccurrenceEmbeddings(data.cooccur_counter_mapping, item_id_to_embedding, all_item_ids) + + train_dataloader = DataLoader( #call в основном потоке делается нужно исправить + dataset, + batch_size=BATCH_SIZE, + shuffle=True, + drop_last=True, + ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map( + ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding']) + # ).map(add_cooc_transform + ).repeat(NUM_EPOCHS) + + valid_dataloader = DataLoader( + dataset, + batch_size=BATCH_SIZE, + shuffle=False, + drop_last=False, + ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding']) + # ).map(add_cooc_transform) + ) + + LOG_EVERY_NUM_STEPS = int(len(train_dataloader) // NUM_EPOCHS) + + model = PlumRQVAE( + input_dim=INPUT_DIM, + num_codebooks=NUM_CODEBOOKS, + codebook_size=CODEBOOK_SIZE, + embedding_dim=HIDDEN_DIM, + beta=BETA, + quant_loss_weight=1.0, + contrastive_loss_weight=1.0, + temperature=1.0 + ).to(DEVICE) + + total_params = sum(p.numel() for p in model.parameters()) + trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + + logger.debug(f'Overall parameters: {total_params:,}') + logger.debug(f'Trainable parameters: {trainable_params:,}') + + optimizer = torch.optim.Adam(model.parameters(), lr=LR, fused=True) + + callbacks = [ + InitCodebooks(valid_dataloader), + + cb.BatchMetrics(metrics=lambda model_outputs, batch: { + 'loss': model_outputs['loss'], + 'recon_loss': model_outputs['recon_loss'], + 'rqvae_loss': model_outputs['rqvae_loss'], + 'con_loss': model_outputs['con_loss'] + }, name='train'), + + FixDeadCentroids(valid_dataloader), + + cb.MetricAccumulator( + accumulators={ + 'train/loss': cb.MeanAccumulator(), + 'train/recon_loss': cb.MeanAccumulator(), + 'train/rqvae_loss': cb.MeanAccumulator(), + 'train/con_loss': cb.MeanAccumulator(), + 'num_dead/0': cb.MeanAccumulator(), + 'num_dead/1': cb.MeanAccumulator(), + 'num_dead/2': cb.MeanAccumulator(), + }, + reset_every_num_steps=LOG_EVERY_NUM_STEPS + ), + + cb.Validation( + dataset=valid_dataloader, + callbacks=[ + cb.BatchMetrics(metrics=lambda model_outputs, batch: { + 'loss': model_outputs['loss'], + 'recon_loss': model_outputs['recon_loss'], + 'rqvae_loss': model_outputs['rqvae_loss'], + 'con_loss': model_outputs['con_loss'] + }, name='valid'), + cb.MetricAccumulator( + accumulators={ + 'valid/loss': cb.MeanAccumulator(), + 'valid/recon_loss': cb.MeanAccumulator(), + 'valid/rqvae_loss': cb.MeanAccumulator(), + 'valid/con_loss': cb.MeanAccumulator() + } + ), + ], + ).every_num_steps(LOG_EVERY_NUM_STEPS), + + cb.Logger().every_num_steps(LOG_EVERY_NUM_STEPS), + cb.TensorboardLogger(experiment_name=EXPERIMENT_NAME, logdir=os.path.join(IREC_PATH, 'tensorboard_logs')), + + cb.EarlyStopping( + metric='valid/recon_loss', + patience=40, + minimize=True, + model_path=os.path.join(IREC_PATH, 'checkpoints', EXPERIMENT_NAME) + ).every_num_steps(LOG_EVERY_NUM_STEPS), + ] + + logger.debug('Everything is ready for training process!') + + runner = TrainingRunner( + model=model, + optimizer=optimizer, + dataset=train_dataloader, + callbacks=callbacks, + ) + runner.run() + + +if __name__ == '__main__': + main() diff --git a/scripts/plum-lsvd/transforms.py b/scripts/plum-lsvd/transforms.py new file mode 100644 index 0000000..143002b --- /dev/null +++ b/scripts/plum-lsvd/transforms.py @@ -0,0 +1,287 @@ +import numpy as np +import pickle +import torch +from typing import Dict, List +import time +from collections import defaultdict, Counter + +class AddWeightedCooccurrenceEmbeddings: + def __init__(self, cooccur_counts, item_id_to_embedding, all_item_ids, top_k): + self.cooccur_counts = cooccur_counts + self.item_id_to_embedding = item_id_to_embedding + self.all_item_ids = all_item_ids + self.call_count = 0 + self.top_k = top_k + + # Предвычисляем top_k для каждого item_id + self._top_k_cache = {} + self._build_top_k_cache() + + def _build_top_k_cache(self): + """Предвычисляет top-k соседей для каждого item_id""" + for item_id, counter in self.cooccur_counts.items(): + if counter and len(counter) > 0: + # Сортируем по частоте и берем top_k + top_items = counter.most_common(self.top_k) + cooc_ids, freqs = zip(*top_items) + freqs_array = np.array(freqs, dtype=np.float32) + probs = freqs_array / freqs_array.sum() + + self._top_k_cache[item_id] = { + 'cooc_ids': cooc_ids, + 'probs': probs + } + + def __call__(self, batch): + self.call_count += 1 + item_ids = batch['item_id'] + cooccurrence_embeddings = [] + + for idx, item_id in enumerate(item_ids): + item_id_val = int(item_id.item()) if torch.is_tensor(item_id) else int(item_id) + + # Используем предвычисленный top-k кэш + if item_id_val in self._top_k_cache: + cache_entry = self._top_k_cache[item_id_val] + cooc_id = np.random.choice( + cache_entry['cooc_ids'], + p=cache_entry['probs'] + ) + else: + cooc_id = np.random.choice(self.all_item_ids) + if self.call_count % 500 == 0 and idx < 5: + print(f" idx={idx}: item_id={item_id_val} fallback random") + if self.call_count % 500 == 0 and idx < 5: + print(f" idx={idx}: item_id={item_id_val} cooc_id={cooc_id}") + cooc_emb = self.item_id_to_embedding.get(cooc_id, batch['embedding'][0]) + cooccurrence_embeddings.append(cooc_emb) + + batch['cooccurrence_embedding'] = torch.stack(cooccurrence_embeddings) + return batch + +#запустить сасрек, леттер, sasrec << tiger < letter < plum + + +class AddWeightedCooccurrenceEmbeddingsVectorized: + + def __init__( + self, + cooccur_counts: Dict[int, Dict[int, int]], + item_id_to_embedding: Dict[int, torch.Tensor], + all_item_ids: List[int], + device: torch.device, + limit_neighbors: bool = True, + max_neighbors: int = 256, + seed: int = 42, + verbose: bool = True + ): + self.device = device + self.call_count = 0 + self.limit_neighbors = limit_neighbors + self.max_neighbors = max_neighbors + self.seed = seed + self.verbose = verbose + + torch.manual_seed(seed) + np.random.seed(seed) + + if self.verbose: + print(f"\n{'='*80}") + print(f"Initializing AddWeightedCooccurrenceEmbeddingsVectorized") + print(f"{'='*80}") + init_start = time.time() + + all_item_ids_sorted = sorted(all_item_ids) + self.item_id_to_idx = {item_id: idx for idx, item_id in enumerate(all_item_ids_sorted)} + self.idx_to_item_id = torch.tensor(all_item_ids_sorted, device=device, dtype=torch.long) + + if self.verbose: + print(f"[INIT] Sorted {len(all_item_ids)} item IDs and created mappings") + + num_items = len(all_item_ids_sorted) + embedding_dim = next(iter(item_id_to_embedding.values())).shape[0] + + if self.verbose: + print(f"[INIT] Num items: {num_items}, Embedding dim: {embedding_dim}") + + self.embedding_matrix = torch.zeros( + size=(num_items, embedding_dim), + device=device, + dtype=torch.float32, + requires_grad=False + ) + + emb_load_start = time.time() + for item_id, emb in item_id_to_embedding.items(): + idx = self.item_id_to_idx[item_id] + if isinstance(emb, torch.Tensor): + self.embedding_matrix[idx] = emb.to(device).detach() + else: + self.embedding_matrix[idx] = torch.tensor(emb, device=device, dtype=torch.float32) + + if self.verbose: + emb_load_time = time.time() - emb_load_start + print(f"[INIT] Loaded {len(item_id_to_embedding)} embeddings in {emb_load_time*1000:.2f}ms") + + self._build_cooccurrence_tables(cooccur_counts, num_items) + + if self.verbose: + init_time = time.time() - init_start + print(f"[INIT] Total initialization time: {init_time*1000:.2f}ms") + print(f"{'='*80}\n") + + def _build_cooccurrence_tables(self, cooccur_counts: Dict, num_items: int): + if self.verbose: + build_start = time.time() + print(f"\n[BUILD] Building cooccurrence tables...") + + indexed_cooccur_counts = {} + for item_id, neighbors in cooccur_counts.items(): + if item_id in self.item_id_to_idx: + idx = self.item_id_to_idx[item_id] + indexed_neighbors = {} + for neighbor_id, count in neighbors.items(): + if neighbor_id in self.item_id_to_idx: + neighbor_idx = self.item_id_to_idx[neighbor_id] + indexed_neighbors[neighbor_idx] = count + if indexed_neighbors: + indexed_cooccur_counts[idx] = indexed_neighbors + + if self.verbose: + items_with_cooc = len(indexed_cooccur_counts) + print(f"[BUILD] Items with cooccurrences: {items_with_cooc}/{num_items}") + total_pairs = sum(len(neighbors) for neighbors in indexed_cooccur_counts.values()) + print(f"[BUILD] Total cooccurrence pairs: {total_pairs}") + + max_actual_neighbors = 0 + for idx in range(num_items): + counter = indexed_cooccur_counts.get(idx) + if counter and len(counter) > 0: + num_neighbors = len(counter) + if self.limit_neighbors: + num_neighbors = min(num_neighbors, self.max_neighbors) + else: + num_neighbors = num_items + max_actual_neighbors = max(max_actual_neighbors, num_neighbors) + + if self.limit_neighbors: + max_actual_neighbors = min(max_actual_neighbors, self.max_neighbors) + + if self.verbose: + print(f"[BUILD] Max neighbors per item: {max_actual_neighbors}") + + neighbors_matrix = torch.zeros( + (num_items, max_actual_neighbors), + dtype=torch.long, + device=self.device, + requires_grad=False + ) + + probs_matrix = torch.zeros( + (num_items, max_actual_neighbors), + dtype=torch.float32, + device=self.device, + requires_grad=False + ) + + valid_mask = torch.zeros( + (num_items, max_actual_neighbors), + dtype=torch.bool, + device=self.device, + requires_grad=False + ) + + matrix_fill_start = time.time() + + for idx in range(num_items): + counter = indexed_cooccur_counts.get(idx) + + if counter and len(counter) > 0: + cooc_items = sorted(counter.items(), key=lambda x: x, reverse=True) + cooc_ids, freqs = zip(*cooc_items) + cooc_ids = list(cooc_ids) + freqs = np.array(freqs, dtype=np.float32) + + num_neighbors = min(len(cooc_ids), max_actual_neighbors) + cooc_ids = cooc_ids[:num_neighbors] + freqs = freqs[:num_neighbors] + + probs = freqs / freqs.sum() + + neighbors_matrix[idx, :num_neighbors] = torch.tensor( + cooc_ids, dtype=torch.long, device=self.device + ) + probs_matrix[idx, :num_neighbors] = torch.tensor( + probs, dtype=torch.float32, device=self.device + ) + valid_mask[idx, :num_neighbors] = True + + else: + if max_actual_neighbors >= num_items: + neighbors_matrix[idx, :num_items] = torch.arange(num_items, device=self.device) + probs_matrix[idx, :num_items] = 1.0 / num_items + valid_mask[idx, :num_items] = True + else: + perm = torch.randperm(num_items, device=self.device)[:max_actual_neighbors] + neighbors_matrix[idx] = perm + probs_matrix[idx] = 1.0 / max_actual_neighbors + valid_mask[idx] = True + + if self.verbose: + matrix_fill_time = time.time() - matrix_fill_start + print(f"[BUILD] Filled matrices in {matrix_fill_time*1000:.2f}ms") + + self.neighbors_matrix = neighbors_matrix + self.probs_matrix = probs_matrix + self.valid_mask = valid_mask + + if self.verbose: + print(f"[BUILD] neighbors_matrix shape: {neighbors_matrix.shape}") + print(f"[BUILD] probs_matrix shape: {probs_matrix.shape}") + print(f"[BUILD] valid_mask shape: {valid_mask.shape}") + build_time = time.time() - build_start + print(f"[BUILD] Total build time: {build_time*1000:.2f}ms") + + def __call__(self, batch): + self.call_count += 1 + + call_start = time.time() + + item_ids = batch['item_id'] + + if not isinstance(item_ids, torch.Tensor): + item_ids = torch.tensor(item_ids, device=self.device, dtype=torch.long) + else: + item_ids = item_ids.to(device=self.device, dtype=torch.long) + + batch_size = item_ids.shape + + indexed_item_ids = torch.tensor( + [self.item_id_to_idx.get(int(iid.item()), 0) for iid in item_ids], + device=self.device, + dtype=torch.long + ) + + probs = self.probs_matrix[indexed_item_ids] + mask = self.valid_mask[indexed_item_ids] + + masked_probs = probs.clone() + masked_probs[~mask] = 0.0 + + row_sums = masked_probs.sum(dim=1, keepdim=True) + row_sums[row_sums == 0] = 1.0 + masked_probs = masked_probs / row_sums + + neighbor_indices = torch.multinomial(masked_probs, num_samples=1, replacement=True) + neighbor_indices = neighbor_indices.squeeze(1) + + cooc_indexed_ids = self.neighbors_matrix[indexed_item_ids, neighbor_indices] + cooccurrence_embeddings = self.embedding_matrix[cooc_indexed_ids] + + batch['cooccurrence_embedding'] = cooccurrence_embeddings + + call_time = time.time() - call_start + if self.verbose and self.call_count % 1000 == 0: + print(f"Call #{self.call_count}: batch_size={batch_size}, {call_time*1000:.2f}ms") + + return batch \ No newline at end of file diff --git a/scripts/plum-yambda/callbacks.py b/scripts/plum-yambda/callbacks.py new file mode 100644 index 0000000..43ec460 --- /dev/null +++ b/scripts/plum-yambda/callbacks.py @@ -0,0 +1,64 @@ +import torch + +import irec.callbacks as cb +from irec.runners import TrainingRunner, TrainingRunnerContext + +class InitCodebooks(cb.TrainingCallback): + def __init__(self, dataloader): + super().__init__() + self._dataloader = dataloader + + @torch.no_grad() + def before_run(self, runner: TrainingRunner): + for i in range(len(runner.model.codebooks)): + X = next(iter(self._dataloader))['embedding'] + idx = torch.randperm(X.shape[0], device=X.device)[:len(runner.model.codebooks[i])] + remainder = runner.model.encoder(X[idx]) + + for j in range(i): + codebook_indices = runner.model.get_codebook_indices(remainder, runner.model.codebooks[j]) + codebook_vectors = runner.model.codebooks[j][codebook_indices] + remainder = remainder - codebook_vectors + + runner.model.codebooks[i].data = remainder.detach() + + +class FixDeadCentroids(cb.TrainingCallback): + def __init__(self, dataloader): + super().__init__() + self._dataloader = dataloader + + def after_step(self, runner: TrainingRunner, context: TrainingRunnerContext): + for i, num_fixed in enumerate(self.fix_dead_codebooks(runner)): + context.metrics[f'num_dead/{i}'] = num_fixed + + @torch.no_grad() + def fix_dead_codebooks(self, runner: TrainingRunner): + num_fixed = [] + for codebook_idx, codebook in enumerate(runner.model.codebooks): + centroid_counts = torch.zeros(codebook.shape[0], dtype=torch.long, device=codebook.device) + random_batch = next(iter(self._dataloader))['embedding'] + + for batch in self._dataloader: + remainder = runner.model.encoder(batch['embedding']) + for l in range(codebook_idx): + ind = runner.model.get_codebook_indices(remainder, runner.model.codebooks[l]) + remainder = remainder - runner.model.codebooks[l][ind] + + indices = runner.model.get_codebook_indices(remainder, codebook) + centroid_counts.scatter_add_(0, indices, torch.ones_like(indices)) + + dead_mask = (centroid_counts == 0) + num_dead = int(dead_mask.sum().item()) + num_fixed.append(num_dead) + if num_dead == 0: + continue + + remainder = runner.model.encoder(random_batch) + for l in range(codebook_idx): + ind = runner.model.get_codebook_indices(remainder, runner.model.codebooks[l]) + remainder = remainder - runner.model.codebooks[l][ind] + remainder = remainder[torch.randperm(remainder.shape[0], device=codebook.device)][:num_dead] + codebook[dead_mask] = remainder.detach() + + return num_fixed diff --git a/scripts/plum-yambda/cooc_data.py b/scripts/plum-yambda/cooc_data.py new file mode 100644 index 0000000..50f2bdd --- /dev/null +++ b/scripts/plum-yambda/cooc_data.py @@ -0,0 +1,108 @@ +import json +import pickle +from collections import defaultdict, Counter + +import numpy as np +from loguru import logger + + +import pickle +from collections import defaultdict, Counter + +class CoocMappingDataset: + def __init__( + self, + train_sampler, + num_items, + cooccur_counter_mapping=None + ): + self._train_sampler = train_sampler + self._num_items = num_items + self._cooccur_counter_mapping = cooccur_counter_mapping + + @classmethod + def create(cls, inter_json_path, window_size): + max_item_id = 0 + train_dataset, validation_dataset, test_dataset = [], [], [] + + with open(inter_json_path, 'r') as f: + user_interactions = json.load(f) + + for user_id_str, item_ids in user_interactions.items(): + user_id = int(user_id_str) + if item_ids: + max_item_id = max(max_item_id, max(item_ids)) + assert len(item_ids) >= 5, f'Core-5 dataset is used, user {user_id} has only {len(item_ids)} items' + train_dataset.append({ + 'user.ids': [user_id], + 'item.ids': item_ids[:-2], + }) + + cooccur_counter_mapping = cls.build_cooccur_counter_mapping(train_dataset, window_size=window_size) + logger.debug(f'Computed window-based co-occurrence mapping for {len(cooccur_counter_mapping)} items but max_item_id is {max_item_id}') + + train_sampler = train_dataset + + return cls( + train_sampler=train_sampler, + num_items=max_item_id + 1, + cooccur_counter_mapping=cooccur_counter_mapping + ) + + @classmethod + def create_from_split_part( + cls, + train_inter_json_path, + window_size + ): + + max_item_id = 0 + train_dataset = [] + + with open(train_inter_json_path, 'r') as f: + train_interactions = json.load(f) + + # Обрабатываем TRAIN + for user_id_str, item_ids in train_interactions.items(): + user_id = int(user_id_str) + if item_ids: + max_item_id = max(max_item_id, max(item_ids)) + + train_dataset.append({ + 'user.ids': [user_id], + 'item.ids': item_ids, + }) + + logger.debug(f'Train: {len(train_dataset)} users') + logger.debug(f'Max item ID: {max_item_id}') + + cooccur_counter_mapping = cls.build_cooccur_counter_mapping( + train_dataset, + window_size=window_size + ) + + logger.debug(f'Computed window-based co-occurrence mapping for {len(cooccur_counter_mapping)} items') + + return cls( + train_sampler=train_dataset, + num_items=max_item_id + 1, + cooccur_counter_mapping=cooccur_counter_mapping + ) + + + @staticmethod + def build_cooccur_counter_mapping(train_dataset, window_size): #TODO передавать время и по нему строить окно + cooccur_counts = defaultdict(Counter) + for session in train_dataset: + items = session['item.ids'] + for i in range(len(items)): + item_i = items[i] + for j in range(max(0, i - window_size), min(len(items), i + window_size + 1)): + if i != j: + cooccur_counts[item_i][items[j]] += 1 + return cooccur_counts + + + @property + def cooccur_counter_mapping(self): + return self._cooccur_counter_mapping diff --git a/scripts/plum-yambda/data.py b/scripts/plum-yambda/data.py new file mode 100644 index 0000000..842adb5 --- /dev/null +++ b/scripts/plum-yambda/data.py @@ -0,0 +1,62 @@ +import numpy as np +import pickle + +from irec.data.base import BaseDataset +from irec.data.transforms import Transform + + +import polars as pl +import numpy as np +import torch + +class EmbeddingDatasetParquet(BaseDataset): + def __init__(self, data_path): + self.df = pl.read_parquet(data_path) + self.item_ids = np.array(self.df['item_id'], dtype=np.int64) + self.embeddings = np.array(self.df['embedding'].to_list(), dtype=np.float32) + print(f"embedding dim: {self.embeddings[0].shape}") + + def __getitem__(self, idx): + index = self.item_ids[idx] + tensor_emb = self.embeddings[idx] + return { + 'item_id': index, + 'embedding': tensor_emb, + 'embedding_dim': len(tensor_emb) + } + + def __len__(self): + return len(self.embeddings) + + +class EmbeddingDataset(BaseDataset): + def __init__(self, data_path): + self.data_path = data_path + with open(data_path, 'rb') as f: + self.data = pickle.load(f) + + self.item_ids = np.array(self.data['item_id'], dtype=np.int64) + self.embeddings = np.array(self.data['embedding'], dtype=np.float32) + + def __getitem__(self, idx): + index = self.item_ids[idx] + tensor_emb = self.embeddings[idx] + return { + 'item_id': index, + 'embedding': tensor_emb, + 'embedding_dim': len(tensor_emb) + } + + def __len__(self): + return len(self.embeddings) + + +class ProcessEmbeddings(Transform): + def __init__(self, embedding_dim, keys): + self.embedding_dim = embedding_dim + self.keys = keys + + def __call__(self, batch): + for key in self.keys: + batch[key] = batch[key].reshape(-1, self.embedding_dim) + return batch \ No newline at end of file diff --git a/scripts/plum-yambda/models.py b/scripts/plum-yambda/models.py new file mode 100644 index 0000000..a411519 --- /dev/null +++ b/scripts/plum-yambda/models.py @@ -0,0 +1,135 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +class PlumRQVAE(nn.Module): + def __init__( + self, + input_dim, + num_codebooks, + codebook_size, + embedding_dim, + beta=0.25, + quant_loss_weight=1.0, + contrastive_loss_weight=1.0, + temperature=0.0, + ): + super().__init__() + self.register_buffer('beta', torch.tensor(beta)) + self.temperature = temperature + + self.input_dim = input_dim + self.num_codebooks = num_codebooks + self.codebook_size = codebook_size + self.embedding_dim = embedding_dim + self.quant_loss_weight = quant_loss_weight + + self.contrastive_loss_weight = contrastive_loss_weight + + self.encoder = self.make_encoding_tower(input_dim, embedding_dim) + self.decoder = self.make_encoding_tower(embedding_dim, input_dim) + + self.codebooks = torch.nn.ParameterList() + for _ in range(num_codebooks): + cb = torch.FloatTensor(codebook_size, embedding_dim) + #nn.init.normal_(cb) + self.codebooks.append(cb) + + @staticmethod + def make_encoding_tower(d1, d2, bias=False): + return torch.nn.Sequential( + nn.Linear(d1, d1), + nn.ReLU(), + nn.Linear(d1, d2), + nn.ReLU(), + nn.Linear(d2, d2, bias=bias) + ) + + @staticmethod + def get_codebook_indices(remainder, codebook): + dist = torch.cdist(remainder, codebook) + return dist.argmin(dim=-1) + + def _quantize_representation(self, latent_vector): + latent_restored = 0 + remainder = latent_vector + + for codebook in self.codebooks: + codebook_indices = self.get_codebook_indices(remainder, codebook) + quantized = codebook[codebook_indices] + codebook_vectors = remainder + (quantized - remainder).detach() + latent_restored += codebook_vectors + remainder = remainder - codebook_vectors + + return latent_restored + + def contrastive_loss(self, p_i, p_i_star): + N_b = p_i.size(0) + + p_i = F.normalize(p_i, p=2, dim=-1) #TODO посмотреть без нормалайза + p_i_star = F.normalize(p_i_star, p=2, dim=-1) + + similarities = torch.matmul(p_i, p_i_star.T) / self.temperature + + labels = torch.arange(N_b, dtype=torch.long, device=p_i.device) + + loss = F.cross_entropy(similarities, labels) + + return loss #только по последней размерности + + def forward(self, inputs): + latent_vector = self.encoder(inputs['embedding']) + # print(f"latent vector shape: {latent_vector.shape}") + # print(f"inputs embedding shape: {inputs['embedding']}") + item_ids = inputs['item_id'] + + latent_restored = 0 + rqvae_loss = 0 + clusters = [] + remainder = latent_vector + + for codebook in self.codebooks: + codebook_indices = self.get_codebook_indices(remainder, codebook) + clusters.append(codebook_indices) + + quantized = codebook[codebook_indices] + codebook_vectors = remainder + (quantized - remainder).detach() + + rqvae_loss += self.beta * torch.nn.functional.mse_loss(remainder, quantized.detach()) + rqvae_loss += torch.nn.functional.mse_loss(quantized, remainder.detach()) + + latent_restored += codebook_vectors + remainder = remainder - codebook_vectors + + embeddings_restored = self.decoder(latent_restored) + recon_loss = torch.nn.functional.mse_loss(embeddings_restored, inputs['embedding']) + + if 'cooccurrence_embedding' in inputs: + # print(f"cooccurrence_embedding shape: {inputs['cooccurrence_embedding'].shape} device {inputs['cooccurrence_embedding'].device}" ) + # print(f"latent_restored shape {latent_restored.shape} device {latent_restored.device}") + cooccurrence_latent = self.encoder(inputs['cooccurrence_embedding'].to(latent_restored.device)) + cooccurrence_restored = self._quantize_representation(cooccurrence_latent) + con_loss = self.contrastive_loss(latent_restored, cooccurrence_restored) + else: + con_loss = torch.as_tensor(0.0, device=latent_vector.device) + + loss = ( + recon_loss + + self.quant_loss_weight * rqvae_loss + + self.contrastive_loss_weight * con_loss + ).mean() + + clusters_counts = [] + for cluster in clusters: + clusters_counts.append(torch.bincount(cluster, minlength=self.codebook_size)) + + return loss, { + 'loss': loss.item(), + 'recon_loss': recon_loss.mean().item(), + 'rqvae_loss': rqvae_loss.mean().item(), + 'con_loss': con_loss.item(), + + 'clusters_counts': clusters_counts, + 'clusters': torch.stack(clusters).T, + 'embedding_hat': embeddings_restored, + } \ No newline at end of file diff --git a/scripts/plum-yambda/transforms.py b/scripts/plum-yambda/transforms.py new file mode 100644 index 0000000..bdbfffa --- /dev/null +++ b/scripts/plum-yambda/transforms.py @@ -0,0 +1,247 @@ +import numpy as np +import pickle +import torch +import torch.nn.functional as F +from typing import Dict, List +from irec.data.base import BaseDataset +from irec.data.transforms import Transform + +from cooc_data import CoocMappingDataset + + +class AddWeightedCooccurrenceEmbeddings: + def __init__(self, cooccur_counts, item_id_to_embedding, all_item_ids): + self.cooccur_counts = cooccur_counts + self.item_id_to_embedding = item_id_to_embedding + self.all_item_ids = all_item_ids + self.call_count = 0 + + def __call__(self, batch): + self.call_count += 1 + item_ids = batch['item_id'] + cooccurrence_embeddings = [] + + for idx, item_id in enumerate(item_ids): + item_id_val = int(item_id.item()) if torch.is_tensor(item_id) else int(item_id) + + counter = self.cooccur_counts.get(item_id_val) + if counter and len(counter) > 0: + cooc_ids, freqs = zip(*counter.items()) + freqs_array = np.array(freqs, dtype=np.float32) + probs = freqs_array / freqs_array.sum() + cooc_id = np.random.choice(cooc_ids, p=probs) + + else: + cooc_id = np.random.choice(self.all_item_ids) + if self.call_count % 500 == 0 and idx < 5: + print(f" idx={idx}: item_id={item_id_val} fallback random") + + cooc_emb = self.item_id_to_embedding.get(cooc_id, batch['embedding'][0]) + cooccurrence_embeddings.append(cooc_emb) + + batch['cooccurrence_embedding'] = torch.stack(cooccurrence_embeddings) + return batch + + + +class AddWeightedCooccurrenceEmbeddingsCached: + def __init__(self, cooccur_counts, item_id_to_embedding, all_item_ids): + self.cooccur_counts = cooccur_counts + self.item_id_to_embedding = item_id_to_embedding + self.all_item_ids = all_item_ids + self.call_count = 0 + + self.cooc_probs_cache = {} + self._precompute_probabilities() + + def _precompute_probabilities(self): + for item_id, counter in self.cooccur_counts.items(): + if counter and len(counter) > 0: + cooc_ids, freqs = zip(*counter.items()) + freqs_array = np.array(freqs, dtype=np.float32) + probs = freqs_array / freqs_array.sum() + self.cooc_probs_cache[item_id] = (cooc_ids, probs) + + def __call__(self, batch): + self.call_count += 1 + item_ids = batch['item_id'] + cooccurrence_embeddings = [] + + for idx, item_id in enumerate(item_ids): + item_id_val = int(item_id.item()) if torch.is_tensor(item_id) else int(item_id) + + if item_id_val in self.cooc_probs_cache: + cooc_ids, probs = self.cooc_probs_cache[item_id_val] + cooc_id = np.random.choice(cooc_ids, p=probs) + else: + cooc_id = np.random.choice(self.all_item_ids) + if self.call_count % 10 == 0 and idx < 5: + print(f" idx={idx}: item_id={item_id_val} fallback random") + + cooc_emb = self.item_id_to_embedding.get(cooc_id, batch['embedding'][0]) + cooccurrence_embeddings.append(cooc_emb) + + batch['cooccurrence_embedding'] = torch.stack(cooccurrence_embeddings) + return batch + +class AddWeightedCooccurrenceEmbeddingsVectorized: + + def __init__( + self, + cooccur_counts: Dict[int, Dict[int, int]], + item_id_to_embedding: Dict[int, torch.Tensor], + all_item_ids: List[int], + device: torch.device, + limit_neighbors: bool = True, + max_neighbors: int = 256 + ): + """ + limit_neighbors: если True, ограничиваем до max_neighbors (для экономии памяти) + max_neighbors: максимум соседей (используется только если limit_neighbors=True) + """ + self.device = device + self.call_count = 0 + self.limit_neighbors = limit_neighbors + self.max_neighbors = max_neighbors + + max_item_id = max(item_id_to_embedding.keys()) + embedding_dim = next(iter(item_id_to_embedding.values())).shape[0] + + self.embedding_matrix = torch.zeros( + (max_item_id + 1, embedding_dim), + device=device, + dtype=torch.float32, + requires_grad=False + ) + + print("Building embedding matrix") + for item_id, emb in item_id_to_embedding.items(): + if isinstance(emb, torch.Tensor): + self.embedding_matrix[item_id] = emb.detach() + else: + self.embedding_matrix[item_id] = torch.tensor(emb, device=device, dtype=torch.float32) + + self.all_item_ids_tensor = torch.tensor( + all_item_ids, + device=device, + dtype=torch.long, + requires_grad=False + ) + + print("Building cooccurrence tables") + self._build_cooccurrence_tables(cooccur_counts, max_item_id, len(all_item_ids)) + + def _build_cooccurrence_tables(self, cooccur_counts: Dict, max_item_id: int, num_all_items: int): + """ + - neighbors_matrix: [max_item_id+1, num_neighbors] + - probs_matrix: [max_item_id+1, num_neighbors] + Если у item_id нет соседей, neighbors и probs заполняются равномерно из all_items + """ + neighbor_counts = {} + for item_id in range(max_item_id + 1): + counter = cooccur_counts.get(item_id) + if counter and len(counter) > 0: + num_neighbors = len(counter) + if self.limit_neighbors: + num_neighbors = min(num_neighbors, self.max_neighbors) + else: + num_neighbors = num_all_items + + neighbor_counts[item_id] = num_neighbors + + max_num_neighbors = max(neighbor_counts.values()) + actual_max_neighbors = min(max_num_neighbors, self.max_neighbors) if self.limit_neighbors else max_num_neighbors + + print(f"Max neighbors per item: {actual_max_neighbors}") + + neighbors_matrix = torch.zeros( + (max_item_id + 1, actual_max_neighbors), + dtype=torch.long, + device=self.device, + requires_grad=False + ) + + probs_matrix = torch.zeros( + (max_item_id + 1, actual_max_neighbors), + dtype=torch.float32, + device=self.device, + requires_grad=False + ) + + num_items_with_cooc = 0 + + # Заполняем матрицы + for item_id in range(max_item_id + 1): + counter = cooccur_counts.get(item_id) + + if counter and len(counter) > 0: + # === Есть соседи: используем реальные вероятности === + num_items_with_cooc += 1 + + # Извлекаем соседей и их counts, сортируем по частоте + cooc_ids, freqs = zip(*sorted(counter.items(), key=lambda x: x[1], reverse=True)) + cooc_ids = list(cooc_ids) + freqs = np.array(freqs, dtype=np.float32) + + # Берем только топ + num_neighbors = min(len(cooc_ids), actual_max_neighbors) + cooc_ids = cooc_ids[:num_neighbors] + freqs = freqs[:num_neighbors] + + # Нормализуем + probs = freqs / freqs.sum() + + neighbors_matrix[item_id, :num_neighbors] = torch.tensor( + cooc_ids, dtype=torch.long, device=self.device + ) + probs_matrix[item_id, :num_neighbors] = torch.tensor( + probs, dtype=torch.float32, device=self.device + ) + + else: + # Нет соседей: равномерное распределение на all_items + if actual_max_neighbors >= num_all_items: + # Можем поместить всех айтемов + neighbors_matrix[item_id, :num_all_items] = self.all_item_ids_tensor + probs_matrix[item_id, :num_all_items] = 1.0 / num_all_items + else: + # Выбираем случайное подмножество + indices = torch.randperm(num_all_items, device=self.device)[:actual_max_neighbors] + neighbors_matrix[item_id] = self.all_item_ids_tensor[indices] + probs_matrix[item_id] = 1.0 / actual_max_neighbors + + self.neighbors_matrix = neighbors_matrix + self.probs_matrix = probs_matrix + + print(f"Cooccurrence tables built: {num_items_with_cooc}/{max_item_id + 1} items have real neighbors") + + def __call__(self, batch): + self.call_count += 1 + + item_ids = batch['item_id'] # [batch_size] + batch_size = item_ids.shape[0] + + # Берем вероятности для items в батче + probs = self.probs_matrix[item_ids] # [batch_size, max_neighbors] + + # Выбираем индекс соседа для каждого item + # torch.multinomial: выбирает из max_neighbors категорий по вероятностям + # Результат: [batch_size, 1] - индексы в диапазоне [0, max_neighbors) + neighbor_indices = torch.multinomial(probs, num_samples=1, replacement=True) + neighbor_indices = neighbor_indices.squeeze(1) # [batch_size] + + # neighbors_matrix[item_ids, neighbor_indices] -> [batch_size] + cooc_ids = self.neighbors_matrix[item_ids, neighbor_indices] + + # Lookup эмбеддингов + cooccurrence_embeddings = self.embedding_matrix[cooc_ids] # [batch_size, embedding_dim] + + batch['cooccurrence_embedding'] = cooccurrence_embeddings + + # if self.call_count % 500 == 0: + # print( + # f"Call #{self.call_count}: {batch_size} samples, " + # f"cooc_embeddings shape: {cooccurrence_embeddings.shape}" + # ) + + return batch \ No newline at end of file diff --git a/scripts/plum-yambda/yambda_4_1_train_plum.py b/scripts/plum-yambda/yambda_4_1_train_plum.py new file mode 100644 index 0000000..86e0c2b --- /dev/null +++ b/scripts/plum-yambda/yambda_4_1_train_plum.py @@ -0,0 +1,186 @@ +from loguru import logger +import os + +import torch + +import pickle + +import irec.callbacks as cb +from irec.data.dataloader import DataLoader +from irec.data.transforms import Collate, ToTorch, ToDevice +from irec.runners import TrainingRunner + +from irec.utils import fix_random_seed + +from callbacks import InitCodebooks, FixDeadCentroids +from data import EmbeddingDatasetParquet, ProcessEmbeddings +from models import PlumRQVAE +from transforms import AddWeightedCooccurrenceEmbeddingsVectorized +from cooc_data import CoocMappingDataset + +SEED_VALUE = 42 +DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + +NUM_EPOCHS = 35 +BATCH_SIZE = 1024 + +MAX_NEIGHBOURS_COUNT = 1000 + +INPUT_DIM = 128 +HIDDEN_DIM = 32 +CODEBOOK_SIZE = 256 +NUM_CODEBOOKS = 3 +BETA = 0.25 +LR = 1e-4 +WINDOW_SIZE = 2 + +EXPERIMENT_NAME = f'4-1_filtered_yambda_gpu_week_ws_{WINDOW_SIZE}' +INTER_TRAIN_PATH = "/home/jovyan/IRec/data/Yambda/week-splits/merged_for_exps_filtered/exp_4-1_0.9_inter_semantics_train.json" #отсекать старое (может и нет) +EMBEDDINGS_PATH = "/home/jovyan/IRec/sigir/yambda_data/yambda_embeddings_reindexed.parquet" +IREC_PATH = '../../' + +print(INTER_TRAIN_PATH) +def main(): + fix_random_seed(SEED_VALUE) + + data = CoocMappingDataset.create_from_split_part( + train_inter_json_path=INTER_TRAIN_PATH, + window_size=WINDOW_SIZE + ) + + dataset = EmbeddingDatasetParquet( + data_path=EMBEDDINGS_PATH + ) + + item_id_to_embedding = {} + all_item_ids = [] + for idx in range(len(dataset)): + sample = dataset[idx] + item_id = int(sample['item_id']) + item_id_to_embedding[item_id] = torch.tensor(sample['embedding'], device=DEVICE) + all_item_ids.append(item_id) + + add_cooc_transform = AddWeightedCooccurrenceEmbeddingsVectorized( + cooccur_counts=data.cooccur_counter_mapping, + item_id_to_embedding=item_id_to_embedding, + all_item_ids=all_item_ids, + device=DEVICE, + limit_neighbors=True, + max_neighbors = MAX_NEIGHBOURS_COUNT + ) + + train_dataloader = DataLoader( #call в основном потоке делается нужно исправить + dataset, + batch_size=BATCH_SIZE, + shuffle=True, + drop_last=True, + ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map( + ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding']) + ).map(add_cooc_transform + ).repeat(NUM_EPOCHS) + + valid_dataloader = DataLoader( + dataset, + batch_size=BATCH_SIZE, + shuffle=False, + drop_last=False, + ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding']) + ).map(add_cooc_transform) + + LOG_EVERY_NUM_STEPS = int(len(train_dataloader) // NUM_EPOCHS) + + model = PlumRQVAE( + input_dim=INPUT_DIM, + num_codebooks=NUM_CODEBOOKS, + codebook_size=CODEBOOK_SIZE, + embedding_dim=HIDDEN_DIM, + beta=BETA, + quant_loss_weight=1.0, + contrastive_loss_weight=1.0, + temperature=1.0 + ).to(DEVICE) + + total_params = sum(p.numel() for p in model.parameters()) + trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + + logger.debug(f'Overall parameters: {total_params:,}') + logger.debug(f'Trainable parameters: {trainable_params:,}') + + optimizer = torch.optim.Adam(model.parameters(), lr=LR, fused=True) + + callbacks = [ + InitCodebooks(valid_dataloader), + + cb.BatchMetrics(metrics=lambda model_outputs, batch: { + 'loss': model_outputs['loss'], + 'recon_loss': model_outputs['recon_loss'], + 'rqvae_loss': model_outputs['rqvae_loss'], + 'con_loss': model_outputs['con_loss'] + }, name='train'), + + FixDeadCentroids(valid_dataloader), + + cb.MetricAccumulator( + accumulators={ + 'train/loss': cb.MeanAccumulator(), + 'train/recon_loss': cb.MeanAccumulator(), + 'train/rqvae_loss': cb.MeanAccumulator(), + 'train/con_loss': cb.MeanAccumulator(), + 'num_dead/0': cb.MeanAccumulator(), + 'num_dead/1': cb.MeanAccumulator(), + 'num_dead/2': cb.MeanAccumulator(), + }, + reset_every_num_steps=LOG_EVERY_NUM_STEPS + ), + + cb.Validation( + dataset=valid_dataloader, + callbacks=[ + cb.BatchMetrics(metrics=lambda model_outputs, batch: { + 'loss': model_outputs['loss'], + 'recon_loss': model_outputs['recon_loss'], + 'rqvae_loss': model_outputs['rqvae_loss'], + 'con_loss': model_outputs['con_loss'] + }, name='valid'), + cb.MetricAccumulator( + accumulators={ + 'valid/loss': cb.MeanAccumulator(), + 'valid/recon_loss': cb.MeanAccumulator(), + 'valid/rqvae_loss': cb.MeanAccumulator(), + 'valid/con_loss': cb.MeanAccumulator() + } + ), + ], + ).every_num_steps(LOG_EVERY_NUM_STEPS), + + cb.Logger().every_num_steps(LOG_EVERY_NUM_STEPS), + cb.TensorboardLogger(experiment_name=EXPERIMENT_NAME, logdir=os.path.join(IREC_PATH, 'tensorboard_logs')), + + cb.Profiler( + wait=10, + warmup=10, + active=10, + logdir=os.path.join(IREC_PATH, 'tensorboard_logs') + ), + + cb.EarlyStopping( + metric='valid/recon_loss', + patience=40, + minimize=True, + model_path=os.path.join(IREC_PATH, 'checkpoints', EXPERIMENT_NAME) + ).every_num_steps(LOG_EVERY_NUM_STEPS), + ] + + logger.debug('Everything is ready for training process!') + + runner = TrainingRunner( + model=model, + optimizer=optimizer, + dataset=train_dataloader, + callbacks=callbacks, + ) + runner.run() + + +if __name__ == '__main__': + main() diff --git a/scripts/plum-yambda/yambda_infer_4.1_default.py b/scripts/plum-yambda/yambda_infer_4.1_default.py new file mode 100644 index 0000000..1485fde --- /dev/null +++ b/scripts/plum-yambda/yambda_infer_4.1_default.py @@ -0,0 +1,145 @@ +from loguru import logger +import os + +import torch + +import irec.callbacks as cb +from irec.data.dataloader import DataLoader +from irec.data.transforms import Collate, ToTorch, ToDevice +from irec.runners import InferenceRunner + +from irec.utils import fix_random_seed + +from data import EmbeddingDatasetParquet, ProcessEmbeddings +from models import PlumRQVAE + +# ПУТИ +IREC_PATH = '/home/jovyan/IRec/' +EMBEDDINGS_PATH = "/home/jovyan/IRec/sigir/yambda_data/yambda_embeddings_reindexed.parquet" +MODEL_PATH = '/home/jovyan/IRec/checkpoints/4-1_filtered_yambda_gpu_quantile_ws_2_best_0.0026.pth' +RESULTS_PATH = os.path.join(IREC_PATH, 'results_sigir_yambda') + +WINDOW_SIZE = 2 +EXPERIMENT_NAME = f'4-1_filtered_yambda_gpu_quantile_ws_{WINDOW_SIZE}' + +# ОСТАЛЬНОЕ + +SEED_VALUE = 42 +DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + +BATCH_SIZE = 1024 + +INPUT_DIM = 128 +HIDDEN_DIM = 32 +CODEBOOK_SIZE = 256 +NUM_CODEBOOKS = 3 + +BETA = 0.25 + + + +def main(): + fix_random_seed(SEED_VALUE) + + dataset = EmbeddingDatasetParquet( + data_path=EMBEDDINGS_PATH + ) + + item_id_to_embedding = {} + all_item_ids = [] + for idx in range(len(dataset)): + sample = dataset[idx] + item_id = int(sample['item_id']) + item_id_to_embedding[item_id] = torch.tensor(sample['embedding']) + all_item_ids.append(item_id) + + dataloader = DataLoader( + dataset, + batch_size=BATCH_SIZE, + shuffle=False, + drop_last=False, + ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])) + + model = PlumRQVAE( + input_dim=INPUT_DIM, + num_codebooks=NUM_CODEBOOKS, + codebook_size=CODEBOOK_SIZE, + embedding_dim=HIDDEN_DIM, + beta=BETA, + quant_loss_weight=1.0, + contrastive_loss_weight=1.0, + temperature=1.0 + ).to(DEVICE) + + total_params = sum(p.numel() for p in model.parameters()) + trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + + logger.debug(f'Overall parameters: {total_params:,}') + logger.debug(f'Trainable parameters: {trainable_params:,}') + + callbacks = [ + cb.LoadModel(MODEL_PATH), + + cb.BatchMetrics(metrics=lambda model_outputs, _: { + 'loss': model_outputs['loss'], + 'recon_loss': model_outputs['recon_loss'], + 'rqvae_loss': model_outputs['rqvae_loss'], + 'con_loss': model_outputs['con_loss'] + }, name='valid'), + + cb.MetricAccumulator( + accumulators={ + 'valid/loss': cb.MeanAccumulator(), + 'valid/recon_loss': cb.MeanAccumulator(), + 'valid/rqvae_loss': cb.MeanAccumulator(), + 'valid/con_loss': cb.MeanAccumulator(), + }, + ), + + cb.Logger().every_num_steps(len(dataloader)), + + cb.InferenceSaver( + metrics=lambda batch, model_outputs, _: {'item_id': batch['item_id'], 'clusters': model_outputs['clusters']}, + save_path=os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'), + format='json' + ) + ] + + logger.debug('Everything is ready for training process!') + + runner = InferenceRunner( + model=model, + dataset=dataloader, + callbacks=callbacks, + ) + runner.run() + + import json + from collections import defaultdict + import numpy as np + + with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'), 'r') as f: + mappings = json.load(f) + + inter = {} + sem_2_ids = defaultdict(list) + for mapping in mappings: + item_id = mapping['item_id'] + clusters = mapping['clusters'] + inter[int(item_id)] = clusters + sem_2_ids[tuple(clusters)].append(int(item_id)) + + for semantics, items in sem_2_ids.items(): + assert len(items) <= CODEBOOK_SIZE, str(len(items)) + collision_solvers = np.random.permutation(CODEBOOK_SIZE)[:len(items)].tolist() + for item_id, collision_solver in zip(items, collision_solvers): + inter[item_id].append(collision_solver) + for i in range(len(inter[item_id])): + inter[item_id][i] += CODEBOOK_SIZE * i + + with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters_colisionless.json'), 'w') as f: + json.dump(inter, f, indent=2) + + +if __name__ == '__main__': + main() diff --git a/scripts/plum/beauty-exps/4_1_train_plum.py b/scripts/plum/beauty-exps/4_1_train_plum.py new file mode 100644 index 0000000..357fc19 --- /dev/null +++ b/scripts/plum/beauty-exps/4_1_train_plum.py @@ -0,0 +1,169 @@ +from loguru import logger +import os + +import torch + +import pickle + +import irec.callbacks as cb +from irec.data.dataloader import DataLoader +from irec.data.transforms import Collate, ToTorch, ToDevice +from irec.runners import TrainingRunner + +from irec.utils import fix_random_seed + +from callbacks import InitCodebooks, FixDeadCentroids +from data import EmbeddingDataset, ProcessEmbeddings +from models import PlumRQVAE +from transforms import AddWeightedCooccurrenceEmbeddings +from cooc_data import CoocMappingDataset + +SEED_VALUE = 42 +DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + +NUM_EPOCHS = 500 +BATCH_SIZE = 1024 + +INPUT_DIM = 4096 +HIDDEN_DIM = 32 +CODEBOOK_SIZE = 256 +NUM_CODEBOOKS = 3 +BETA = 0.25 +LR = 1e-4 +WINDOW_SIZE = 2 + +EXPERIMENT_NAME = f'4-1_yambda_quantile_ws_{WINDOW_SIZE}' +INTER_TRAIN_PATH = "/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/exp_4-1_0.9_inter_semantics_train.json" +EMBEDDINGS_PATH = "/home/jovyan/tiger/data/Beauty/default_content_embeddings.pkl" +IREC_PATH = '../../' + +print(INTER_TRAIN_PATH) +def main(): + fix_random_seed(SEED_VALUE) + + data = CoocMappingDataset.create_from_split_part( + train_inter_json_path=INTER_TRAIN_PATH, + window_size=WINDOW_SIZE + ) + + dataset = EmbeddingDataset( + data_path=EMBEDDINGS_PATH + ) + + item_id_to_embedding = {} + all_item_ids = [] + for idx in range(len(dataset)): + sample = dataset[idx] + item_id = int(sample['item_id']) + item_id_to_embedding[item_id] = torch.tensor(sample['embedding']) + all_item_ids.append(item_id) + + add_cooc_transform = AddWeightedCooccurrenceEmbeddings( + data.cooccur_counter_mapping, item_id_to_embedding, all_item_ids) + + train_dataloader = DataLoader( + dataset, + batch_size=BATCH_SIZE, + shuffle=True, + drop_last=True, + ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map( + ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding']) + ).map(add_cooc_transform).repeat(NUM_EPOCHS) + + valid_dataloader = DataLoader( + dataset, + batch_size=BATCH_SIZE, + shuffle=False, + drop_last=False, + ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])).map(add_cooc_transform) + + LOG_EVERY_NUM_STEPS = int(len(train_dataloader) // NUM_EPOCHS) + + model = PlumRQVAE( + input_dim=INPUT_DIM, + num_codebooks=NUM_CODEBOOKS, + codebook_size=CODEBOOK_SIZE, + embedding_dim=HIDDEN_DIM, + beta=BETA, + quant_loss_weight=1.0, + contrastive_loss_weight=1.0, + temperature=1.0 + ).to(DEVICE) + + total_params = sum(p.numel() for p in model.parameters()) + trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + + logger.debug(f'Overall parameters: {total_params:,}') + logger.debug(f'Trainable parameters: {trainable_params:,}') + + optimizer = torch.optim.Adam(model.parameters(), lr=LR, fused=True) + + callbacks = [ + InitCodebooks(valid_dataloader), + + cb.BatchMetrics(metrics=lambda model_outputs, batch: { + 'loss': model_outputs['loss'], + 'recon_loss': model_outputs['recon_loss'], + 'rqvae_loss': model_outputs['rqvae_loss'], + 'con_loss': model_outputs['con_loss'] + }, name='train'), + + FixDeadCentroids(valid_dataloader), + + cb.MetricAccumulator( + accumulators={ + 'train/loss': cb.MeanAccumulator(), + 'train/recon_loss': cb.MeanAccumulator(), + 'train/rqvae_loss': cb.MeanAccumulator(), + 'train/con_loss': cb.MeanAccumulator(), + 'num_dead/0': cb.MeanAccumulator(), + 'num_dead/1': cb.MeanAccumulator(), + 'num_dead/2': cb.MeanAccumulator(), + }, + reset_every_num_steps=LOG_EVERY_NUM_STEPS + ), + + cb.Validation( + dataset=valid_dataloader, + callbacks=[ + cb.BatchMetrics(metrics=lambda model_outputs, batch: { + 'loss': model_outputs['loss'], + 'recon_loss': model_outputs['recon_loss'], + 'rqvae_loss': model_outputs['rqvae_loss'], + 'con_loss': model_outputs['con_loss'] + }, name='valid'), + cb.MetricAccumulator( + accumulators={ + 'valid/loss': cb.MeanAccumulator(), + 'valid/recon_loss': cb.MeanAccumulator(), + 'valid/rqvae_loss': cb.MeanAccumulator(), + 'valid/con_loss': cb.MeanAccumulator() + } + ), + ], + ).every_num_steps(LOG_EVERY_NUM_STEPS), + + cb.Logger().every_num_steps(LOG_EVERY_NUM_STEPS), + cb.TensorboardLogger(experiment_name=EXPERIMENT_NAME, logdir=os.path.join(IREC_PATH, 'tensorboard_logs')), + + cb.EarlyStopping( + metric='valid/recon_loss', + patience=40, + minimize=True, + model_path=os.path.join(IREC_PATH, 'checkpoints', EXPERIMENT_NAME) + ).every_num_steps(LOG_EVERY_NUM_STEPS), + ] + + logger.debug('Everything is ready for training process!') + + runner = TrainingRunner( + model=model, + optimizer=optimizer, + dataset=train_dataloader, + callbacks=callbacks, + ) + runner.run() + + +if __name__ == '__main__': + main() diff --git a/scripts/plum/beauty-exps/4_2_train_plum.py b/scripts/plum/beauty-exps/4_2_train_plum.py new file mode 100644 index 0000000..96cfda9 --- /dev/null +++ b/scripts/plum/beauty-exps/4_2_train_plum.py @@ -0,0 +1,169 @@ +from loguru import logger +import os + +import torch + +import pickle + +import irec.callbacks as cb +from irec.data.dataloader import DataLoader +from irec.data.transforms import Collate, ToTorch, ToDevice +from irec.runners import TrainingRunner + +from irec.utils import fix_random_seed + +from callbacks import InitCodebooks, FixDeadCentroids +from data import EmbeddingDataset, ProcessEmbeddings +from models import PlumRQVAE +from transforms import AddWeightedCooccurrenceEmbeddings +from cooc_data import CoocMappingDataset + +SEED_VALUE = 42 +DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + +NUM_EPOCHS = 500 +BATCH_SIZE = 1024 + +INPUT_DIM = 4096 +HIDDEN_DIM = 32 +CODEBOOK_SIZE = 256 +NUM_CODEBOOKS = 3 +BETA = 0.25 +LR = 1e-4 +WINDOW_SIZE = 2 + +EXPERIMENT_NAME = f'4-2_updated_quantile_plum_rqvae_beauty_ws_{WINDOW_SIZE}' +INTER_TRAIN_PATH = "/home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/merged_for_exps/exp_4-2_0.8_inter_semantics_train.json" +EMBEDDINGS_PATH = "/home/jovyan/tiger/data/Beauty/default_content_embeddings.pkl" +IREC_PATH = '../../' + +print(INTER_TRAIN_PATH) +def main(): + fix_random_seed(SEED_VALUE) + + data = CoocMappingDataset.create_from_split_part( + train_inter_json_path=INTER_TRAIN_PATH, + window_size=WINDOW_SIZE + ) + + dataset = EmbeddingDataset( + data_path=EMBEDDINGS_PATH + ) + + item_id_to_embedding = {} + all_item_ids = [] + for idx in range(len(dataset)): + sample = dataset[idx] + item_id = int(sample['item_id']) + item_id_to_embedding[item_id] = torch.tensor(sample['embedding']) + all_item_ids.append(item_id) + + add_cooc_transform = AddWeightedCooccurrenceEmbeddings( + data.cooccur_counter_mapping, item_id_to_embedding, all_item_ids) + + train_dataloader = DataLoader( + dataset, + batch_size=BATCH_SIZE, + shuffle=True, + drop_last=True, + ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map( + ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding']) + ).map(add_cooc_transform).repeat(NUM_EPOCHS) + + valid_dataloader = DataLoader( + dataset, + batch_size=BATCH_SIZE, + shuffle=False, + drop_last=False, + ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])).map(add_cooc_transform) + + LOG_EVERY_NUM_STEPS = int(len(train_dataloader) // NUM_EPOCHS) + + model = PlumRQVAE( + input_dim=INPUT_DIM, + num_codebooks=NUM_CODEBOOKS, + codebook_size=CODEBOOK_SIZE, + embedding_dim=HIDDEN_DIM, + beta=BETA, + quant_loss_weight=1.0, + contrastive_loss_weight=1.0, + temperature=1.0 + ).to(DEVICE) + + total_params = sum(p.numel() for p in model.parameters()) + trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + + logger.debug(f'Overall parameters: {total_params:,}') + logger.debug(f'Trainable parameters: {trainable_params:,}') + + optimizer = torch.optim.Adam(model.parameters(), lr=LR, fused=True) + + callbacks = [ + InitCodebooks(valid_dataloader), + + cb.BatchMetrics(metrics=lambda model_outputs, batch: { + 'loss': model_outputs['loss'], + 'recon_loss': model_outputs['recon_loss'], + 'rqvae_loss': model_outputs['rqvae_loss'], + 'con_loss': model_outputs['con_loss'] + }, name='train'), + + FixDeadCentroids(valid_dataloader), + + cb.MetricAccumulator( + accumulators={ + 'train/loss': cb.MeanAccumulator(), + 'train/recon_loss': cb.MeanAccumulator(), + 'train/rqvae_loss': cb.MeanAccumulator(), + 'train/con_loss': cb.MeanAccumulator(), + 'num_dead/0': cb.MeanAccumulator(), + 'num_dead/1': cb.MeanAccumulator(), + 'num_dead/2': cb.MeanAccumulator(), + }, + reset_every_num_steps=LOG_EVERY_NUM_STEPS + ), + + cb.Validation( + dataset=valid_dataloader, + callbacks=[ + cb.BatchMetrics(metrics=lambda model_outputs, batch: { + 'loss': model_outputs['loss'], + 'recon_loss': model_outputs['recon_loss'], + 'rqvae_loss': model_outputs['rqvae_loss'], + 'con_loss': model_outputs['con_loss'] + }, name='valid'), + cb.MetricAccumulator( + accumulators={ + 'valid/loss': cb.MeanAccumulator(), + 'valid/recon_loss': cb.MeanAccumulator(), + 'valid/rqvae_loss': cb.MeanAccumulator(), + 'valid/con_loss': cb.MeanAccumulator() + } + ), + ], + ).every_num_steps(LOG_EVERY_NUM_STEPS), + + cb.Logger().every_num_steps(LOG_EVERY_NUM_STEPS), + cb.TensorboardLogger(experiment_name=EXPERIMENT_NAME, logdir=os.path.join(IREC_PATH, 'tensorboard_logs')), + + cb.EarlyStopping( + metric='valid/recon_loss', + patience=40, + minimize=True, + model_path=os.path.join(IREC_PATH, 'checkpoints', EXPERIMENT_NAME) + ).every_num_steps(LOG_EVERY_NUM_STEPS), + ] + + logger.debug('Everything is ready for training process!') + + runner = TrainingRunner( + model=model, + optimizer=optimizer, + dataset=train_dataloader, + callbacks=callbacks, + ) + runner.run() + + +if __name__ == '__main__': + main() diff --git a/scripts/plum/beauty-exps/4_3_train_plum.py b/scripts/plum/beauty-exps/4_3_train_plum.py new file mode 100644 index 0000000..ac6cfb6 --- /dev/null +++ b/scripts/plum/beauty-exps/4_3_train_plum.py @@ -0,0 +1,169 @@ +from loguru import logger +import os + +import torch + +import pickle + +import irec.callbacks as cb +from irec.data.dataloader import DataLoader +from irec.data.transforms import Collate, ToTorch, ToDevice +from irec.runners import TrainingRunner + +from irec.utils import fix_random_seed + +from callbacks import InitCodebooks, FixDeadCentroids +from data import EmbeddingDataset, ProcessEmbeddings +from models import PlumRQVAE +from transforms import AddWeightedCooccurrenceEmbeddings +from cooc_data import CoocMappingDataset + +SEED_VALUE = 42 +DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + +NUM_EPOCHS = 500 +BATCH_SIZE = 1024 + +INPUT_DIM = 4096 +HIDDEN_DIM = 32 +CODEBOOK_SIZE = 256 +NUM_CODEBOOKS = 3 +BETA = 0.25 +LR = 1e-4 +WINDOW_SIZE = 2 + +EXPERIMENT_NAME = f'4-3_updated_quantile_plum_rqvae_beauty_ws_{WINDOW_SIZE}' +INTER_TRAIN_PATH = "/home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/merged_for_exps/exp_4-3_0.95_inter_semantics_train.json" +EMBEDDINGS_PATH = "/home/jovyan/tiger/data/Beauty/default_content_embeddings.pkl" +IREC_PATH = '../../' + +print(INTER_TRAIN_PATH) +def main(): + fix_random_seed(SEED_VALUE) + + data = CoocMappingDataset.create_from_split_part( + train_inter_json_path=INTER_TRAIN_PATH, + window_size=WINDOW_SIZE + ) + + dataset = EmbeddingDataset( + data_path=EMBEDDINGS_PATH + ) + + item_id_to_embedding = {} + all_item_ids = [] + for idx in range(len(dataset)): + sample = dataset[idx] + item_id = int(sample['item_id']) + item_id_to_embedding[item_id] = torch.tensor(sample['embedding']) + all_item_ids.append(item_id) + + add_cooc_transform = AddWeightedCooccurrenceEmbeddings( + data.cooccur_counter_mapping, item_id_to_embedding, all_item_ids) + + train_dataloader = DataLoader( + dataset, + batch_size=BATCH_SIZE, + shuffle=True, + drop_last=True, + ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map( + ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding']) + ).map(add_cooc_transform).repeat(NUM_EPOCHS) + + valid_dataloader = DataLoader( + dataset, + batch_size=BATCH_SIZE, + shuffle=False, + drop_last=False, + ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])).map(add_cooc_transform) + + LOG_EVERY_NUM_STEPS = int(len(train_dataloader) // NUM_EPOCHS) + + model = PlumRQVAE( + input_dim=INPUT_DIM, + num_codebooks=NUM_CODEBOOKS, + codebook_size=CODEBOOK_SIZE, + embedding_dim=HIDDEN_DIM, + beta=BETA, + quant_loss_weight=1.0, + contrastive_loss_weight=1.0, + temperature=1.0 + ).to(DEVICE) + + total_params = sum(p.numel() for p in model.parameters()) + trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + + logger.debug(f'Overall parameters: {total_params:,}') + logger.debug(f'Trainable parameters: {trainable_params:,}') + + optimizer = torch.optim.Adam(model.parameters(), lr=LR, fused=True) + + callbacks = [ + InitCodebooks(valid_dataloader), + + cb.BatchMetrics(metrics=lambda model_outputs, batch: { + 'loss': model_outputs['loss'], + 'recon_loss': model_outputs['recon_loss'], + 'rqvae_loss': model_outputs['rqvae_loss'], + 'con_loss': model_outputs['con_loss'] + }, name='train'), + + FixDeadCentroids(valid_dataloader), + + cb.MetricAccumulator( + accumulators={ + 'train/loss': cb.MeanAccumulator(), + 'train/recon_loss': cb.MeanAccumulator(), + 'train/rqvae_loss': cb.MeanAccumulator(), + 'train/con_loss': cb.MeanAccumulator(), + 'num_dead/0': cb.MeanAccumulator(), + 'num_dead/1': cb.MeanAccumulator(), + 'num_dead/2': cb.MeanAccumulator(), + }, + reset_every_num_steps=LOG_EVERY_NUM_STEPS + ), + + cb.Validation( + dataset=valid_dataloader, + callbacks=[ + cb.BatchMetrics(metrics=lambda model_outputs, batch: { + 'loss': model_outputs['loss'], + 'recon_loss': model_outputs['recon_loss'], + 'rqvae_loss': model_outputs['rqvae_loss'], + 'con_loss': model_outputs['con_loss'] + }, name='valid'), + cb.MetricAccumulator( + accumulators={ + 'valid/loss': cb.MeanAccumulator(), + 'valid/recon_loss': cb.MeanAccumulator(), + 'valid/rqvae_loss': cb.MeanAccumulator(), + 'valid/con_loss': cb.MeanAccumulator() + } + ), + ], + ).every_num_steps(LOG_EVERY_NUM_STEPS), + + cb.Logger().every_num_steps(LOG_EVERY_NUM_STEPS), + cb.TensorboardLogger(experiment_name=EXPERIMENT_NAME, logdir=os.path.join(IREC_PATH, 'tensorboard_logs')), + + cb.EarlyStopping( + metric='valid/recon_loss', + patience=40, + minimize=True, + model_path=os.path.join(IREC_PATH, 'checkpoints', EXPERIMENT_NAME) + ).every_num_steps(LOG_EVERY_NUM_STEPS), + ] + + logger.debug('Everything is ready for training process!') + + runner = TrainingRunner( + model=model, + optimizer=optimizer, + dataset=train_dataloader, + callbacks=callbacks, + ) + runner.run() + + +if __name__ == '__main__': + main() diff --git a/scripts/plum/beauty-exps/infer_4.1_default.py b/scripts/plum/beauty-exps/infer_4.1_default.py new file mode 100644 index 0000000..fff61d3 --- /dev/null +++ b/scripts/plum/beauty-exps/infer_4.1_default.py @@ -0,0 +1,145 @@ +from loguru import logger +import os + +import torch + +import irec.callbacks as cb +from irec.data.dataloader import DataLoader +from irec.data.transforms import Collate, ToTorch, ToDevice +from irec.runners import InferenceRunner + +from irec.utils import fix_random_seed + +from data import EmbeddingDataset, ProcessEmbeddings +from models import PlumRQVAE + +# ПУТИ +IREC_PATH = '/home/jovyan/IRec/' +EMBEDDINGS_PATH = '/home/jovyan/tiger/data/Beauty/default_content_embeddings.pkl' +MODEL_PATH = '/home/jovyan/IRec/checkpoints/4-1_updated_quantile_plum_rqvae_beauty_ws_2_best_0.0052.pth' +RESULTS_PATH = os.path.join(IREC_PATH, 'results_sigir') + +WINDOW_SIZE = 2 +EXPERIMENT_NAME = f'4-1_updated_quantile_plum_rqvae_beauty_ws_{WINDOW_SIZE}' + +# ОСТАЛЬНОЕ + +SEED_VALUE = 42 +DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + +BATCH_SIZE = 1024 + +INPUT_DIM = 4096 +HIDDEN_DIM = 32 +CODEBOOK_SIZE = 256 +NUM_CODEBOOKS = 3 + +BETA = 0.25 + + + +def main(): + fix_random_seed(SEED_VALUE) + + dataset = EmbeddingDataset( + data_path=EMBEDDINGS_PATH + ) + + item_id_to_embedding = {} + all_item_ids = [] + for idx in range(len(dataset)): + sample = dataset[idx] + item_id = int(sample['item_id']) + item_id_to_embedding[item_id] = torch.tensor(sample['embedding']) + all_item_ids.append(item_id) + + dataloader = DataLoader( + dataset, + batch_size=BATCH_SIZE, + shuffle=False, + drop_last=False, + ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])) + + model = PlumRQVAE( + input_dim=INPUT_DIM, + num_codebooks=NUM_CODEBOOKS, + codebook_size=CODEBOOK_SIZE, + embedding_dim=HIDDEN_DIM, + beta=BETA, + quant_loss_weight=1.0, + contrastive_loss_weight=1.0, + temperature=1.0 + ).to(DEVICE) + + total_params = sum(p.numel() for p in model.parameters()) + trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + + logger.debug(f'Overall parameters: {total_params:,}') + logger.debug(f'Trainable parameters: {trainable_params:,}') + + callbacks = [ + cb.LoadModel(MODEL_PATH), + + cb.BatchMetrics(metrics=lambda model_outputs, _: { + 'loss': model_outputs['loss'], + 'recon_loss': model_outputs['recon_loss'], + 'rqvae_loss': model_outputs['rqvae_loss'], + 'con_loss': model_outputs['con_loss'] + }, name='valid'), + + cb.MetricAccumulator( + accumulators={ + 'valid/loss': cb.MeanAccumulator(), + 'valid/recon_loss': cb.MeanAccumulator(), + 'valid/rqvae_loss': cb.MeanAccumulator(), + 'valid/con_loss': cb.MeanAccumulator(), + }, + ), + + cb.Logger().every_num_steps(len(dataloader)), + + cb.InferenceSaver( + metrics=lambda batch, model_outputs, _: {'item_id': batch['item_id'], 'clusters': model_outputs['clusters']}, + save_path=os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'), + format='json' + ) + ] + + logger.debug('Everything is ready for training process!') + + runner = InferenceRunner( + model=model, + dataset=dataloader, + callbacks=callbacks, + ) + runner.run() + + import json + from collections import defaultdict + import numpy as np + + with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'), 'r') as f: + mappings = json.load(f) + + inter = {} + sem_2_ids = defaultdict(list) + for mapping in mappings: + item_id = mapping['item_id'] + clusters = mapping['clusters'] + inter[int(item_id)] = clusters + sem_2_ids[tuple(clusters)].append(int(item_id)) + + for semantics, items in sem_2_ids.items(): + assert len(items) <= CODEBOOK_SIZE, str(len(items)) + collision_solvers = np.random.permutation(CODEBOOK_SIZE)[:len(items)].tolist() + for item_id, collision_solver in zip(items, collision_solvers): + inter[item_id].append(collision_solver) + for i in range(len(inter[item_id])): + inter[item_id][i] += CODEBOOK_SIZE * i + + with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters_colisionless.json'), 'w') as f: + json.dump(inter, f, indent=2) + + +if __name__ == '__main__': + main() diff --git a/scripts/plum/beauty-exps/infer_4.2_default.py b/scripts/plum/beauty-exps/infer_4.2_default.py new file mode 100644 index 0000000..c5c7c02 --- /dev/null +++ b/scripts/plum/beauty-exps/infer_4.2_default.py @@ -0,0 +1,145 @@ +from loguru import logger +import os + +import torch + +import irec.callbacks as cb +from irec.data.dataloader import DataLoader +from irec.data.transforms import Collate, ToTorch, ToDevice +from irec.runners import InferenceRunner + +from irec.utils import fix_random_seed + +from data import EmbeddingDataset, ProcessEmbeddings +from models import PlumRQVAE + +# ПУТИ +IREC_PATH = '/home/jovyan/IRec/' +EMBEDDINGS_PATH = '/home/jovyan/tiger/data/Beauty/default_content_embeddings.pkl' +MODEL_PATH = '/home/jovyan/IRec/checkpoints/4-2_updated_quantile_plum_rqvae_beauty_ws_2_best_0.0051.pth' +RESULTS_PATH = os.path.join(IREC_PATH, 'results_sigir') + +WINDOW_SIZE = 2 +EXPERIMENT_NAME = f'4-2_updated_quantile_plum_rqvae_beauty_ws_{WINDOW_SIZE}' + +# ОСТАЛЬНОЕ + +SEED_VALUE = 42 +DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + +BATCH_SIZE = 1024 + +INPUT_DIM = 4096 +HIDDEN_DIM = 32 +CODEBOOK_SIZE = 256 +NUM_CODEBOOKS = 3 + +BETA = 0.25 + + + +def main(): + fix_random_seed(SEED_VALUE) + + dataset = EmbeddingDataset( + data_path=EMBEDDINGS_PATH + ) + + item_id_to_embedding = {} + all_item_ids = [] + for idx in range(len(dataset)): + sample = dataset[idx] + item_id = int(sample['item_id']) + item_id_to_embedding[item_id] = torch.tensor(sample['embedding']) + all_item_ids.append(item_id) + + dataloader = DataLoader( + dataset, + batch_size=BATCH_SIZE, + shuffle=False, + drop_last=False, + ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])) + + model = PlumRQVAE( + input_dim=INPUT_DIM, + num_codebooks=NUM_CODEBOOKS, + codebook_size=CODEBOOK_SIZE, + embedding_dim=HIDDEN_DIM, + beta=BETA, + quant_loss_weight=1.0, + contrastive_loss_weight=1.0, + temperature=1.0 + ).to(DEVICE) + + total_params = sum(p.numel() for p in model.parameters()) + trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + + logger.debug(f'Overall parameters: {total_params:,}') + logger.debug(f'Trainable parameters: {trainable_params:,}') + + callbacks = [ + cb.LoadModel(MODEL_PATH), + + cb.BatchMetrics(metrics=lambda model_outputs, _: { + 'loss': model_outputs['loss'], + 'recon_loss': model_outputs['recon_loss'], + 'rqvae_loss': model_outputs['rqvae_loss'], + 'con_loss': model_outputs['con_loss'] + }, name='valid'), + + cb.MetricAccumulator( + accumulators={ + 'valid/loss': cb.MeanAccumulator(), + 'valid/recon_loss': cb.MeanAccumulator(), + 'valid/rqvae_loss': cb.MeanAccumulator(), + 'valid/con_loss': cb.MeanAccumulator(), + }, + ), + + cb.Logger().every_num_steps(len(dataloader)), + + cb.InferenceSaver( + metrics=lambda batch, model_outputs, _: {'item_id': batch['item_id'], 'clusters': model_outputs['clusters']}, + save_path=os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'), + format='json' + ) + ] + + logger.debug('Everything is ready for training process!') + + runner = InferenceRunner( + model=model, + dataset=dataloader, + callbacks=callbacks, + ) + runner.run() + + import json + from collections import defaultdict + import numpy as np + + with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'), 'r') as f: + mappings = json.load(f) + + inter = {} + sem_2_ids = defaultdict(list) + for mapping in mappings: + item_id = mapping['item_id'] + clusters = mapping['clusters'] + inter[int(item_id)] = clusters + sem_2_ids[tuple(clusters)].append(int(item_id)) + + for semantics, items in sem_2_ids.items(): + assert len(items) <= CODEBOOK_SIZE, str(len(items)) + collision_solvers = np.random.permutation(CODEBOOK_SIZE)[:len(items)].tolist() + for item_id, collision_solver in zip(items, collision_solvers): + inter[item_id].append(collision_solver) + for i in range(len(inter[item_id])): + inter[item_id][i] += CODEBOOK_SIZE * i + + with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters_colisionless.json'), 'w') as f: + json.dump(inter, f, indent=2) + + +if __name__ == '__main__': + main() diff --git a/scripts/plum/beauty-exps/infer_4.3_default.py b/scripts/plum/beauty-exps/infer_4.3_default.py new file mode 100644 index 0000000..c7fca80 --- /dev/null +++ b/scripts/plum/beauty-exps/infer_4.3_default.py @@ -0,0 +1,145 @@ +from loguru import logger +import os + +import torch + +import irec.callbacks as cb +from irec.data.dataloader import DataLoader +from irec.data.transforms import Collate, ToTorch, ToDevice +from irec.runners import InferenceRunner + +from irec.utils import fix_random_seed + +from data import EmbeddingDataset, ProcessEmbeddings +from models import PlumRQVAE + +# ПУТИ +IREC_PATH = '/home/jovyan/IRec/' +EMBEDDINGS_PATH = '/home/jovyan/tiger/data/Beauty/default_content_embeddings.pkl' +MODEL_PATH = '/home/jovyan/IRec/checkpoints/4-3_updated_quantile_plum_rqvae_beauty_ws_2_best_0.005.pth' +RESULTS_PATH = os.path.join(IREC_PATH, 'results_sigir') + +WINDOW_SIZE = 2 +EXPERIMENT_NAME = f'4-3_updated_quantile_plum_rqvae_beauty_ws_{WINDOW_SIZE}' + +# ОСТАЛЬНОЕ + +SEED_VALUE = 42 +DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + +BATCH_SIZE = 1024 + +INPUT_DIM = 4096 +HIDDEN_DIM = 32 +CODEBOOK_SIZE = 256 +NUM_CODEBOOKS = 3 + +BETA = 0.25 + + + +def main(): + fix_random_seed(SEED_VALUE) + + dataset = EmbeddingDataset( + data_path=EMBEDDINGS_PATH + ) + + item_id_to_embedding = {} + all_item_ids = [] + for idx in range(len(dataset)): + sample = dataset[idx] + item_id = int(sample['item_id']) + item_id_to_embedding[item_id] = torch.tensor(sample['embedding']) + all_item_ids.append(item_id) + + dataloader = DataLoader( + dataset, + batch_size=BATCH_SIZE, + shuffle=False, + drop_last=False, + ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])) + + model = PlumRQVAE( + input_dim=INPUT_DIM, + num_codebooks=NUM_CODEBOOKS, + codebook_size=CODEBOOK_SIZE, + embedding_dim=HIDDEN_DIM, + beta=BETA, + quant_loss_weight=1.0, + contrastive_loss_weight=1.0, + temperature=1.0 + ).to(DEVICE) + + total_params = sum(p.numel() for p in model.parameters()) + trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + + logger.debug(f'Overall parameters: {total_params:,}') + logger.debug(f'Trainable parameters: {trainable_params:,}') + + callbacks = [ + cb.LoadModel(MODEL_PATH), + + cb.BatchMetrics(metrics=lambda model_outputs, _: { + 'loss': model_outputs['loss'], + 'recon_loss': model_outputs['recon_loss'], + 'rqvae_loss': model_outputs['rqvae_loss'], + 'con_loss': model_outputs['con_loss'] + }, name='valid'), + + cb.MetricAccumulator( + accumulators={ + 'valid/loss': cb.MeanAccumulator(), + 'valid/recon_loss': cb.MeanAccumulator(), + 'valid/rqvae_loss': cb.MeanAccumulator(), + 'valid/con_loss': cb.MeanAccumulator(), + }, + ), + + cb.Logger().every_num_steps(len(dataloader)), + + cb.InferenceSaver( + metrics=lambda batch, model_outputs, _: {'item_id': batch['item_id'], 'clusters': model_outputs['clusters']}, + save_path=os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'), + format='json' + ) + ] + + logger.debug('Everything is ready for training process!') + + runner = InferenceRunner( + model=model, + dataset=dataloader, + callbacks=callbacks, + ) + runner.run() + + import json + from collections import defaultdict + import numpy as np + + with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'), 'r') as f: + mappings = json.load(f) + + inter = {} + sem_2_ids = defaultdict(list) + for mapping in mappings: + item_id = mapping['item_id'] + clusters = mapping['clusters'] + inter[int(item_id)] = clusters + sem_2_ids[tuple(clusters)].append(int(item_id)) + + for semantics, items in sem_2_ids.items(): + assert len(items) <= CODEBOOK_SIZE, str(len(items)) + collision_solvers = np.random.permutation(CODEBOOK_SIZE)[:len(items)].tolist() + for item_id, collision_solver in zip(items, collision_solvers): + inter[item_id].append(collision_solver) + for i in range(len(inter[item_id])): + inter[item_id][i] += CODEBOOK_SIZE * i + + with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters_colisionless.json'), 'w') as f: + json.dump(inter, f, indent=2) + + +if __name__ == '__main__': + main() diff --git a/scripts/plum/beauty-exps/infer_default.py b/scripts/plum/beauty-exps/infer_default.py new file mode 100644 index 0000000..af8df34 --- /dev/null +++ b/scripts/plum/beauty-exps/infer_default.py @@ -0,0 +1,152 @@ +from loguru import logger +import os + +import torch + +import irec.callbacks as cb +from irec.data.dataloader import DataLoader +from irec.data.transforms import Collate, ToTorch, ToDevice +from irec.runners import InferenceRunner + +from irec.utils import fix_random_seed + +from data import EmbeddingDataset, ProcessEmbeddings +from models import PlumRQVAE +from transforms import AddWeightedCooccurrenceEmbeddings +from cooc_data import CoocMappingDataset + +SEED_VALUE = 42 +DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + +BATCH_SIZE = 1024 + +INPUT_DIM = 4096 +HIDDEN_DIM = 32 +CODEBOOK_SIZE = 256 +NUM_CODEBOOKS = 3 + +BETA = 0.25 +MODEL_PATH = '/home/jovyan/IRec/checkpoints/test_plum_rqvae_beauty_ws_2_best_0.0054.pth' + +WINDOW_SIZE = 2 + +EXPERIMENT_NAME = f'test_plum_rqvae_beauty_ws_{WINDOW_SIZE}' + +IREC_PATH = '/home/jovyan/IRec/' + + +def main(): + fix_random_seed(SEED_VALUE) + + data = CoocMappingDataset.create( + inter_json_path=os.path.join(IREC_PATH, 'data/Beauty/inter_new.json'), + max_sequence_length=20, + sampler_type='sasrec', + window_size=WINDOW_SIZE + ) + + dataset = EmbeddingDataset( + data_path='/home/jovyan/tiger/data/Beauty/default_content_embeddings.pkl' + ) + + item_id_to_embedding = {} + all_item_ids = [] + for idx in range(len(dataset)): + sample = dataset[idx] + item_id = int(sample['item_id']) + item_id_to_embedding[item_id] = torch.tensor(sample['embedding']) + all_item_ids.append(item_id) + + add_cooc_transform = AddWeightedCooccurrenceEmbeddings( + data.cooccur_counter_mapping, item_id_to_embedding, all_item_ids) + + dataloader = DataLoader( + dataset, + batch_size=BATCH_SIZE, + shuffle=False, + drop_last=False, + ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])).map(add_cooc_transform) + + model = PlumRQVAE( + input_dim=INPUT_DIM, + num_codebooks=NUM_CODEBOOKS, + codebook_size=CODEBOOK_SIZE, + embedding_dim=HIDDEN_DIM, + beta=BETA, + quant_loss_weight=1.0, + contrastive_loss_weight=1.0, + temperature=1.0 + ).to(DEVICE) + + total_params = sum(p.numel() for p in model.parameters()) + trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + + logger.debug(f'Overall parameters: {total_params:,}') + logger.debug(f'Trainable parameters: {trainable_params:,}') + + callbacks = [ + cb.LoadModel(MODEL_PATH), + + cb.BatchMetrics(metrics=lambda model_outputs, _: { + 'loss': model_outputs['loss'], + 'recon_loss': model_outputs['recon_loss'], + 'rqvae_loss': model_outputs['rqvae_loss'], + 'con_loss': model_outputs['con_loss'] + }, name='valid'), + + cb.MetricAccumulator( + accumulators={ + 'valid/loss': cb.MeanAccumulator(), + 'valid/recon_loss': cb.MeanAccumulator(), + 'valid/rqvae_loss': cb.MeanAccumulator(), + 'valid/con_loss': cb.MeanAccumulator(), + }, + ), + + cb.Logger().every_num_steps(len(dataloader)), + + cb.InferenceSaver( + metrics=lambda batch, model_outputs, _: {'item_id': batch['item_id'], 'clusters': model_outputs['clusters']}, + save_path=f'/home/jovyan/IRec/results/{EXPERIMENT_NAME}_clusters.json', + format='json' + ) + ] + + logger.debug('Everything is ready for training process!') + + runner = InferenceRunner( + model=model, + dataset=dataloader, + callbacks=callbacks, + ) + runner.run() + + import json + from collections import defaultdict + import numpy as np + + with open(f'/home/jovyan/IRec/results/{EXPERIMENT_NAME}_clusters.json', 'r') as f: + mappings = json.load(f) + + inter = {} + sem_2_ids = defaultdict(list) + for mapping in mappings: + item_id = mapping['item_id'] + clusters = mapping['clusters'] + inter[int(item_id)] = clusters + sem_2_ids[tuple(clusters)].append(int(item_id)) + + for semantics, items in sem_2_ids.items(): + assert len(items) <= CODEBOOK_SIZE, str(len(items)) + collision_solvers = np.random.permutation(CODEBOOK_SIZE)[:len(items)].tolist() + for item_id, collision_solver in zip(items, collision_solvers): + inter[item_id].append(collision_solver) + for i in range(len(inter[item_id])): + inter[item_id][i] += CODEBOOK_SIZE * i + + with open(os.path.join(IREC_PATH, 'results', f'{EXPERIMENT_NAME}_clusters_colisionless.json'), 'w') as f: + json.dump(inter, f, indent=2) + + +if __name__ == '__main__': + main() diff --git a/scripts/plum/cooc_data.py b/scripts/plum/cooc_data.py index b11e6f0..50f2bdd 100644 --- a/scripts/plum/cooc_data.py +++ b/scripts/plum/cooc_data.py @@ -13,21 +13,15 @@ class CoocMappingDataset: def __init__( self, train_sampler, - validation_sampler, - test_sampler, num_items, - max_sequence_length, cooccur_counter_mapping=None ): self._train_sampler = train_sampler - self._validation_sampler = validation_sampler - self._test_sampler = test_sampler self._num_items = num_items - self._max_sequence_length = max_sequence_length self._cooccur_counter_mapping = cooccur_counter_mapping @classmethod - def create(cls, inter_json_path, max_sequence_length, sampler_type, window_size): + def create(cls, inter_json_path, window_size): max_item_id = 0 train_dataset, validation_dataset, test_dataset = [], [], [] @@ -43,31 +37,59 @@ def create(cls, inter_json_path, max_sequence_length, sampler_type, window_size) 'user.ids': [user_id], 'item.ids': item_ids[:-2], }) - validation_dataset.append({ - 'user.ids': [user_id], - 'item.ids': item_ids[:-1], - }) - test_dataset.append({ - 'user.ids': [user_id], - 'item.ids': item_ids, - }) cooccur_counter_mapping = cls.build_cooccur_counter_mapping(train_dataset, window_size=window_size) logger.debug(f'Computed window-based co-occurrence mapping for {len(cooccur_counter_mapping)} items but max_item_id is {max_item_id}') train_sampler = train_dataset - validation_sampler = validation_dataset - test_sampler = test_dataset return cls( train_sampler=train_sampler, - validation_sampler=validation_sampler, - test_sampler=test_sampler, num_items=max_item_id + 1, - max_sequence_length=max_sequence_length, cooccur_counter_mapping=cooccur_counter_mapping ) + @classmethod + def create_from_split_part( + cls, + train_inter_json_path, + window_size + ): + + max_item_id = 0 + train_dataset = [] + + with open(train_inter_json_path, 'r') as f: + train_interactions = json.load(f) + + # Обрабатываем TRAIN + for user_id_str, item_ids in train_interactions.items(): + user_id = int(user_id_str) + if item_ids: + max_item_id = max(max_item_id, max(item_ids)) + + train_dataset.append({ + 'user.ids': [user_id], + 'item.ids': item_ids, + }) + + logger.debug(f'Train: {len(train_dataset)} users') + logger.debug(f'Max item ID: {max_item_id}') + + cooccur_counter_mapping = cls.build_cooccur_counter_mapping( + train_dataset, + window_size=window_size + ) + + logger.debug(f'Computed window-based co-occurrence mapping for {len(cooccur_counter_mapping)} items') + + return cls( + train_sampler=train_dataset, + num_items=max_item_id + 1, + cooccur_counter_mapping=cooccur_counter_mapping + ) + + @staticmethod def build_cooccur_counter_mapping(train_dataset, window_size): #TODO передавать время и по нему строить окно cooccur_counts = defaultdict(Counter) @@ -80,16 +102,6 @@ def build_cooccur_counter_mapping(train_dataset, window_size): #TODO перед cooccur_counts[item_i][items[j]] += 1 return cooccur_counts - def get_datasets(self): - return self._train_sampler, self._validation_sampler, self._test_sampler - - @property - def num_items(self): - return self._num_items - - @property - def max_sequence_length(self): - return self._max_sequence_length @property def cooccur_counter_mapping(self): diff --git a/scripts/plum/data.py b/scripts/plum/data.py index 0ffef82..9c15b70 100644 --- a/scripts/plum/data.py +++ b/scripts/plum/data.py @@ -5,6 +5,29 @@ from irec.data.transforms import Transform +import polars as pl +import torch + +class EmbeddingDatasetParquet(BaseDataset): + def __init__(self, data_path): + self.df = pl.read_parquet(data_path) + self.item_ids = np.array(self.df['item_id'], dtype=np.int64) + self.embeddings = np.array(self.df['embedding'].to_list(), dtype=np.float32) + print(f"embedding dim: {self.embeddings[0].shape}") + + def __getitem__(self, idx): + index = self.item_ids[idx] + tensor_emb = self.embeddings[idx] + return { + 'item_id': index, + 'embedding': tensor_emb, + 'embedding_dim': len(tensor_emb) + } + + def __len__(self): + return len(self.embeddings) + + class EmbeddingDataset(BaseDataset): def __init__(self, data_path): self.data_path = data_path diff --git a/scripts/plum/infer_default.py b/scripts/plum/infer_default.py index af8df34..b15fb6d 100644 --- a/scripts/plum/infer_default.py +++ b/scripts/plum/infer_default.py @@ -12,8 +12,18 @@ from data import EmbeddingDataset, ProcessEmbeddings from models import PlumRQVAE -from transforms import AddWeightedCooccurrenceEmbeddings -from cooc_data import CoocMappingDataset + +# ПУТИ +IREC_PATH = '/home/jovyan/IRec/' +EMBEDDINGS_PATH = '/home/jovyan/tiger/data/Beauty/default_content_embeddings.pkl' +MODEL_PATH = '/home/jovyan/IRec/checkpoints/4-1_plum_rqvae_beauty_ws_2_best_0.0051.pth' +RESULTS_PATH = os.path.join(IREC_PATH, 'results') + +WINDOW_SIZE = 2 + +EXPERIMENT_NAME = f'test_plum_rqvae_beauty_ws_{WINDOW_SIZE}' + +# ОСТАЛЬНОЕ SEED_VALUE = 42 DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') @@ -26,29 +36,16 @@ NUM_CODEBOOKS = 3 BETA = 0.25 -MODEL_PATH = '/home/jovyan/IRec/checkpoints/test_plum_rqvae_beauty_ws_2_best_0.0054.pth' -WINDOW_SIZE = 2 - -EXPERIMENT_NAME = f'test_plum_rqvae_beauty_ws_{WINDOW_SIZE}' - -IREC_PATH = '/home/jovyan/IRec/' def main(): fix_random_seed(SEED_VALUE) - data = CoocMappingDataset.create( - inter_json_path=os.path.join(IREC_PATH, 'data/Beauty/inter_new.json'), - max_sequence_length=20, - sampler_type='sasrec', - window_size=WINDOW_SIZE - ) - dataset = EmbeddingDataset( - data_path='/home/jovyan/tiger/data/Beauty/default_content_embeddings.pkl' + data_path=EMBEDDINGS_PATH ) - + item_id_to_embedding = {} all_item_ids = [] for idx in range(len(dataset)): @@ -57,15 +54,12 @@ def main(): item_id_to_embedding[item_id] = torch.tensor(sample['embedding']) all_item_ids.append(item_id) - add_cooc_transform = AddWeightedCooccurrenceEmbeddings( - data.cooccur_counter_mapping, item_id_to_embedding, all_item_ids) - dataloader = DataLoader( dataset, batch_size=BATCH_SIZE, shuffle=False, drop_last=False, - ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])).map(add_cooc_transform) + ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])) model = PlumRQVAE( input_dim=INPUT_DIM, @@ -106,8 +100,8 @@ def main(): cb.Logger().every_num_steps(len(dataloader)), cb.InferenceSaver( - metrics=lambda batch, model_outputs, _: {'item_id': batch['item_id'], 'clusters': model_outputs['clusters']}, - save_path=f'/home/jovyan/IRec/results/{EXPERIMENT_NAME}_clusters.json', + metrics=lambda batch, model_outputs, _: {'item_id': batch['item_id'], 'clusters': model_outputs['clusters']}, + save_path=os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'), format='json' ) ] @@ -125,9 +119,9 @@ def main(): from collections import defaultdict import numpy as np - with open(f'/home/jovyan/IRec/results/{EXPERIMENT_NAME}_clusters.json', 'r') as f: + with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'), 'r') as f: mappings = json.load(f) - + inter = {} sem_2_ids = defaultdict(list) for mapping in mappings: @@ -143,8 +137,8 @@ def main(): inter[item_id].append(collision_solver) for i in range(len(inter[item_id])): inter[item_id][i] += CODEBOOK_SIZE * i - - with open(os.path.join(IREC_PATH, 'results', f'{EXPERIMENT_NAME}_clusters_colisionless.json'), 'w') as f: + + with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters_colisionless.json'), 'w') as f: json.dump(inter, f, indent=2) diff --git a/scripts/plum/train_plum.py b/scripts/plum/train_plum.py index 5a00bc3..ffa9e43 100644 --- a/scripts/plum/train_plum.py +++ b/scripts/plum/train_plum.py @@ -41,8 +41,6 @@ def main(): data = CoocMappingDataset.create( inter_json_path=os.path.join(IREC_PATH, 'data/Beauty/inter_new.json'), - max_sequence_length=20, - sampler_type='sasrec', window_size=WINDOW_SIZE ) diff --git a/scripts/plum/train_plum_timestamp_based.py b/scripts/plum/train_plum_timestamp_based.py new file mode 100644 index 0000000..e755d95 --- /dev/null +++ b/scripts/plum/train_plum_timestamp_based.py @@ -0,0 +1,168 @@ +from loguru import logger +import os + +import torch + +import pickle + +import irec.callbacks as cb +from irec.data.dataloader import DataLoader +from irec.data.transforms import Collate, ToTorch, ToDevice +from irec.runners import TrainingRunner + +from irec.utils import fix_random_seed + +from callbacks import InitCodebooks, FixDeadCentroids +from data import EmbeddingDataset, ProcessEmbeddings +from models import PlumRQVAE +from transforms import AddWeightedCooccurrenceEmbeddings +from cooc_data import CoocMappingDataset + +SEED_VALUE = 42 +DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + +NUM_EPOCHS = 500 +BATCH_SIZE = 1024 + +INPUT_DIM = 4096 +HIDDEN_DIM = 32 +CODEBOOK_SIZE = 256 +NUM_CODEBOOKS = 3 +BETA = 0.25 +LR = 1e-4 +WINDOW_SIZE = 2 + +EXPERIMENT_NAME = f'4-1_plum_rqvae_beauty_ws_{WINDOW_SIZE}' +INTER_TRAIN_PATH = "/home/jovyan/IRec/sigir/Beauty_new/splits/exp_data/exp_4.1_inter_semantics_train.json" +EMBEDDINGS_PATH = "/home/jovyan/tiger/data/Beauty/default_content_embeddings.pkl" +IREC_PATH = '../../' + +def main(): + fix_random_seed(SEED_VALUE) + + data = CoocMappingDataset.create_from_split_part( + train_inter_json_path=INTER_TRAIN_PATH, + window_size=WINDOW_SIZE + ) + + dataset = EmbeddingDataset( + data_path=EMBEDDINGS_PATH + ) + + item_id_to_embedding = {} + all_item_ids = [] + for idx in range(len(dataset)): + sample = dataset[idx] + item_id = int(sample['item_id']) + item_id_to_embedding[item_id] = torch.tensor(sample['embedding']) + all_item_ids.append(item_id) + + add_cooc_transform = AddWeightedCooccurrenceEmbeddings( + data.cooccur_counter_mapping, item_id_to_embedding, all_item_ids) + + train_dataloader = DataLoader( + dataset, + batch_size=BATCH_SIZE, + shuffle=True, + drop_last=True, + ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map( + ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding']) + ).map(add_cooc_transform).repeat(NUM_EPOCHS) + + valid_dataloader = DataLoader( + dataset, + batch_size=BATCH_SIZE, + shuffle=False, + drop_last=False, + ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])).map(add_cooc_transform) + + LOG_EVERY_NUM_STEPS = int(len(train_dataloader) // NUM_EPOCHS) + + model = PlumRQVAE( + input_dim=INPUT_DIM, + num_codebooks=NUM_CODEBOOKS, + codebook_size=CODEBOOK_SIZE, + embedding_dim=HIDDEN_DIM, + beta=BETA, + quant_loss_weight=1.0, + contrastive_loss_weight=1.0, + temperature=1.0 + ).to(DEVICE) + + total_params = sum(p.numel() for p in model.parameters()) + trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + + logger.debug(f'Overall parameters: {total_params:,}') + logger.debug(f'Trainable parameters: {trainable_params:,}') + + optimizer = torch.optim.Adam(model.parameters(), lr=LR, fused=True) + + callbacks = [ + InitCodebooks(valid_dataloader), + + cb.BatchMetrics(metrics=lambda model_outputs, batch: { + 'loss': model_outputs['loss'], + 'recon_loss': model_outputs['recon_loss'], + 'rqvae_loss': model_outputs['rqvae_loss'], + 'con_loss': model_outputs['con_loss'] + }, name='train'), + + FixDeadCentroids(valid_dataloader), + + cb.MetricAccumulator( + accumulators={ + 'train/loss': cb.MeanAccumulator(), + 'train/recon_loss': cb.MeanAccumulator(), + 'train/rqvae_loss': cb.MeanAccumulator(), + 'train/con_loss': cb.MeanAccumulator(), + 'num_dead/0': cb.MeanAccumulator(), + 'num_dead/1': cb.MeanAccumulator(), + 'num_dead/2': cb.MeanAccumulator(), + }, + reset_every_num_steps=LOG_EVERY_NUM_STEPS + ), + + cb.Validation( + dataset=valid_dataloader, + callbacks=[ + cb.BatchMetrics(metrics=lambda model_outputs, batch: { + 'loss': model_outputs['loss'], + 'recon_loss': model_outputs['recon_loss'], + 'rqvae_loss': model_outputs['rqvae_loss'], + 'con_loss': model_outputs['con_loss'] + }, name='valid'), + cb.MetricAccumulator( + accumulators={ + 'valid/loss': cb.MeanAccumulator(), + 'valid/recon_loss': cb.MeanAccumulator(), + 'valid/rqvae_loss': cb.MeanAccumulator(), + 'valid/con_loss': cb.MeanAccumulator() + } + ), + ], + ).every_num_steps(LOG_EVERY_NUM_STEPS), + + cb.Logger().every_num_steps(LOG_EVERY_NUM_STEPS), + cb.TensorboardLogger(experiment_name=EXPERIMENT_NAME, logdir=os.path.join(IREC_PATH, 'tensorboard_logs')), + + cb.EarlyStopping( + metric='valid/recon_loss', + patience=40, + minimize=True, + model_path=os.path.join(IREC_PATH, 'checkpoints', EXPERIMENT_NAME) + ).every_num_steps(LOG_EVERY_NUM_STEPS), + ] + + logger.debug('Everything is ready for training process!') + + runner = TrainingRunner( + model=model, + optimizer=optimizer, + dataset=train_dataloader, + callbacks=callbacks, + ) + runner.run() + + +if __name__ == '__main__': + main() diff --git a/scripts/plum/transforms.py b/scripts/plum/transforms.py index 0af1dda..8887115 100644 --- a/scripts/plum/transforms.py +++ b/scripts/plum/transforms.py @@ -2,12 +2,6 @@ import pickle import torch -from irec.data.base import BaseDataset -from irec.data.transforms import Transform - -from cooc_data import CoocMappingDataset - - class AddWeightedCooccurrenceEmbeddings: def __init__(self, cooccur_counts, item_id_to_embedding, all_item_ids): self.cooccur_counts = cooccur_counts @@ -32,7 +26,7 @@ def __call__(self, batch): else: cooc_id = np.random.choice(self.all_item_ids) - if self.call_count % 10 == 0 and idx < 5: + if self.call_count % 500 == 0 and idx < 5: print(f" idx={idx}: item_id={item_id_val} fallback random") cooc_emb = self.item_id_to_embedding.get(cooc_id, batch['embedding'][0]) diff --git a/scripts/rqvae-yambda/callbacks.py b/scripts/rqvae-yambda/callbacks.py new file mode 100644 index 0000000..43ec460 --- /dev/null +++ b/scripts/rqvae-yambda/callbacks.py @@ -0,0 +1,64 @@ +import torch + +import irec.callbacks as cb +from irec.runners import TrainingRunner, TrainingRunnerContext + +class InitCodebooks(cb.TrainingCallback): + def __init__(self, dataloader): + super().__init__() + self._dataloader = dataloader + + @torch.no_grad() + def before_run(self, runner: TrainingRunner): + for i in range(len(runner.model.codebooks)): + X = next(iter(self._dataloader))['embedding'] + idx = torch.randperm(X.shape[0], device=X.device)[:len(runner.model.codebooks[i])] + remainder = runner.model.encoder(X[idx]) + + for j in range(i): + codebook_indices = runner.model.get_codebook_indices(remainder, runner.model.codebooks[j]) + codebook_vectors = runner.model.codebooks[j][codebook_indices] + remainder = remainder - codebook_vectors + + runner.model.codebooks[i].data = remainder.detach() + + +class FixDeadCentroids(cb.TrainingCallback): + def __init__(self, dataloader): + super().__init__() + self._dataloader = dataloader + + def after_step(self, runner: TrainingRunner, context: TrainingRunnerContext): + for i, num_fixed in enumerate(self.fix_dead_codebooks(runner)): + context.metrics[f'num_dead/{i}'] = num_fixed + + @torch.no_grad() + def fix_dead_codebooks(self, runner: TrainingRunner): + num_fixed = [] + for codebook_idx, codebook in enumerate(runner.model.codebooks): + centroid_counts = torch.zeros(codebook.shape[0], dtype=torch.long, device=codebook.device) + random_batch = next(iter(self._dataloader))['embedding'] + + for batch in self._dataloader: + remainder = runner.model.encoder(batch['embedding']) + for l in range(codebook_idx): + ind = runner.model.get_codebook_indices(remainder, runner.model.codebooks[l]) + remainder = remainder - runner.model.codebooks[l][ind] + + indices = runner.model.get_codebook_indices(remainder, codebook) + centroid_counts.scatter_add_(0, indices, torch.ones_like(indices)) + + dead_mask = (centroid_counts == 0) + num_dead = int(dead_mask.sum().item()) + num_fixed.append(num_dead) + if num_dead == 0: + continue + + remainder = runner.model.encoder(random_batch) + for l in range(codebook_idx): + ind = runner.model.get_codebook_indices(remainder, runner.model.codebooks[l]) + remainder = remainder - runner.model.codebooks[l][ind] + remainder = remainder[torch.randperm(remainder.shape[0], device=codebook.device)][:num_dead] + codebook[dead_mask] = remainder.detach() + + return num_fixed diff --git a/scripts/rqvae-yambda/data.py b/scripts/rqvae-yambda/data.py new file mode 100644 index 0000000..6c213ee --- /dev/null +++ b/scripts/rqvae-yambda/data.py @@ -0,0 +1,35 @@ +import numpy as np +import polars as pl + +from irec.data.base import BaseDataset +from irec.data.transforms import Transform + + +class EmbeddingDatasetParquet(BaseDataset): + def __init__(self, data_path): + self.df = pl.read_parquet(data_path) + self.item_ids = np.array(self.df['item_id'], dtype=np.int64) + self.embeddings = np.array(self.df['embedding'].to_list(), dtype=np.float32) + print(f"embedding dim: {self.embeddings[0].shape}") + + def __getitem__(self, idx): + index = self.item_ids[idx] + tensor_emb = self.embeddings[idx] + return { + 'item_id': index, + 'embedding': tensor_emb, + 'embedding_dim': len(tensor_emb) + } + + def __len__(self): + return len(self.embeddings) + +class ProcessEmbeddings(Transform): + def __init__(self, embedding_dim, keys): + self.embedding_dim = embedding_dim + self.keys = keys + + def __call__(self, batch): + for key in self.keys: + batch[key] = batch[key].reshape(-1, self.embedding_dim) + return batch \ No newline at end of file diff --git a/scripts/rqvae-yambda/infer_yambda.py b/scripts/rqvae-yambda/infer_yambda.py new file mode 100644 index 0000000..7daf42f --- /dev/null +++ b/scripts/rqvae-yambda/infer_yambda.py @@ -0,0 +1,128 @@ +from loguru import logger +import os + +import torch + +import irec.callbacks as cb +from irec.data.dataloader import DataLoader +from irec.data.transforms import Collate, ToTorch, ToDevice +from irec.runners import InferenceRunner + +from irec.utils import fix_random_seed + +from data import EmbeddingDatasetParquet, ProcessEmbeddings +from models import RQVAE + + +IREC_PATH = '/home/jovyan/IRec/' +EMBEDDINGS_PATH = "/home/jovyan/IRec/sigir/yambda_data/yambda_embeddings_reindexed.parquet" +MODEL_PATH = '/home/jovyan/IRec/checkpoints/rqvae_yambda_hd_128_cz_512_best_0.0014.pth' +RESULTS_PATH = '/home/jovyan/IRec/rqvae-yambda-sem-ids' +EXPERIMENT_NAME = 'rqvae_yambda_hd_128_cz_512' + +SEED_VALUE = 42 +DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + +BATCH_SIZE = 1024 + +INPUT_DIM = 128 +HIDDEN_DIM = 128 +CODEBOOK_SIZE = 512 +NUM_CODEBOOKS = 3 + +BETA = 0.25 + + +def main(): + fix_random_seed(SEED_VALUE) + + dataset = EmbeddingDatasetParquet( + data_path=EMBEDDINGS_PATH + ) + + dataloader = DataLoader( + dataset, + batch_size=BATCH_SIZE, + shuffle=True, + drop_last=False, + ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])) + + model = RQVAE( + input_dim=INPUT_DIM, + num_codebooks=NUM_CODEBOOKS, + codebook_size=CODEBOOK_SIZE, + embedding_dim=HIDDEN_DIM, + beta=BETA, + quant_loss_weight=1.0 + ).to(DEVICE) + + total_params = sum(p.numel() for p in model.parameters()) + trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + + logger.debug(f'Overall parameters: {total_params:,}') + logger.debug(f'Trainable parameters: {trainable_params:,}') + + callbacks = [ + cb.LoadModel(MODEL_PATH), + + cb.BatchMetrics(metrics=lambda model_outputs, _: { + 'loss': model_outputs['loss'], + 'recon_loss': model_outputs['recon_loss'], + 'rqvae_loss': model_outputs['rqvae_loss'], + }, name='valid'), + + cb.MetricAccumulator( + accumulators={ + 'valid/loss': cb.MeanAccumulator(), + 'valid/recon_loss': cb.MeanAccumulator(), + 'valid/rqvae_loss': cb.MeanAccumulator(), + }, + ), + + cb.Logger().every_num_steps(len(dataloader)), + + cb.InferenceSaver( + metrics=lambda batch, model_outputs, _: {'item_id': batch['item_id'], 'clusters': model_outputs['clusters']}, + save_path=os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'), + format='json' + ) + ] + + logger.debug('Everything is ready for training process!') + + runner = InferenceRunner( + model=model, + dataset=dataloader, + callbacks=callbacks, + ) + runner.run() + + import json + from collections import defaultdict + import numpy as np + + with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'), 'r') as f: + mappings = json.load(f) + + inter = {} + sem_2_ids = defaultdict(list) + for mapping in mappings: + item_id = mapping['item_id'] + clusters = mapping['clusters'] + inter[int(item_id)] = clusters + sem_2_ids[tuple(clusters)].append(int(item_id)) + + for semantics, items in sem_2_ids.items(): + assert len(items) <= CODEBOOK_SIZE, str(len(items)) + collision_solvers = np.random.permutation(CODEBOOK_SIZE)[:len(items)].tolist() + for item_id, collision_solver in zip(items, collision_solvers): + inter[item_id].append(collision_solver) + for i in range(len(inter[item_id])): + inter[item_id][i] += CODEBOOK_SIZE * i + + with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters_colisionless.json'), 'w') as f: + json.dump(inter, f, indent=2) + + +if __name__ == '__main__': + main() diff --git a/scripts/rqvae-yambda/models.py b/scripts/rqvae-yambda/models.py new file mode 100644 index 0000000..87c2241 --- /dev/null +++ b/scripts/rqvae-yambda/models.py @@ -0,0 +1,91 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +class RQVAE(nn.Module): + def __init__( + self, + input_dim, + num_codebooks, + codebook_size, + embedding_dim, + beta=0.25, + quant_loss_weight=1.0, + ): + super().__init__() + self.register_buffer('beta', torch.tensor(beta)) + + self.input_dim = input_dim + self.num_codebooks = num_codebooks + self.codebook_size = codebook_size + self.embedding_dim = embedding_dim + self.quant_loss_weight = quant_loss_weight + + + self.encoder = self.make_encoding_tower(input_dim, embedding_dim) + self.decoder = self.make_encoding_tower(embedding_dim, input_dim) + + self.codebooks = torch.nn.ParameterList() + for _ in range(num_codebooks): + cb = torch.FloatTensor(codebook_size, embedding_dim) + #nn.init.normal_(cb) + self.codebooks.append(cb) + + @staticmethod + def make_encoding_tower(d1, d2, bias=False): + return torch.nn.Sequential( + nn.Linear(d1, d1), + nn.ReLU(), + nn.Linear(d1, d2), + nn.ReLU(), + nn.Linear(d2, d2, bias=bias) + ) + + @staticmethod + def get_codebook_indices(remainder, codebook): + dist = torch.cdist(remainder, codebook) + return dist.argmin(dim=-1) + + def forward(self, inputs): + latent_vector = self.encoder(inputs['embedding']) + item_ids = inputs['item_id'] + + latent_restored = 0 + rqvae_loss = 0 + clusters = [] + remainder = latent_vector + + for codebook in self.codebooks: + codebook_indices = self.get_codebook_indices(remainder, codebook) + clusters.append(codebook_indices) + + quantized = codebook[codebook_indices] + codebook_vectors = remainder + (quantized - remainder).detach() + + rqvae_loss += self.beta * torch.nn.functional.mse_loss(remainder, quantized.detach()) + rqvae_loss += torch.nn.functional.mse_loss(quantized, remainder.detach()) + + latent_restored += codebook_vectors + remainder = remainder - codebook_vectors + + embeddings_restored = self.decoder(latent_restored) + recon_loss = torch.nn.functional.mse_loss(embeddings_restored, inputs['embedding']) + + loss = ( + recon_loss + + self.quant_loss_weight * rqvae_loss + ).mean() + + clusters_counts = [] + for cluster in clusters: + clusters_counts.append(torch.bincount(cluster, minlength=self.codebook_size)) + + return loss, { + 'loss': loss.item(), + 'recon_loss': recon_loss.mean().item(), + 'rqvae_loss': rqvae_loss.mean().item(), + + 'clusters_counts': clusters_counts, + 'clusters': torch.stack(clusters).T, + 'embedding_hat': embeddings_restored, + } \ No newline at end of file diff --git a/scripts/rqvae-yambda/train_yambda.py b/scripts/rqvae-yambda/train_yambda.py new file mode 100644 index 0000000..71582ae --- /dev/null +++ b/scripts/rqvae-yambda/train_yambda.py @@ -0,0 +1,151 @@ +from loguru import logger +import os + +import torch + +import irec.callbacks as cb +from irec.data.dataloader import DataLoader +from irec.data.transforms import Collate, ToTorch, ToDevice +from irec.runners import TrainingRunner + +from irec.utils import fix_random_seed + +from callbacks import InitCodebooks, FixDeadCentroids +from data import EmbeddingDatasetParquet, ProcessEmbeddings +from models import RQVAE + +SEED_VALUE = 42 +DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + +NUM_EPOCHS = 100 +BATCH_SIZE = 1024 + +INPUT_DIM = 128 +HIDDEN_DIM = 128 +CODEBOOK_SIZE = 512 +NUM_CODEBOOKS = 3 +BETA = 0.25 +LR = 1e-4 + +EXPERIMENT_NAME = 'rqvae_yambda_hd_128_cz_512' +EMBEDDINGS_PATH = "/home/jovyan/IRec/sigir/yambda_data/yambda_embeddings_reindexed.parquet" +IREC_PATH = '../../' + +print(EXPERIMENT_NAME, EMBEDDINGS_PATH) +def main(): + fix_random_seed(SEED_VALUE) + + dataset = EmbeddingDatasetParquet( + data_path=EMBEDDINGS_PATH + ) + + train_dataloader = DataLoader( + dataset, + batch_size=BATCH_SIZE, + num_workers=8, + shuffle=True, + drop_last=True, + persistent_workers=True, + pin_memory=True + ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map( + ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding']) + ).repeat(NUM_EPOCHS) + + valid_dataloader = DataLoader( + dataset, + batch_size=BATCH_SIZE, + shuffle=False, + drop_last=False, + ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])) + + LOG_EVERY_NUM_STEPS = int(len(train_dataloader) // NUM_EPOCHS) + + model = RQVAE( + input_dim=INPUT_DIM, + num_codebooks=NUM_CODEBOOKS, + codebook_size=CODEBOOK_SIZE, + embedding_dim=HIDDEN_DIM, + beta=BETA, + quant_loss_weight=1.0 + ).to(DEVICE) + + total_params = sum(p.numel() for p in model.parameters()) + trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + + logger.debug(f'Overall parameters: {total_params:,}') + logger.debug(f'Trainable parameters: {trainable_params:,}') + + optimizer = torch.optim.Adam(model.parameters(), lr=LR, fused=True) + + callbacks = [ + InitCodebooks(valid_dataloader), + + cb.BatchMetrics(metrics=lambda model_outputs, batch: { + 'loss': model_outputs['loss'], + 'recon_loss': model_outputs['recon_loss'], + 'rqvae_loss': model_outputs['rqvae_loss'] + }, name='train'), + + FixDeadCentroids(valid_dataloader), + + cb.MetricAccumulator( + accumulators={ + 'train/loss': cb.MeanAccumulator(), + 'train/recon_loss': cb.MeanAccumulator(), + 'train/rqvae_loss': cb.MeanAccumulator(), + 'num_dead/0': cb.MeanAccumulator(), + 'num_dead/1': cb.MeanAccumulator(), + 'num_dead/2': cb.MeanAccumulator(), + }, + reset_every_num_steps=LOG_EVERY_NUM_STEPS + ), + + cb.Validation( + dataset=valid_dataloader, + callbacks=[ + cb.BatchMetrics(metrics=lambda model_outputs, batch: { + 'loss': model_outputs['loss'], + 'recon_loss': model_outputs['recon_loss'], + 'rqvae_loss': model_outputs['rqvae_loss'], + }, name='valid'), + cb.MetricAccumulator( + accumulators={ + 'valid/loss': cb.MeanAccumulator(), + 'valid/recon_loss': cb.MeanAccumulator(), + 'valid/rqvae_loss': cb.MeanAccumulator(), + } + ), + ], + ).every_num_steps(LOG_EVERY_NUM_STEPS), + + cb.Logger().every_num_steps(LOG_EVERY_NUM_STEPS), + cb.TensorboardLogger(experiment_name=EXPERIMENT_NAME, logdir=os.path.join(IREC_PATH, 'tensorboard_logs')), + + cb.Profiler( + wait=10, + warmup=10, + active=10, + logdir=os.path.join(IREC_PATH, 'tensorboard_logs') + ), + + cb.EarlyStopping( + metric='valid/recon_loss', + patience=40, + minimize=True, + model_path=os.path.join(IREC_PATH, 'checkpoints', EXPERIMENT_NAME) + ).every_num_steps(LOG_EVERY_NUM_STEPS), + ] + + logger.debug('Everything is ready for training process!') + + runner = TrainingRunner( + model=model, + optimizer=optimizer, + dataset=train_dataloader, + callbacks=callbacks, + ) + runner.run() + + +if __name__ == '__main__': + main() diff --git a/scripts/tiger-lsvd/data.py b/scripts/tiger-lsvd/data.py new file mode 100644 index 0000000..26123db --- /dev/null +++ b/scripts/tiger-lsvd/data.py @@ -0,0 +1,428 @@ +from collections import defaultdict +import json +from loguru import logger +import numpy as np +from pathlib import Path + + +import pyarrow as pa +import pyarrow.feather as feather + +import torch +import polars as pl +from irec.data.base import BaseDataset + + +class InteractionsDatasetParquet(BaseDataset): + def __init__(self, data_path, max_items=None): + self.df = pl.read_parquet(data_path) + assert 'uid' in self.df.columns, "Missing 'uid' column" + assert 'item_ids' in self.df.columns, "Missing 'item_ids' column" + print(f"Dataset loaded: {len(self.df)} users") + + if max_items is not None: + self.df = self.df.with_columns( + pl.col("item_ids").list.slice(-max_items).alias("item_ids") + ) + + def __getitem__(self, idx): + row = self.df.row(idx, named=True) + return { + 'user_id': row['uid'], + 'item_ids': np.array(row['item_ids'], dtype=np.uint32), + } + + def __len__(self): + return len(self.df) + + def __iter__(self): + for idx in range(len(self)): + yield self[idx] + + + +class Dataset: + def __init__( + self, + train_sampler, + validation_sampler, + test_sampler, + num_items, + max_sequence_length + ): + self._train_sampler = train_sampler + self._validation_sampler = validation_sampler + self._test_sampler = test_sampler + self._num_items = num_items + self._max_sequence_length = max_sequence_length + + @classmethod + def create_timestamp_based_parquet( + cls, + train_parquet_path, + validation_parquet_path, + test_parquet_path, + max_sequence_length, + sampler_type, + min_sample_len=2, + is_extended=False, + max_train_events=50 + ): + """ + Загружает данные из parquet файлов с timestamp-based сплитом. + + Ожидает структуру parquet: + - uid: int (user id) + - item_ids: list[int] (список item ids) + + Аналогично create_timestamp_based, но для parquet формата. + """ + max_item_id = 0 + train_dataset, validation_dataset, test_dataset = [], [], [] + + print(f"started to load datasets from parquet with max train length {max_train_events}") + + # Загружаем parquet файлы + train_df = pl.read_parquet(train_parquet_path) + validation_df = pl.read_parquet(validation_parquet_path) + test_df = pl.read_parquet(test_parquet_path) + + # Проверяем наличие необходимых колонок + for df, name in [(train_df, "train"), (validation_df, "validation"), (test_df, "test")]: + assert 'uid' in df.columns, f"Missing 'uid' column in {name}" + assert 'item_ids' in df.columns, f"Missing 'item_ids' column in {name}" + + # Создаем словари для быстрого доступа + train_data = {str(row['uid']): row['item_ids'] for row in train_df.iter_rows(named=True)} + validation_data = {str(row['uid']): row['item_ids'] for row in validation_df.iter_rows(named=True)} + test_data = {str(row['uid']): row['item_ids'] for row in test_df.iter_rows(named=True)} + + all_users = set(train_data.keys()) | set(validation_data.keys()) | set(test_data.keys()) + print(f"all users count: {len(all_users)}") + + us_count = 0 + for user_id_str in all_users: + if us_count % 100 == 0: + print(f"user id {us_count}/{len(all_users)}: {user_id_str}") + + user_id = int(user_id_str) + + # Получаем последовательности для каждого сплита + train_items = list(train_data.get(user_id_str, [])) + validation_items = list(validation_data.get(user_id_str, [])) + test_items = list(test_data.get(user_id_str, [])) + + # Обрезаем train на последние max_train_events событий + train_items = train_items[-max_train_events:] if len(train_items) > max_train_events else train_items + + full_sequence = train_items + validation_items + test_items + if full_sequence: + max_item_id = max(max_item_id, max(full_sequence)) + + if us_count % 100 == 0: + print(f"full sequence len: {len(full_sequence)}") + + us_count += 1 + if len(full_sequence) < 4: + print(f'Core-4 dataset is used, user {user_id} has only {len(full_sequence)} items') + continue + + if is_extended: + # sample = [1, 2] + # sample = [1, 2, 3] + # sample = [1, 2, 3, 4] + # sample = [1, 2, 3, 4, 5] + # sample = [1, 2, 3, 4, 5, 6] + # sample = [1, 2, 3, 4, 5, 6, 7] + # sample = [1, 2, 3, 4, 5, 6, 7, 8] + for prefix_length in range(min_sample_len, len(train_items) + 1): + train_dataset.append({ + 'user.ids': [user_id], + 'item.ids': train_items[:prefix_length], + }) + else: + # sample = [1, 2, 3, 4, 5, 6, 7, 8] + train_dataset.append({ + 'user.ids': [user_id], + 'item.ids': train_items, + }) + + # валидация + + # разворачиваем каждый айтем из валидации в отдельный сэмпл + # Пример: Train=[1,2], Valid=[3,4] + # sample = [1, 2, 3] + # sample = [1, 2, 3, 4] + + current_history = train_items.copy() + valid_small_history = 0 + for item in validation_items: + # эвал датасет сам отрезает таргет потом + sample_sequence = current_history + [item] + + if len(sample_sequence) >= min_sample_len: + validation_dataset.append({ + 'user.ids': [user_id], + 'item.ids': sample_sequence, + }) + else: + valid_small_history += 1 + current_history.append(item) + + # разворачиваем каждый айтем из теста в отдельный сэмпл + # Пример: Train=[1,2], Valid=[3,4], Test=[5, 6] + # sample = [1, 2, 3, 4, 5] + # sample = [1, 2, 3, 4, 5, 6] + current_history = train_items + validation_items + test_small_history = 0 + for item in test_items: + sample_sequence = current_history + [item] + if len(sample_sequence) >= min_sample_len: + test_dataset.append({ + 'user.ids': [user_id], + 'item.ids': sample_sequence, + }) + else: + test_small_history += 1 + current_history.append(item) + + print(f"Train dataset size: {len(train_dataset)}") + print(f"Validation dataset size: {len(validation_dataset)} with skipped {valid_small_history}") + print(f"Test dataset size: {len(test_dataset)} with skipped {test_small_history}") + + logger.debug(f'Train dataset size: {len(train_dataset)}') + logger.debug(f'Validation dataset size: {len(validation_dataset)}') + logger.debug(f'Test dataset size: {len(test_dataset)}') + + train_sampler = TrainDataset(train_dataset, sampler_type, max_sequence_length=max_sequence_length) + validation_sampler = EvalDataset(validation_dataset, max_sequence_length=max_sequence_length) + test_sampler = EvalDataset(test_dataset, max_sequence_length=max_sequence_length) + + return cls( + train_sampler=train_sampler, + validation_sampler=validation_sampler, + test_sampler=test_sampler, + num_items=max_item_id + 1, # +1 added because our ids are 0-indexed + max_sequence_length=max_sequence_length + ) + + @classmethod + def create(cls, inter_json_path, max_sequence_length, sampler_type, is_extended=False): + max_item_id = 0 + train_dataset, validation_dataset, test_dataset = [], [], [] + + with open(inter_json_path, 'r') as f: + user_interactions = json.load(f) + + for user_id_str, item_ids in user_interactions.items(): + user_id = int(user_id_str) + + if item_ids: + max_item_id = max(max_item_id, max(item_ids)) + + assert len(item_ids) >= 5, f'Core-5 dataset is used, user {user_id} has only {len(item_ids)} items' + + # sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] (leave one out scheme, 8 - train, 9 - valid, 10 - test) + if is_extended: + # sample = [1, 2] + # sample = [1, 2, 3] + # sample = [1, 2, 3, 4] + # sample = [1, 2, 3, 4, 5] + # sample = [1, 2, 3, 4, 5, 6] + # sample = [1, 2, 3, 4, 5, 6, 7] + # sample = [1, 2, 3, 4, 5, 6, 7, 8] + for prefix_length in range(2, len(item_ids) - 2 + 1): + train_dataset.append({ + 'user.ids': [user_id], + 'item.ids': item_ids[:prefix_length], + }) + else: + # sample = [1, 2, 3, 4, 5, 6, 7, 8] + train_dataset.append({ + 'user.ids': [user_id], + 'item.ids': item_ids[:-2], + }) + + # sample = [1, 2, 3, 4, 5, 6, 7, 8, 9] + validation_dataset.append({ + 'user.ids': [user_id], + 'item.ids': item_ids[:-1], + }) + + # sample = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + test_dataset.append({ + 'user.ids': [user_id], + 'item.ids': item_ids, + }) + + logger.debug(f'Train dataset size: {len(train_dataset)}') + logger.debug(f'Validation dataset size: {len(validation_dataset)}') + logger.debug(f'Test dataset size: {len(test_dataset)}') + logger.debug(f'Max item id: {max_item_id}') + + train_sampler = TrainDataset(train_dataset, sampler_type, max_sequence_length=max_sequence_length) + validation_sampler = EvalDataset(validation_dataset, max_sequence_length=max_sequence_length) + test_sampler = EvalDataset(test_dataset, max_sequence_length=max_sequence_length) + + return cls( + train_sampler=train_sampler, + validation_sampler=validation_sampler, + test_sampler=test_sampler, + num_items=max_item_id + 1, # +1 added because our ids are 0-indexed + max_sequence_length=max_sequence_length + ) + + def get_datasets(self): + return self._train_sampler, self._validation_sampler, self._test_sampler + + @property + def num_items(self): + return self._num_items + + @property + def max_sequence_length(self): + return self._max_sequence_length + + +class TrainDataset(BaseDataset): + def __init__(self, dataset, prediction_type, max_sequence_length): + self._dataset = dataset + self._prediction_type = prediction_type + self._max_sequence_length = max_sequence_length + + self._transforms = { + 'sasrec': self._all_items_transform, + 'tiger': self._last_item_transform + } + + def _all_items_transform(self, sample): + item_sequence = sample['item.ids'][-self._max_sequence_length:][:-1] + next_item_sequence = sample['item.ids'][-self._max_sequence_length:][1:] + return { + 'user.ids': np.array(sample['user.ids'], dtype=np.int64), + 'user.length': np.array([len(sample['user.ids'])], dtype=np.int64), + 'item.ids': np.array(item_sequence, dtype=np.int64), + 'item.length': np.array([len(item_sequence)], dtype=np.int64), + 'labels.ids': np.array(next_item_sequence, dtype=np.int64), + 'labels.length': np.array([len(next_item_sequence)], dtype=np.int64) + } + + def _last_item_transform(self, sample): + item_sequence = sample['item.ids'][-self._max_sequence_length:][:-1] + last_item = sample['item.ids'][-self._max_sequence_length:][-1] + return { + 'user.ids': np.array(sample['user.ids'], dtype=np.int64), + 'user.length': np.array([len(sample['user.ids'])], dtype=np.int64), + 'item.ids': np.array(item_sequence, dtype=np.int64), + 'item.length': np.array([len(item_sequence)], dtype=np.int64), + 'labels.ids': np.array([last_item], dtype=np.int64), + 'labels.length': np.array([1], dtype=np.int64), + } + + def __getitem__(self, index): + return self._transforms[self._prediction_type](self._dataset[index]) + + def __len__(self): + return len(self._dataset) + + +class EvalDataset(BaseDataset): + def __init__(self, dataset, max_sequence_length): + self._dataset = dataset + self._max_sequence_length = max_sequence_length + + @property + def dataset(self): + return self._dataset + + def __len__(self): + return len(self._dataset) + + def __getitem__(self, index): + sample = self._dataset[index] + + item_sequence = sample['item.ids'][-self._max_sequence_length:][:-1] + next_item = sample['item.ids'][-self._max_sequence_length:][-1] + + return { + 'user.ids': np.array(sample['user.ids'], dtype=np.int64), + 'user.length': np.array([len(sample['user.ids'])], dtype=np.int64), + 'item.ids': np.array(item_sequence, dtype=np.int64), + 'item.length': np.array([len(item_sequence)], dtype=np.int64), + 'labels.ids': np.array([next_item], dtype=np.int64), + 'labels.length': np.array([1], dtype=np.int64), + 'visited.ids': np.array(sample['item.ids'][:-1], dtype=np.int64), + 'visited.length': np.array([len(sample['item.ids'][:-1])], dtype=np.int64), + } + + +class ArrowBatchDataset(BaseDataset): + def __init__(self, batch_dir, device='cuda', preload=False): + self.batch_dir = Path(batch_dir) + self.device = device + + all_files = list(self.batch_dir.glob('batch_*_len_*.arrow')) + + batch_files_map = defaultdict(list) + for f in all_files: + batch_id = int(f.stem.split('_')[1]) + batch_files_map[batch_id].append(f) + + for batch_id in batch_files_map: + batch_files_map[batch_id].sort() + + self.batch_indices = sorted(batch_files_map.keys()) + + if preload: + print(f"Preloading {len(self.batch_indices)} batches...") + self.cached_batches = [] + + for idx in range(len(self.batch_indices)): + batch = self._load_batch(batch_files_map[self.batch_indices[idx]]) + self.cached_batches.append(batch) + else: + self.cached_batches = None + self.batch_files_map = batch_files_map + + def _load_batch(self, arrow_files): + batch = {} + + for arrow_file in arrow_files: + table = feather.read_table(arrow_file) + metadata = table.schema.metadata or {} + + for col_name in table.column_names: + col = table.column(col_name) + + shape_key = f'{col_name}_shape' + dtype_key = f'{col_name}_dtype' + + if shape_key.encode() in metadata: + shape = eval(metadata[shape_key.encode()].decode()) + dtype = np.dtype(metadata[dtype_key.encode()].decode()) + + # Проверяем тип колонки + if pa.types.is_list(col.type) or pa.types.is_large_list(col.type): + arr = np.array(col.to_pylist(), dtype=dtype) + else: + arr = col.to_numpy().reshape(shape).astype(dtype) + else: + if pa.types.is_list(col.type) or pa.types.is_large_list(col.type): + arr = np.array(col.to_pylist()) + else: + arr = col.to_numpy() + + batch[col_name] = torch.from_numpy(arr.copy()).to(self.device) + + return batch + + def __len__(self): + return len(self.batch_indices) + + def __getitem__(self, idx): + if self.cached_batches is not None: + return self.cached_batches[idx] + else: + batch_id = self.batch_indices[idx] + arrow_files = self.batch_files_map[batch_id] + return self._load_batch(arrow_files) diff --git a/scripts/tiger-lsvd/lsvd_train_4.1_plum.py b/scripts/tiger-lsvd/lsvd_train_4.1_plum.py new file mode 100644 index 0000000..96a08d7 --- /dev/null +++ b/scripts/tiger-lsvd/lsvd_train_4.1_plum.py @@ -0,0 +1,230 @@ +from functools import partial +import json +from loguru import logger +import os + +import torch + +import irec.callbacks as cb +from irec.data.transforms import Collate, ToDevice +from irec.data.dataloader import DataLoader +from irec.runners import TrainingRunner +from irec.utils import fix_random_seed + +from data import ArrowBatchDataset +from models import TigerModel, CorrectItemsLogitsProcessor + + +# ПУТИ +IREC_PATH = '../../' +SEMANTIC_MAPPING_PATH = os.path.join(IREC_PATH, 'results/4-1_vk_lsvd_ods_base_with_gap_cb_512_ws_2_k_2000_8w_e_35_clusters_colisionless.json') +TRAIN_BATCHES_DIR = os.path.join(IREC_PATH, 'data/lsvd/8-weeks-base-one-week-split-4.1/train_batches/') +VALID_BATCHES_DIR = os.path.join(IREC_PATH, 'data/lsvd/8-weeks-base-one-week-split-4.1/valid_batches/') +EVAL_BATCHES_DIR = os.path.join(IREC_PATH, 'data/lsvd/8-weeks-base-one-week-split-4.1/eval_batches/') + + +TENSORBOARD_LOGDIR = os.path.join(IREC_PATH, 'tensorboard_logs') +CHECKPOINTS_DIR = os.path.join(IREC_PATH, 'checkpoints') + +EXPERIMENT_NAME = 'tiger_4-1_vk_lsvd_ods_base_with_gap_cb_512_ws_2_k_2000_8w_e_35' + +# ОСТАЛЬНОЕ +SEED_VALUE = 42 +DEVICE = 'cuda' + +NUM_EPOCHS = 300 +MAX_SEQ_LEN = 20 +TRAIN_BATCH_SIZE = 256 +VALID_BATCH_SIZE = 1024 +EMBEDDING_DIM = 128 +CODEBOOK_SIZE = 512 +NUM_POSITIONS = 80 +NUM_USER_HASH = 2000 +NUM_HEADS = 6 +NUM_LAYERS = 4 +FEEDFORWARD_DIM = 1024 +KV_DIM = 64 +DROPOUT = 0.2 +NUM_BEAMS = 30 +TOP_K = 20 +NUM_CODEBOOKS = 4 +LR = 0.0001 + +USE_MICROBATCHING = True +MICROBATCH_SIZE = 256 + +torch.set_float32_matmul_precision('high') +torch._dynamo.config.capture_scalar_outputs = True + +import torch._inductor.config as config +config.triton.cudagraph_skip_dynamic_graphs = True + + +def main(): + fix_random_seed(SEED_VALUE) + + with open(SEMANTIC_MAPPING_PATH, 'r') as f: + mappings = json.load(f) + + train_dataloader = DataLoader( + ArrowBatchDataset( + TRAIN_BATCHES_DIR, + device='cpu', + preload=True + ), + batch_size=1, + shuffle=True, + num_workers=0, + pin_memory=True, + collate_fn=Collate() + ).map(ToDevice(DEVICE)).repeat(NUM_EPOCHS) + + valid_dataloder = ArrowBatchDataset( + VALID_BATCHES_DIR, + device=DEVICE, + preload=True + ) + + eval_dataloder = ArrowBatchDataset( + EVAL_BATCHES_DIR, + device=DEVICE, + preload=True + ) + + model = TigerModel( + embedding_dim=EMBEDDING_DIM, + codebook_size=CODEBOOK_SIZE, + sem_id_len=NUM_CODEBOOKS, + user_ids_count=NUM_USER_HASH, + num_positions=NUM_POSITIONS, + num_heads=NUM_HEADS, + num_encoder_layers=NUM_LAYERS, + num_decoder_layers=NUM_LAYERS, + dim_feedforward=FEEDFORWARD_DIM, + num_beams=NUM_BEAMS, + num_return_sequences=TOP_K, + activation='relu', + d_kv=KV_DIM, + dropout=DROPOUT, + layer_norm_eps=1e-6, + initializer_range=0.02, + logits_processor=partial( + CorrectItemsLogitsProcessor, + NUM_CODEBOOKS, + CODEBOOK_SIZE, + mappings, + NUM_BEAMS + ), + use_microbatching=USE_MICROBATCHING, + microbatch_size=MICROBATCH_SIZE + ).to(DEVICE) + + total_params = sum(p.numel() for p in model.parameters()) + trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + + logger.debug(f'Overall parameters: {total_params:,}') + logger.debug(f'Trainable parameters: {trainable_params:,}') + + optimizer = torch.optim.AdamW( + model.parameters(), + lr=LR, + ) + + EPOCH_NUM_STEPS = 1024 # int(len(train_dataloader) // NUM_EPOCHS) + + callbacks = [ + cb.BatchMetrics(metrics=lambda model_outputs, _: { + 'loss': model_outputs['loss'].item(), + }, name='train'), + cb.MetricAccumulator( + accumulators={ + 'train/loss': cb.MeanAccumulator(), + }, + reset_every_num_steps=EPOCH_NUM_STEPS + ), + + cb.Validation( + dataset=valid_dataloder, + callbacks=[ + cb.BatchMetrics(metrics=lambda model_outputs, _:{ + 'loss': model_outputs['loss'].item(), + 'recall@5': model_outputs['recall@5'].tolist(), + 'recall@10': model_outputs['recall@10'].tolist(), + 'recall@20': model_outputs['recall@20'].tolist(), + 'ndcg@5': model_outputs['ndcg@5'].tolist(), + 'ndcg@10': model_outputs['ndcg@10'].tolist(), + 'ndcg@20': model_outputs['ndcg@20'].tolist(), + }, name='validation'), + cb.MetricAccumulator( + accumulators={ + 'validation/loss': cb.MeanAccumulator(), + 'validation/recall@5': cb.MeanAccumulator(), + 'validation/recall@10': cb.MeanAccumulator(), + 'validation/recall@20': cb.MeanAccumulator(), + 'validation/ndcg@5': cb.MeanAccumulator(), + 'validation/ndcg@10': cb.MeanAccumulator(), + 'validation/ndcg@20': cb.MeanAccumulator(), + }, + ), + ], + ).every_num_steps(EPOCH_NUM_STEPS), + + cb.Validation( + dataset=eval_dataloder, + callbacks=[ + cb.BatchMetrics(metrics=lambda model_outputs, _: { + 'loss': model_outputs['loss'].item(), + 'recall@5': model_outputs['recall@5'].tolist(), + 'recall@10': model_outputs['recall@10'].tolist(), + 'recall@20': model_outputs['recall@20'].tolist(), + 'ndcg@5': model_outputs['ndcg@5'].tolist(), + 'ndcg@10': model_outputs['ndcg@10'].tolist(), + 'ndcg@20': model_outputs['ndcg@20'].tolist(), + }, name='eval'), + cb.MetricAccumulator( + accumulators={ + 'eval/loss': cb.MeanAccumulator(), + 'eval/recall@5': cb.MeanAccumulator(), + 'eval/recall@10': cb.MeanAccumulator(), + 'eval/recall@20': cb.MeanAccumulator(), + 'eval/ndcg@5': cb.MeanAccumulator(), + 'eval/ndcg@10': cb.MeanAccumulator(), + 'eval/ndcg@20': cb.MeanAccumulator(), + }, + ), + ], + ).every_num_steps(EPOCH_NUM_STEPS * 4), + + cb.Logger().every_num_steps(EPOCH_NUM_STEPS), + cb.TensorboardLogger(experiment_name=EXPERIMENT_NAME, logdir=TENSORBOARD_LOGDIR), + + cb.EarlyStopping( + metric='validation/ndcg@20', + patience=40 * 4, + minimize=False, + model_path=os.path.join(CHECKPOINTS_DIR, EXPERIMENT_NAME) + ).every_num_steps(EPOCH_NUM_STEPS) + + # cb.Profiler( + # wait=10, + # warmup=10, + # active=10, + # logdir=TENSORBOARD_LOGDIR + # ), + # cb.StopAfterNumSteps(40) + + ] + + logger.debug('Everything is ready for training process!') + + runner = TrainingRunner( + model=model, + optimizer=optimizer, + dataset=train_dataloader, + callbacks=callbacks, + ) + runner.run() + + +if __name__ == '__main__': + main() diff --git a/scripts/tiger-lsvd/lsvd_train_4.2_plum.py b/scripts/tiger-lsvd/lsvd_train_4.2_plum.py new file mode 100644 index 0000000..991f662 --- /dev/null +++ b/scripts/tiger-lsvd/lsvd_train_4.2_plum.py @@ -0,0 +1,230 @@ +from functools import partial +import json +from loguru import logger +import os + +import torch + +import irec.callbacks as cb +from irec.data.transforms import Collate, ToDevice +from irec.data.dataloader import DataLoader +from irec.runners import TrainingRunner +from irec.utils import fix_random_seed + +from data import ArrowBatchDataset +from models import TigerModel, CorrectItemsLogitsProcessor + + +# ПУТИ +IREC_PATH = '../../' +SEMANTIC_MAPPING_PATH = os.path.join(IREC_PATH, 'results/4-2_vk_lsvd_ods_base_with_gap_cb_512_ws_2_k_2000_8w_e_35_clusters_colisionless.json') +TRAIN_BATCHES_DIR = os.path.join(IREC_PATH, 'data/lsvd/8-weeks-base-one-week-split-4.2/train_batches/') +VALID_BATCHES_DIR = os.path.join(IREC_PATH, 'data/lsvd/8-weeks-base-one-week-split-4.2/valid_batches/') +EVAL_BATCHES_DIR = os.path.join(IREC_PATH, 'data/lsvd/8-weeks-base-one-week-split-4.2/eval_batches/') + + +TENSORBOARD_LOGDIR = os.path.join(IREC_PATH, 'tensorboard_logs') +CHECKPOINTS_DIR = os.path.join(IREC_PATH, 'checkpoints') + +EXPERIMENT_NAME = 'tiger_4-2_vk_lsvd_ods_base_cb_512_ws_2_k_2000_8w_e_35' + +# ОСТАЛЬНОЕ +SEED_VALUE = 42 +DEVICE = 'cuda' + +NUM_EPOCHS = 300 +MAX_SEQ_LEN = 20 +TRAIN_BATCH_SIZE = 256 +VALID_BATCH_SIZE = 1024 +EMBEDDING_DIM = 128 +CODEBOOK_SIZE = 512 +NUM_POSITIONS = 80 +NUM_USER_HASH = 2000 +NUM_HEADS = 6 +NUM_LAYERS = 4 +FEEDFORWARD_DIM = 1024 +KV_DIM = 64 +DROPOUT = 0.2 +NUM_BEAMS = 30 +TOP_K = 20 +NUM_CODEBOOKS = 4 +LR = 0.0001 + +USE_MICROBATCHING = True +MICROBATCH_SIZE = 256 + +torch.set_float32_matmul_precision('high') +torch._dynamo.config.capture_scalar_outputs = True + +import torch._inductor.config as config +config.triton.cudagraph_skip_dynamic_graphs = True + + +def main(): + fix_random_seed(SEED_VALUE) + + with open(SEMANTIC_MAPPING_PATH, 'r') as f: + mappings = json.load(f) + + train_dataloader = DataLoader( + ArrowBatchDataset( + TRAIN_BATCHES_DIR, + device='cpu', + preload=True + ), + batch_size=1, + shuffle=True, + num_workers=0, + pin_memory=True, + collate_fn=Collate() + ).map(ToDevice(DEVICE)).repeat(NUM_EPOCHS) + + valid_dataloder = ArrowBatchDataset( + VALID_BATCHES_DIR, + device=DEVICE, + preload=True + ) + + eval_dataloder = ArrowBatchDataset( + EVAL_BATCHES_DIR, + device=DEVICE, + preload=True + ) + + model = TigerModel( + embedding_dim=EMBEDDING_DIM, + codebook_size=CODEBOOK_SIZE, + sem_id_len=NUM_CODEBOOKS, + user_ids_count=NUM_USER_HASH, + num_positions=NUM_POSITIONS, + num_heads=NUM_HEADS, + num_encoder_layers=NUM_LAYERS, + num_decoder_layers=NUM_LAYERS, + dim_feedforward=FEEDFORWARD_DIM, + num_beams=NUM_BEAMS, + num_return_sequences=TOP_K, + activation='relu', + d_kv=KV_DIM, + dropout=DROPOUT, + layer_norm_eps=1e-6, + initializer_range=0.02, + logits_processor=partial( + CorrectItemsLogitsProcessor, + NUM_CODEBOOKS, + CODEBOOK_SIZE, + mappings, + NUM_BEAMS + ), + use_microbatching=USE_MICROBATCHING, + microbatch_size=MICROBATCH_SIZE + ).to(DEVICE) + + total_params = sum(p.numel() for p in model.parameters()) + trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + + logger.debug(f'Overall parameters: {total_params:,}') + logger.debug(f'Trainable parameters: {trainable_params:,}') + + optimizer = torch.optim.AdamW( + model.parameters(), + lr=LR, + ) + + EPOCH_NUM_STEPS = 1024 # int(len(train_dataloader) // NUM_EPOCHS) + + callbacks = [ + cb.BatchMetrics(metrics=lambda model_outputs, _: { + 'loss': model_outputs['loss'].item(), + }, name='train'), + cb.MetricAccumulator( + accumulators={ + 'train/loss': cb.MeanAccumulator(), + }, + reset_every_num_steps=EPOCH_NUM_STEPS + ), + + cb.Validation( + dataset=valid_dataloder, + callbacks=[ + cb.BatchMetrics(metrics=lambda model_outputs, _:{ + 'loss': model_outputs['loss'].item(), + 'recall@5': model_outputs['recall@5'].tolist(), + 'recall@10': model_outputs['recall@10'].tolist(), + 'recall@20': model_outputs['recall@20'].tolist(), + 'ndcg@5': model_outputs['ndcg@5'].tolist(), + 'ndcg@10': model_outputs['ndcg@10'].tolist(), + 'ndcg@20': model_outputs['ndcg@20'].tolist(), + }, name='validation'), + cb.MetricAccumulator( + accumulators={ + 'validation/loss': cb.MeanAccumulator(), + 'validation/recall@5': cb.MeanAccumulator(), + 'validation/recall@10': cb.MeanAccumulator(), + 'validation/recall@20': cb.MeanAccumulator(), + 'validation/ndcg@5': cb.MeanAccumulator(), + 'validation/ndcg@10': cb.MeanAccumulator(), + 'validation/ndcg@20': cb.MeanAccumulator(), + }, + ), + ], + ).every_num_steps(EPOCH_NUM_STEPS), + + cb.Validation( + dataset=eval_dataloder, + callbacks=[ + cb.BatchMetrics(metrics=lambda model_outputs, _: { + 'loss': model_outputs['loss'].item(), + 'recall@5': model_outputs['recall@5'].tolist(), + 'recall@10': model_outputs['recall@10'].tolist(), + 'recall@20': model_outputs['recall@20'].tolist(), + 'ndcg@5': model_outputs['ndcg@5'].tolist(), + 'ndcg@10': model_outputs['ndcg@10'].tolist(), + 'ndcg@20': model_outputs['ndcg@20'].tolist(), + }, name='eval'), + cb.MetricAccumulator( + accumulators={ + 'eval/loss': cb.MeanAccumulator(), + 'eval/recall@5': cb.MeanAccumulator(), + 'eval/recall@10': cb.MeanAccumulator(), + 'eval/recall@20': cb.MeanAccumulator(), + 'eval/ndcg@5': cb.MeanAccumulator(), + 'eval/ndcg@10': cb.MeanAccumulator(), + 'eval/ndcg@20': cb.MeanAccumulator(), + }, + ), + ], + ).every_num_steps(EPOCH_NUM_STEPS * 4), + + cb.Logger().every_num_steps(EPOCH_NUM_STEPS), + cb.TensorboardLogger(experiment_name=EXPERIMENT_NAME, logdir=TENSORBOARD_LOGDIR), + + cb.EarlyStopping( + metric='validation/ndcg@20', + patience=40 * 4, + minimize=False, + model_path=os.path.join(CHECKPOINTS_DIR, EXPERIMENT_NAME) + ).every_num_steps(EPOCH_NUM_STEPS) + + # cb.Profiler( + # wait=10, + # warmup=10, + # active=10, + # logdir=TENSORBOARD_LOGDIR + # ), + # cb.StopAfterNumSteps(40) + + ] + + logger.debug('Everything is ready for training process!') + + runner = TrainingRunner( + model=model, + optimizer=optimizer, + dataset=train_dataloader, + callbacks=callbacks, + ) + runner.run() + + +if __name__ == '__main__': + main() diff --git a/scripts/tiger-lsvd/lsvd_train_rqvae.py b/scripts/tiger-lsvd/lsvd_train_rqvae.py new file mode 100644 index 0000000..aadf225 --- /dev/null +++ b/scripts/tiger-lsvd/lsvd_train_rqvae.py @@ -0,0 +1,230 @@ +from functools import partial +import json +from loguru import logger +import os + +import torch + +import irec.callbacks as cb +from irec.data.transforms import Collate, ToDevice +from irec.data.dataloader import DataLoader +from irec.runners import TrainingRunner +from irec.utils import fix_random_seed + +from data import ArrowBatchDataset +from models import TigerModel, CorrectItemsLogitsProcessor + + +# ПУТИ +IREC_PATH = '../../' +SEMANTIC_MAPPING_PATH = os.path.join(IREC_PATH, 'results/rqvae_vk_lsvd_cz_512_8-weeks_clusters_colisionless.json') +TRAIN_BATCHES_DIR = os.path.join(IREC_PATH, 'data/lsvd/8-weeks-base-one-week-split-rqvae/train_batches/') +VALID_BATCHES_DIR = os.path.join(IREC_PATH, 'data/lsvd/8-weeks-base-one-week-split-rqvae/valid_batches/') +EVAL_BATCHES_DIR = os.path.join(IREC_PATH, 'data/lsvd/8-weeks-base-one-week-split-rqvae/eval_batches/') + + +TENSORBOARD_LOGDIR = os.path.join(IREC_PATH, 'tensorboard_logs') +CHECKPOINTS_DIR = os.path.join(IREC_PATH, 'checkpoints') + +EXPERIMENT_NAME = 'tiger_rqvae_vk_lsvd_cz_512_8-weeks' + +# ОСТАЛЬНОЕ +SEED_VALUE = 42 +DEVICE = 'cuda' + +NUM_EPOCHS = 300 +MAX_SEQ_LEN = 20 +TRAIN_BATCH_SIZE = 256 +VALID_BATCH_SIZE = 1024 +EMBEDDING_DIM = 128 +CODEBOOK_SIZE = 512 +NUM_POSITIONS = 80 +NUM_USER_HASH = 2000 +NUM_HEADS = 6 +NUM_LAYERS = 4 +FEEDFORWARD_DIM = 1024 +KV_DIM = 64 +DROPOUT = 0.2 +NUM_BEAMS = 30 +TOP_K = 20 +NUM_CODEBOOKS = 4 +LR = 0.0001 + +USE_MICROBATCHING = True +MICROBATCH_SIZE = 256 + +torch.set_float32_matmul_precision('high') +torch._dynamo.config.capture_scalar_outputs = True + +import torch._inductor.config as config +config.triton.cudagraph_skip_dynamic_graphs = True + + +def main(): + fix_random_seed(SEED_VALUE) + + with open(SEMANTIC_MAPPING_PATH, 'r') as f: + mappings = json.load(f) + + train_dataloader = DataLoader( + ArrowBatchDataset( + TRAIN_BATCHES_DIR, + device='cpu', + preload=True + ), + batch_size=1, + shuffle=True, + num_workers=0, + pin_memory=True, + collate_fn=Collate() + ).map(ToDevice(DEVICE)).repeat(NUM_EPOCHS) + + valid_dataloder = ArrowBatchDataset( + VALID_BATCHES_DIR, + device=DEVICE, + preload=True + ) + + eval_dataloder = ArrowBatchDataset( + EVAL_BATCHES_DIR, + device=DEVICE, + preload=True + ) + + model = TigerModel( + embedding_dim=EMBEDDING_DIM, + codebook_size=CODEBOOK_SIZE, + sem_id_len=NUM_CODEBOOKS, + user_ids_count=NUM_USER_HASH, + num_positions=NUM_POSITIONS, + num_heads=NUM_HEADS, + num_encoder_layers=NUM_LAYERS, + num_decoder_layers=NUM_LAYERS, + dim_feedforward=FEEDFORWARD_DIM, + num_beams=NUM_BEAMS, + num_return_sequences=TOP_K, + activation='relu', + d_kv=KV_DIM, + dropout=DROPOUT, + layer_norm_eps=1e-6, + initializer_range=0.02, + logits_processor=partial( + CorrectItemsLogitsProcessor, + NUM_CODEBOOKS, + CODEBOOK_SIZE, + mappings, + NUM_BEAMS + ), + use_microbatching=USE_MICROBATCHING, + microbatch_size=MICROBATCH_SIZE + ).to(DEVICE) + + total_params = sum(p.numel() for p in model.parameters()) + trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + + logger.debug(f'Overall parameters: {total_params:,}') + logger.debug(f'Trainable parameters: {trainable_params:,}') + + optimizer = torch.optim.AdamW( + model.parameters(), + lr=LR, + ) + + EPOCH_NUM_STEPS = 1024 # int(len(train_dataloader) // NUM_EPOCHS) + + callbacks = [ + cb.BatchMetrics(metrics=lambda model_outputs, _: { + 'loss': model_outputs['loss'].item(), + }, name='train'), + cb.MetricAccumulator( + accumulators={ + 'train/loss': cb.MeanAccumulator(), + }, + reset_every_num_steps=EPOCH_NUM_STEPS + ), + + cb.Validation( + dataset=valid_dataloder, + callbacks=[ + cb.BatchMetrics(metrics=lambda model_outputs, _:{ + 'loss': model_outputs['loss'].item(), + 'recall@5': model_outputs['recall@5'].tolist(), + 'recall@10': model_outputs['recall@10'].tolist(), + 'recall@20': model_outputs['recall@20'].tolist(), + 'ndcg@5': model_outputs['ndcg@5'].tolist(), + 'ndcg@10': model_outputs['ndcg@10'].tolist(), + 'ndcg@20': model_outputs['ndcg@20'].tolist(), + }, name='validation'), + cb.MetricAccumulator( + accumulators={ + 'validation/loss': cb.MeanAccumulator(), + 'validation/recall@5': cb.MeanAccumulator(), + 'validation/recall@10': cb.MeanAccumulator(), + 'validation/recall@20': cb.MeanAccumulator(), + 'validation/ndcg@5': cb.MeanAccumulator(), + 'validation/ndcg@10': cb.MeanAccumulator(), + 'validation/ndcg@20': cb.MeanAccumulator(), + }, + ), + ], + ).every_num_steps(EPOCH_NUM_STEPS), + + cb.Validation( + dataset=eval_dataloder, + callbacks=[ + cb.BatchMetrics(metrics=lambda model_outputs, _: { + 'loss': model_outputs['loss'].item(), + 'recall@5': model_outputs['recall@5'].tolist(), + 'recall@10': model_outputs['recall@10'].tolist(), + 'recall@20': model_outputs['recall@20'].tolist(), + 'ndcg@5': model_outputs['ndcg@5'].tolist(), + 'ndcg@10': model_outputs['ndcg@10'].tolist(), + 'ndcg@20': model_outputs['ndcg@20'].tolist(), + }, name='eval'), + cb.MetricAccumulator( + accumulators={ + 'eval/loss': cb.MeanAccumulator(), + 'eval/recall@5': cb.MeanAccumulator(), + 'eval/recall@10': cb.MeanAccumulator(), + 'eval/recall@20': cb.MeanAccumulator(), + 'eval/ndcg@5': cb.MeanAccumulator(), + 'eval/ndcg@10': cb.MeanAccumulator(), + 'eval/ndcg@20': cb.MeanAccumulator(), + }, + ), + ], + ).every_num_steps(EPOCH_NUM_STEPS * 4), + + cb.Logger().every_num_steps(EPOCH_NUM_STEPS), + cb.TensorboardLogger(experiment_name=EXPERIMENT_NAME, logdir=TENSORBOARD_LOGDIR), + + cb.EarlyStopping( + metric='validation/ndcg@20', + patience=40 * 4, + minimize=False, + model_path=os.path.join(CHECKPOINTS_DIR, EXPERIMENT_NAME) + ).every_num_steps(EPOCH_NUM_STEPS) + + # cb.Profiler( + # wait=10, + # warmup=10, + # active=10, + # logdir=TENSORBOARD_LOGDIR + # ), + # cb.StopAfterNumSteps(40) + + ] + + logger.debug('Everything is ready for training process!') + + runner = TrainingRunner( + model=model, + optimizer=optimizer, + dataset=train_dataloader, + callbacks=callbacks, + ) + runner.run() + + +if __name__ == '__main__': + main() diff --git a/scripts/tiger-lsvd/lsvd_varka_4.1_plum.py b/scripts/tiger-lsvd/lsvd_varka_4.1_plum.py new file mode 100644 index 0000000..cc35507 --- /dev/null +++ b/scripts/tiger-lsvd/lsvd_varka_4.1_plum.py @@ -0,0 +1,304 @@ +from collections import defaultdict +import json +import murmurhash +import numpy as np +import os +from pathlib import Path + +import pyarrow as pa +import pyarrow.feather as feather + +import torch + +from irec.data.transforms import Collate, Transform +from irec.data.dataloader import DataLoader + +from data import Dataset + +print("tiger no arrow varka 4.1") + +# ПУТИ + +IREC_PATH = '../../' +INTERACTIONS_TRAIN_PATH = "/home/jovyan/IRec/sigir/lsvd_data/8-days-base-ows/base_with_gap_interactions_grouped.parquet" +INTERACTIONS_VALID_PATH = "/home/jovyan/IRec/sigir/lsvd_data/8-days-base-ows/val_interactions_grouped.parquet" +INTERACTIONS_TEST_PATH = "/home/jovyan/IRec/sigir/lsvd_data/8-days-base-ows/test_interactions_grouped.parquet" + +SEMANTIC_MAPPING_PATH = os.path.join(IREC_PATH, 'results/4-1_vk_lsvd_ods_base_with_gap_cb_512_ws_2_k_2000_8w_e_35_clusters_colisionless.json') +TRAIN_BATCHES_DIR = os.path.join(IREC_PATH, 'data/lsvd/8-weeks-base-one-week-split-4.1/train_batches/') +VALID_BATCHES_DIR = os.path.join(IREC_PATH, 'data/lsvd/8-weeks-base-one-week-split-4.1/valid_batches/') +EVAL_BATCHES_DIR = os.path.join(IREC_PATH, 'data/lsvd/8-weeks-base-one-week-split-4.1/eval_batches/') + + +# ОСТАЛЬНОЕ + +SEED_VALUE = 42 +DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + + +MAX_TRAIN_EVENTS = 500 +MAX_SEQ_LEN = 20 +TRAIN_BATCH_SIZE = 256 +VALID_BATCH_SIZE = 1024 +NUM_USER_HASH = 2000 +CODEBOOK_SIZE = 512 +NUM_CODEBOOKS = 4 + +UNIFIED_VOCAB_SIZE = CODEBOOK_SIZE * NUM_CODEBOOKS + NUM_USER_HASH + 10 # 10 for utilities +PAD_TOKEN_ID = UNIFIED_VOCAB_SIZE - 1, +EOS_TOKEN_ID = UNIFIED_VOCAB_SIZE - 2, +DECODER_START_TOKEN_ID = UNIFIED_VOCAB_SIZE - 3, + + +class TigerProcessing(Transform): + def __call__(self, batch): + input_semantic_ids, attention_mask = batch['item.semantic.padded'], batch['item.semantic.mask'] + batch_size = attention_mask.shape[0] + + input_semantic_ids[~attention_mask] = PAD_TOKEN_ID # TODO ??? + + input_semantic_ids = np.concatenate([ + input_semantic_ids, + NUM_CODEBOOKS * CODEBOOK_SIZE + batch['user.hashed.ids'][:, None] + ], axis=-1) + + attention_mask = np.concatenate([ + attention_mask, + np.ones((batch_size, 1), dtype=attention_mask.dtype) + ], axis=-1) + + batch['input.data'] = input_semantic_ids + batch['input.mask'] = attention_mask + + target_semantic_ids = batch['labels.semantic.padded'] + target_semantic_ids = np.concatenate([ + np.ones( + (batch_size, 1), + dtype=np.int64, + ) * DECODER_START_TOKEN_ID, + target_semantic_ids + ], axis=-1) + + batch['output.data'] = target_semantic_ids + + return batch + + +class ToMasked(Transform): + def __init__(self, prefix, is_right_aligned=False): + self._prefix = prefix + self._is_right_aligned = is_right_aligned + + def __call__(self, batch): + data = batch[f'{self._prefix}.ids'] + lengths = batch[f'{self._prefix}.length'] + + batch_size = lengths.shape[0] + max_sequence_length = int(lengths.max()) + + if len(data.shape) == 1: # only indices + padded_tensor = np.zeros( + (batch_size, max_sequence_length), + dtype=data.dtype + ) # (batch_size, max_seq_len) + else: + assert len(data.shape) == 2 # embeddings + padded_tensor = np.zeros( + (batch_size, max_sequence_length, data.shape[-1]), + dtype=data.dtype + ) # (batch_size, max_seq_len, emb_dim) + + mask = np.arange(max_sequence_length)[None] < lengths[:, None] + + if self._is_right_aligned: + mask = np.flip(mask, axis=-1) + + padded_tensor[mask] = data + + batch[f'{self._prefix}.padded'] = padded_tensor + batch[f'{self._prefix}.mask'] = mask + + return batch + + +class SemanticIdsMapper(Transform): + def __init__(self, mapping, names=[]): + super().__init__() + self._mapping = mapping + self._names = names + + max_item_id = max(int(k) for k in mapping.keys()) + print(len(list(mapping.keys())), min(int(k) for k in mapping.keys()) , max(int(k) for k in mapping.keys())) + # print(mapping["280052"]) #304781 + # assert False + data = [] + for i in range(max_item_id + 1): + if str(i) in mapping: + data.append(mapping[str(i)]) + else: + data.append([-1] * NUM_CODEBOOKS) + + self._mapping_tensor = torch.tensor(data, dtype=torch.long) + self._semantic_length = self._mapping_tensor.shape[-1] + + missing_count = (max_item_id + 1) - len(mapping) + print(f"Mapping: {len(mapping)} items, {missing_count} missing (-1 filled)") + + def __call__(self, batch): + for name in self._names: + if f'{name}.ids' in batch: + ids = batch[f'{name}.ids'] + lengths = batch[f'{name}.length'] + assert ids.min() >= 0 + assert ids.max() < self._mapping_tensor.shape[0] + semantic_ids = self._mapping_tensor[ids].flatten() + + assert (semantic_ids != -1).all(), \ + f"Missing mappings detected in {name}! Invalid positions: {(semantic_ids == -1).sum()} out of {len(semantic_ids)}" + + batch[f'{name}.semantic.ids'] = semantic_ids.numpy() + batch[f'{name}.semantic.length'] = lengths * self._semantic_length + + return batch + + +class UserHashing(Transform): + def __init__(self, hash_size): + super().__init__() + self._hash_size = hash_size + + def __call__(self, batch): + batch['user.hashed.ids'] = np.array([murmurhash.hash(str(x)) % self._hash_size for x in batch['user.ids']], dtype=np.int64) + return batch + + +def save_batches_to_arrow(batches, output_dir): + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=False) + + for batch_idx, batch in enumerate(batches): + length_groups = defaultdict(dict) + metadata_groups = defaultdict(dict) + + for key, value in batch.items(): + length = len(value) + + metadata_groups[length][f'{key}_shape'] = str(value.shape) + metadata_groups[length][f'{key}_dtype'] = str(value.dtype) + + if value.ndim == 1: + # 1D массив - сохраняем как есть + length_groups[length][key] = value + elif value.ndim == 2: + # 2D массив - используем list of lists + length_groups[length][key] = value.tolist() + else: + # >2D массив - flatten и сохраняем shape + length_groups[length][key] = value.flatten() + + for length, fields in length_groups.items(): + arrow_dict = {} + for k, v in fields.items(): + if isinstance(v, list) and len(v) > 0 and isinstance(v[0], list): + # List of lists (2D) + arrow_dict[k] = pa.array(v) + else: + arrow_dict[k] = pa.array(v) + + table = pa.table(arrow_dict) + if length in metadata_groups: + table = table.replace_schema_metadata(metadata_groups[length]) + + feather.write_feather( + table, + output_dir / f"batch_{batch_idx:06d}_len_{length}.arrow", + compression='lz4' + ) + + # arrow_dict = {k: pa.array(v) for k, v in fields.items()} + # table = pa.table(arrow_dict) + + # feather.write_feather( + # table, + # output_dir / f"batch_{batch_idx:06d}_len_{length}.arrow", + # compression='lz4' + # ) + + +def main(): + with open(SEMANTIC_MAPPING_PATH, 'r') as f: + mappings = json.load(f) + print("варка может начать умирать") + data = Dataset.create_timestamp_based_parquet( + train_parquet_path=INTERACTIONS_TRAIN_PATH, + validation_parquet_path=INTERACTIONS_VALID_PATH, + test_parquet_path=INTERACTIONS_TEST_PATH, + max_sequence_length=MAX_SEQ_LEN, + sampler_type='tiger', + min_sample_len=2, + is_extended=True, + max_train_events=MAX_TRAIN_EVENTS + ) + + train_dataset, valid_dataset, eval_dataset = data.get_datasets() + print("варка не умерла") + train_dataloader = DataLoader( + dataset=train_dataset, + batch_size=TRAIN_BATCH_SIZE, + shuffle=True, + drop_last=True + ) \ + .map(Collate()) \ + .map(UserHashing(NUM_USER_HASH)) \ + .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \ + .map(ToMasked('item.semantic', is_right_aligned=True)) \ + .map(ToMasked('labels.semantic', is_right_aligned=True)) \ + .map(TigerProcessing()) + + valid_dataloader = DataLoader( + dataset=valid_dataset, + batch_size=VALID_BATCH_SIZE, + shuffle=False, + drop_last=False + ) \ + .map(Collate()) \ + .map(UserHashing(NUM_USER_HASH)) \ + .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \ + .map(ToMasked('item.semantic', is_right_aligned=True)) \ + .map(ToMasked('labels.semantic', is_right_aligned=True)) \ + .map(ToMasked('visited', is_right_aligned=True)) \ + .map(TigerProcessing()) + + eval_dataloader = DataLoader( + dataset=eval_dataset, + batch_size=VALID_BATCH_SIZE, + shuffle=False, + drop_last=False + ) \ + .map(Collate()) \ + .map(UserHashing(NUM_USER_HASH)) \ + .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \ + .map(ToMasked('item.semantic', is_right_aligned=True)) \ + .map(ToMasked('labels.semantic', is_right_aligned=True)) \ + .map(ToMasked('visited', is_right_aligned=True)) \ + .map(TigerProcessing()) + + train_batches = [] + for train_batch in train_dataloader: + train_batches.append(train_batch) + save_batches_to_arrow(train_batches, TRAIN_BATCHES_DIR) + + valid_batches = [] + for valid_batch in valid_dataloader: + valid_batches.append(valid_batch) + save_batches_to_arrow(valid_batches, VALID_BATCHES_DIR) + + eval_batches = [] + for eval_batch in eval_dataloader: + eval_batches.append(eval_batch) + save_batches_to_arrow(eval_batches, EVAL_BATCHES_DIR) + + + +if __name__ == '__main__': + main() diff --git a/scripts/tiger-lsvd/lsvd_varka_4.2_plum.py b/scripts/tiger-lsvd/lsvd_varka_4.2_plum.py new file mode 100644 index 0000000..7de54e4 --- /dev/null +++ b/scripts/tiger-lsvd/lsvd_varka_4.2_plum.py @@ -0,0 +1,304 @@ +from collections import defaultdict +import json +import murmurhash +import numpy as np +import os +from pathlib import Path + +import pyarrow as pa +import pyarrow.feather as feather + +import torch + +from irec.data.transforms import Collate, Transform +from irec.data.dataloader import DataLoader + +from data import Dataset + +print("tiger no arrow varka 4.1") + +# ПУТИ + +IREC_PATH = '../../' +INTERACTIONS_TRAIN_PATH = "/home/jovyan/IRec/sigir/lsvd_data/8-days-base-ows/base_with_gap_interactions_grouped.parquet" +INTERACTIONS_VALID_PATH = "/home/jovyan/IRec/sigir/lsvd_data/8-days-base-ows/val_interactions_grouped.parquet" +INTERACTIONS_TEST_PATH = "/home/jovyan/IRec/sigir/lsvd_data/8-days-base-ows/test_interactions_grouped.parquet" + +SEMANTIC_MAPPING_PATH = os.path.join(IREC_PATH, 'results/4-2_vk_lsvd_ods_base_with_gap_cb_512_ws_2_k_2000_8w_e_35_clusters_colisionless.json') +TRAIN_BATCHES_DIR = os.path.join(IREC_PATH, 'data/lsvd/8-weeks-base-one-week-split-4.2/train_batches/') +VALID_BATCHES_DIR = os.path.join(IREC_PATH, 'data/lsvd/8-weeks-base-one-week-split-4.2/valid_batches/') +EVAL_BATCHES_DIR = os.path.join(IREC_PATH, 'data/lsvd/8-weeks-base-one-week-split-4.2/eval_batches/') + + +# ОСТАЛЬНОЕ + +SEED_VALUE = 42 +DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + + +MAX_TRAIN_EVENTS = 500 +MAX_SEQ_LEN = 20 +TRAIN_BATCH_SIZE = 256 +VALID_BATCH_SIZE = 1024 +NUM_USER_HASH = 2000 +CODEBOOK_SIZE = 512 +NUM_CODEBOOKS = 4 + +UNIFIED_VOCAB_SIZE = CODEBOOK_SIZE * NUM_CODEBOOKS + NUM_USER_HASH + 10 # 10 for utilities +PAD_TOKEN_ID = UNIFIED_VOCAB_SIZE - 1, +EOS_TOKEN_ID = UNIFIED_VOCAB_SIZE - 2, +DECODER_START_TOKEN_ID = UNIFIED_VOCAB_SIZE - 3, + + +class TigerProcessing(Transform): + def __call__(self, batch): + input_semantic_ids, attention_mask = batch['item.semantic.padded'], batch['item.semantic.mask'] + batch_size = attention_mask.shape[0] + + input_semantic_ids[~attention_mask] = PAD_TOKEN_ID # TODO ??? + + input_semantic_ids = np.concatenate([ + input_semantic_ids, + NUM_CODEBOOKS * CODEBOOK_SIZE + batch['user.hashed.ids'][:, None] + ], axis=-1) + + attention_mask = np.concatenate([ + attention_mask, + np.ones((batch_size, 1), dtype=attention_mask.dtype) + ], axis=-1) + + batch['input.data'] = input_semantic_ids + batch['input.mask'] = attention_mask + + target_semantic_ids = batch['labels.semantic.padded'] + target_semantic_ids = np.concatenate([ + np.ones( + (batch_size, 1), + dtype=np.int64, + ) * DECODER_START_TOKEN_ID, + target_semantic_ids + ], axis=-1) + + batch['output.data'] = target_semantic_ids + + return batch + + +class ToMasked(Transform): + def __init__(self, prefix, is_right_aligned=False): + self._prefix = prefix + self._is_right_aligned = is_right_aligned + + def __call__(self, batch): + data = batch[f'{self._prefix}.ids'] + lengths = batch[f'{self._prefix}.length'] + + batch_size = lengths.shape[0] + max_sequence_length = int(lengths.max()) + + if len(data.shape) == 1: # only indices + padded_tensor = np.zeros( + (batch_size, max_sequence_length), + dtype=data.dtype + ) # (batch_size, max_seq_len) + else: + assert len(data.shape) == 2 # embeddings + padded_tensor = np.zeros( + (batch_size, max_sequence_length, data.shape[-1]), + dtype=data.dtype + ) # (batch_size, max_seq_len, emb_dim) + + mask = np.arange(max_sequence_length)[None] < lengths[:, None] + + if self._is_right_aligned: + mask = np.flip(mask, axis=-1) + + padded_tensor[mask] = data + + batch[f'{self._prefix}.padded'] = padded_tensor + batch[f'{self._prefix}.mask'] = mask + + return batch + + +class SemanticIdsMapper(Transform): + def __init__(self, mapping, names=[]): + super().__init__() + self._mapping = mapping + self._names = names + + max_item_id = max(int(k) for k in mapping.keys()) + print(len(list(mapping.keys())), min(int(k) for k in mapping.keys()) , max(int(k) for k in mapping.keys())) + # print(mapping["280052"]) #304781 + # assert False + data = [] + for i in range(max_item_id + 1): + if str(i) in mapping: + data.append(mapping[str(i)]) + else: + data.append([-1] * NUM_CODEBOOKS) + + self._mapping_tensor = torch.tensor(data, dtype=torch.long) + self._semantic_length = self._mapping_tensor.shape[-1] + + missing_count = (max_item_id + 1) - len(mapping) + print(f"Mapping: {len(mapping)} items, {missing_count} missing (-1 filled)") + + def __call__(self, batch): + for name in self._names: + if f'{name}.ids' in batch: + ids = batch[f'{name}.ids'] + lengths = batch[f'{name}.length'] + assert ids.min() >= 0 + assert ids.max() < self._mapping_tensor.shape[0] + semantic_ids = self._mapping_tensor[ids].flatten() + + assert (semantic_ids != -1).all(), \ + f"Missing mappings detected in {name}! Invalid positions: {(semantic_ids == -1).sum()} out of {len(semantic_ids)}" + + batch[f'{name}.semantic.ids'] = semantic_ids.numpy() + batch[f'{name}.semantic.length'] = lengths * self._semantic_length + + return batch + + +class UserHashing(Transform): + def __init__(self, hash_size): + super().__init__() + self._hash_size = hash_size + + def __call__(self, batch): + batch['user.hashed.ids'] = np.array([murmurhash.hash(str(x)) % self._hash_size for x in batch['user.ids']], dtype=np.int64) + return batch + + +def save_batches_to_arrow(batches, output_dir): + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=False) + + for batch_idx, batch in enumerate(batches): + length_groups = defaultdict(dict) + metadata_groups = defaultdict(dict) + + for key, value in batch.items(): + length = len(value) + + metadata_groups[length][f'{key}_shape'] = str(value.shape) + metadata_groups[length][f'{key}_dtype'] = str(value.dtype) + + if value.ndim == 1: + # 1D массив - сохраняем как есть + length_groups[length][key] = value + elif value.ndim == 2: + # 2D массив - используем list of lists + length_groups[length][key] = value.tolist() + else: + # >2D массив - flatten и сохраняем shape + length_groups[length][key] = value.flatten() + + for length, fields in length_groups.items(): + arrow_dict = {} + for k, v in fields.items(): + if isinstance(v, list) and len(v) > 0 and isinstance(v[0], list): + # List of lists (2D) + arrow_dict[k] = pa.array(v) + else: + arrow_dict[k] = pa.array(v) + + table = pa.table(arrow_dict) + if length in metadata_groups: + table = table.replace_schema_metadata(metadata_groups[length]) + + feather.write_feather( + table, + output_dir / f"batch_{batch_idx:06d}_len_{length}.arrow", + compression='lz4' + ) + + # arrow_dict = {k: pa.array(v) for k, v in fields.items()} + # table = pa.table(arrow_dict) + + # feather.write_feather( + # table, + # output_dir / f"batch_{batch_idx:06d}_len_{length}.arrow", + # compression='lz4' + # ) + + +def main(): + with open(SEMANTIC_MAPPING_PATH, 'r') as f: + mappings = json.load(f) + print("варка может начать умирать") + data = Dataset.create_timestamp_based_parquet( + train_parquet_path=INTERACTIONS_TRAIN_PATH, + validation_parquet_path=INTERACTIONS_VALID_PATH, + test_parquet_path=INTERACTIONS_TEST_PATH, + max_sequence_length=MAX_SEQ_LEN, + sampler_type='tiger', + min_sample_len=2, + is_extended=True, + max_train_events=MAX_TRAIN_EVENTS + ) + + train_dataset, valid_dataset, eval_dataset = data.get_datasets() + print("варка не умерла") + train_dataloader = DataLoader( + dataset=train_dataset, + batch_size=TRAIN_BATCH_SIZE, + shuffle=True, + drop_last=True + ) \ + .map(Collate()) \ + .map(UserHashing(NUM_USER_HASH)) \ + .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \ + .map(ToMasked('item.semantic', is_right_aligned=True)) \ + .map(ToMasked('labels.semantic', is_right_aligned=True)) \ + .map(TigerProcessing()) + + valid_dataloader = DataLoader( + dataset=valid_dataset, + batch_size=VALID_BATCH_SIZE, + shuffle=False, + drop_last=False + ) \ + .map(Collate()) \ + .map(UserHashing(NUM_USER_HASH)) \ + .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \ + .map(ToMasked('item.semantic', is_right_aligned=True)) \ + .map(ToMasked('labels.semantic', is_right_aligned=True)) \ + .map(ToMasked('visited', is_right_aligned=True)) \ + .map(TigerProcessing()) + + eval_dataloader = DataLoader( + dataset=eval_dataset, + batch_size=VALID_BATCH_SIZE, + shuffle=False, + drop_last=False + ) \ + .map(Collate()) \ + .map(UserHashing(NUM_USER_HASH)) \ + .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \ + .map(ToMasked('item.semantic', is_right_aligned=True)) \ + .map(ToMasked('labels.semantic', is_right_aligned=True)) \ + .map(ToMasked('visited', is_right_aligned=True)) \ + .map(TigerProcessing()) + + train_batches = [] + for train_batch in train_dataloader: + train_batches.append(train_batch) + save_batches_to_arrow(train_batches, TRAIN_BATCHES_DIR) + + valid_batches = [] + for valid_batch in valid_dataloader: + valid_batches.append(valid_batch) + save_batches_to_arrow(valid_batches, VALID_BATCHES_DIR) + + eval_batches = [] + for eval_batch in eval_dataloader: + eval_batches.append(eval_batch) + save_batches_to_arrow(eval_batches, EVAL_BATCHES_DIR) + + + +if __name__ == '__main__': + main() diff --git a/scripts/tiger-lsvd/lsvd_varka_rqvae.py b/scripts/tiger-lsvd/lsvd_varka_rqvae.py new file mode 100644 index 0000000..bb5ecc0 --- /dev/null +++ b/scripts/tiger-lsvd/lsvd_varka_rqvae.py @@ -0,0 +1,304 @@ +from collections import defaultdict +import json +import murmurhash +import numpy as np +import os +from pathlib import Path + +import pyarrow as pa +import pyarrow.feather as feather + +import torch + +from irec.data.transforms import Collate, Transform +from irec.data.dataloader import DataLoader + +from data import Dataset + +print("tiger no arrow varka 4.1") + +# ПУТИ + +IREC_PATH = '../../' +INTERACTIONS_TRAIN_PATH = "/home/jovyan/IRec/sigir/lsvd_data/8-weeks-base-ows/base_with_gap_interactions_grouped.parquet" +INTERACTIONS_VALID_PATH = "/home/jovyan/IRec/sigir/lsvd_data/8-weeks-base-ows/val_interactions_grouped.parquet" +INTERACTIONS_TEST_PATH = "/home/jovyan/IRec/sigir/lsvd_data/8-weeks-base-ows/test_interactions_grouped.parquet" + +SEMANTIC_MAPPING_PATH = os.path.join(IREC_PATH, 'results/rqvae_vk_lsvd_cz_512_8-weeks_clusters_colisionless.json') +TRAIN_BATCHES_DIR = os.path.join(IREC_PATH, 'data/lsvd/8-weeks-base-one-week-split-rqvae/train_batches/') +VALID_BATCHES_DIR = os.path.join(IREC_PATH, 'data/lsvd/8-weeks-base-one-week-split-rqvae/valid_batches/') +EVAL_BATCHES_DIR = os.path.join(IREC_PATH, 'data/lsvd/8-weeks-base-one-week-split-rqvae/eval_batches/') + + +# ОСТАЛЬНОЕ + +SEED_VALUE = 42 +DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + + +MAX_TRAIN_EVENTS = 500 +MAX_SEQ_LEN = 20 +TRAIN_BATCH_SIZE = 256 +VALID_BATCH_SIZE = 1024 +NUM_USER_HASH = 2000 +CODEBOOK_SIZE = 512 +NUM_CODEBOOKS = 4 + +UNIFIED_VOCAB_SIZE = CODEBOOK_SIZE * NUM_CODEBOOKS + NUM_USER_HASH + 10 # 10 for utilities +PAD_TOKEN_ID = UNIFIED_VOCAB_SIZE - 1, +EOS_TOKEN_ID = UNIFIED_VOCAB_SIZE - 2, +DECODER_START_TOKEN_ID = UNIFIED_VOCAB_SIZE - 3, + + +class TigerProcessing(Transform): + def __call__(self, batch): + input_semantic_ids, attention_mask = batch['item.semantic.padded'], batch['item.semantic.mask'] + batch_size = attention_mask.shape[0] + + input_semantic_ids[~attention_mask] = PAD_TOKEN_ID # TODO ??? + + input_semantic_ids = np.concatenate([ + input_semantic_ids, + NUM_CODEBOOKS * CODEBOOK_SIZE + batch['user.hashed.ids'][:, None] + ], axis=-1) + + attention_mask = np.concatenate([ + attention_mask, + np.ones((batch_size, 1), dtype=attention_mask.dtype) + ], axis=-1) + + batch['input.data'] = input_semantic_ids + batch['input.mask'] = attention_mask + + target_semantic_ids = batch['labels.semantic.padded'] + target_semantic_ids = np.concatenate([ + np.ones( + (batch_size, 1), + dtype=np.int64, + ) * DECODER_START_TOKEN_ID, + target_semantic_ids + ], axis=-1) + + batch['output.data'] = target_semantic_ids + + return batch + + +class ToMasked(Transform): + def __init__(self, prefix, is_right_aligned=False): + self._prefix = prefix + self._is_right_aligned = is_right_aligned + + def __call__(self, batch): + data = batch[f'{self._prefix}.ids'] + lengths = batch[f'{self._prefix}.length'] + + batch_size = lengths.shape[0] + max_sequence_length = int(lengths.max()) + + if len(data.shape) == 1: # only indices + padded_tensor = np.zeros( + (batch_size, max_sequence_length), + dtype=data.dtype + ) # (batch_size, max_seq_len) + else: + assert len(data.shape) == 2 # embeddings + padded_tensor = np.zeros( + (batch_size, max_sequence_length, data.shape[-1]), + dtype=data.dtype + ) # (batch_size, max_seq_len, emb_dim) + + mask = np.arange(max_sequence_length)[None] < lengths[:, None] + + if self._is_right_aligned: + mask = np.flip(mask, axis=-1) + + padded_tensor[mask] = data + + batch[f'{self._prefix}.padded'] = padded_tensor + batch[f'{self._prefix}.mask'] = mask + + return batch + + +class SemanticIdsMapper(Transform): + def __init__(self, mapping, names=[]): + super().__init__() + self._mapping = mapping + self._names = names + + max_item_id = max(int(k) for k in mapping.keys()) + print(len(list(mapping.keys())), min(int(k) for k in mapping.keys()) , max(int(k) for k in mapping.keys())) + # print(mapping["280052"]) #304781 + # assert False + data = [] + for i in range(max_item_id + 1): + if str(i) in mapping: + data.append(mapping[str(i)]) + else: + data.append([-1] * NUM_CODEBOOKS) + + self._mapping_tensor = torch.tensor(data, dtype=torch.long) + self._semantic_length = self._mapping_tensor.shape[-1] + + missing_count = (max_item_id + 1) - len(mapping) + print(f"Mapping: {len(mapping)} items, {missing_count} missing (-1 filled)") + + def __call__(self, batch): + for name in self._names: + if f'{name}.ids' in batch: + ids = batch[f'{name}.ids'] + lengths = batch[f'{name}.length'] + assert ids.min() >= 0 + assert ids.max() < self._mapping_tensor.shape[0] + semantic_ids = self._mapping_tensor[ids].flatten() + + assert (semantic_ids != -1).all(), \ + f"Missing mappings detected in {name}! Invalid positions: {(semantic_ids == -1).sum()} out of {len(semantic_ids)}" + + batch[f'{name}.semantic.ids'] = semantic_ids.numpy() + batch[f'{name}.semantic.length'] = lengths * self._semantic_length + + return batch + + +class UserHashing(Transform): + def __init__(self, hash_size): + super().__init__() + self._hash_size = hash_size + + def __call__(self, batch): + batch['user.hashed.ids'] = np.array([murmurhash.hash(str(x)) % self._hash_size for x in batch['user.ids']], dtype=np.int64) + return batch + + +def save_batches_to_arrow(batches, output_dir): + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=False) + + for batch_idx, batch in enumerate(batches): + length_groups = defaultdict(dict) + metadata_groups = defaultdict(dict) + + for key, value in batch.items(): + length = len(value) + + metadata_groups[length][f'{key}_shape'] = str(value.shape) + metadata_groups[length][f'{key}_dtype'] = str(value.dtype) + + if value.ndim == 1: + # 1D массив - сохраняем как есть + length_groups[length][key] = value + elif value.ndim == 2: + # 2D массив - используем list of lists + length_groups[length][key] = value.tolist() + else: + # >2D массив - flatten и сохраняем shape + length_groups[length][key] = value.flatten() + + for length, fields in length_groups.items(): + arrow_dict = {} + for k, v in fields.items(): + if isinstance(v, list) and len(v) > 0 and isinstance(v[0], list): + # List of lists (2D) + arrow_dict[k] = pa.array(v) + else: + arrow_dict[k] = pa.array(v) + + table = pa.table(arrow_dict) + if length in metadata_groups: + table = table.replace_schema_metadata(metadata_groups[length]) + + feather.write_feather( + table, + output_dir / f"batch_{batch_idx:06d}_len_{length}.arrow", + compression='lz4' + ) + + # arrow_dict = {k: pa.array(v) for k, v in fields.items()} + # table = pa.table(arrow_dict) + + # feather.write_feather( + # table, + # output_dir / f"batch_{batch_idx:06d}_len_{length}.arrow", + # compression='lz4' + # ) + + +def main(): + with open(SEMANTIC_MAPPING_PATH, 'r') as f: + mappings = json.load(f) + print("варка может начать умирать") + data = Dataset.create_timestamp_based_parquet( + train_parquet_path=INTERACTIONS_TRAIN_PATH, + validation_parquet_path=INTERACTIONS_VALID_PATH, + test_parquet_path=INTERACTIONS_TEST_PATH, + max_sequence_length=MAX_SEQ_LEN, + sampler_type='tiger', + min_sample_len=2, + is_extended=True, + max_train_events=MAX_TRAIN_EVENTS + ) + + train_dataset, valid_dataset, eval_dataset = data.get_datasets() + print("варка не умерла") + train_dataloader = DataLoader( + dataset=train_dataset, + batch_size=TRAIN_BATCH_SIZE, + shuffle=True, + drop_last=True + ) \ + .map(Collate()) \ + .map(UserHashing(NUM_USER_HASH)) \ + .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \ + .map(ToMasked('item.semantic', is_right_aligned=True)) \ + .map(ToMasked('labels.semantic', is_right_aligned=True)) \ + .map(TigerProcessing()) + + valid_dataloader = DataLoader( + dataset=valid_dataset, + batch_size=VALID_BATCH_SIZE, + shuffle=False, + drop_last=False + ) \ + .map(Collate()) \ + .map(UserHashing(NUM_USER_HASH)) \ + .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \ + .map(ToMasked('item.semantic', is_right_aligned=True)) \ + .map(ToMasked('labels.semantic', is_right_aligned=True)) \ + .map(ToMasked('visited', is_right_aligned=True)) \ + .map(TigerProcessing()) + + eval_dataloader = DataLoader( + dataset=eval_dataset, + batch_size=VALID_BATCH_SIZE, + shuffle=False, + drop_last=False + ) \ + .map(Collate()) \ + .map(UserHashing(NUM_USER_HASH)) \ + .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \ + .map(ToMasked('item.semantic', is_right_aligned=True)) \ + .map(ToMasked('labels.semantic', is_right_aligned=True)) \ + .map(ToMasked('visited', is_right_aligned=True)) \ + .map(TigerProcessing()) + + train_batches = [] + for train_batch in train_dataloader: + train_batches.append(train_batch) + save_batches_to_arrow(train_batches, TRAIN_BATCHES_DIR) + + valid_batches = [] + for valid_batch in valid_dataloader: + valid_batches.append(valid_batch) + save_batches_to_arrow(valid_batches, VALID_BATCHES_DIR) + + eval_batches = [] + for eval_batch in eval_dataloader: + eval_batches.append(eval_batch) + save_batches_to_arrow(eval_batches, EVAL_BATCHES_DIR) + + + +if __name__ == '__main__': + main() diff --git a/scripts/tiger-lsvd/models.py b/scripts/tiger-lsvd/models.py new file mode 100644 index 0000000..f89419e --- /dev/null +++ b/scripts/tiger-lsvd/models.py @@ -0,0 +1,223 @@ +import torch +from transformers import T5ForConditionalGeneration, T5Config, LogitsProcessor + +from irec.models import TorchModel + + +class CorrectItemsLogitsProcessor(LogitsProcessor): + def __init__(self, num_codebooks, codebook_size, mapping, num_beams, visited_items): + self.num_codebooks = num_codebooks + self.codebook_size = codebook_size + self.num_beams = num_beams + + semantic_ids = [] + for i in range(len(mapping)): + assert len(mapping[str(i)]) == num_codebooks, 'All semantic ids must have the same length' + semantic_ids.append(mapping[str(i)]) + + self.index_semantic_ids = torch.tensor(semantic_ids, dtype=torch.long, device=visited_items.device) # (num_items, semantic_ids) + + batch_size, _ = visited_items.shape + + self.index_semantic_ids = torch.tile(self.index_semantic_ids[None], dims=[batch_size, 1, 1]) # (batch_size, num_items, semantic_ids) + + index = visited_items[..., None].tile(dims=[1, 1, num_codebooks]) # (batch_size, num_rated, semantic_ids) + self.index_semantic_ids = torch.scatter( + input=self.index_semantic_ids, + dim=1, + index=index, + src=torch.zeros_like(index) + ) # (batch_size, num_items, semantic_ids) + + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: + next_sid_codebook_num = (torch.minimum((input_ids[:, -1].max() // self.codebook_size), torch.as_tensor(self.num_codebooks - 1)).item() + 1) % self.num_codebooks + a = torch.tile(self.index_semantic_ids[:, None, :, next_sid_codebook_num], dims=[1, self.num_beams, 1]) # (batch_size, num_beams, num_items) + a = a.reshape(a.shape[0] * a.shape[1], a.shape[2]) # (batch_size * num_beams, num_items) + + if next_sid_codebook_num != 0: + b = torch.tile(self.index_semantic_ids[:, None :, :next_sid_codebook_num], dims=[1, self.num_beams, 1, 1]) # (batch_size, num_beams, num_items, sid_len) + b = b.reshape(b.shape[0] * b.shape[1], b.shape[2], b.shape[3]) # (batch_size * num_beams, num_items, sid_len) + + current_prefixes = input_ids[:, -next_sid_codebook_num:] # (batch_size * num_beams, sid_len) + possible_next_items_mask = ( + torch.eq(current_prefixes[:, None, :], b).long().sum(dim=-1) == next_sid_codebook_num + ) # (batch_size * num_beams, num_items) + a[~possible_next_items_mask] = (next_sid_codebook_num + 1) * self.codebook_size + + scores_mask = torch.zeros_like(scores).bool() # (batch_size * num_beams, num_items) + scores_mask = torch.scatter_add( + input=scores_mask, + dim=-1, + index=a, + src=torch.ones_like(a).bool() + ) + + scores[:, :next_sid_codebook_num * self.codebook_size] = -torch.inf + scores[:, (next_sid_codebook_num + 1) * self.codebook_size:] = -torch.inf + scores[~(scores_mask.bool())] = -torch.inf + + return scores + + +class TigerModel(TorchModel): + def __init__( + self, + embedding_dim, + codebook_size, + sem_id_len, + num_positions, + user_ids_count, + num_heads, + num_encoder_layers, + num_decoder_layers, + dim_feedforward, + num_beams=100, + num_return_sequences=20, + d_kv=64, + layer_norm_eps=1e-6, + activation='relu', + dropout=0.1, + initializer_range=0.02, + logits_processor=None, + use_microbatching=False, + microbatch_size=128 + ): + super().__init__() + self._embedding_dim = embedding_dim + self._codebook_size = codebook_size + self._num_positions = num_positions + self._num_heads = num_heads + self._num_encoder_layers = num_encoder_layers + self._num_decoder_layers = num_decoder_layers + self._dim_feedforward = dim_feedforward + self._num_beams = num_beams + self._num_return_sequences = num_return_sequences + self._d_kv = d_kv + self._layer_norm_eps = layer_norm_eps + self._activation = activation + self._dropout = dropout + self._sem_id_len = sem_id_len + self.user_ids_count = user_ids_count + self.logits_processor = logits_processor + self._use_microbatching = use_microbatching + self._microbatch_size = microbatch_size + + unified_vocab_size = codebook_size * self._sem_id_len + self.user_ids_count + 10 # 10 for utilities + self.config = T5Config( + vocab_size=unified_vocab_size, + d_model=self._embedding_dim, + d_kv=self._d_kv, + d_ff=self._dim_feedforward, + num_layers=self._num_encoder_layers, + num_decoder_layers=self._num_decoder_layers, + num_heads=self._num_heads, + dropout_rate=self._dropout, + is_encoder_decoder=True, + use_cache=False, + pad_token_id=unified_vocab_size - 1, + eos_token_id=unified_vocab_size - 2, + decoder_start_token_id=unified_vocab_size - 3, + layer_norm_epsilon=self._layer_norm_eps, + feed_forward_proj=self._activation, + tie_word_embeddings=False + ) + self.model = T5ForConditionalGeneration(config=self.config) + self._init_weights(initializer_range) + + self.model = torch.compile( + self.model, + mode='reduce-overhead', + fullgraph=False, + dynamic=True + ) + + def forward(self, inputs): + input_semantic_ids = inputs['input.data'] + attention_mask = inputs['input.mask'] + target_semantic_ids = inputs['output.data'] + + decoder_input_ids = target_semantic_ids[:, :-1].contiguous() + labels = target_semantic_ids[:, 1:].contiguous() + + model_output = self.model( + input_ids=input_semantic_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + labels=labels + ) + loss = model_output['loss'] + + metrics = {'loss': loss.detach()} + + if not self.training and not self._use_microbatching: + visited_batch = inputs['visited.padded'] + + output = self.model.generate( + input_ids=input_semantic_ids, + attention_mask=attention_mask, + num_beams=self._num_beams, + num_return_sequences=self._num_return_sequences, + max_length=self._sem_id_len + 1, + decoder_start_token_id=self.config.decoder_start_token_id, + eos_token_id=self.config.eos_token_id, + pad_token_id=self.config.pad_token_id, + do_sample=False, + early_stopping=False, + logits_processor=[self.logits_processor(visited_items=visited_batch)] if self.logits_processor is not None else [], + ) + + predictions = output[:, 1:].reshape(-1, self._num_return_sequences, self._sem_id_len) + + all_hits = (torch.eq(predictions, labels[:, None]).sum(dim=-1)) # (batch_size, top_k) + elif not self.training and self._use_microbatching: + visited_batch = inputs['visited.padded'] + batch_size = input_semantic_ids.shape[0] + + inference_batch_size = self._microbatch_size # вместо полного batch_size + + all_predictions = [] + all_labels = [] + # print(f"start to infer batch of shape {input_semantic_ids.shape} with new batch {inference_batch_size}") + for batch_idx in range(0, batch_size, inference_batch_size): + batch_end = min(batch_idx + inference_batch_size, batch_size) + batch_slice = slice(batch_idx, batch_end) + + input_ids_batch = input_semantic_ids[batch_slice] + attention_mask_batch = attention_mask[batch_slice] + visited_batch_subset = visited_batch[batch_slice] + labels_batch = labels[batch_slice] + + with torch.inference_mode(): + output = self.model.generate( + input_ids=input_ids_batch, + attention_mask=attention_mask_batch, + num_beams=self._num_beams, + num_return_sequences=self._num_return_sequences, + max_length=self._sem_id_len + 1, + decoder_start_token_id=self.config.decoder_start_token_id, + eos_token_id=self.config.eos_token_id, + pad_token_id=self.config.pad_token_id, + do_sample=False, + early_stopping=False, + logits_processor=[self.logits_processor(visited_items=visited_batch_subset)] if self.logits_processor is not None else [], + ) + + predictions_batch = output[:, 1:].reshape(-1, self._num_return_sequences, self._sem_id_len) + all_predictions.append(predictions_batch) + all_labels.append(labels_batch) + # print("end infer of batch") + + predictions = torch.cat(all_predictions, dim=0) # (batch_size, num_return_sequences, sem_id_len) + labels_full = torch.cat(all_labels, dim=0) # (batch_size, sem_id_len) + all_hits = (torch.eq(predictions, labels_full[:, None]).sum(dim=-1)) # (batch_size, top_k) + + if not self.training: + for k in [5, 10, 20]: + hits = (all_hits[:, :k] == self._sem_id_len).float() # (batch_size, k) + recall = hits.sum(dim=-1) # (batch_size) + discount_factor = 1 / torch.log2(torch.arange(1, k + 1, 1).float() + 1.).to(hits.device) # (k) + + metrics[f'recall@{k}'] = recall.cpu().float() + metrics[f'ndcg@{k}'] = torch.einsum('bk,k->b', hits, discount_factor).cpu().float() + + return loss, metrics \ No newline at end of file diff --git a/scripts/tiger-yambda/data.py b/scripts/tiger-yambda/data.py new file mode 100644 index 0000000..87ff07d --- /dev/null +++ b/scripts/tiger-yambda/data.py @@ -0,0 +1,498 @@ +from collections import defaultdict +import json +from loguru import logger +import numpy as np +from pathlib import Path + + +import pyarrow as pa +import pyarrow.feather as feather + +import torch + +from irec.data.base import BaseDataset + + +class Dataset: + def __init__( + self, + train_sampler, + validation_sampler, + test_sampler, + num_items, + max_sequence_length + ): + self._train_sampler = train_sampler + self._validation_sampler = validation_sampler + self._test_sampler = test_sampler + self._num_items = num_items + self._max_sequence_length = max_sequence_length + + @classmethod + def create_timestamp_based( + cls, + train_json_path, + validation_json_path, + test_json_path, + max_sequence_length, + sampler_type, + min_sample_len=2, + is_extended=False, + max_train_events=50 + ): + max_item_id = 0 + train_dataset, validation_dataset, test_dataset = [], [], [] + print("started to load datasets") + with open(train_json_path, 'r') as f: + train_data = json.load(f) + with open(validation_json_path, 'r') as f: + validation_data = json.load(f) + with open(test_json_path, 'r') as f: + test_data = json.load(f) + + all_users = set(train_data.keys()) | set(validation_data.keys()) | set(test_data.keys()) + print(f"all users count: {len(all_users)}") + us_count = 0 + for user_id_str in all_users: + if us_count % 100 == 0: + print(f"user id {us_count}/{len(all_users)}: {user_id_str}") + user_id = int(user_id_str) + + train_items = train_data.get(user_id_str, []) + validation_items = validation_data.get(user_id_str, []) + test_items = test_data.get(user_id_str, []) + + full_sequence = train_items + validation_items + test_items + if full_sequence: + max_item_id = max(max_item_id, max(full_sequence)) + + if us_count % 100 == 0: + print(f"full sequence len: {len(full_sequence)}") + us_count += 1 + assert len(full_sequence) >= 2, f'Core-5 dataset is used, user {user_id} has only {len(full_sequence)} items' + + # Обрезаем train на последние max_train_events событий + train_items = train_items[-max_train_events:] if len(train_items) > max_train_events else train_items + + if is_extended: + # sample = [1, 2] + # sample = [1, 2, 3] + # sample = [1, 2, 3, 4] + # sample = [1, 2, 3, 4, 5] + # sample = [1, 2, 3, 4, 5, 6] + # sample = [1, 2, 3, 4, 5, 6, 7] + # sample = [1, 2, 3, 4, 5, 6, 7, 8] + for prefix_length in range(min_sample_len, len(train_items) + 1): + train_dataset.append({ + 'user.ids': [user_id], + 'item.ids': train_items[:prefix_length], + }) + else: + # sample = [1, 2, 3, 4, 5, 6, 7, 8] + train_dataset.append({ + 'user.ids': [user_id], + 'item.ids': train_items, + }) + + # валидация + + # разворачиваем каждый айтем из валидации в отдельный сэмпл + # Пример: Train=[1,2], Valid=[3,4] + # sample = [1, 2, 3] + # sample = [1, 2, 3, 4] + + current_history = train_items.copy() + valid_small_history = 0 + for item in validation_items: + # эвал датасет сам отрезает таргет потом + sample_sequence = current_history + [item] + + if len(sample_sequence) >= min_sample_len: + validation_dataset.append({ + 'user.ids': [user_id], + 'item.ids': sample_sequence, + }) + else: + valid_small_history += 1 + current_history.append(item) + + # разворачиваем каждый айтем из теста в отдельный сэмпл + # Пример: Train=[1,2], Valid=[3,4], Test=[5, 6] + # sample = [1, 2, 3, 4, 5] + # sample = [1, 2, 3, 4, 5, 6] + current_history = train_items + validation_items + test_small_history = 0 + for item in test_items: + sample_sequence = current_history + [item] + if len(sample_sequence) >= min_sample_len: + test_dataset.append({ + 'user.ids': [user_id], + 'item.ids': sample_sequence, + }) + else: + test_small_history += 1 + current_history.append(item) + + print(f"Train dataset size: {len(train_dataset)}") + print(f"Validation dataset size: {len(validation_dataset)} with skipped {valid_small_history}") + print(f"Test dataset size: {len(test_dataset)} with skipped {test_small_history}") + + logger.debug(f'Train dataset size: {len(train_dataset)}') + logger.debug(f'Validation dataset size: {len(validation_dataset)}') + logger.debug(f'Test dataset size: {len(test_dataset)}') + + train_sampler = TrainDataset(train_dataset, sampler_type, max_sequence_length=max_sequence_length) + validation_sampler = EvalDataset(validation_dataset, max_sequence_length=max_sequence_length) + test_sampler = EvalDataset(test_dataset, max_sequence_length=max_sequence_length) + + return cls( + train_sampler=train_sampler, + validation_sampler=validation_sampler, + test_sampler=test_sampler, + num_items=max_item_id + 1, # +1 added because our ids are 0-indexed + max_sequence_length=max_sequence_length + ) + + @classmethod + def create_timestamp_based_with_one_valid( + cls, + train_json_path, + validation_json_path, + test_json_path, + max_sequence_length, + sampler_type, + min_sample_len=2, + is_extended=False, + max_train_events=50, + max_valid_events=50 + ): + max_item_id = 0 + train_dataset, validation_dataset, test_dataset = [], [], [] + print("started to load datasets") + with open(train_json_path, 'r') as f: + train_data = json.load(f) + with open(validation_json_path, 'r') as f: + validation_data = json.load(f) + with open(test_json_path, 'r') as f: + test_data = json.load(f) + + all_users = set(train_data.keys()) | set(validation_data.keys()) | set(test_data.keys()) + print(f"all users count: {len(all_users)}") + us_count = 0 + for user_id_str in all_users: + if us_count % 100 == 0: + print(f"user id {us_count}/{len(all_users)}: {user_id_str}") + user_id = int(user_id_str) + + train_items = train_data.get(user_id_str, []) + validation_items = validation_data.get(user_id_str, []) + test_items = test_data.get(user_id_str, []) + + full_sequence = train_items + validation_items + test_items + if full_sequence: + max_item_id = max(max_item_id, max(full_sequence)) + + if us_count % 100 == 0: + print(f"full sequence len: {len(full_sequence)}") + + assert len(full_sequence) >= 2, f'Core-5 dataset is used, user {user_id} has only {len(full_sequence)} items' + + # Обрезаем train на последние max_train_events событий + train_items = train_items[-max_train_events:] if len(train_items) > max_train_events else train_items + + if is_extended: + # sample = [1, 2] + # sample = [1, 2, 3] + # sample = [1, 2, 3, 4] + # sample = [1, 2, 3, 4, 5] + # sample = [1, 2, 3, 4, 5, 6] + # sample = [1, 2, 3, 4, 5, 6, 7] + # sample = [1, 2, 3, 4, 5, 6, 7, 8] + for prefix_length in range(min_sample_len, len(train_items) + 1): + train_dataset.append({ + 'user.ids': [user_id], + 'item.ids': train_items[:prefix_length], + }) + else: + # sample = [1, 2, 3, 4, 5, 6, 7, 8] + train_dataset.append({ + 'user.ids': [user_id], + 'item.ids': train_items, + }) + + # валидация + + # разворачиваем каждый айтем из валидации в отдельный сэмпл + # Пример: Train=[1,2], Valid=[3,4] + # sample = [1, 2, 3] + # sample = [1, 2, 3, 4] + + current_history = train_items.copy() + if us_count % 100 == 0: + print(f"validation data length {len(validation_items[:max_valid_events])}") + us_count += 1 + for item in validation_items[:max_valid_events]: + # эвал датасет сам отрезает таргет потом + sample_sequence = current_history + [item] + + if len(sample_sequence) >= min_sample_len: + validation_dataset.append({ + 'user.ids': [user_id], + 'item.ids': sample_sequence, + }) + current_history.append(item) + + # разворачиваем каждый айтем из теста в отдельный сэмпл + # Пример: Train=[1,2], Valid=[3,4], Test=[5, 6] + # sample = [1, 2, 3, 4, 5] + # sample = [1, 2, 3, 4, 5, 6] + current_history = train_items + validation_items + for item in test_items: + sample_sequence = current_history + [item] + if len(sample_sequence) >= min_sample_len: + test_dataset.append({ + 'user.ids': [user_id], + 'item.ids': sample_sequence, + }) + current_history.append(item) + + print(f"Train dataset size: {len(train_dataset)}") + print(f"Validation dataset size: {len(validation_dataset)}") + print(f"Test dataset size: {len(test_dataset)}") + + logger.debug(f'Train dataset size: {len(train_dataset)}') + logger.debug(f'Validation dataset size: {len(validation_dataset)}') + logger.debug(f'Test dataset size: {len(test_dataset)}') + + train_sampler = TrainDataset(train_dataset, sampler_type, max_sequence_length=max_sequence_length) + validation_sampler = EvalDataset(validation_dataset, max_sequence_length=max_sequence_length) + test_sampler = EvalDataset(test_dataset, max_sequence_length=max_sequence_length) + + return cls( + train_sampler=train_sampler, + validation_sampler=validation_sampler, + test_sampler=test_sampler, + num_items=max_item_id + 1, # +1 added because our ids are 0-indexed + max_sequence_length=max_sequence_length + ) + + @classmethod + def create(cls, inter_json_path, max_sequence_length, sampler_type, is_extended=False): + max_item_id = 0 + train_dataset, validation_dataset, test_dataset = [], [], [] + + with open(inter_json_path, 'r') as f: + user_interactions = json.load(f) + + for user_id_str, item_ids in user_interactions.items(): + user_id = int(user_id_str) + + if item_ids: + max_item_id = max(max_item_id, max(item_ids)) + + assert len(item_ids) >= 5, f'Core-5 dataset is used, user {user_id} has only {len(item_ids)} items' + + # sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] (leave one out scheme, 8 - train, 9 - valid, 10 - test) + if is_extended: + # sample = [1, 2] + # sample = [1, 2, 3] + # sample = [1, 2, 3, 4] + # sample = [1, 2, 3, 4, 5] + # sample = [1, 2, 3, 4, 5, 6] + # sample = [1, 2, 3, 4, 5, 6, 7] + # sample = [1, 2, 3, 4, 5, 6, 7, 8] + for prefix_length in range(2, len(item_ids) - 2 + 1): + train_dataset.append({ + 'user.ids': [user_id], + 'item.ids': item_ids[:prefix_length], + }) + else: + # sample = [1, 2, 3, 4, 5, 6, 7, 8] + train_dataset.append({ + 'user.ids': [user_id], + 'item.ids': item_ids[:-2], + }) + + # sample = [1, 2, 3, 4, 5, 6, 7, 8, 9] + validation_dataset.append({ + 'user.ids': [user_id], + 'item.ids': item_ids[:-1], + }) + + # sample = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + test_dataset.append({ + 'user.ids': [user_id], + 'item.ids': item_ids, + }) + + logger.debug(f'Train dataset size: {len(train_dataset)}') + logger.debug(f'Validation dataset size: {len(validation_dataset)}') + logger.debug(f'Test dataset size: {len(test_dataset)}') + logger.debug(f'Max item id: {max_item_id}') + + train_sampler = TrainDataset(train_dataset, sampler_type, max_sequence_length=max_sequence_length) + validation_sampler = EvalDataset(validation_dataset, max_sequence_length=max_sequence_length) + test_sampler = EvalDataset(test_dataset, max_sequence_length=max_sequence_length) + + return cls( + train_sampler=train_sampler, + validation_sampler=validation_sampler, + test_sampler=test_sampler, + num_items=max_item_id + 1, # +1 added because our ids are 0-indexed + max_sequence_length=max_sequence_length + ) + + def get_datasets(self): + return self._train_sampler, self._validation_sampler, self._test_sampler + + @property + def num_items(self): + return self._num_items + + @property + def max_sequence_length(self): + return self._max_sequence_length + + +class TrainDataset(BaseDataset): + def __init__(self, dataset, prediction_type, max_sequence_length): + self._dataset = dataset + self._prediction_type = prediction_type + self._max_sequence_length = max_sequence_length + + self._transforms = { + 'sasrec': self._all_items_transform, + 'tiger': self._last_item_transform + } + + def _all_items_transform(self, sample): + item_sequence = sample['item.ids'][-self._max_sequence_length:][:-1] + next_item_sequence = sample['item.ids'][-self._max_sequence_length:][1:] + return { + 'user.ids': np.array(sample['user.ids'], dtype=np.int64), + 'user.length': np.array([len(sample['user.ids'])], dtype=np.int64), + 'item.ids': np.array(item_sequence, dtype=np.int64), + 'item.length': np.array([len(item_sequence)], dtype=np.int64), + 'labels.ids': np.array(next_item_sequence, dtype=np.int64), + 'labels.length': np.array([len(next_item_sequence)], dtype=np.int64) + } + + def _last_item_transform(self, sample): + item_sequence = sample['item.ids'][-self._max_sequence_length:][:-1] + last_item = sample['item.ids'][-self._max_sequence_length:][-1] + return { + 'user.ids': np.array(sample['user.ids'], dtype=np.int64), + 'user.length': np.array([len(sample['user.ids'])], dtype=np.int64), + 'item.ids': np.array(item_sequence, dtype=np.int64), + 'item.length': np.array([len(item_sequence)], dtype=np.int64), + 'labels.ids': np.array([last_item], dtype=np.int64), + 'labels.length': np.array([1], dtype=np.int64), + } + + def __getitem__(self, index): + return self._transforms[self._prediction_type](self._dataset[index]) + + def __len__(self): + return len(self._dataset) + + +class EvalDataset(BaseDataset): + def __init__(self, dataset, max_sequence_length): + self._dataset = dataset + self._max_sequence_length = max_sequence_length + + @property + def dataset(self): + return self._dataset + + def __len__(self): + return len(self._dataset) + + def __getitem__(self, index): + sample = self._dataset[index] + + item_sequence = sample['item.ids'][-self._max_sequence_length:][:-1] + next_item = sample['item.ids'][-self._max_sequence_length:][-1] + + return { + 'user.ids': np.array(sample['user.ids'], dtype=np.int64), + 'user.length': np.array([len(sample['user.ids'])], dtype=np.int64), + 'item.ids': np.array(item_sequence, dtype=np.int64), + 'item.length': np.array([len(item_sequence)], dtype=np.int64), + 'labels.ids': np.array([next_item], dtype=np.int64), + 'labels.length': np.array([1], dtype=np.int64), + 'visited.ids': np.array(sample['item.ids'][:-1], dtype=np.int64), + 'visited.length': np.array([len(sample['item.ids'][:-1])], dtype=np.int64), + } + + +class ArrowBatchDataset(BaseDataset): + def __init__(self, batch_dir, device='cuda', preload=False): + self.batch_dir = Path(batch_dir) + self.device = device + + all_files = list(self.batch_dir.glob('batch_*_len_*.arrow')) + + batch_files_map = defaultdict(list) + for f in all_files: + batch_id = int(f.stem.split('_')[1]) + batch_files_map[batch_id].append(f) + + for batch_id in batch_files_map: + batch_files_map[batch_id].sort() + + self.batch_indices = sorted(batch_files_map.keys()) + + if preload: + print(f"Preloading {len(self.batch_indices)} batches...") + self.cached_batches = [] + + for idx in range(len(self.batch_indices)): + batch = self._load_batch(batch_files_map[self.batch_indices[idx]]) + self.cached_batches.append(batch) + else: + self.cached_batches = None + self.batch_files_map = batch_files_map + + def _load_batch(self, arrow_files): + batch = {} + + for arrow_file in arrow_files: + table = feather.read_table(arrow_file) + metadata = table.schema.metadata or {} + + for col_name in table.column_names: + col = table.column(col_name) + + shape_key = f'{col_name}_shape' + dtype_key = f'{col_name}_dtype' + + if shape_key.encode() in metadata: + shape = eval(metadata[shape_key.encode()].decode()) + dtype = np.dtype(metadata[dtype_key.encode()].decode()) + + # Проверяем тип колонки + if pa.types.is_list(col.type) or pa.types.is_large_list(col.type): + arr = np.array(col.to_pylist(), dtype=dtype) + else: + arr = col.to_numpy().reshape(shape).astype(dtype) + else: + if pa.types.is_list(col.type) or pa.types.is_large_list(col.type): + arr = np.array(col.to_pylist()) + else: + arr = col.to_numpy() + + batch[col_name] = torch.from_numpy(arr.copy()).to(self.device) + + return batch + + def __len__(self): + return len(self.batch_indices) + + def __getitem__(self, idx): + if self.cached_batches is not None: + return self.cached_batches[idx] + else: + batch_id = self.batch_indices[idx] + arrow_files = self.batch_files_map[batch_id] + return self._load_batch(arrow_files) diff --git a/scripts/tiger-yambda/models.py b/scripts/tiger-yambda/models.py new file mode 100644 index 0000000..8fd0f76 --- /dev/null +++ b/scripts/tiger-yambda/models.py @@ -0,0 +1,223 @@ +import torch +from transformers import T5ForConditionalGeneration, T5Config, LogitsProcessor + +from irec.models import TorchModel + + +class CorrectItemsLogitsProcessor(LogitsProcessor): + def __init__(self, num_codebooks, codebook_size, mapping, num_beams, visited_items): + self.num_codebooks = num_codebooks + self.codebook_size = codebook_size + self.num_beams = num_beams + + semantic_ids = [] + for i in range(len(mapping)): + assert len(mapping[str(i)]) == num_codebooks, 'All semantic ids must have the same length' + semantic_ids.append(mapping[str(i)]) + + self.index_semantic_ids = torch.tensor(semantic_ids, dtype=torch.long, device=visited_items.device) # (num_items, semantic_ids) + + batch_size, _ = visited_items.shape + + self.index_semantic_ids = torch.tile(self.index_semantic_ids[None], dims=[batch_size, 1, 1]) # (batch_size, num_items, semantic_ids) + + index = visited_items[..., None].tile(dims=[1, 1, num_codebooks]) # (batch_size, num_rated, semantic_ids) + self.index_semantic_ids = torch.scatter( + input=self.index_semantic_ids, + dim=1, + index=index, + src=torch.zeros_like(index) + ) # (batch_size, num_items, semantic_ids) + + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: + next_sid_codebook_num = (torch.minimum((input_ids[:, -1].max() // self.codebook_size), torch.as_tensor(self.num_codebooks - 1)).item() + 1) % self.num_codebooks + a = torch.tile(self.index_semantic_ids[:, None, :, next_sid_codebook_num], dims=[1, self.num_beams, 1]) # (batch_size, num_beams, num_items) + a = a.reshape(a.shape[0] * a.shape[1], a.shape[2]) # (batch_size * num_beams, num_items) + + if next_sid_codebook_num != 0: + b = torch.tile(self.index_semantic_ids[:, None :, :next_sid_codebook_num], dims=[1, self.num_beams, 1, 1]) # (batch_size, num_beams, num_items, sid_len) + b = b.reshape(b.shape[0] * b.shape[1], b.shape[2], b.shape[3]) # (batch_size * num_beams, num_items, sid_len) + + current_prefixes = input_ids[:, -next_sid_codebook_num:] # (batch_size * num_beams, sid_len) + possible_next_items_mask = ( + torch.eq(current_prefixes[:, None, :], b).long().sum(dim=-1) == next_sid_codebook_num + ) # (batch_size * num_beams, num_items) + a[~possible_next_items_mask] = (next_sid_codebook_num + 1) * self.codebook_size + + scores_mask = torch.zeros_like(scores).bool() # (batch_size * num_beams, num_items) + scores_mask = torch.scatter_add( + input=scores_mask, + dim=-1, + index=a, + src=torch.ones_like(a).bool() + ) + + scores[:, :next_sid_codebook_num * self.codebook_size] = -torch.inf + scores[:, (next_sid_codebook_num + 1) * self.codebook_size:] = -torch.inf + scores[~(scores_mask.bool())] = -torch.inf + + return scores + + +class TigerModel(TorchModel): + def __init__( + self, + embedding_dim, + codebook_size, + sem_id_len, + num_positions, + user_ids_count, + num_heads, + num_encoder_layers, + num_decoder_layers, + dim_feedforward, + num_beams=100, + num_return_sequences=20, + d_kv=64, + layer_norm_eps=1e-6, + activation='relu', + dropout=0.1, + initializer_range=0.02, + logits_processor=None, + use_microbatching=False, + microbatch_size=128 + ): + super().__init__() + self._embedding_dim = embedding_dim + self._codebook_size = codebook_size + self._num_positions = num_positions + self._num_heads = num_heads + self._num_encoder_layers = num_encoder_layers + self._num_decoder_layers = num_decoder_layers + self._dim_feedforward = dim_feedforward + self._num_beams = num_beams + self._num_return_sequences = num_return_sequences + self._d_kv = d_kv + self._layer_norm_eps = layer_norm_eps + self._activation = activation + self._dropout = dropout + self._sem_id_len = sem_id_len + self.user_ids_count = user_ids_count + self.logits_processor = logits_processor + self._use_microbatching = use_microbatching + self._microbatch_size = microbatch_size + + unified_vocab_size = codebook_size * self._sem_id_len + self.user_ids_count + 10 # 10 for utilities + self.config = T5Config( + vocab_size=unified_vocab_size, + d_model=self._embedding_dim, + d_kv=self._d_kv, + d_ff=self._dim_feedforward, + num_layers=self._num_encoder_layers, + num_decoder_layers=self._num_decoder_layers, + num_heads=self._num_heads, + dropout_rate=self._dropout, + is_encoder_decoder=True, + use_cache=False, + pad_token_id=unified_vocab_size - 1, + eos_token_id=unified_vocab_size - 2, + decoder_start_token_id=unified_vocab_size - 3, + layer_norm_epsilon=self._layer_norm_eps, + feed_forward_proj=self._activation, + tie_word_embeddings=False + ) + self.model = T5ForConditionalGeneration(config=self.config) + self._init_weights(initializer_range) + + self.model = torch.compile( + self.model, + mode='reduce-overhead', + fullgraph=False, + dynamic=True + ) + + def forward(self, inputs): + input_semantic_ids = inputs['input.data'] + attention_mask = inputs['input.mask'] + target_semantic_ids = inputs['output.data'] + + decoder_input_ids = target_semantic_ids[:, :-1].contiguous() + labels = target_semantic_ids[:, 1:].contiguous() + + model_output = self.model( + input_ids=input_semantic_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + labels=labels + ) + loss = model_output['loss'] + + metrics = {'loss': loss.detach()} + + if not self.training and not self._use_microbatching: + visited_batch = inputs['visited.padded'] + + output = self.model.generate( + input_ids=input_semantic_ids, + attention_mask=attention_mask, + num_beams=self._num_beams, + num_return_sequences=self._num_return_sequences, + max_length=self._sem_id_len + 1, + decoder_start_token_id=self.config.decoder_start_token_id, + eos_token_id=self.config.eos_token_id, + pad_token_id=self.config.pad_token_id, + do_sample=False, + early_stopping=False, + logits_processor=[self.logits_processor(visited_items=visited_batch)] if self.logits_processor is not None else [], + ) + + predictions = output[:, 1:].reshape(-1, self._num_return_sequences, self._sem_id_len) + + all_hits = (torch.eq(predictions, labels[:, None]).sum(dim=-1)) # (batch_size, top_k) + elif not self.training and self._use_microbatching: + visited_batch = inputs['visited.padded'] + batch_size = input_semantic_ids.shape[0] + + inference_batch_size = self._microbatch_size # вместо полного batch_size + + all_predictions = [] + all_labels = [] + # print(f"start to infer batch of shape {input_semantic_ids.shape} with new batch {inference_batch_size}") + for batch_idx in range(0, batch_size, inference_batch_size): + batch_end = min(batch_idx + inference_batch_size, batch_size) + batch_slice = slice(batch_idx, batch_end) + + input_ids_batch = input_semantic_ids[batch_slice] + attention_mask_batch = attention_mask[batch_slice] + visited_batch_subset = visited_batch[batch_slice] + labels_batch = labels[batch_slice] + + with torch.inference_mode(): + output = self.model.generate( + input_ids=input_ids_batch, + attention_mask=attention_mask_batch, + num_beams=self._num_beams, + num_return_sequences=self._num_return_sequences, + max_length=self._sem_id_len + 1, + decoder_start_token_id=self.config.decoder_start_token_id, + eos_token_id=self.config.eos_token_id, + pad_token_id=self.config.pad_token_id, + do_sample=False, + early_stopping=False, + logits_processor=[self.logits_processor(visited_items=visited_batch_subset)] if self.logits_processor is not None else [], + ) + + predictions_batch = output[:, 1:].reshape(-1, self._num_return_sequences, self._sem_id_len) + all_predictions.append(predictions_batch) + all_labels.append(labels_batch) + # print("end infer of batch") + + predictions = torch.cat(all_predictions, dim=0) # (batch_size, num_return_sequences, sem_id_len) + labels_full = torch.cat(all_labels, dim=0) # (batch_size, sem_id_len) + all_hits = (torch.eq(predictions, labels_full[:, None]).sum(dim=-1)) # (batch_size, top_k) + + if not self.training: + for k in [5, 10, 20]: + hits = (all_hits[:, :k] == self._sem_id_len).float() # (batch_size, k) + recall = hits.sum(dim=-1) # (batch_size) + discount_factor = 1 / torch.log2(torch.arange(1, k + 1, 1).float() + 1.).to(hits.device) # (k) + + metrics[f'recall@{k}'] = recall.cpu().float() + metrics[f'ndcg@{k}'] = torch.einsum('bk,k->b', hits, discount_factor).cpu().float() + + return loss, metrics \ No newline at end of file diff --git a/scripts/tiger-yambda/yambda_train_4.1_plum.py b/scripts/tiger-yambda/yambda_train_4.1_plum.py new file mode 100644 index 0000000..607c0e3 --- /dev/null +++ b/scripts/tiger-yambda/yambda_train_4.1_plum.py @@ -0,0 +1,230 @@ +from functools import partial +import json +from loguru import logger +import os + +import torch + +import irec.callbacks as cb +from irec.data.transforms import Collate, ToDevice +from irec.data.dataloader import DataLoader +from irec.runners import TrainingRunner +from irec.utils import fix_random_seed + +from data import ArrowBatchDataset +from models import TigerModel, CorrectItemsLogitsProcessor + + +# ПУТИ +IREC_PATH = '../../' +SEMANTIC_MAPPING_PATH = os.path.join(IREC_PATH, 'results_sigir_yambda/4-1_filtered_yambda_gpu_quantile_ws_2_clusters_colisionless.json') +TRAIN_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Yambda/day-splits/test/yambda_quantile_tiger_T_train_batches/') +VALID_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Yambda/day-splits/test/yambda_quantile_tiger_T_valid_batches/') +EVAL_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Yambda/day-splits/test/yambda_quantile_tiger_T_eval_batches/') + + +TENSORBOARD_LOGDIR = os.path.join(IREC_PATH, 'tensorboard_logs') +CHECKPOINTS_DIR = os.path.join(IREC_PATH, 'checkpoints') + +EXPERIMENT_NAME = 'TEST_tiger_yambda_filtered_day-split_plum_ws_2_dp_0.2_max_300_256_1024' + +# ОСТАЛЬНОЕ +SEED_VALUE = 42 +DEVICE = 'cuda' + +NUM_EPOCHS = 300 +MAX_SEQ_LEN = 20 +TRAIN_BATCH_SIZE = 256 +VALID_BATCH_SIZE = 1024 +EMBEDDING_DIM = 128 +CODEBOOK_SIZE = 256 +NUM_POSITIONS = 20 +NUM_USER_HASH = 2000 +NUM_HEADS = 6 +NUM_LAYERS = 4 +FEEDFORWARD_DIM = 1024 +KV_DIM = 64 +DROPOUT = 0.2 +NUM_BEAMS = 30 +TOP_K = 20 +NUM_CODEBOOKS = 4 +LR = 0.0001 + +USE_MICROBATCHING = True +MICROBATCH_SIZE = 128 + +torch.set_float32_matmul_precision('high') +torch._dynamo.config.capture_scalar_outputs = True + +import torch._inductor.config as config +config.triton.cudagraph_skip_dynamic_graphs = True + + +def main(): + fix_random_seed(SEED_VALUE) + + with open(SEMANTIC_MAPPING_PATH, 'r') as f: + mappings = json.load(f) + + train_dataloader = DataLoader( + ArrowBatchDataset( + TRAIN_BATCHES_DIR, + device='cpu', + preload=True + ), + batch_size=1, + shuffle=True, + num_workers=0, + pin_memory=True, + collate_fn=Collate() + ).map(ToDevice(DEVICE)).repeat(NUM_EPOCHS) + + valid_dataloder = ArrowBatchDataset( + VALID_BATCHES_DIR, + device=DEVICE, + preload=True + ) + + eval_dataloder = ArrowBatchDataset( + EVAL_BATCHES_DIR, + device=DEVICE, + preload=True + ) + + model = TigerModel( + embedding_dim=EMBEDDING_DIM, + codebook_size=CODEBOOK_SIZE, + sem_id_len=NUM_CODEBOOKS, + user_ids_count=NUM_USER_HASH, + num_positions=NUM_POSITIONS, + num_heads=NUM_HEADS, + num_encoder_layers=NUM_LAYERS, + num_decoder_layers=NUM_LAYERS, + dim_feedforward=FEEDFORWARD_DIM, + num_beams=NUM_BEAMS, + num_return_sequences=TOP_K, + activation='relu', + d_kv=KV_DIM, + dropout=DROPOUT, + layer_norm_eps=1e-6, + initializer_range=0.02, + logits_processor=partial( + CorrectItemsLogitsProcessor, + NUM_CODEBOOKS, + CODEBOOK_SIZE, + mappings, + NUM_BEAMS + ), + use_microbatching=USE_MICROBATCHING, + microbatch_size=MICROBATCH_SIZE + ).to(DEVICE) + + total_params = sum(p.numel() for p in model.parameters()) + trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + + logger.debug(f'Overall parameters: {total_params:,}') + logger.debug(f'Trainable parameters: {trainable_params:,}') + + optimizer = torch.optim.AdamW( + model.parameters(), + lr=LR, + ) + + EPOCH_NUM_STEPS = 1024 # int(len(train_dataloader) // NUM_EPOCHS) + + callbacks = [ + cb.BatchMetrics(metrics=lambda model_outputs, _: { + 'loss': model_outputs['loss'].item(), + }, name='train'), + cb.MetricAccumulator( + accumulators={ + 'train/loss': cb.MeanAccumulator(), + }, + reset_every_num_steps=EPOCH_NUM_STEPS + ), + + cb.Validation( + dataset=valid_dataloder, + callbacks=[ + cb.BatchMetrics(metrics=lambda model_outputs, _:{ + 'loss': model_outputs['loss'].item(), + 'recall@5': model_outputs['recall@5'].tolist(), + 'recall@10': model_outputs['recall@10'].tolist(), + 'recall@20': model_outputs['recall@20'].tolist(), + 'ndcg@5': model_outputs['ndcg@5'].tolist(), + 'ndcg@10': model_outputs['ndcg@10'].tolist(), + 'ndcg@20': model_outputs['ndcg@20'].tolist(), + }, name='validation'), + cb.MetricAccumulator( + accumulators={ + 'validation/loss': cb.MeanAccumulator(), + 'validation/recall@5': cb.MeanAccumulator(), + 'validation/recall@10': cb.MeanAccumulator(), + 'validation/recall@20': cb.MeanAccumulator(), + 'validation/ndcg@5': cb.MeanAccumulator(), + 'validation/ndcg@10': cb.MeanAccumulator(), + 'validation/ndcg@20': cb.MeanAccumulator(), + }, + ), + ], + ).every_num_steps(EPOCH_NUM_STEPS), + + cb.Validation( + dataset=eval_dataloder, + callbacks=[ + cb.BatchMetrics(metrics=lambda model_outputs, _: { + 'loss': model_outputs['loss'].item(), + 'recall@5': model_outputs['recall@5'].tolist(), + 'recall@10': model_outputs['recall@10'].tolist(), + 'recall@20': model_outputs['recall@20'].tolist(), + 'ndcg@5': model_outputs['ndcg@5'].tolist(), + 'ndcg@10': model_outputs['ndcg@10'].tolist(), + 'ndcg@20': model_outputs['ndcg@20'].tolist(), + }, name='eval'), + cb.MetricAccumulator( + accumulators={ + 'eval/loss': cb.MeanAccumulator(), + 'eval/recall@5': cb.MeanAccumulator(), + 'eval/recall@10': cb.MeanAccumulator(), + 'eval/recall@20': cb.MeanAccumulator(), + 'eval/ndcg@5': cb.MeanAccumulator(), + 'eval/ndcg@10': cb.MeanAccumulator(), + 'eval/ndcg@20': cb.MeanAccumulator(), + }, + ), + ], + ).every_num_steps(EPOCH_NUM_STEPS * 4), + + cb.Logger().every_num_steps(EPOCH_NUM_STEPS), + cb.TensorboardLogger(experiment_name=EXPERIMENT_NAME, logdir=TENSORBOARD_LOGDIR), + + cb.EarlyStopping( + metric='validation/ndcg@20', + patience=40 * 4, + minimize=False, + model_path=os.path.join(CHECKPOINTS_DIR, EXPERIMENT_NAME) + ).every_num_steps(EPOCH_NUM_STEPS) + + # cb.Profiler( + # wait=10, + # warmup=10, + # active=10, + # logdir=TENSORBOARD_LOGDIR + # ), + # cb.StopAfterNumSteps(40) + + ] + + logger.debug('Everything is ready for training process!') + + runner = TrainingRunner( + model=model, + optimizer=optimizer, + dataset=train_dataloader, + callbacks=callbacks, + ) + runner.run() + + +if __name__ == '__main__': + main() diff --git a/scripts/tiger-yambda/yambda_varka_4.1_plum.py b/scripts/tiger-yambda/yambda_varka_4.1_plum.py new file mode 100644 index 0000000..9c00704 --- /dev/null +++ b/scripts/tiger-yambda/yambda_varka_4.1_plum.py @@ -0,0 +1,304 @@ +from collections import defaultdict +import json +import murmurhash +import numpy as np +import os +from pathlib import Path + +import pyarrow as pa +import pyarrow.feather as feather + +import torch + +from irec.data.transforms import Collate, Transform +from irec.data.dataloader import DataLoader + +from data import Dataset + +print("tiger no arrow varka 4.1") + +# ПУТИ + +IREC_PATH = '../../' +INTERACTIONS_TRAIN_PATH = os.path.join(IREC_PATH, 'data/Yambda/day-splits/merged_for_exps_filtered/exp_4_0.9_inter_tiger_train.json') +INTERACTIONS_VALID_PATH = os.path.join(IREC_PATH, 'data/Yambda/day-splits/merged_for_exps_filtered/valid_set.json') +INTERACTIONS_TEST_PATH = os.path.join(IREC_PATH, 'data/Yambda/day-splits/merged_for_exps_filtered/test_set.json') + +SEMANTIC_MAPPING_PATH = os.path.join(IREC_PATH, 'results_sigir_yambda/4-1_filtered_yambda_gpu_quantile_ws_2_clusters_colisionless.json') +TRAIN_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Yambda/day-splits/test/yambda_quantile_tiger_T_train_batches/') +VALID_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Yambda/day-splits/test/yambda_quantile_tiger_T_valid_batches/') +EVAL_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Yambda/day-splits/test/yambda_quantile_tiger_T_eval_batches/') + + +# ОСТАЛЬНОЕ + +SEED_VALUE = 42 +DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + + +MAX_TRAIN_EVENTS = 300 +MAX_SEQ_LEN = 20 +TRAIN_BATCH_SIZE = 256 +VALID_BATCH_SIZE = 1024 +NUM_USER_HASH = 2000 +CODEBOOK_SIZE = 256 +NUM_CODEBOOKS = 4 + +UNIFIED_VOCAB_SIZE = CODEBOOK_SIZE * NUM_CODEBOOKS + NUM_USER_HASH + 10 # 10 for utilities +PAD_TOKEN_ID = UNIFIED_VOCAB_SIZE - 1, +EOS_TOKEN_ID = UNIFIED_VOCAB_SIZE - 2, +DECODER_START_TOKEN_ID = UNIFIED_VOCAB_SIZE - 3, + + +class TigerProcessing(Transform): + def __call__(self, batch): + input_semantic_ids, attention_mask = batch['item.semantic.padded'], batch['item.semantic.mask'] + batch_size = attention_mask.shape[0] + + input_semantic_ids[~attention_mask] = PAD_TOKEN_ID # TODO ??? + + input_semantic_ids = np.concatenate([ + input_semantic_ids, + NUM_CODEBOOKS * CODEBOOK_SIZE + batch['user.hashed.ids'][:, None] + ], axis=-1) + + attention_mask = np.concatenate([ + attention_mask, + np.ones((batch_size, 1), dtype=attention_mask.dtype) + ], axis=-1) + + batch['input.data'] = input_semantic_ids + batch['input.mask'] = attention_mask + + target_semantic_ids = batch['labels.semantic.padded'] + target_semantic_ids = np.concatenate([ + np.ones( + (batch_size, 1), + dtype=np.int64, + ) * DECODER_START_TOKEN_ID, + target_semantic_ids + ], axis=-1) + + batch['output.data'] = target_semantic_ids + + return batch + + +class ToMasked(Transform): + def __init__(self, prefix, is_right_aligned=False): + self._prefix = prefix + self._is_right_aligned = is_right_aligned + + def __call__(self, batch): + data = batch[f'{self._prefix}.ids'] + lengths = batch[f'{self._prefix}.length'] + + batch_size = lengths.shape[0] + max_sequence_length = int(lengths.max()) + + if len(data.shape) == 1: # only indices + padded_tensor = np.zeros( + (batch_size, max_sequence_length), + dtype=data.dtype + ) # (batch_size, max_seq_len) + else: + assert len(data.shape) == 2 # embeddings + padded_tensor = np.zeros( + (batch_size, max_sequence_length, data.shape[-1]), + dtype=data.dtype + ) # (batch_size, max_seq_len, emb_dim) + + mask = np.arange(max_sequence_length)[None] < lengths[:, None] + + if self._is_right_aligned: + mask = np.flip(mask, axis=-1) + + padded_tensor[mask] = data + + batch[f'{self._prefix}.padded'] = padded_tensor + batch[f'{self._prefix}.mask'] = mask + + return batch + + +class SemanticIdsMapper(Transform): + def __init__(self, mapping, names=[]): + super().__init__() + self._mapping = mapping + self._names = names + + max_item_id = max(int(k) for k in mapping.keys()) + print(len(list(mapping.keys())), min(int(k) for k in mapping.keys()) , max(int(k) for k in mapping.keys())) + print(mapping["280052"]) #304781 + # assert False + data = [] + for i in range(max_item_id + 1): + if str(i) in mapping: + data.append(mapping[str(i)]) + else: + data.append([-1] * NUM_CODEBOOKS) + + self._mapping_tensor = torch.tensor(data, dtype=torch.long) + self._semantic_length = self._mapping_tensor.shape[-1] + + missing_count = (max_item_id + 1) - len(mapping) + print(f"Mapping: {len(mapping)} items, {missing_count} missing (-1 filled)") + + def __call__(self, batch): + for name in self._names: + if f'{name}.ids' in batch: + ids = batch[f'{name}.ids'] + lengths = batch[f'{name}.length'] + assert ids.min() >= 0 + assert ids.max() < self._mapping_tensor.shape[0] + semantic_ids = self._mapping_tensor[ids].flatten() + + assert (semantic_ids != -1).all(), \ + f"Missing mappings detected in {name}! Invalid positions: {(semantic_ids == -1).sum()} out of {len(semantic_ids)}" + + batch[f'{name}.semantic.ids'] = semantic_ids.numpy() + batch[f'{name}.semantic.length'] = lengths * self._semantic_length + + return batch + + +class UserHashing(Transform): + def __init__(self, hash_size): + super().__init__() + self._hash_size = hash_size + + def __call__(self, batch): + batch['user.hashed.ids'] = np.array([murmurhash.hash(str(x)) % self._hash_size for x in batch['user.ids']], dtype=np.int64) + return batch + + +def save_batches_to_arrow(batches, output_dir): + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=False) + + for batch_idx, batch in enumerate(batches): + length_groups = defaultdict(dict) + metadata_groups = defaultdict(dict) + + for key, value in batch.items(): + length = len(value) + + metadata_groups[length][f'{key}_shape'] = str(value.shape) + metadata_groups[length][f'{key}_dtype'] = str(value.dtype) + + if value.ndim == 1: + # 1D массив - сохраняем как есть + length_groups[length][key] = value + elif value.ndim == 2: + # 2D массив - используем list of lists + length_groups[length][key] = value.tolist() + else: + # >2D массив - flatten и сохраняем shape + length_groups[length][key] = value.flatten() + + for length, fields in length_groups.items(): + arrow_dict = {} + for k, v in fields.items(): + if isinstance(v, list) and len(v) > 0 and isinstance(v[0], list): + # List of lists (2D) + arrow_dict[k] = pa.array(v) + else: + arrow_dict[k] = pa.array(v) + + table = pa.table(arrow_dict) + if length in metadata_groups: + table = table.replace_schema_metadata(metadata_groups[length]) + + feather.write_feather( + table, + output_dir / f"batch_{batch_idx:06d}_len_{length}.arrow", + compression='lz4' + ) + + # arrow_dict = {k: pa.array(v) for k, v in fields.items()} + # table = pa.table(arrow_dict) + + # feather.write_feather( + # table, + # output_dir / f"batch_{batch_idx:06d}_len_{length}.arrow", + # compression='lz4' + # ) + + +def main(): + with open(SEMANTIC_MAPPING_PATH, 'r') as f: + mappings = json.load(f) + + data = Dataset.create_timestamp_based( + train_json_path=INTERACTIONS_TRAIN_PATH, + validation_json_path=INTERACTIONS_VALID_PATH, + test_json_path=INTERACTIONS_TEST_PATH, + max_sequence_length=MAX_SEQ_LEN, + sampler_type='tiger', + min_sample_len=2, + is_extended=True, + max_train_events=MAX_TRAIN_EVENTS + ) + + train_dataset, valid_dataset, eval_dataset = data.get_datasets() + + train_dataloader = DataLoader( + dataset=train_dataset, + batch_size=TRAIN_BATCH_SIZE, + shuffle=True, + drop_last=True + ) \ + .map(Collate()) \ + .map(UserHashing(NUM_USER_HASH)) \ + .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \ + .map(ToMasked('item.semantic', is_right_aligned=True)) \ + .map(ToMasked('labels.semantic', is_right_aligned=True)) \ + .map(TigerProcessing()) + + valid_dataloader = DataLoader( + dataset=valid_dataset, + batch_size=VALID_BATCH_SIZE, + shuffle=False, + drop_last=False + ) \ + .map(Collate()) \ + .map(UserHashing(NUM_USER_HASH)) \ + .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \ + .map(ToMasked('item.semantic', is_right_aligned=True)) \ + .map(ToMasked('labels.semantic', is_right_aligned=True)) \ + .map(ToMasked('visited', is_right_aligned=True)) \ + .map(TigerProcessing()) + + eval_dataloader = DataLoader( + dataset=eval_dataset, + batch_size=VALID_BATCH_SIZE, + shuffle=False, + drop_last=False + ) \ + .map(Collate()) \ + .map(UserHashing(NUM_USER_HASH)) \ + .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \ + .map(ToMasked('item.semantic', is_right_aligned=True)) \ + .map(ToMasked('labels.semantic', is_right_aligned=True)) \ + .map(ToMasked('visited', is_right_aligned=True)) \ + .map(TigerProcessing()) + + train_batches = [] + for train_batch in train_dataloader: + train_batches.append(train_batch) + save_batches_to_arrow(train_batches, TRAIN_BATCHES_DIR) + + valid_batches = [] + for valid_batch in valid_dataloader: + valid_batches.append(valid_batch) + save_batches_to_arrow(valid_batches, VALID_BATCHES_DIR) + + eval_batches = [] + for eval_batch in eval_dataloader: + eval_batches.append(eval_batch) + save_batches_to_arrow(eval_batches, EVAL_BATCHES_DIR) + + + +if __name__ == '__main__': + main() diff --git a/scripts/tiger/beauty_exps/train_4.1_plum.py b/scripts/tiger/beauty_exps/train_4.1_plum.py new file mode 100644 index 0000000..8daf273 --- /dev/null +++ b/scripts/tiger/beauty_exps/train_4.1_plum.py @@ -0,0 +1,225 @@ +from functools import partial +import json +from loguru import logger +import os + +import torch + +import irec.callbacks as cb +from irec.data.transforms import Collate, ToDevice +from irec.data.dataloader import DataLoader +from irec.runners import TrainingRunner +from irec.utils import fix_random_seed + +from data import ArrowBatchDataset +from models import TigerModel, CorrectItemsLogitsProcessor + + +# ПУТИ +IREC_PATH = '../../' +SEMANTIC_MAPPING_PATH = os.path.join(IREC_PATH, 'results_sigir/4-1_updated_quantile_plum_rqvae_beauty_ws_2_clusters_colisionless.json') +TRAIN_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/updated_quantile_tiger_4-1_train_batches/') +VALID_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/updated_quantile_tiger_4-1_valid_batches/') +EVAL_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/updated_quantile_tiger_4-1_eval_batches/') + +TENSORBOARD_LOGDIR = os.path.join(IREC_PATH, 'tensorboard_logs') +CHECKPOINTS_DIR = os.path.join(IREC_PATH, 'checkpoints') + +EXPERIMENT_NAME = 'tiger_beauty_updated_quantile_4-1_plum_ws_2_dp_0.2' + +# ОСТАЛЬНОЕ +SEED_VALUE = 42 +DEVICE = 'cuda' + +NUM_EPOCHS = 300 +MAX_SEQ_LEN = 20 +TRAIN_BATCH_SIZE = 256 +VALID_BATCH_SIZE = 1024 +EMBEDDING_DIM = 128 +CODEBOOK_SIZE = 256 +NUM_POSITIONS = 20 +NUM_USER_HASH = 2000 +NUM_HEADS = 6 +NUM_LAYERS = 4 +FEEDFORWARD_DIM = 1024 +KV_DIM = 64 +DROPOUT = 0.2 +NUM_BEAMS = 30 +TOP_K = 20 +NUM_CODEBOOKS = 4 +LR = 0.0001 + + +torch.set_float32_matmul_precision('high') +torch._dynamo.config.capture_scalar_outputs = True + +import torch._inductor.config as config +config.triton.cudagraph_skip_dynamic_graphs = True + + +def main(): + fix_random_seed(SEED_VALUE) + + with open(SEMANTIC_MAPPING_PATH, 'r') as f: + mappings = json.load(f) + + train_dataloader = DataLoader( + ArrowBatchDataset( + TRAIN_BATCHES_DIR, + device='cpu', + preload=True + ), + batch_size=1, + shuffle=True, + num_workers=0, + pin_memory=True, + collate_fn=Collate() + ).map(ToDevice(DEVICE)).repeat(NUM_EPOCHS) + + valid_dataloder = ArrowBatchDataset( + VALID_BATCHES_DIR, + device=DEVICE, + preload=True + ) + + eval_dataloder = ArrowBatchDataset( + EVAL_BATCHES_DIR, + device=DEVICE, + preload=True + ) + + model = TigerModel( + embedding_dim=EMBEDDING_DIM, + codebook_size=CODEBOOK_SIZE, + sem_id_len=NUM_CODEBOOKS, + user_ids_count=NUM_USER_HASH, + num_positions=NUM_POSITIONS, + num_heads=NUM_HEADS, + num_encoder_layers=NUM_LAYERS, + num_decoder_layers=NUM_LAYERS, + dim_feedforward=FEEDFORWARD_DIM, + num_beams=NUM_BEAMS, + num_return_sequences=TOP_K, + activation='relu', + d_kv=KV_DIM, + dropout=DROPOUT, + layer_norm_eps=1e-6, + initializer_range=0.02, + logits_processor=partial( + CorrectItemsLogitsProcessor, + NUM_CODEBOOKS, + CODEBOOK_SIZE, + mappings, + NUM_BEAMS + ) + ).to(DEVICE) + + total_params = sum(p.numel() for p in model.parameters()) + trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + + logger.debug(f'Overall parameters: {total_params:,}') + logger.debug(f'Trainable parameters: {trainable_params:,}') + + optimizer = torch.optim.AdamW( + model.parameters(), + lr=LR, + ) + + EPOCH_NUM_STEPS = 1024 # int(len(train_dataloader) // NUM_EPOCHS) + + callbacks = [ + cb.BatchMetrics(metrics=lambda model_outputs, _: { + 'loss': model_outputs['loss'].item(), + }, name='train'), + cb.MetricAccumulator( + accumulators={ + 'train/loss': cb.MeanAccumulator(), + }, + reset_every_num_steps=EPOCH_NUM_STEPS + ), + + cb.Validation( + dataset=valid_dataloder, + callbacks=[ + cb.BatchMetrics(metrics=lambda model_outputs, _:{ + 'loss': model_outputs['loss'].item(), + 'recall@5': model_outputs['recall@5'].tolist(), + 'recall@10': model_outputs['recall@10'].tolist(), + 'recall@20': model_outputs['recall@20'].tolist(), + 'ndcg@5': model_outputs['ndcg@5'].tolist(), + 'ndcg@10': model_outputs['ndcg@10'].tolist(), + 'ndcg@20': model_outputs['ndcg@20'].tolist(), + }, name='validation'), + cb.MetricAccumulator( + accumulators={ + 'validation/loss': cb.MeanAccumulator(), + 'validation/recall@5': cb.MeanAccumulator(), + 'validation/recall@10': cb.MeanAccumulator(), + 'validation/recall@20': cb.MeanAccumulator(), + 'validation/ndcg@5': cb.MeanAccumulator(), + 'validation/ndcg@10': cb.MeanAccumulator(), + 'validation/ndcg@20': cb.MeanAccumulator(), + }, + ), + ], + ).every_num_steps(EPOCH_NUM_STEPS), + + cb.Validation( + dataset=eval_dataloder, + callbacks=[ + cb.BatchMetrics(metrics=lambda model_outputs, _: { + 'loss': model_outputs['loss'].item(), + 'recall@5': model_outputs['recall@5'].tolist(), + 'recall@10': model_outputs['recall@10'].tolist(), + 'recall@20': model_outputs['recall@20'].tolist(), + 'ndcg@5': model_outputs['ndcg@5'].tolist(), + 'ndcg@10': model_outputs['ndcg@10'].tolist(), + 'ndcg@20': model_outputs['ndcg@20'].tolist(), + }, name='eval'), + cb.MetricAccumulator( + accumulators={ + 'eval/loss': cb.MeanAccumulator(), + 'eval/recall@5': cb.MeanAccumulator(), + 'eval/recall@10': cb.MeanAccumulator(), + 'eval/recall@20': cb.MeanAccumulator(), + 'eval/ndcg@5': cb.MeanAccumulator(), + 'eval/ndcg@10': cb.MeanAccumulator(), + 'eval/ndcg@20': cb.MeanAccumulator(), + }, + ), + ], + ).every_num_steps(EPOCH_NUM_STEPS), + + cb.Logger().every_num_steps(EPOCH_NUM_STEPS), + cb.TensorboardLogger(experiment_name=EXPERIMENT_NAME, logdir=TENSORBOARD_LOGDIR), + + cb.EarlyStopping( + metric='eval/ndcg@20', + patience=40, + minimize=False, + model_path=os.path.join(CHECKPOINTS_DIR, EXPERIMENT_NAME) + ).every_num_steps(EPOCH_NUM_STEPS) + + # cb.Profiler( + # wait=10, + # warmup=10, + # active=10, + # logdir=TENSORBOARD_LOGDIR + # ), + # cb.StopAfterNumSteps(40) + + ] + + logger.debug('Everything is ready for training process!') + + runner = TrainingRunner( + model=model, + optimizer=optimizer, + dataset=train_dataloader, + callbacks=callbacks, + ) + runner.run() + + +if __name__ == '__main__': + main() diff --git a/scripts/tiger/beauty_exps/train_4.2_plum.py b/scripts/tiger/beauty_exps/train_4.2_plum.py new file mode 100644 index 0000000..580bcb5 --- /dev/null +++ b/scripts/tiger/beauty_exps/train_4.2_plum.py @@ -0,0 +1,225 @@ +from functools import partial +import json +from loguru import logger +import os + +import torch + +import irec.callbacks as cb +from irec.data.transforms import Collate, ToDevice +from irec.data.dataloader import DataLoader +from irec.runners import TrainingRunner +from irec.utils import fix_random_seed + +from data import ArrowBatchDataset +from models import TigerModel, CorrectItemsLogitsProcessor + + +# ПУТИ +IREC_PATH = '../../' +SEMANTIC_MAPPING_PATH = os.path.join(IREC_PATH, 'results_sigir/4-2_updated_quantile_plum_rqvae_beauty_ws_2_clusters_colisionless.json') +TRAIN_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/updated_quantile_tiger_4-2_train_batches/') +VALID_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/updated_quantile_tiger_4-2_valid_batches/') +EVAL_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/updated_quantile_tiger_4-2_eval_batches/') + +TENSORBOARD_LOGDIR = os.path.join(IREC_PATH, 'tensorboard_logs') +CHECKPOINTS_DIR = os.path.join(IREC_PATH, 'checkpoints') + +EXPERIMENT_NAME = 'tiger_beauty_updated_quantile_4-2_plum_ws_2_dp_0.2' + +# ОСТАЛЬНОЕ +SEED_VALUE = 42 +DEVICE = 'cuda' + +NUM_EPOCHS = 300 +MAX_SEQ_LEN = 20 +TRAIN_BATCH_SIZE = 256 +VALID_BATCH_SIZE = 1024 +EMBEDDING_DIM = 128 +CODEBOOK_SIZE = 256 +NUM_POSITIONS = 20 +NUM_USER_HASH = 2000 +NUM_HEADS = 6 +NUM_LAYERS = 4 +FEEDFORWARD_DIM = 1024 +KV_DIM = 64 +DROPOUT = 0.2 +NUM_BEAMS = 30 +TOP_K = 20 +NUM_CODEBOOKS = 4 +LR = 0.0001 + + +torch.set_float32_matmul_precision('high') +torch._dynamo.config.capture_scalar_outputs = True + +import torch._inductor.config as config +config.triton.cudagraph_skip_dynamic_graphs = True + + +def main(): + fix_random_seed(SEED_VALUE) + + with open(SEMANTIC_MAPPING_PATH, 'r') as f: + mappings = json.load(f) + + train_dataloader = DataLoader( + ArrowBatchDataset( + TRAIN_BATCHES_DIR, + device='cpu', + preload=True + ), + batch_size=1, + shuffle=True, + num_workers=0, + pin_memory=True, + collate_fn=Collate() + ).map(ToDevice(DEVICE)).repeat(NUM_EPOCHS) + + valid_dataloder = ArrowBatchDataset( + VALID_BATCHES_DIR, + device=DEVICE, + preload=True + ) + + eval_dataloder = ArrowBatchDataset( + EVAL_BATCHES_DIR, + device=DEVICE, + preload=True + ) + + model = TigerModel( + embedding_dim=EMBEDDING_DIM, + codebook_size=CODEBOOK_SIZE, + sem_id_len=NUM_CODEBOOKS, + user_ids_count=NUM_USER_HASH, + num_positions=NUM_POSITIONS, + num_heads=NUM_HEADS, + num_encoder_layers=NUM_LAYERS, + num_decoder_layers=NUM_LAYERS, + dim_feedforward=FEEDFORWARD_DIM, + num_beams=NUM_BEAMS, + num_return_sequences=TOP_K, + activation='relu', + d_kv=KV_DIM, + dropout=DROPOUT, + layer_norm_eps=1e-6, + initializer_range=0.02, + logits_processor=partial( + CorrectItemsLogitsProcessor, + NUM_CODEBOOKS, + CODEBOOK_SIZE, + mappings, + NUM_BEAMS + ) + ).to(DEVICE) + + total_params = sum(p.numel() for p in model.parameters()) + trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + + logger.debug(f'Overall parameters: {total_params:,}') + logger.debug(f'Trainable parameters: {trainable_params:,}') + + optimizer = torch.optim.AdamW( + model.parameters(), + lr=LR, + ) + + EPOCH_NUM_STEPS = 1024 # int(len(train_dataloader) // NUM_EPOCHS) + + callbacks = [ + cb.BatchMetrics(metrics=lambda model_outputs, _: { + 'loss': model_outputs['loss'].item(), + }, name='train'), + cb.MetricAccumulator( + accumulators={ + 'train/loss': cb.MeanAccumulator(), + }, + reset_every_num_steps=EPOCH_NUM_STEPS + ), + + cb.Validation( + dataset=valid_dataloder, + callbacks=[ + cb.BatchMetrics(metrics=lambda model_outputs, _:{ + 'loss': model_outputs['loss'].item(), + 'recall@5': model_outputs['recall@5'].tolist(), + 'recall@10': model_outputs['recall@10'].tolist(), + 'recall@20': model_outputs['recall@20'].tolist(), + 'ndcg@5': model_outputs['ndcg@5'].tolist(), + 'ndcg@10': model_outputs['ndcg@10'].tolist(), + 'ndcg@20': model_outputs['ndcg@20'].tolist(), + }, name='validation'), + cb.MetricAccumulator( + accumulators={ + 'validation/loss': cb.MeanAccumulator(), + 'validation/recall@5': cb.MeanAccumulator(), + 'validation/recall@10': cb.MeanAccumulator(), + 'validation/recall@20': cb.MeanAccumulator(), + 'validation/ndcg@5': cb.MeanAccumulator(), + 'validation/ndcg@10': cb.MeanAccumulator(), + 'validation/ndcg@20': cb.MeanAccumulator(), + }, + ), + ], + ).every_num_steps(EPOCH_NUM_STEPS), + + cb.Validation( + dataset=eval_dataloder, + callbacks=[ + cb.BatchMetrics(metrics=lambda model_outputs, _: { + 'loss': model_outputs['loss'].item(), + 'recall@5': model_outputs['recall@5'].tolist(), + 'recall@10': model_outputs['recall@10'].tolist(), + 'recall@20': model_outputs['recall@20'].tolist(), + 'ndcg@5': model_outputs['ndcg@5'].tolist(), + 'ndcg@10': model_outputs['ndcg@10'].tolist(), + 'ndcg@20': model_outputs['ndcg@20'].tolist(), + }, name='eval'), + cb.MetricAccumulator( + accumulators={ + 'eval/loss': cb.MeanAccumulator(), + 'eval/recall@5': cb.MeanAccumulator(), + 'eval/recall@10': cb.MeanAccumulator(), + 'eval/recall@20': cb.MeanAccumulator(), + 'eval/ndcg@5': cb.MeanAccumulator(), + 'eval/ndcg@10': cb.MeanAccumulator(), + 'eval/ndcg@20': cb.MeanAccumulator(), + }, + ), + ], + ).every_num_steps(EPOCH_NUM_STEPS), + + cb.Logger().every_num_steps(EPOCH_NUM_STEPS), + cb.TensorboardLogger(experiment_name=EXPERIMENT_NAME, logdir=TENSORBOARD_LOGDIR), + + cb.EarlyStopping( + metric='eval/ndcg@20', + patience=40, + minimize=False, + model_path=os.path.join(CHECKPOINTS_DIR, EXPERIMENT_NAME) + ).every_num_steps(EPOCH_NUM_STEPS) + + # cb.Profiler( + # wait=10, + # warmup=10, + # active=10, + # logdir=TENSORBOARD_LOGDIR + # ), + # cb.StopAfterNumSteps(40) + + ] + + logger.debug('Everything is ready for training process!') + + runner = TrainingRunner( + model=model, + optimizer=optimizer, + dataset=train_dataloader, + callbacks=callbacks, + ) + runner.run() + + +if __name__ == '__main__': + main() diff --git a/scripts/tiger/beauty_exps/train_4.3_plum.py b/scripts/tiger/beauty_exps/train_4.3_plum.py new file mode 100644 index 0000000..f98e9fd --- /dev/null +++ b/scripts/tiger/beauty_exps/train_4.3_plum.py @@ -0,0 +1,225 @@ +from functools import partial +import json +from loguru import logger +import os + +import torch + +import irec.callbacks as cb +from irec.data.transforms import Collate, ToDevice +from irec.data.dataloader import DataLoader +from irec.runners import TrainingRunner +from irec.utils import fix_random_seed + +from data import ArrowBatchDataset +from models import TigerModel, CorrectItemsLogitsProcessor + + +# ПУТИ +IREC_PATH = '../../' +SEMANTIC_MAPPING_PATH = os.path.join(IREC_PATH, 'results_sigir/4-3_updated_quantile_plum_rqvae_beauty_ws_2_clusters_colisionless.json') +TRAIN_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/updated_quantile_tiger_4-3_train_batches/') +VALID_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/updated_quantile_tiger_4-3_valid_batches/') +EVAL_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/updated_quantile_tiger_4-3_eval_batches/') + +TENSORBOARD_LOGDIR = os.path.join(IREC_PATH, 'tensorboard_logs') +CHECKPOINTS_DIR = os.path.join(IREC_PATH, 'checkpoints') + +EXPERIMENT_NAME = 'tiger_beauty_updated_quantile_4-3_plum_ws_2_dp_0.2' + +# ОСТАЛЬНОЕ +SEED_VALUE = 42 +DEVICE = 'cuda' + +NUM_EPOCHS = 300 +MAX_SEQ_LEN = 20 +TRAIN_BATCH_SIZE = 256 +VALID_BATCH_SIZE = 1024 +EMBEDDING_DIM = 128 +CODEBOOK_SIZE = 256 +NUM_POSITIONS = 20 +NUM_USER_HASH = 2000 +NUM_HEADS = 6 +NUM_LAYERS = 4 +FEEDFORWARD_DIM = 1024 +KV_DIM = 64 +DROPOUT = 0.2 +NUM_BEAMS = 30 +TOP_K = 20 +NUM_CODEBOOKS = 4 +LR = 0.0001 + + +torch.set_float32_matmul_precision('high') +torch._dynamo.config.capture_scalar_outputs = True + +import torch._inductor.config as config +config.triton.cudagraph_skip_dynamic_graphs = True + + +def main(): + fix_random_seed(SEED_VALUE) + + with open(SEMANTIC_MAPPING_PATH, 'r') as f: + mappings = json.load(f) + + train_dataloader = DataLoader( + ArrowBatchDataset( + TRAIN_BATCHES_DIR, + device='cpu', + preload=True + ), + batch_size=1, + shuffle=True, + num_workers=0, + pin_memory=True, + collate_fn=Collate() + ).map(ToDevice(DEVICE)).repeat(NUM_EPOCHS) + + valid_dataloder = ArrowBatchDataset( + VALID_BATCHES_DIR, + device=DEVICE, + preload=True + ) + + eval_dataloder = ArrowBatchDataset( + EVAL_BATCHES_DIR, + device=DEVICE, + preload=True + ) + + model = TigerModel( + embedding_dim=EMBEDDING_DIM, + codebook_size=CODEBOOK_SIZE, + sem_id_len=NUM_CODEBOOKS, + user_ids_count=NUM_USER_HASH, + num_positions=NUM_POSITIONS, + num_heads=NUM_HEADS, + num_encoder_layers=NUM_LAYERS, + num_decoder_layers=NUM_LAYERS, + dim_feedforward=FEEDFORWARD_DIM, + num_beams=NUM_BEAMS, + num_return_sequences=TOP_K, + activation='relu', + d_kv=KV_DIM, + dropout=DROPOUT, + layer_norm_eps=1e-6, + initializer_range=0.02, + logits_processor=partial( + CorrectItemsLogitsProcessor, + NUM_CODEBOOKS, + CODEBOOK_SIZE, + mappings, + NUM_BEAMS + ) + ).to(DEVICE) + + total_params = sum(p.numel() for p in model.parameters()) + trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + + logger.debug(f'Overall parameters: {total_params:,}') + logger.debug(f'Trainable parameters: {trainable_params:,}') + + optimizer = torch.optim.AdamW( + model.parameters(), + lr=LR, + ) + + EPOCH_NUM_STEPS = 1024 # int(len(train_dataloader) // NUM_EPOCHS) + + callbacks = [ + cb.BatchMetrics(metrics=lambda model_outputs, _: { + 'loss': model_outputs['loss'].item(), + }, name='train'), + cb.MetricAccumulator( + accumulators={ + 'train/loss': cb.MeanAccumulator(), + }, + reset_every_num_steps=EPOCH_NUM_STEPS + ), + + cb.Validation( + dataset=valid_dataloder, + callbacks=[ + cb.BatchMetrics(metrics=lambda model_outputs, _:{ + 'loss': model_outputs['loss'].item(), + 'recall@5': model_outputs['recall@5'].tolist(), + 'recall@10': model_outputs['recall@10'].tolist(), + 'recall@20': model_outputs['recall@20'].tolist(), + 'ndcg@5': model_outputs['ndcg@5'].tolist(), + 'ndcg@10': model_outputs['ndcg@10'].tolist(), + 'ndcg@20': model_outputs['ndcg@20'].tolist(), + }, name='validation'), + cb.MetricAccumulator( + accumulators={ + 'validation/loss': cb.MeanAccumulator(), + 'validation/recall@5': cb.MeanAccumulator(), + 'validation/recall@10': cb.MeanAccumulator(), + 'validation/recall@20': cb.MeanAccumulator(), + 'validation/ndcg@5': cb.MeanAccumulator(), + 'validation/ndcg@10': cb.MeanAccumulator(), + 'validation/ndcg@20': cb.MeanAccumulator(), + }, + ), + ], + ).every_num_steps(EPOCH_NUM_STEPS), + + cb.Validation( + dataset=eval_dataloder, + callbacks=[ + cb.BatchMetrics(metrics=lambda model_outputs, _: { + 'loss': model_outputs['loss'].item(), + 'recall@5': model_outputs['recall@5'].tolist(), + 'recall@10': model_outputs['recall@10'].tolist(), + 'recall@20': model_outputs['recall@20'].tolist(), + 'ndcg@5': model_outputs['ndcg@5'].tolist(), + 'ndcg@10': model_outputs['ndcg@10'].tolist(), + 'ndcg@20': model_outputs['ndcg@20'].tolist(), + }, name='eval'), + cb.MetricAccumulator( + accumulators={ + 'eval/loss': cb.MeanAccumulator(), + 'eval/recall@5': cb.MeanAccumulator(), + 'eval/recall@10': cb.MeanAccumulator(), + 'eval/recall@20': cb.MeanAccumulator(), + 'eval/ndcg@5': cb.MeanAccumulator(), + 'eval/ndcg@10': cb.MeanAccumulator(), + 'eval/ndcg@20': cb.MeanAccumulator(), + }, + ), + ], + ).every_num_steps(EPOCH_NUM_STEPS), + + cb.Logger().every_num_steps(EPOCH_NUM_STEPS), + cb.TensorboardLogger(experiment_name=EXPERIMENT_NAME, logdir=TENSORBOARD_LOGDIR), + + cb.EarlyStopping( + metric='eval/ndcg@20', + patience=40, + minimize=False, + model_path=os.path.join(CHECKPOINTS_DIR, EXPERIMENT_NAME) + ).every_num_steps(EPOCH_NUM_STEPS) + + # cb.Profiler( + # wait=10, + # warmup=10, + # active=10, + # logdir=TENSORBOARD_LOGDIR + # ), + # cb.StopAfterNumSteps(40) + + ] + + logger.debug('Everything is ready for training process!') + + runner = TrainingRunner( + model=model, + optimizer=optimizer, + dataset=train_dataloader, + callbacks=callbacks, + ) + runner.run() + + +if __name__ == '__main__': + main() diff --git a/scripts/tiger/beauty_exps/varka_4.1_plum.py b/scripts/tiger/beauty_exps/varka_4.1_plum.py new file mode 100644 index 0000000..302e04e --- /dev/null +++ b/scripts/tiger/beauty_exps/varka_4.1_plum.py @@ -0,0 +1,287 @@ +from collections import defaultdict +import json +import murmurhash +import numpy as np +import os +from pathlib import Path + +import pyarrow as pa +import pyarrow.feather as feather + +import torch + +from irec.data.transforms import Collate, Transform +from irec.data.dataloader import DataLoader + +from data import Dataset + + + +# ПУТИ + +IREC_PATH = '../../' +INTERACTIONS_TRAIN_PATH = os.path.join(IREC_PATH, 'sigir/Beauty_new/updated_quantile_splits/merged_for_exps/exp_4_0.9_inter_tiger_train.json') +INTERACTIONS_VALID_PATH = os.path.join(IREC_PATH, 'sigir/Beauty_new/updated_quantile_splits/merged_for_exps/valid_set.json') +INTERACTIONS_TEST_PATH = os.path.join(IREC_PATH, 'sigir/Beauty_new/updated_quantile_splits/merged_for_exps/test_set.json') + +SEMANTIC_MAPPING_PATH = os.path.join(IREC_PATH, 'results_sigir/4-1_updated_quantile_plum_rqvae_beauty_ws_2_clusters_colisionless.json') +TRAIN_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/updated_quantile_tiger_4-1_train_batches/') +VALID_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/updated_quantile_tiger_4-1_valid_batches/') +EVAL_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/updated_quantile_tiger_4-1_eval_batches/') + + +# ОСТАЛЬНОЕ + +SEED_VALUE = 42 +DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + + +MAX_SEQ_LEN = 20 +TRAIN_BATCH_SIZE = 256 +VALID_BATCH_SIZE = 1024 +NUM_USER_HASH = 2000 +CODEBOOK_SIZE = 256 +NUM_CODEBOOKS = 4 + +UNIFIED_VOCAB_SIZE = CODEBOOK_SIZE * NUM_CODEBOOKS + NUM_USER_HASH + 10 # 10 for utilities +PAD_TOKEN_ID = UNIFIED_VOCAB_SIZE - 1, +EOS_TOKEN_ID = UNIFIED_VOCAB_SIZE - 2, +DECODER_START_TOKEN_ID = UNIFIED_VOCAB_SIZE - 3, + + + +class TigerProcessing(Transform): + def __call__(self, batch): + input_semantic_ids, attention_mask = batch['item.semantic.padded'], batch['item.semantic.mask'] + batch_size = attention_mask.shape[0] + + input_semantic_ids[~attention_mask] = PAD_TOKEN_ID # TODO ??? + + input_semantic_ids = np.concatenate([ + input_semantic_ids, + NUM_CODEBOOKS * CODEBOOK_SIZE + batch['user.hashed.ids'][:, None] + ], axis=-1) + + attention_mask = np.concatenate([ + attention_mask, + np.ones((batch_size, 1), dtype=attention_mask.dtype) + ], axis=-1) + + batch['input.data'] = input_semantic_ids + batch['input.mask'] = attention_mask + + target_semantic_ids = batch['labels.semantic.padded'] + target_semantic_ids = np.concatenate([ + np.ones( + (batch_size, 1), + dtype=np.int64, + ) * DECODER_START_TOKEN_ID, + target_semantic_ids + ], axis=-1) + + batch['output.data'] = target_semantic_ids + + return batch + + +class ToMasked(Transform): + def __init__(self, prefix, is_right_aligned=False): + self._prefix = prefix + self._is_right_aligned = is_right_aligned + + def __call__(self, batch): + data = batch[f'{self._prefix}.ids'] + lengths = batch[f'{self._prefix}.length'] + + batch_size = lengths.shape[0] + max_sequence_length = int(lengths.max()) + + if len(data.shape) == 1: # only indices + padded_tensor = np.zeros( + (batch_size, max_sequence_length), + dtype=data.dtype + ) # (batch_size, max_seq_len) + else: + assert len(data.shape) == 2 # embeddings + padded_tensor = np.zeros( + (batch_size, max_sequence_length, data.shape[-1]), + dtype=data.dtype + ) # (batch_size, max_seq_len, emb_dim) + + mask = np.arange(max_sequence_length)[None] < lengths[:, None] + + if self._is_right_aligned: + mask = np.flip(mask, axis=-1) + + padded_tensor[mask] = data + + batch[f'{self._prefix}.padded'] = padded_tensor + batch[f'{self._prefix}.mask'] = mask + + return batch + + +class SemanticIdsMapper(Transform): + def __init__(self, mapping, names=[]): + super().__init__() + self._mapping = mapping + self._names = names + + data = [] + for i in range(len(mapping)): + data.append(mapping[str(i)]) + self._mapping_tensor = torch.tensor(data, dtype=torch.long) + self._semantic_length = self._mapping_tensor.shape[-1] + + def __call__(self, batch): + for name in self._names: + if f'{name}.ids' in batch: + ids = batch[f'{name}.ids'] + lengths = batch[f'{name}.length'] + assert ids.min() >= 0 + assert ids.max() < self._mapping_tensor.shape[0] + batch[f'{name}.semantic.ids'] = self._mapping_tensor[ids].flatten().numpy() + batch[f'{name}.semantic.length'] = lengths * self._semantic_length + + return batch + + +class UserHashing(Transform): + def __init__(self, hash_size): + super().__init__() + self._hash_size = hash_size + + def __call__(self, batch): + batch['user.hashed.ids'] = np.array([murmurhash.hash(str(x)) % self._hash_size for x in batch['user.ids']], dtype=np.int64) + return batch + + +def save_batches_to_arrow(batches, output_dir): + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=False) + + for batch_idx, batch in enumerate(batches): + length_groups = defaultdict(dict) + metadata_groups = defaultdict(dict) + + for key, value in batch.items(): + length = len(value) + + metadata_groups[length][f'{key}_shape'] = str(value.shape) + metadata_groups[length][f'{key}_dtype'] = str(value.dtype) + + if value.ndim == 1: + # 1D массив - сохраняем как есть + length_groups[length][key] = value + elif value.ndim == 2: + # 2D массив - используем list of lists + length_groups[length][key] = value.tolist() + else: + # >2D массив - flatten и сохраняем shape + length_groups[length][key] = value.flatten() + + for length, fields in length_groups.items(): + arrow_dict = {} + for k, v in fields.items(): + if isinstance(v, list) and len(v) > 0 and isinstance(v[0], list): + # List of lists (2D) + arrow_dict[k] = pa.array(v) + else: + arrow_dict[k] = pa.array(v) + + table = pa.table(arrow_dict) + if length in metadata_groups: + table = table.replace_schema_metadata(metadata_groups[length]) + + feather.write_feather( + table, + output_dir / f"batch_{batch_idx:06d}_len_{length}.arrow", + compression='lz4' + ) + + # arrow_dict = {k: pa.array(v) for k, v in fields.items()} + # table = pa.table(arrow_dict) + + # feather.write_feather( + # table, + # output_dir / f"batch_{batch_idx:06d}_len_{length}.arrow", + # compression='lz4' + # ) + + +def main(): + data = Dataset.create_timestamp_based( + train_json_path=INTERACTIONS_TRAIN_PATH, + validation_json_path=INTERACTIONS_VALID_PATH, + test_json_path=INTERACTIONS_TEST_PATH, + max_sequence_length=MAX_SEQ_LEN, + sampler_type='tiger', + min_sample_len=2, + is_extended=True + ) + + with open(SEMANTIC_MAPPING_PATH, 'r') as f: + mappings = json.load(f) + + train_dataset, valid_dataset, eval_dataset = data.get_datasets() + + train_dataloader = DataLoader( + dataset=train_dataset, + batch_size=TRAIN_BATCH_SIZE, + shuffle=True, + drop_last=True + ) \ + .map(Collate()) \ + .map(UserHashing(NUM_USER_HASH)) \ + .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \ + .map(ToMasked('item.semantic', is_right_aligned=True)) \ + .map(ToMasked('labels.semantic', is_right_aligned=True)) \ + .map(TigerProcessing()) + + valid_dataloader = DataLoader( + dataset=valid_dataset, + batch_size=VALID_BATCH_SIZE, + shuffle=False, + drop_last=False + ) \ + .map(Collate()) \ + .map(UserHashing(NUM_USER_HASH)) \ + .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \ + .map(ToMasked('item.semantic', is_right_aligned=True)) \ + .map(ToMasked('labels.semantic', is_right_aligned=True)) \ + .map(ToMasked('visited', is_right_aligned=True)) \ + .map(TigerProcessing()) + + eval_dataloader = DataLoader( + dataset=eval_dataset, + batch_size=VALID_BATCH_SIZE, + shuffle=False, + drop_last=False + ) \ + .map(Collate()) \ + .map(UserHashing(NUM_USER_HASH)) \ + .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \ + .map(ToMasked('item.semantic', is_right_aligned=True)) \ + .map(ToMasked('labels.semantic', is_right_aligned=True)) \ + .map(ToMasked('visited', is_right_aligned=True)) \ + .map(TigerProcessing()) + + train_batches = [] + for train_batch in train_dataloader: + train_batches.append(train_batch) + save_batches_to_arrow(train_batches, TRAIN_BATCHES_DIR) + + valid_batches = [] + for valid_batch in valid_dataloader: + valid_batches.append(valid_batch) + save_batches_to_arrow(valid_batches, VALID_BATCHES_DIR) + + eval_batches = [] + for eval_batch in eval_dataloader: + eval_batches.append(eval_batch) + save_batches_to_arrow(eval_batches, EVAL_BATCHES_DIR) + + + +if __name__ == '__main__': + main() diff --git a/scripts/tiger/beauty_exps/varka_4.2_plum.py b/scripts/tiger/beauty_exps/varka_4.2_plum.py new file mode 100644 index 0000000..b00fef2 --- /dev/null +++ b/scripts/tiger/beauty_exps/varka_4.2_plum.py @@ -0,0 +1,288 @@ +from collections import defaultdict +import json +import murmurhash +import numpy as np +import os +from pathlib import Path + +import pyarrow as pa +import pyarrow.feather as feather + +import torch + +from irec.data.transforms import Collate, Transform +from irec.data.dataloader import DataLoader + +from data import Dataset + + + +# ПУТИ + +IREC_PATH = '../../' +INTERACTIONS_TRAIN_PATH = os.path.join(IREC_PATH, 'sigir/Beauty_new/updated_quantile_splits/merged_for_exps/exp_4_0.9_inter_tiger_train.json') +INTERACTIONS_VALID_PATH = os.path.join(IREC_PATH, 'sigir/Beauty_new/updated_quantile_splits/merged_for_exps/valid_set.json') +INTERACTIONS_TEST_PATH = os.path.join(IREC_PATH, 'sigir/Beauty_new/updated_quantile_splits/merged_for_exps/test_set.json') + +SEMANTIC_MAPPING_PATH = os.path.join(IREC_PATH, 'results_sigir/4-2_updated_quantile_plum_rqvae_beauty_ws_2_clusters_colisionless.json') +TRAIN_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/updated_quantile_tiger_4-2_train_batches/') +VALID_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/updated_quantile_tiger_4-2_valid_batches/') +EVAL_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/updated_quantile_tiger_4-2_eval_batches/') + + + +# ОСТАЛЬНОЕ + +SEED_VALUE = 42 +DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + + +MAX_SEQ_LEN = 20 +TRAIN_BATCH_SIZE = 256 +VALID_BATCH_SIZE = 1024 +NUM_USER_HASH = 2000 +CODEBOOK_SIZE = 256 +NUM_CODEBOOKS = 4 + +UNIFIED_VOCAB_SIZE = CODEBOOK_SIZE * NUM_CODEBOOKS + NUM_USER_HASH + 10 # 10 for utilities +PAD_TOKEN_ID = UNIFIED_VOCAB_SIZE - 1, +EOS_TOKEN_ID = UNIFIED_VOCAB_SIZE - 2, +DECODER_START_TOKEN_ID = UNIFIED_VOCAB_SIZE - 3, + + + +class TigerProcessing(Transform): + def __call__(self, batch): + input_semantic_ids, attention_mask = batch['item.semantic.padded'], batch['item.semantic.mask'] + batch_size = attention_mask.shape[0] + + input_semantic_ids[~attention_mask] = PAD_TOKEN_ID # TODO ??? + + input_semantic_ids = np.concatenate([ + input_semantic_ids, + NUM_CODEBOOKS * CODEBOOK_SIZE + batch['user.hashed.ids'][:, None] + ], axis=-1) + + attention_mask = np.concatenate([ + attention_mask, + np.ones((batch_size, 1), dtype=attention_mask.dtype) + ], axis=-1) + + batch['input.data'] = input_semantic_ids + batch['input.mask'] = attention_mask + + target_semantic_ids = batch['labels.semantic.padded'] + target_semantic_ids = np.concatenate([ + np.ones( + (batch_size, 1), + dtype=np.int64, + ) * DECODER_START_TOKEN_ID, + target_semantic_ids + ], axis=-1) + + batch['output.data'] = target_semantic_ids + + return batch + + +class ToMasked(Transform): + def __init__(self, prefix, is_right_aligned=False): + self._prefix = prefix + self._is_right_aligned = is_right_aligned + + def __call__(self, batch): + data = batch[f'{self._prefix}.ids'] + lengths = batch[f'{self._prefix}.length'] + + batch_size = lengths.shape[0] + max_sequence_length = int(lengths.max()) + + if len(data.shape) == 1: # only indices + padded_tensor = np.zeros( + (batch_size, max_sequence_length), + dtype=data.dtype + ) # (batch_size, max_seq_len) + else: + assert len(data.shape) == 2 # embeddings + padded_tensor = np.zeros( + (batch_size, max_sequence_length, data.shape[-1]), + dtype=data.dtype + ) # (batch_size, max_seq_len, emb_dim) + + mask = np.arange(max_sequence_length)[None] < lengths[:, None] + + if self._is_right_aligned: + mask = np.flip(mask, axis=-1) + + padded_tensor[mask] = data + + batch[f'{self._prefix}.padded'] = padded_tensor + batch[f'{self._prefix}.mask'] = mask + + return batch + + +class SemanticIdsMapper(Transform): + def __init__(self, mapping, names=[]): + super().__init__() + self._mapping = mapping + self._names = names + + data = [] + for i in range(len(mapping)): + data.append(mapping[str(i)]) + self._mapping_tensor = torch.tensor(data, dtype=torch.long) + self._semantic_length = self._mapping_tensor.shape[-1] + + def __call__(self, batch): + for name in self._names: + if f'{name}.ids' in batch: + ids = batch[f'{name}.ids'] + lengths = batch[f'{name}.length'] + assert ids.min() >= 0 + assert ids.max() < self._mapping_tensor.shape[0] + batch[f'{name}.semantic.ids'] = self._mapping_tensor[ids].flatten().numpy() + batch[f'{name}.semantic.length'] = lengths * self._semantic_length + + return batch + + +class UserHashing(Transform): + def __init__(self, hash_size): + super().__init__() + self._hash_size = hash_size + + def __call__(self, batch): + batch['user.hashed.ids'] = np.array([murmurhash.hash(str(x)) % self._hash_size for x in batch['user.ids']], dtype=np.int64) + return batch + + +def save_batches_to_arrow(batches, output_dir): + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=False) + + for batch_idx, batch in enumerate(batches): + length_groups = defaultdict(dict) + metadata_groups = defaultdict(dict) + + for key, value in batch.items(): + length = len(value) + + metadata_groups[length][f'{key}_shape'] = str(value.shape) + metadata_groups[length][f'{key}_dtype'] = str(value.dtype) + + if value.ndim == 1: + # 1D массив - сохраняем как есть + length_groups[length][key] = value + elif value.ndim == 2: + # 2D массив - используем list of lists + length_groups[length][key] = value.tolist() + else: + # >2D массив - flatten и сохраняем shape + length_groups[length][key] = value.flatten() + + for length, fields in length_groups.items(): + arrow_dict = {} + for k, v in fields.items(): + if isinstance(v, list) and len(v) > 0 and isinstance(v[0], list): + # List of lists (2D) + arrow_dict[k] = pa.array(v) + else: + arrow_dict[k] = pa.array(v) + + table = pa.table(arrow_dict) + if length in metadata_groups: + table = table.replace_schema_metadata(metadata_groups[length]) + + feather.write_feather( + table, + output_dir / f"batch_{batch_idx:06d}_len_{length}.arrow", + compression='lz4' + ) + + # arrow_dict = {k: pa.array(v) for k, v in fields.items()} + # table = pa.table(arrow_dict) + + # feather.write_feather( + # table, + # output_dir / f"batch_{batch_idx:06d}_len_{length}.arrow", + # compression='lz4' + # ) + + +def main(): + data = Dataset.create_timestamp_based( + train_json_path=INTERACTIONS_TRAIN_PATH, + validation_json_path=INTERACTIONS_VALID_PATH, + test_json_path=INTERACTIONS_TEST_PATH, + max_sequence_length=MAX_SEQ_LEN, + sampler_type='tiger', + min_sample_len=2, + is_extended=True + ) + + with open(SEMANTIC_MAPPING_PATH, 'r') as f: + mappings = json.load(f) + + train_dataset, valid_dataset, eval_dataset = data.get_datasets() + + train_dataloader = DataLoader( + dataset=train_dataset, + batch_size=TRAIN_BATCH_SIZE, + shuffle=True, + drop_last=True + ) \ + .map(Collate()) \ + .map(UserHashing(NUM_USER_HASH)) \ + .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \ + .map(ToMasked('item.semantic', is_right_aligned=True)) \ + .map(ToMasked('labels.semantic', is_right_aligned=True)) \ + .map(TigerProcessing()) + + valid_dataloader = DataLoader( + dataset=valid_dataset, + batch_size=VALID_BATCH_SIZE, + shuffle=False, + drop_last=False + ) \ + .map(Collate()) \ + .map(UserHashing(NUM_USER_HASH)) \ + .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \ + .map(ToMasked('item.semantic', is_right_aligned=True)) \ + .map(ToMasked('labels.semantic', is_right_aligned=True)) \ + .map(ToMasked('visited', is_right_aligned=True)) \ + .map(TigerProcessing()) + + eval_dataloader = DataLoader( + dataset=eval_dataset, + batch_size=VALID_BATCH_SIZE, + shuffle=False, + drop_last=False + ) \ + .map(Collate()) \ + .map(UserHashing(NUM_USER_HASH)) \ + .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \ + .map(ToMasked('item.semantic', is_right_aligned=True)) \ + .map(ToMasked('labels.semantic', is_right_aligned=True)) \ + .map(ToMasked('visited', is_right_aligned=True)) \ + .map(TigerProcessing()) + + train_batches = [] + for train_batch in train_dataloader: + train_batches.append(train_batch) + save_batches_to_arrow(train_batches, TRAIN_BATCHES_DIR) + + valid_batches = [] + for valid_batch in valid_dataloader: + valid_batches.append(valid_batch) + save_batches_to_arrow(valid_batches, VALID_BATCHES_DIR) + + eval_batches = [] + for eval_batch in eval_dataloader: + eval_batches.append(eval_batch) + save_batches_to_arrow(eval_batches, EVAL_BATCHES_DIR) + + + +if __name__ == '__main__': + main() diff --git a/scripts/tiger/beauty_exps/varka_4.3_plum.py b/scripts/tiger/beauty_exps/varka_4.3_plum.py new file mode 100644 index 0000000..2a96339 --- /dev/null +++ b/scripts/tiger/beauty_exps/varka_4.3_plum.py @@ -0,0 +1,289 @@ +from collections import defaultdict +import json +import murmurhash +import numpy as np +import os +from pathlib import Path + +import pyarrow as pa +import pyarrow.feather as feather + +import torch + +from irec.data.transforms import Collate, Transform +from irec.data.dataloader import DataLoader + +from data import Dataset + + + +# ПУТИ + +IREC_PATH = '../../' +INTERACTIONS_TRAIN_PATH = os.path.join(IREC_PATH, 'sigir/Beauty_new/updated_quantile_splits/merged_for_exps/exp_4_0.9_inter_tiger_train.json') +INTERACTIONS_VALID_PATH = os.path.join(IREC_PATH, 'sigir/Beauty_new/updated_quantile_splits/merged_for_exps/valid_set.json') +INTERACTIONS_TEST_PATH = os.path.join(IREC_PATH, 'sigir/Beauty_new/updated_quantile_splits/merged_for_exps/test_set.json') + +SEMANTIC_MAPPING_PATH = os.path.join(IREC_PATH, 'results_sigir/4-3_updated_quantile_plum_rqvae_beauty_ws_2_clusters_colisionless.json') +TRAIN_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/updated_quantile_tiger_4-3_train_batches/') +VALID_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/updated_quantile_tiger_4-3_valid_batches/') +EVAL_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/updated_quantile_tiger_4-3_eval_batches/') + + + + +# ОСТАЛЬНОЕ + +SEED_VALUE = 42 +DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + + +MAX_SEQ_LEN = 20 +TRAIN_BATCH_SIZE = 256 +VALID_BATCH_SIZE = 1024 +NUM_USER_HASH = 2000 +CODEBOOK_SIZE = 256 +NUM_CODEBOOKS = 4 + +UNIFIED_VOCAB_SIZE = CODEBOOK_SIZE * NUM_CODEBOOKS + NUM_USER_HASH + 10 # 10 for utilities +PAD_TOKEN_ID = UNIFIED_VOCAB_SIZE - 1, +EOS_TOKEN_ID = UNIFIED_VOCAB_SIZE - 2, +DECODER_START_TOKEN_ID = UNIFIED_VOCAB_SIZE - 3, + + + +class TigerProcessing(Transform): + def __call__(self, batch): + input_semantic_ids, attention_mask = batch['item.semantic.padded'], batch['item.semantic.mask'] + batch_size = attention_mask.shape[0] + + input_semantic_ids[~attention_mask] = PAD_TOKEN_ID # TODO ??? + + input_semantic_ids = np.concatenate([ + input_semantic_ids, + NUM_CODEBOOKS * CODEBOOK_SIZE + batch['user.hashed.ids'][:, None] + ], axis=-1) + + attention_mask = np.concatenate([ + attention_mask, + np.ones((batch_size, 1), dtype=attention_mask.dtype) + ], axis=-1) + + batch['input.data'] = input_semantic_ids + batch['input.mask'] = attention_mask + + target_semantic_ids = batch['labels.semantic.padded'] + target_semantic_ids = np.concatenate([ + np.ones( + (batch_size, 1), + dtype=np.int64, + ) * DECODER_START_TOKEN_ID, + target_semantic_ids + ], axis=-1) + + batch['output.data'] = target_semantic_ids + + return batch + + +class ToMasked(Transform): + def __init__(self, prefix, is_right_aligned=False): + self._prefix = prefix + self._is_right_aligned = is_right_aligned + + def __call__(self, batch): + data = batch[f'{self._prefix}.ids'] + lengths = batch[f'{self._prefix}.length'] + + batch_size = lengths.shape[0] + max_sequence_length = int(lengths.max()) + + if len(data.shape) == 1: # only indices + padded_tensor = np.zeros( + (batch_size, max_sequence_length), + dtype=data.dtype + ) # (batch_size, max_seq_len) + else: + assert len(data.shape) == 2 # embeddings + padded_tensor = np.zeros( + (batch_size, max_sequence_length, data.shape[-1]), + dtype=data.dtype + ) # (batch_size, max_seq_len, emb_dim) + + mask = np.arange(max_sequence_length)[None] < lengths[:, None] + + if self._is_right_aligned: + mask = np.flip(mask, axis=-1) + + padded_tensor[mask] = data + + batch[f'{self._prefix}.padded'] = padded_tensor + batch[f'{self._prefix}.mask'] = mask + + return batch + + +class SemanticIdsMapper(Transform): + def __init__(self, mapping, names=[]): + super().__init__() + self._mapping = mapping + self._names = names + + data = [] + for i in range(len(mapping)): + data.append(mapping[str(i)]) + self._mapping_tensor = torch.tensor(data, dtype=torch.long) + self._semantic_length = self._mapping_tensor.shape[-1] + + def __call__(self, batch): + for name in self._names: + if f'{name}.ids' in batch: + ids = batch[f'{name}.ids'] + lengths = batch[f'{name}.length'] + assert ids.min() >= 0 + assert ids.max() < self._mapping_tensor.shape[0] + batch[f'{name}.semantic.ids'] = self._mapping_tensor[ids].flatten().numpy() + batch[f'{name}.semantic.length'] = lengths * self._semantic_length + + return batch + + +class UserHashing(Transform): + def __init__(self, hash_size): + super().__init__() + self._hash_size = hash_size + + def __call__(self, batch): + batch['user.hashed.ids'] = np.array([murmurhash.hash(str(x)) % self._hash_size for x in batch['user.ids']], dtype=np.int64) + return batch + + +def save_batches_to_arrow(batches, output_dir): + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=False) + + for batch_idx, batch in enumerate(batches): + length_groups = defaultdict(dict) + metadata_groups = defaultdict(dict) + + for key, value in batch.items(): + length = len(value) + + metadata_groups[length][f'{key}_shape'] = str(value.shape) + metadata_groups[length][f'{key}_dtype'] = str(value.dtype) + + if value.ndim == 1: + # 1D массив - сохраняем как есть + length_groups[length][key] = value + elif value.ndim == 2: + # 2D массив - используем list of lists + length_groups[length][key] = value.tolist() + else: + # >2D массив - flatten и сохраняем shape + length_groups[length][key] = value.flatten() + + for length, fields in length_groups.items(): + arrow_dict = {} + for k, v in fields.items(): + if isinstance(v, list) and len(v) > 0 and isinstance(v[0], list): + # List of lists (2D) + arrow_dict[k] = pa.array(v) + else: + arrow_dict[k] = pa.array(v) + + table = pa.table(arrow_dict) + if length in metadata_groups: + table = table.replace_schema_metadata(metadata_groups[length]) + + feather.write_feather( + table, + output_dir / f"batch_{batch_idx:06d}_len_{length}.arrow", + compression='lz4' + ) + + # arrow_dict = {k: pa.array(v) for k, v in fields.items()} + # table = pa.table(arrow_dict) + + # feather.write_feather( + # table, + # output_dir / f"batch_{batch_idx:06d}_len_{length}.arrow", + # compression='lz4' + # ) + + +def main(): + data = Dataset.create_timestamp_based( + train_json_path=INTERACTIONS_TRAIN_PATH, + validation_json_path=INTERACTIONS_VALID_PATH, + test_json_path=INTERACTIONS_TEST_PATH, + max_sequence_length=MAX_SEQ_LEN, + sampler_type='tiger', + min_sample_len=2, + is_extended=True + ) + + with open(SEMANTIC_MAPPING_PATH, 'r') as f: + mappings = json.load(f) + + train_dataset, valid_dataset, eval_dataset = data.get_datasets() + + train_dataloader = DataLoader( + dataset=train_dataset, + batch_size=TRAIN_BATCH_SIZE, + shuffle=True, + drop_last=True + ) \ + .map(Collate()) \ + .map(UserHashing(NUM_USER_HASH)) \ + .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \ + .map(ToMasked('item.semantic', is_right_aligned=True)) \ + .map(ToMasked('labels.semantic', is_right_aligned=True)) \ + .map(TigerProcessing()) + + valid_dataloader = DataLoader( + dataset=valid_dataset, + batch_size=VALID_BATCH_SIZE, + shuffle=False, + drop_last=False + ) \ + .map(Collate()) \ + .map(UserHashing(NUM_USER_HASH)) \ + .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \ + .map(ToMasked('item.semantic', is_right_aligned=True)) \ + .map(ToMasked('labels.semantic', is_right_aligned=True)) \ + .map(ToMasked('visited', is_right_aligned=True)) \ + .map(TigerProcessing()) + + eval_dataloader = DataLoader( + dataset=eval_dataset, + batch_size=VALID_BATCH_SIZE, + shuffle=False, + drop_last=False + ) \ + .map(Collate()) \ + .map(UserHashing(NUM_USER_HASH)) \ + .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \ + .map(ToMasked('item.semantic', is_right_aligned=True)) \ + .map(ToMasked('labels.semantic', is_right_aligned=True)) \ + .map(ToMasked('visited', is_right_aligned=True)) \ + .map(TigerProcessing()) + + train_batches = [] + for train_batch in train_dataloader: + train_batches.append(train_batch) + save_batches_to_arrow(train_batches, TRAIN_BATCHES_DIR) + + valid_batches = [] + for valid_batch in valid_dataloader: + valid_batches.append(valid_batch) + save_batches_to_arrow(valid_batches, VALID_BATCHES_DIR) + + eval_batches = [] + for eval_batch in eval_dataloader: + eval_batches.append(eval_batch) + save_batches_to_arrow(eval_batches, EVAL_BATCHES_DIR) + + + +if __name__ == '__main__': + main() diff --git a/scripts/tiger/data.py b/scripts/tiger/data.py index 188993a..a34accd 100644 --- a/scripts/tiger/data.py +++ b/scripts/tiger/data.py @@ -28,6 +28,116 @@ def __init__( self._num_items = num_items self._max_sequence_length = max_sequence_length + @classmethod + def create_timestamp_based( + cls, + train_json_path, + validation_json_path, + test_json_path, + max_sequence_length, + sampler_type, + min_sample_len=2, + is_extended=False + ): + max_item_id = 0 + train_dataset, validation_dataset, test_dataset = [], [], [] + + with open(train_json_path, 'r') as f: + train_data = json.load(f) + with open(validation_json_path, 'r') as f: + validation_data = json.load(f) + with open(test_json_path, 'r') as f: + test_data = json.load(f) + + all_users = set(train_data.keys()) | set(validation_data.keys()) | set(test_data.keys()) + print(f"all users count: {len(all_users)}") + for user_id_str in all_users: + user_id = int(user_id_str) + + train_items = train_data.get(user_id_str, []) + validation_items = validation_data.get(user_id_str, []) + test_items = test_data.get(user_id_str, []) + + full_sequence = train_items + validation_items + test_items + if full_sequence: + max_item_id = max(max_item_id, max(full_sequence)) + + assert len(full_sequence) >= 5, f'Core-5 dataset is used, user {user_id} has only {len(full_sequence)} items' + + if is_extended: + # sample = [1, 2] + # sample = [1, 2, 3] + # sample = [1, 2, 3, 4] + # sample = [1, 2, 3, 4, 5] + # sample = [1, 2, 3, 4, 5, 6] + # sample = [1, 2, 3, 4, 5, 6, 7] + # sample = [1, 2, 3, 4, 5, 6, 7, 8] + for prefix_length in range(min_sample_len, len(train_items) + 1): + train_dataset.append({ + 'user.ids': [user_id], + 'item.ids': train_items[:prefix_length], + }) + else: + # sample = [1, 2, 3, 4, 5, 6, 7, 8] + train_dataset.append({ + 'user.ids': [user_id], + 'item.ids': train_items, + }) + + # разворачиваем каждый айтем из валидации в отдельный сэмпл + # Пример: Train=[1,2], Valid=[3,4] + # sample = [1, 2, 3] + # sample = [1, 2, 3, 4] + + current_history = train_items.copy() + for item in validation_items: + # эвал датасет сам отрезает таргет потом + sample_sequence = current_history + [item] + + if len(sample_sequence) >= min_sample_len: + validation_dataset.append({ + 'user.ids': [user_id], + 'item.ids': sample_sequence, + }) + current_history.append(item) + + # разворачиваем каждый айтем из теста в отдельный сэмпл + # Пример: Train=[1,2], Valid=[3,4], Test=[5, 6] + # sample = [1, 2, 3, 4, 5] + # sample = [1, 2, 3, 4, 5, 6] + current_history = train_items + validation_items + + for item in test_items: + # эвал датасет сам отрезает таргет потом + sample_sequence = current_history + [item] + + if len(sample_sequence) >= min_sample_len: + test_dataset.append({ + 'user.ids': [user_id], + 'item.ids': sample_sequence, + }) + + current_history.append(item) + + logger.debug(f'Train dataset size: {len(train_dataset)}') + logger.debug(f'Validation dataset size: {len(validation_dataset)}') + logger.debug(f'Test dataset size: {len(test_dataset)}') + print(f'Train dataset size: {len(train_dataset)}') + print(f'Validation dataset size: {len(validation_dataset)}') + print(f'Test dataset size: {len(test_dataset)}') + + train_sampler = TrainDataset(train_dataset, sampler_type, max_sequence_length=max_sequence_length) + validation_sampler = EvalDataset(validation_dataset, max_sequence_length=max_sequence_length) + test_sampler = EvalDataset(test_dataset, max_sequence_length=max_sequence_length) + + return cls( + train_sampler=train_sampler, + validation_sampler=validation_sampler, + test_sampler=test_sampler, + num_items=max_item_id + 1, # +1 added because our ids are 0-indexed + max_sequence_length=max_sequence_length + ) + @classmethod def create(cls, inter_json_path, max_sequence_length, sampler_type, is_extended=False): max_item_id = 0 diff --git a/scripts/tiger/train.py b/scripts/tiger/train.py index f436dd4..1a2d347 100644 --- a/scripts/tiger/train.py +++ b/scripts/tiger/train.py @@ -14,10 +14,23 @@ from data import ArrowBatchDataset from models import TigerModel, CorrectItemsLogitsProcessor + +# ПУТИ +IREC_PATH = '../../' +SEMANTIC_MAPPING_PATH = os.path.join(IREC_PATH, 'results_sigir/4-1_plum_rqvae_beauty_ws_2_clusters_colisionless.json') +TRAIN_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/tiger_4-1_train_batches/') +VALID_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/tiger_4-1_valid_batches/') +EVAL_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/tiger_4-1_eval_batches/') + +TENSORBOARD_LOGDIR = os.path.join(IREC_PATH, 'tensorboard_logs') +CHECKPOINTS_DIR = os.path.join(IREC_PATH, 'checkpoints') + +EXPERIMENT_NAME = 'tiger_beauty_4-1_plum_ws_2_dp_0.2' + +# ОСТАЛЬНОЕ SEED_VALUE = 42 DEVICE = 'cuda' -EXPERIMENT_NAME = 'tiger_beauty' NUM_EPOCHS = 300 MAX_SEQ_LEN = 20 TRAIN_BATCH_SIZE = 256 @@ -30,13 +43,12 @@ NUM_LAYERS = 4 FEEDFORWARD_DIM = 1024 KV_DIM = 64 -DROPOUT = 0.1 +DROPOUT = 0.2 NUM_BEAMS = 30 TOP_K = 20 NUM_CODEBOOKS = 4 -LR = 3e-4 +LR = 0.0001 -IREC_PATH = '../../' torch.set_float32_matmul_precision('high') torch._dynamo.config.capture_scalar_outputs = True @@ -48,30 +60,30 @@ def main(): fix_random_seed(SEED_VALUE) - with open(os.path.join(IREC_PATH, 'results/rqvae_beauty_best_clusters_colisionless.json'), 'r') as f: + with open(SEMANTIC_MAPPING_PATH, 'r') as f: mappings = json.load(f) - + train_dataloader = DataLoader( ArrowBatchDataset( - os.path.join(IREC_PATH, 'data/Beauty/tiger_train_batches/'), - device='cpu', + TRAIN_BATCHES_DIR, + device='cpu', preload=True ), - batch_size=1, - shuffle=True, + batch_size=1, + shuffle=True, num_workers=0, - pin_memory=True, + pin_memory=True, collate_fn=Collate() ).map(ToDevice(DEVICE)).repeat(NUM_EPOCHS) valid_dataloder = ArrowBatchDataset( - os.path.join(IREC_PATH, 'data/Beauty/tiger_valid_batches/'), + VALID_BATCHES_DIR, device=DEVICE, preload=True ) eval_dataloder = ArrowBatchDataset( - os.path.join(IREC_PATH, 'data/Beauty/tiger_eval_batches/'), + EVAL_BATCHES_DIR, device=DEVICE, preload=True ) @@ -177,22 +189,22 @@ def main(): ), ], ).every_num_steps(EPOCH_NUM_STEPS), - + cb.Logger().every_num_steps(EPOCH_NUM_STEPS), - cb.TensorboardLogger(experiment_name=EXPERIMENT_NAME, logdir=os.path.join(IREC_PATH, 'tensorboard_logs')), + cb.TensorboardLogger(experiment_name=EXPERIMENT_NAME, logdir=TENSORBOARD_LOGDIR), cb.EarlyStopping( - metric='eval/ndcg@20', + metric='eval/ndcg@20', patience=40, minimize=False, - model_path=os.path.join(IREC_PATH, 'checkpoints', EXPERIMENT_NAME) + model_path=os.path.join(CHECKPOINTS_DIR, EXPERIMENT_NAME) ).every_num_steps(EPOCH_NUM_STEPS) # cb.Profiler( # wait=10, # warmup=10, # active=10, - # logdir=os.path.join(IREC_PATH, 'tensorboard_logs') + # logdir=TENSORBOARD_LOGDIR # ), # cb.StopAfterNumSteps(40) diff --git a/scripts/tiger/varka.py b/scripts/tiger/varka.py index ed47595..4dc3e02 100644 --- a/scripts/tiger/varka.py +++ b/scripts/tiger/varka.py @@ -15,6 +15,20 @@ from data import Dataset + + +# ПУТИ + +IREC_PATH = '../../' +INTERACTIONS_PATH = os.path.join(IREC_PATH, 'data/Beauty/inter.json') +SEMANTIC_MAPPING_PATH = os.path.join(IREC_PATH, 'results/rqvae_beauty_best_clusters_colisionless.json') +TRAIN_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/tiger_train_batches/') +VALID_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/tiger_valid_batches/') +EVAL_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/tiger_eval_batches/') + + +# ОСТАЛЬНОЕ + SEED_VALUE = 42 DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') @@ -32,8 +46,6 @@ DECODER_START_TOKEN_ID = UNIFIED_VOCAB_SIZE - 3, -IREC_PATH = '../../' - class TigerProcessing(Transform): def __call__(self, batch): @@ -42,12 +54,12 @@ def __call__(self, batch): input_semantic_ids[~attention_mask] = PAD_TOKEN_ID # TODO ??? - input_semantic_ids = np.concat([ + input_semantic_ids = np.concatenate([ input_semantic_ids, NUM_CODEBOOKS * CODEBOOK_SIZE + batch['user.hashed.ids'][:, None] ], axis=-1) - attention_mask = np.concat([ + attention_mask = np.concatenate([ attention_mask, np.ones((batch_size, 1), dtype=attention_mask.dtype) ], axis=-1) @@ -56,7 +68,7 @@ def __call__(self, batch): batch['input.mask'] = attention_mask target_semantic_ids = batch['labels.semantic.padded'] - target_semantic_ids = np.concat([ + target_semantic_ids = np.concatenate([ np.ones( (batch_size, 1), dtype=np.int64, @@ -73,7 +85,7 @@ class ToMasked(Transform): def __init__(self, prefix, is_right_aligned=False): self._prefix = prefix self._is_right_aligned = is_right_aligned - + def __call__(self, batch): data = batch[f'{self._prefix}.ids'] lengths = batch[f'{self._prefix}.length'] @@ -92,7 +104,7 @@ def __call__(self, batch): (batch_size, max_sequence_length, data.shape[-1]), dtype=data.dtype ) # (batch_size, max_seq_len, emb_dim) - + mask = np.arange(max_sequence_length)[None] < lengths[:, None] if self._is_right_aligned: @@ -117,10 +129,10 @@ def __init__(self, mapping, names=[]): data.append(mapping[str(i)]) self._mapping_tensor = torch.tensor(data, dtype=torch.long) self._semantic_length = self._mapping_tensor.shape[-1] - + def __call__(self, batch): for name in self._names: - if f'{name}.ids' in batch: + if f'{name}.ids' in batch: ids = batch[f'{name}.ids'] lengths = batch[f'{name}.length'] assert ids.min() >= 0 @@ -135,7 +147,7 @@ class UserHashing(Transform): def __init__(self, hash_size): super().__init__() self._hash_size = hash_size - + def __call__(self, batch): batch['user.hashed.ids'] = np.array([murmurhash.hash(str(x)) % self._hash_size for x in batch['user.ids']], dtype=np.int64) return batch @@ -144,7 +156,7 @@ def __call__(self, batch): def save_batches_to_arrow(batches, output_dir): output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=False) - + for batch_idx, batch in enumerate(batches): length_groups = defaultdict(dict) metadata_groups = defaultdict(dict) @@ -164,7 +176,7 @@ def save_batches_to_arrow(batches, output_dir): else: # >2D массив - flatten и сохраняем shape length_groups[length][key] = value.flatten() - + for length, fields in length_groups.items(): arrow_dict = {} for k, v in fields.items(): @@ -173,11 +185,11 @@ def save_batches_to_arrow(batches, output_dir): arrow_dict[k] = pa.array(v) else: arrow_dict[k] = pa.array(v) - + table = pa.table(arrow_dict) if length in metadata_groups: table = table.replace_schema_metadata(metadata_groups[length]) - + feather.write_feather( table, output_dir / f"batch_{batch_idx:06d}_len_{length}.arrow", @@ -186,7 +198,7 @@ def save_batches_to_arrow(batches, output_dir): # arrow_dict = {k: pa.array(v) for k, v in fields.items()} # table = pa.table(arrow_dict) - + # feather.write_feather( # table, # output_dir / f"batch_{batch_idx:06d}_len_{length}.arrow", @@ -196,15 +208,15 @@ def save_batches_to_arrow(batches, output_dir): def main(): data = Dataset.create( - inter_json_path=os.path.join(IREC_PATH, 'data/Beauty/inter.json'), + inter_json_path=INTERACTIONS_PATH, max_sequence_length=MAX_SEQ_LEN, sampler_type='tiger', is_extended=True ) - with open(os.path.join(IREC_PATH, 'results/rqvae_beauty_best_clusters_colisionless.json'), 'r') as f: + with open(SEMANTIC_MAPPING_PATH, 'r') as f: mappings = json.load(f) - + train_dataset, valid_dataset, eval_dataset = data.get_datasets() train_dataloader = DataLoader( @@ -219,7 +231,7 @@ def main(): .map(ToMasked('item.semantic', is_right_aligned=True)) \ .map(ToMasked('labels.semantic', is_right_aligned=True)) \ .map(TigerProcessing()) - + valid_dataloader = DataLoader( dataset=valid_dataset, batch_size=VALID_BATCH_SIZE, @@ -251,17 +263,18 @@ def main(): train_batches = [] for train_batch in train_dataloader: train_batches.append(train_batch) - save_batches_to_arrow(train_batches, os.path.join(IREC_PATH, 'data/Beauty/tiger_train_batches/')) - + save_batches_to_arrow(train_batches, TRAIN_BATCHES_DIR) + valid_batches = [] for valid_batch in valid_dataloader: valid_batches.append(valid_batch) - save_batches_to_arrow(valid_batches, os.path.join(IREC_PATH, 'data/Beauty/tiger_valid_batches/')) - + save_batches_to_arrow(valid_batches, VALID_BATCHES_DIR) + eval_batches = [] for eval_batch in eval_dataloader: eval_batches.append(eval_batch) - save_batches_to_arrow(eval_batches, os.path.join(IREC_PATH, 'data/Beauty/tiger_eval_batches/')) + save_batches_to_arrow(eval_batches, EVAL_BATCHES_DIR) + if __name__ == '__main__': diff --git a/scripts/tiger/varka_timestamp_based.py b/scripts/tiger/varka_timestamp_based.py new file mode 100644 index 0000000..11343ea --- /dev/null +++ b/scripts/tiger/varka_timestamp_based.py @@ -0,0 +1,287 @@ +from collections import defaultdict +import json +import murmurhash +import numpy as np +import os +from pathlib import Path + +import pyarrow as pa +import pyarrow.feather as feather + +import torch + +from irec.data.transforms import Collate, Transform +from irec.data.dataloader import DataLoader + +from data import Dataset + + + +# ПУТИ + +IREC_PATH = '../../' +INTERACTIONS_TRAIN_PATH = os.path.join(IREC_PATH, 'sigir/Beauty_new/splits/exp_data/exp_4_inter_tiger_train.json') +INTERACTIONS_VALID_PATH = os.path.join(IREC_PATH, 'sigir/Beauty_new/splits/exp_data/valid_skip_set.json') +INTERACTIONS_TEST_PATH = os.path.join(IREC_PATH, 'sigir/Beauty_new/splits/exp_data/test_set.json') + +SEMANTIC_MAPPING_PATH = os.path.join(IREC_PATH, 'results_sigir/4-1_plum_rqvae_beauty_ws_2_clusters_colisionless.json') +TRAIN_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/tiger_4-1_train_batches/') +VALID_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/tiger_4-1_valid_batches/') +EVAL_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/tiger_4-1_eval_batches/') + + +# ОСТАЛЬНОЕ + +SEED_VALUE = 42 +DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + + +MAX_SEQ_LEN = 20 +TRAIN_BATCH_SIZE = 256 +VALID_BATCH_SIZE = 1024 +NUM_USER_HASH = 2000 +CODEBOOK_SIZE = 256 +NUM_CODEBOOKS = 4 + +UNIFIED_VOCAB_SIZE = CODEBOOK_SIZE * NUM_CODEBOOKS + NUM_USER_HASH + 10 # 10 for utilities +PAD_TOKEN_ID = UNIFIED_VOCAB_SIZE - 1, +EOS_TOKEN_ID = UNIFIED_VOCAB_SIZE - 2, +DECODER_START_TOKEN_ID = UNIFIED_VOCAB_SIZE - 3, + + + +class TigerProcessing(Transform): + def __call__(self, batch): + input_semantic_ids, attention_mask = batch['item.semantic.padded'], batch['item.semantic.mask'] + batch_size = attention_mask.shape[0] + + input_semantic_ids[~attention_mask] = PAD_TOKEN_ID # TODO ??? + + input_semantic_ids = np.concatenate([ + input_semantic_ids, + NUM_CODEBOOKS * CODEBOOK_SIZE + batch['user.hashed.ids'][:, None] + ], axis=-1) + + attention_mask = np.concatenate([ + attention_mask, + np.ones((batch_size, 1), dtype=attention_mask.dtype) + ], axis=-1) + + batch['input.data'] = input_semantic_ids + batch['input.mask'] = attention_mask + + target_semantic_ids = batch['labels.semantic.padded'] + target_semantic_ids = np.concatenate([ + np.ones( + (batch_size, 1), + dtype=np.int64, + ) * DECODER_START_TOKEN_ID, + target_semantic_ids + ], axis=-1) + + batch['output.data'] = target_semantic_ids + + return batch + + +class ToMasked(Transform): + def __init__(self, prefix, is_right_aligned=False): + self._prefix = prefix + self._is_right_aligned = is_right_aligned + + def __call__(self, batch): + data = batch[f'{self._prefix}.ids'] + lengths = batch[f'{self._prefix}.length'] + + batch_size = lengths.shape[0] + max_sequence_length = int(lengths.max()) + + if len(data.shape) == 1: # only indices + padded_tensor = np.zeros( + (batch_size, max_sequence_length), + dtype=data.dtype + ) # (batch_size, max_seq_len) + else: + assert len(data.shape) == 2 # embeddings + padded_tensor = np.zeros( + (batch_size, max_sequence_length, data.shape[-1]), + dtype=data.dtype + ) # (batch_size, max_seq_len, emb_dim) + + mask = np.arange(max_sequence_length)[None] < lengths[:, None] + + if self._is_right_aligned: + mask = np.flip(mask, axis=-1) + + padded_tensor[mask] = data + + batch[f'{self._prefix}.padded'] = padded_tensor + batch[f'{self._prefix}.mask'] = mask + + return batch + + +class SemanticIdsMapper(Transform): + def __init__(self, mapping, names=[]): + super().__init__() + self._mapping = mapping + self._names = names + + data = [] + for i in range(len(mapping)): + data.append(mapping[str(i)]) + self._mapping_tensor = torch.tensor(data, dtype=torch.long) + self._semantic_length = self._mapping_tensor.shape[-1] + + def __call__(self, batch): + for name in self._names: + if f'{name}.ids' in batch: + ids = batch[f'{name}.ids'] + lengths = batch[f'{name}.length'] + assert ids.min() >= 0 + assert ids.max() < self._mapping_tensor.shape[0] + batch[f'{name}.semantic.ids'] = self._mapping_tensor[ids].flatten().numpy() + batch[f'{name}.semantic.length'] = lengths * self._semantic_length + + return batch + + +class UserHashing(Transform): + def __init__(self, hash_size): + super().__init__() + self._hash_size = hash_size + + def __call__(self, batch): + batch['user.hashed.ids'] = np.array([murmurhash.hash(str(x)) % self._hash_size for x in batch['user.ids']], dtype=np.int64) + return batch + + +def save_batches_to_arrow(batches, output_dir): + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=False) + + for batch_idx, batch in enumerate(batches): + length_groups = defaultdict(dict) + metadata_groups = defaultdict(dict) + + for key, value in batch.items(): + length = len(value) + + metadata_groups[length][f'{key}_shape'] = str(value.shape) + metadata_groups[length][f'{key}_dtype'] = str(value.dtype) + + if value.ndim == 1: + # 1D массив - сохраняем как есть + length_groups[length][key] = value + elif value.ndim == 2: + # 2D массив - используем list of lists + length_groups[length][key] = value.tolist() + else: + # >2D массив - flatten и сохраняем shape + length_groups[length][key] = value.flatten() + + for length, fields in length_groups.items(): + arrow_dict = {} + for k, v in fields.items(): + if isinstance(v, list) and len(v) > 0 and isinstance(v[0], list): + # List of lists (2D) + arrow_dict[k] = pa.array(v) + else: + arrow_dict[k] = pa.array(v) + + table = pa.table(arrow_dict) + if length in metadata_groups: + table = table.replace_schema_metadata(metadata_groups[length]) + + feather.write_feather( + table, + output_dir / f"batch_{batch_idx:06d}_len_{length}.arrow", + compression='lz4' + ) + + # arrow_dict = {k: pa.array(v) for k, v in fields.items()} + # table = pa.table(arrow_dict) + + # feather.write_feather( + # table, + # output_dir / f"batch_{batch_idx:06d}_len_{length}.arrow", + # compression='lz4' + # ) + + +def main(): + data = Dataset.create_timestamp_based( + train_json_path=INTERACTIONS_TRAIN_PATH, + validation_json_path=INTERACTIONS_VALID_PATH, + test_json_path=INTERACTIONS_TEST_PATH, + max_sequence_length=MAX_SEQ_LEN, + sampler_type='tiger', + min_sample_len=2, + is_extended=True + ) + + with open(SEMANTIC_MAPPING_PATH, 'r') as f: + mappings = json.load(f) + + train_dataset, valid_dataset, eval_dataset = data.get_datasets() + + train_dataloader = DataLoader( + dataset=train_dataset, + batch_size=TRAIN_BATCH_SIZE, + shuffle=True, + drop_last=True + ) \ + .map(Collate()) \ + .map(UserHashing(NUM_USER_HASH)) \ + .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \ + .map(ToMasked('item.semantic', is_right_aligned=True)) \ + .map(ToMasked('labels.semantic', is_right_aligned=True)) \ + .map(TigerProcessing()) + + valid_dataloader = DataLoader( + dataset=valid_dataset, + batch_size=VALID_BATCH_SIZE, + shuffle=False, + drop_last=False + ) \ + .map(Collate()) \ + .map(UserHashing(NUM_USER_HASH)) \ + .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \ + .map(ToMasked('item.semantic', is_right_aligned=True)) \ + .map(ToMasked('labels.semantic', is_right_aligned=True)) \ + .map(ToMasked('visited', is_right_aligned=True)) \ + .map(TigerProcessing()) + + eval_dataloader = DataLoader( + dataset=eval_dataset, + batch_size=VALID_BATCH_SIZE, + shuffle=False, + drop_last=False + ) \ + .map(Collate()) \ + .map(UserHashing(NUM_USER_HASH)) \ + .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \ + .map(ToMasked('item.semantic', is_right_aligned=True)) \ + .map(ToMasked('labels.semantic', is_right_aligned=True)) \ + .map(ToMasked('visited', is_right_aligned=True)) \ + .map(TigerProcessing()) + + train_batches = [] + for train_batch in train_dataloader: + train_batches.append(train_batch) + save_batches_to_arrow(train_batches, TRAIN_BATCHES_DIR) + + valid_batches = [] + for valid_batch in valid_dataloader: + valid_batches.append(valid_batch) + save_batches_to_arrow(valid_batches, VALID_BATCHES_DIR) + + eval_batches = [] + for eval_batch in eval_dataloader: + eval_batches.append(eval_batch) + save_batches_to_arrow(eval_batches, EVAL_BATCHES_DIR) + + + +if __name__ == '__main__': + main() diff --git a/sigir/Beauty/DatasetProcessing.ipynb b/sigir/Beauty/DatasetProcessing.ipynb new file mode 100644 index 0000000..b49f4ab --- /dev/null +++ b/sigir/Beauty/DatasetProcessing.ipynb @@ -0,0 +1,856 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "3bdb292f", + "metadata": {}, + "outputs": [], + "source": [ + "from collections import defaultdict\n", + "\n", + "import json\n", + "import numpy as np\n", + "import pandas as pd\n", + "import pickle\n", + "import polars as pl\n", + "\n", + "from transformers import LlamaModel, LlamaTokenizer\n", + "\n", + "import torch\n", + "from torch.utils.data import DataLoader\n", + "\n", + "from tqdm import tqdm as tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "66d9b312", + "metadata": {}, + "outputs": [], + "source": [ + "interactions_dataset_path = '../data/Beauty/Beauty_5.json'\n", + "metadata_path = '../data/Beauty/metadata.json'\n", + "\n", + "interactions_output_json_path = '../data/Beauty_new/inter_new.json'\n", + "interactions_output_parquet_path = '../data/Beauty_new/inter_new.parquet'\n", + "embeddings_output_path = '../data/Beauty_new/content_embeddings.pkl'\n", + "item_ids_mapping_output_path = '../data/Beauty_new/item_ids_mapping.json'" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "6ed4dffb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of events: 198502\n" + ] + } + ], + "source": [ + "df = defaultdict(list)\n", + "\n", + "with open(interactions_dataset_path, 'r') as f:\n", + " for line in f.readlines():\n", + " review = json.loads(line)\n", + " df['user_id'].append(review['reviewerID'])\n", + " df['item_id'].append(review['asin'])\n", + " df['timestamp'].append(review['unixReviewTime'])\n", + "\n", + "print(f'Number of events: {len(df[\"user_id\"])}')\n", + "\n", + "df = pl.from_dict(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "c26746c4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 3)
user_iditem_idtimestamp
strstri64
"A1YJEY40YUW4SE""7806397051"1391040000
"A60XNB876KYML""7806397051"1397779200
"A3G6XNM240RMWA""7806397051"1378425600
"A1PQFP6SAJ6D80""7806397051"1386460800
"A38FVHZTNQ271F""7806397051"1382140800
" + ], + "text/plain": [ + "shape: (5, 3)\n", + "┌────────────────┬────────────┬────────────┐\n", + "│ user_id ┆ item_id ┆ timestamp │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ str ┆ str ┆ i64 │\n", + "╞════════════════╪════════════╪════════════╡\n", + "│ A1YJEY40YUW4SE ┆ 7806397051 ┆ 1391040000 │\n", + "│ A60XNB876KYML ┆ 7806397051 ┆ 1397779200 │\n", + "│ A3G6XNM240RMWA ┆ 7806397051 ┆ 1378425600 │\n", + "│ A1PQFP6SAJ6D80 ┆ 7806397051 ┆ 1386460800 │\n", + "│ A38FVHZTNQ271F ┆ 7806397051 ┆ 1382140800 │\n", + "└────────────────┴────────────┴────────────┘" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "adcf5713", + "metadata": {}, + "outputs": [], + "source": [ + "filtered_df = df.clone()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0bbf9ba", + "metadata": {}, + "outputs": [], + "source": [ + "# Processing dataset to get core-5 state in case full dataset is provided\n", + "is_changed = True\n", + "threshold = 5\n", + "good_users = set()\n", + "good_items = set()\n", + "\n", + "while is_changed:\n", + " user_counts = filtered_df.group_by('user_id').agg(\n", + " pl.len().alias('user_count'),\n", + " )\n", + " item_counts = filtered_df.group_by('item_id').agg(\n", + " pl.len().alias('item_count'),\n", + " )\n", + "\n", + " good_users = user_counts.filter(pl.col('user_count') >= threshold).select(\n", + " 'user_id',\n", + " )\n", + " good_items = item_counts.filter(pl.col('item_count') >= threshold).select(\n", + " 'item_id',\n", + " )\n", + "\n", + " old_size = len(filtered_df)\n", + "\n", + " new_df = filtered_df.join(good_users, on='user_id', how='inner')\n", + " new_df = new_df.join(good_items, on='item_id', how='inner')\n", + "\n", + " new_size = len(new_df)\n", + "\n", + " filtered_df = new_df\n", + " is_changed = old_size != new_size\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "218a9348", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 3)
user_iditem_idtimestamp
i64i64i64
001391040000
101397779200
201378425600
301386460800
401382140800
" + ], + "text/plain": [ + "shape: (5, 3)\n", + "┌─────────┬─────────┬────────────┐\n", + "│ user_id ┆ item_id ┆ timestamp │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ i64 ┆ i64 │\n", + "╞═════════╪═════════╪════════════╡\n", + "│ 0 ┆ 0 ┆ 1391040000 │\n", + "│ 1 ┆ 0 ┆ 1397779200 │\n", + "│ 2 ┆ 0 ┆ 1378425600 │\n", + "│ 3 ┆ 0 ┆ 1386460800 │\n", + "│ 4 ┆ 0 ┆ 1382140800 │\n", + "└─────────┴─────────┴────────────┘" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unique_values = filtered_df[\"user_id\"].unique(maintain_order=True).to_list()\n", + "user_ids_mapping = {value: i for i, value in enumerate(unique_values)}\n", + "\n", + "filtered_df = filtered_df.with_columns(\n", + " pl.col(\"user_id\").replace_strict(user_ids_mapping)\n", + ")\n", + "\n", + "unique_values = filtered_df[\"item_id\"].unique(maintain_order=True).to_list()\n", + "item_ids_mapping = {value: i for i, value in enumerate(unique_values)}\n", + "\n", + "filtered_df = filtered_df.with_columns(\n", + " pl.col(\"item_id\").replace_strict(item_ids_mapping)\n", + ")\n", + "\n", + "filtered_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "34604fe6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 2)
old_item_idnew_item_id
stri64
"7806397051"0
"9759091062"1
"9788072216"2
"9790790961"3
"9790794231"4
" + ], + "text/plain": [ + "shape: (5, 2)\n", + "┌─────────────┬─────────────┐\n", + "│ old_item_id ┆ new_item_id │\n", + "│ --- ┆ --- │\n", + "│ str ┆ i64 │\n", + "╞═════════════╪═════════════╡\n", + "│ 7806397051 ┆ 0 │\n", + "│ 9759091062 ┆ 1 │\n", + "│ 9788072216 ┆ 2 │\n", + "│ 9790790961 ┆ 3 │\n", + "│ 9790794231 ┆ 4 │\n", + "└─────────────┴─────────────┘" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "item_ids_mapping_df = pl.from_dict({\n", + " 'old_item_id': list(item_ids_mapping.keys()),\n", + " 'new_item_id': list(item_ids_mapping.values())\n", + "})\n", + "item_ids_mapping_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "99b54db807b9495c", + "metadata": {}, + "outputs": [], + "source": [ + "with open(item_ids_mapping_output_path, 'w') as f:\n", + " json.dump({str(k): v for k, v in item_ids_mapping.items()}, f)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "6017e65c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 3)
user_iditem_idtimestamp
i64i64i64
001391040000
101397779200
201378425600
301386460800
401382140800
" + ], + "text/plain": [ + "shape: (5, 3)\n", + "┌─────────┬─────────┬────────────┐\n", + "│ user_id ┆ item_id ┆ timestamp │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ i64 ┆ i64 │\n", + "╞═════════╪═════════╪════════════╡\n", + "│ 0 ┆ 0 ┆ 1391040000 │\n", + "│ 1 ┆ 0 ┆ 1397779200 │\n", + "│ 2 ┆ 0 ┆ 1378425600 │\n", + "│ 3 ┆ 0 ┆ 1386460800 │\n", + "│ 4 ┆ 0 ┆ 1382140800 │\n", + "└─────────┴─────────┴────────────┘" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "filtered_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "9efd1983", + "metadata": {}, + "outputs": [], + "source": [ + "filtered_df = filtered_df.sort([\"user_id\", \"timestamp\"])\n", + "\n", + "grouped_filtered_df = filtered_df.group_by(\"user_id\", maintain_order=True).agg(pl.all())" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "fd51c525", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 2)
old_item_idnew_item_id
stri64
"7806397051"0
"9759091062"1
"9788072216"2
"9790790961"3
"9790794231"4
" + ], + "text/plain": [ + "shape: (5, 2)\n", + "┌─────────────┬─────────────┐\n", + "│ old_item_id ┆ new_item_id │\n", + "│ --- ┆ --- │\n", + "│ str ┆ i64 │\n", + "╞═════════════╪═════════════╡\n", + "│ 7806397051 ┆ 0 │\n", + "│ 9759091062 ┆ 1 │\n", + "│ 9788072216 ┆ 2 │\n", + "│ 9790790961 ┆ 3 │\n", + "│ 9790794231 ┆ 4 │\n", + "└─────────────┴─────────────┘" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "item_ids_mapping_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "8b0821da", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 3)
user_iditem_idtimestamp
i64list[i64]list[i64]
0[6845, 7872, … 0][1318896000, 1318896000, … 1391040000]
1[815, 10405, … 232][1392422400, 1396224000, … 1397779200]
2[6049, 0, … 6608][1378425600, 1378425600, … 1400284800]
3[5521, 5160, … 0][1379116800, 1380931200, … 1386460800]
4[0, 10469, … 11389][1382140800, 1383523200, … 1388966400]
" + ], + "text/plain": [ + "shape: (5, 3)\n", + "┌─────────┬─────────────────────┬─────────────────────────────────┐\n", + "│ user_id ┆ item_id ┆ timestamp │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ list[i64] ┆ list[i64] │\n", + "╞═════════╪═════════════════════╪═════════════════════════════════╡\n", + "│ 0 ┆ [6845, 7872, … 0] ┆ [1318896000, 1318896000, … 139… │\n", + "│ 1 ┆ [815, 10405, … 232] ┆ [1392422400, 1396224000, … 139… │\n", + "│ 2 ┆ [6049, 0, … 6608] ┆ [1378425600, 1378425600, … 140… │\n", + "│ 3 ┆ [5521, 5160, … 0] ┆ [1379116800, 1380931200, … 138… │\n", + "│ 4 ┆ [0, 10469, … 11389] ┆ [1382140800, 1383523200, … 138… │\n", + "└─────────┴─────────────────────┴─────────────────────────────────┘" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grouped_filtered_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "dc222d59", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Users count: 22363\n", + "Items count: 12101\n", + "Actions count: 198502\n", + "Avg user history len: 8.876358270357287\n" + ] + } + ], + "source": [ + "print('Users count:', filtered_df.select('user_id').unique().shape[0])\n", + "print('Items count:', filtered_df.select('item_id').unique().shape[0])\n", + "print('Actions count:', filtered_df.shape[0])\n", + "print('Avg user history len:', np.mean(list(map(lambda x: x[0], grouped_filtered_df.select(pl.col('item_id').list.len()).rows()))))" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "a272855d-84b2-4414-ba9f-62647e1151cf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape: (5, 3)\n", + "┌─────┬─────────────────────┬─────────────────────────────────┐\n", + "│ uid ┆ item_ids ┆ timestamps │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ list[i64] ┆ list[i64] │\n", + "╞═════╪═════════════════════╪═════════════════════════════════╡\n", + "│ 0 ┆ [6845, 7872, … 0] ┆ [1318896000, 1318896000, … 139… │\n", + "│ 1 ┆ [815, 10405, … 232] ┆ [1392422400, 1396224000, … 139… │\n", + "│ 2 ┆ [6049, 0, … 6608] ┆ [1378425600, 1378425600, … 140… │\n", + "│ 3 ┆ [5521, 5160, … 0] ┆ [1379116800, 1380931200, … 138… │\n", + "│ 4 ┆ [0, 10469, … 11389] ┆ [1382140800, 1383523200, … 138… │\n", + "└─────┴─────────────────────┴─────────────────────────────────┘\n" + ] + } + ], + "source": [ + "inter_new = grouped_filtered_df.select([\n", + " pl.col(\"user_id\").alias(\"uid\"),\n", + " pl.col(\"item_id\").alias(\"item_ids\"),\n", + " pl.col(\"timestamp\").alias(\"timestamps\")\n", + "])\n", + "\n", + "print(inter_new.head())" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "de5a853a-8ee2-42dd-a71a-6cc6f90d526c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Файл успешно сохранен: ../data/Beauty_new/inter_new.parquet\n" + ] + } + ], + "source": [ + "output_path_parquet = interactions_output_parquet_path\n", + "inter_new.write_parquet(output_path_parquet)\n", + "\n", + "print(f\"Файл успешно сохранен: {output_path_parquet}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "d07a2e91", + "metadata": {}, + "outputs": [], + "source": [ + "json_data = {}\n", + "for user_id, item_ids, _ in grouped_filtered_df.iter_rows():\n", + " json_data[user_id] = item_ids\n", + "\n", + "with open(interactions_output_json_path, 'w') as f:\n", + " json.dump(json_data, f, indent=2)" + ] + }, + { + "cell_type": "markdown", + "id": "237523fa", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "## Content embedding creation" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "6361c7a5", + "metadata": {}, + "outputs": [ + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[19], line 5\u001b[0m, in \u001b[0;36mgetDF\u001b[0;34m(path)\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(path, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[0;32m----> 5\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m line \u001b[38;5;129;01min\u001b[39;00m \u001b[43mf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreadlines\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 6\u001b[0m df[i] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28meval\u001b[39m(line)\n", + "File \u001b[0;32m/usr/lib/python3.10/codecs.py:319\u001b[0m, in \u001b[0;36mBufferedIncrementalDecoder.decode\u001b[0;34m(self, input, final)\u001b[0m\n\u001b[1;32m 317\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m\n\u001b[0;32m--> 319\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mdecode\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m, final\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m):\n\u001b[1;32m 320\u001b[0m \u001b[38;5;66;03m# decode input (taking the buffer into account)\u001b[39;00m\n\u001b[1;32m 321\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbuffer \u001b[38;5;241m+\u001b[39m \u001b[38;5;28minput\u001b[39m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: ", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[19], line 11\u001b[0m\n\u001b[1;32m 7\u001b[0m i \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 9\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m pd\u001b[38;5;241m.\u001b[39mDataFrame\u001b[38;5;241m.\u001b[39mfrom_dict(df, orient\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mindex\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 11\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43mgetDF\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmetadata_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 12\u001b[0m df\u001b[38;5;241m.\u001b[39mhead()\n", + "Cell \u001b[0;32mIn[19], line 5\u001b[0m, in \u001b[0;36mgetDF\u001b[0;34m(path)\u001b[0m\n\u001b[1;32m 3\u001b[0m df \u001b[38;5;241m=\u001b[39m {}\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(path, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[0;32m----> 5\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m line \u001b[38;5;129;01min\u001b[39;00m \u001b[43mf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreadlines\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 6\u001b[0m df[i] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28meval\u001b[39m(line)\n\u001b[1;32m 7\u001b[0m i \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "def getDF(path):\n", + " i = 0\n", + " df = {}\n", + " with open(path, 'r') as f:\n", + " for line in f.readlines():\n", + " df[i] = eval(line)\n", + " i += 1\n", + "\n", + " return pd.DataFrame.from_dict(df, orient=\"index\")\n", + "\n", + "df = getDF(metadata_path)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "971fa89c", + "metadata": {}, + "outputs": [], + "source": [ + "def preprocess(row: pd.Series):\n", + " row = row.fillna(\"None\")\n", + " return f\"Title: {row['title']}. Categories: {', '.join(row['categories'][0])}. Description: {row['description']}.\"\n", + "\n", + "\n", + "def get_data(metadata_df, item_ids_mapping_df):\n", + " filtered_df = metadata_df.join(\n", + " item_ids_mapping_df, \n", + " left_on=\"asin\", \n", + " right_on='old_item_id', \n", + " how=\"inner\"\n", + " ).select(pl.col('new_item_id'), pl.col('title'), pl.col('description'), pl.col('categories'))\n", + "\n", + " filtered_df = filtered_df.to_pandas()\n", + " filtered_df[\"combined_text\"] = filtered_df.apply(preprocess, axis=1)\n", + "\n", + " return filtered_df\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b0dd5d5", + "metadata": {}, + "outputs": [], + "source": [ + "data = get_data(pl.from_pandas(df), item_ids_mapping)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12e622ff", + "metadata": {}, + "outputs": [], + "source": [ + "device = torch.device('cuda:6')\n", + "\n", + "model_name = \"huggyllama/llama-7b\"\n", + "tokenizer = LlamaTokenizer.from_pretrained(model_name)\n", + "\n", + "if tokenizer.pad_token is None:\n", + " tokenizer.pad_token = tokenizer.eos_token\n", + "\n", + "model = LlamaModel.from_pretrained(model_name)\n", + "model = model.to(device)\n", + "model = model.eval()\n", + "\n", + "\n", + "class MyDataset:\n", + " def __init__(self, data):\n", + " self._data = list(zip(data.to_dict()['new_item_id'].values(), data.to_dict()['combined_text'].values()))\n", + "\n", + " def __len__(self):\n", + " return len(self._data)\n", + "\n", + " def __getitem__(self, idx):\n", + " text = self._data[idx][1]\n", + " inputs = tokenizer(text, return_tensors=\"pt\", max_length=1024, truncation=True, padding=\"max_length\")\n", + " return {\n", + " 'item_id': self._data[idx][0],\n", + " 'input_ids': inputs['input_ids'][0],\n", + " 'attention_mask': inputs['attention_mask'][0]\n", + " }\n", + " \n", + "\n", + "dataset = MyDataset(data)\n", + "loader = DataLoader(dataset, batch_size=8, drop_last=False, shuffle=False, num_workers=10)\n", + "\n", + "\n", + "new_df = {\n", + " 'item_id': [],\n", + " 'embedding': []\n", + "}\n", + "\n", + "for batch in tqdm(loader):\n", + " with torch.inference_mode():\n", + " outputs = model(\n", + " input_ids=batch[\"input_ids\"].to(device), \n", + " attention_mask=batch[\"attention_mask\"].to(device)\n", + " )\n", + " embeddings = outputs.last_hidden_state\n", + " \n", + " embeddings = outputs.last_hidden_state # (bs, sl, ed)\n", + " embeddings[(~batch[\"attention_mask\"].bool())] = 0. # (bs, sl, ed)\n", + "\n", + " new_df['item_id'] += batch['item_id'].tolist()\n", + " new_df['embedding'] += embeddings.mean(dim=1).tolist() # (bs, ed)\n", + "\n", + "\n", + "with open(embeddings_output_path, 'wb') as f:\n", + " pickle.dump(new_df, f)\n" + ] + }, + { + "cell_type": "markdown", + "id": "a6fffc4a-85f1-424e-b460-29e526df3317", + "metadata": {}, + "source": [ + "# Test" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "1f922431-e3c1-4587-86d1-04a58eb8ffee", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✓ Сохранено: ../data/Beauty_new/splits/inter_new_[0_1291403520.0).json\n", + "✓ Сохранено: ../data/Beauty_new/splits/inter_new_[1291403520.0_1329626880.0).json\n", + "✓ Сохранено: ../data/Beauty_new/splits/inter_new_[1329626880.0_1367850240.0).json\n", + "✓ Сохранено: ../data/Beauty_new/splits/inter_new_[1367850240.0_inf).json\n", + "Интервал 0: 3485 пользователей, 10350 взаимодействий\n", + "Интервал 1: 5751 пользователей, 15837 взаимодействий\n", + "Интервал 2: 13543 пользователей, 61954 взаимодействий\n", + "Интервал 3: 18811 пользователей, 110361 взаимодействий\n" + ] + } + ], + "source": [ + "import polars as pl\n", + "import json\n", + "from typing import List, Dict\n", + "\n", + "def split_session_by_timestamps(\n", + " df: pl.DataFrame,\n", + " time_cutoffs: List[int],\n", + " output_dir: str = None,\n", + " return_dicts: bool = True\n", + ") -> List[Dict[int, List[int]]]:\n", + " \"\"\"\n", + " Разбивает датасет по временным интервалам и возвращает JSON-подобные словари.\n", + " \n", + " Args:\n", + " df: Polars DataFrame с колонками uid, item_ids (list), timestamps (list)\n", + " time_cutoffs: Лист временных точек для разбиения\n", + " output_dir: Директория для сохранения JSON файлов (опционально)\n", + " return_dicts: Возвращать ли словари (как json_data format)\n", + " \n", + " Returns:\n", + " Лист словарей в формате {user_id: [item_ids для интервала]}\n", + " \n", + " Example:\n", + " >>> df = pl.read_parquet(\"inter_new.parquet\")\n", + " >>> cutoffs = [1000000, 2000000, 3000000]\n", + " >>> dicts = split_session_by_timestamps(df, cutoffs, output_dir=\"./data\")\n", + " >>> # Получим 4 JSON файла за каждый интервал + последний\n", + " \"\"\"\n", + " \n", + " result_dicts = []\n", + " \n", + " def extract_interval(df_source, start, end=None):\n", + " \"\"\"Извлекает данные для одного временного интервала\"\"\"\n", + " q = df_source.lazy()\n", + " q = q.explode([\"item_ids\", \"timestamps\"])\n", + " \n", + " if end is not None:\n", + " q = q.filter(\n", + " (pl.col(\"timestamps\") >= start) & \n", + " (pl.col(\"timestamps\") < end)\n", + " )\n", + " else:\n", + " q = q.filter(\n", + " pl.col(\"timestamps\") >= start\n", + " )\n", + " \n", + " q = q.group_by(\"uid\").agg([\n", + " pl.col(\"item_ids\").alias(\"item_ids\")\n", + " ]).sort(\"uid\")\n", + " \n", + " return q.collect()\n", + " \n", + " # Генерируем интервалы\n", + " intervals = []\n", + " current_start = 0\n", + " for cutoff in time_cutoffs:\n", + " intervals.append((current_start, cutoff))\n", + " current_start = cutoff\n", + " # Последний интервал от последнего cutoff до бесконечности\n", + " intervals.append((current_start, None))\n", + " \n", + " # Обрабатываем каждый интервал\n", + " for start, end in intervals:\n", + " subset = extract_interval(df, start, end)\n", + " \n", + " # Конвертируем в JSON-подобный словарь\n", + " json_dict = {}\n", + " for user_id, item_ids in subset.iter_rows():\n", + " json_dict[user_id] = item_ids\n", + " \n", + " result_dicts.append(json_dict)\n", + " \n", + " # Опционально сохраняем файлы\n", + " if output_dir:\n", + " if end is not None:\n", + " filename = f\"inter_new_[{start}_{end}).json\"\n", + " else:\n", + " filename = f\"inter_new_[{start}_inf).json\"\n", + " \n", + " filepath = f\"{output_dir}/{filename}\"\n", + " with open(filepath, 'w') as f:\n", + " json.dump(json_dict, f, indent=2)\n", + " \n", + " print(f\"✓ Сохранено: {filepath}\")\n", + " \n", + " return result_dicts\n", + "\n", + "\n", + "# ==========================================\n", + "# Использование в ноутбуке\n", + "# ==========================================\n", + "\n", + "# Загружаем сохраненный Parquet файл\n", + "df = pl.read_parquet(interactions_output_parquet_path)\n", + "\n", + "# Определяем временные точки разбиения (можно использовать процентили или конкретные даты)\n", + "# Например: разбить на 70%, 80%, 90% от времени\n", + "df_timestamps = df.select(\n", + " pl.col(\"timestamps\").explode()\n", + ")\n", + "min_time = df_timestamps.select(pl.col(\"timestamps\").min()).item()\n", + "max_time = df_timestamps.select(pl.col(\"timestamps\").max()).item()\n", + "\n", + "# Разделяем на 4 части (train/val/test/test_final)\n", + "cutoffs = [\n", + " min_time + (max_time - min_time) * 0.7, # 70%\n", + " min_time + (max_time - min_time) * 0.8, # 80%\n", + " min_time + (max_time - min_time) * 0.9, # 90%\n", + "]\n", + "\n", + "# Применяем функцию\n", + "result_dicts = split_session_by_timestamps(\n", + " df, \n", + " cutoffs,\n", + " output_dir=\"../data/Beauty_new/splits\" # Опционально\n", + ")\n", + "\n", + "# Выводим статистику\n", + "for i, json_dict in enumerate(result_dicts):\n", + " total_interactions = sum(len(items) for items in json_dict.values())\n", + " print(f\"Интервал {i}: {len(json_dict)} пользователей, {total_interactions} взаимодействий\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "73b5ec51-4d94-4021-9a21-3f4345ecdd26", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✓ Сохранено: ../data/Beauty_new/splits/inter_new_[0_inf).json\n" + ] + } + ], + "source": [ + "split_session_by_timestamps(\n", + " df, \n", + " [],\n", + " output_dir=\"../data/Beauty_new/splits\"\n", + ")\n", + "None" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/sigir/Beauty/exps_data.ipynb b/sigir/Beauty/exps_data.ipynb new file mode 100644 index 0000000..2625231 --- /dev/null +++ b/sigir/Beauty/exps_data.ipynb @@ -0,0 +1,921 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "e2462a97-6705-44e1-a232-4dd78a5dfc85", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "import json\n", + "from typing import List, Dict" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "fd38624d-5796-4aa5-929f-7e82c5544f6c", + "metadata": {}, + "outputs": [], + "source": [ + "interactions_output_parquet_path = '/home/jovyan/IRec/sigir/Beauty_new/inter_new.parquet'\n", + "# 1. Загружаем\n", + "df = pl.read_parquet(interactions_output_parquet_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "ee127317-66b8-4f22-9109-94bcb8b1f1ae", + "metadata": {}, + "outputs": [], + "source": [ + "def split_session_by_timestamps(\n", + " df: pl.DataFrame,\n", + " time_cutoffs: List[int],\n", + " output_dir: str = None,\n", + " return_dicts: bool = True\n", + ") -> List[Dict[int, List[int]]]:\n", + " \"\"\"\n", + " Разбивает датасет по временным интервалам и возвращает JSON-подобные словари.\n", + " \n", + " Args:\n", + " df: Polars DataFrame с колонками uid, item_ids (list), timestamps (list)\n", + " time_cutoffs: Лист временных точек для разбиения\n", + " output_dir: Директория для сохранения JSON файлов (опционально)\n", + " return_dicts: Возвращать ли словари (как json_data format)\n", + " \n", + " Returns:\n", + " Лист словарей в формате {user_id: [item_ids для интервала]}\n", + " \n", + " Example:\n", + " >>> df = pl.read_parquet(\"inter_new.parquet\")\n", + " >>> cutoffs = [1000000, 2000000, 3000000]\n", + " >>> dicts = split_session_by_timestamps(df, cutoffs, output_dir=\"./data\")\n", + " >>> # Получим 4 JSON файла за каждый интервал + последний\n", + " \"\"\"\n", + " \n", + " result_dicts = []\n", + " \n", + " def extract_interval(df_source, start, end=None):\n", + " \"\"\"Извлекает данные для одного временного интервала\"\"\"\n", + " q = df_source.lazy()\n", + " q = q.explode([\"item_ids\", \"timestamps\"])\n", + " \n", + " if end is not None:\n", + " q = q.filter(\n", + " (pl.col(\"timestamps\") >= start) & \n", + " (pl.col(\"timestamps\") < end)\n", + " )\n", + " else:\n", + " q = q.filter(\n", + " pl.col(\"timestamps\") >= start\n", + " )\n", + " \n", + " q = q.group_by(\"uid\").agg([\n", + " pl.col(\"item_ids\").alias(\"item_ids\")\n", + " ]).sort(\"uid\")\n", + " \n", + " return q.collect()\n", + " \n", + " # Генерируем интервалы\n", + " intervals = []\n", + " current_start = 0\n", + " for cutoff in time_cutoffs:\n", + " intervals.append((current_start, cutoff))\n", + " current_start = cutoff\n", + " # Последний интервал от последнего cutoff до бесконечности\n", + " intervals.append((current_start, None))\n", + " \n", + " # Обрабатываем каждый интервал\n", + " for start, end in intervals:\n", + " subset = extract_interval(df, start, end)\n", + " \n", + " # Конвертируем в JSON-подобный словарь\n", + " json_dict = {}\n", + " for user_id, item_ids in subset.iter_rows():\n", + " json_dict[user_id] = item_ids\n", + " \n", + " result_dicts.append(json_dict)\n", + " \n", + " # Опционально сохраняем файлы\n", + " if output_dir:\n", + " if end is not None:\n", + " filename = f\"inter_new_[{start}_{end}).json\"\n", + " else:\n", + " filename = f\"inter_new_[{start}_inf).json\"\n", + " \n", + " filepath = f\"{output_dir}/{filename}\"\n", + " with open(filepath, 'w') as f:\n", + " json.dump(json_dict, f, indent=2)\n", + " \n", + " print(f\"✓ Сохранено: {filepath}\")\n", + " \n", + " return result_dicts" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "efc8b582-dd8a-4299-9c49-de906251de8a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cutoffs: [1402444800, 1403654400, 1404864000]\n", + "✓ Сохранено: /home/jovyan/IRec/sigir/Beauty_new/splits/test_splits/inter_new_[0_1402444800).json\n", + "✓ Сохранено: /home/jovyan/IRec/sigir/Beauty_new/splits/test_splits/inter_new_[1402444800_1403654400).json\n", + "✓ Сохранено: /home/jovyan/IRec/sigir/Beauty_new/splits/test_splits/inter_new_[1403654400_1404864000).json\n", + "✓ Сохранено: /home/jovyan/IRec/sigir/Beauty_new/splits/test_splits/inter_new_[1404864000_inf).json\n", + "Part 0 [Base]: 22029 users\n", + "Part 1 [Week -6]: 1854 users\n", + "Part 2 [Week -4]: 1945 users\n", + "Part 3 [Week -2]: 1381 users\n" + ] + } + ], + "source": [ + "global_max_time = df.select(\n", + " pl.col(\"timestamps\").explode().max()\n", + ").item()\n", + "\n", + "# 3. Размер окна (неделя)\n", + "days_val = 14\n", + "window_sec = days_val * 24 * 3600 \n", + "\n", + "# 4. Три отсечки с конца\n", + "cutoff_test_start = global_max_time - window_sec # T - 2w\n", + "cutoff_val_start = global_max_time - 2 * window_sec # T - 4w\n", + "cutoff_gap_start = global_max_time - 3 * window_sec # T - 6w\n", + "\n", + "cutoffs = [\n", + " int(cutoff_gap_start), # Граница Part 0 | Part 1\n", + " int(cutoff_val_start), # Граница Part 1 | Part 2\n", + " int(cutoff_test_start) # Граница Part 2 | Part 3\n", + "]\n", + "\n", + "print(f\"Cutoffs: {cutoffs}\")\n", + "\n", + "# 5. Разбиваем на 4 файла\n", + "# Part 0: Deep History\n", + "# Part 1: Pre-Validation (нужна для s1, но выкидывается для 'совсем короткого' s0?)\n", + "# *В вашем случае 4.2 просто 'на неделю короче', так что Part 1 все равно войдет в трейн Semantics, \n", + "# а выкинется только Part 2. Но если захотите еще короче - можно выкинуть и Part 1.*\n", + "# Part 2: Validation (Есть в 4.1, НЕТ в 4.2 для Semantics)\n", + "# Part 3: Test\n", + "\n", + "split_files = split_session_by_timestamps(\n", + " df, \n", + " cutoffs, \n", + " output_dir=\"/home/jovyan/IRec/sigir/Beauty_new/splits/test_splits\"\n", + ")\n", + "\n", + "names = [\"Base\", \"Week -6\", \"Week -4\", \"Week -2\"]\n", + "for i, d in enumerate(split_files):\n", + " print(f\"Part {i} [{names[i]}]: {len(d)} users\")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "d5ba172e-b430-40a3-a4fa-64366d02a015", + "metadata": {}, + "outputs": [], + "source": [ + "def merge_and_save(parts_to_merge, dirr, output_name):\n", + " merged = {}\n", + " print(f\"Merging {len(parts_to_merge)} files into {output_name}...\")\n", + " \n", + " for part in parts_to_merge:\n", + " # with open(fp, 'r') as f:\n", + " # part = json.load(f)\n", + " for uid, items in part.items():\n", + " if uid not in merged:\n", + " merged[uid] = []\n", + " merged[uid].extend(items)\n", + " \n", + " out_path = f\"{dirr}/{output_name}\"\n", + " with open(out_path, 'w') as f:\n", + " json.dump(merged, f)\n", + " print(f\"✓ Done: {out_path} (Users: {len(merged)})\")\n", + "\n", + "\n", + "# p0, p1, p2, p3 = split_files[0], split_files[1], split_files[2], split_files[3]" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "d116b7e0-9bf9-4104-86a0-69788a70cc14", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Merging 2 files into exp_4_inter_tiger_train.json...\n", + "✓ Done: ../sigir/Beauty_new/splits/exp_data/exp_4_inter_tiger_train.json (Users: 22129)\n", + "Merging 2 files into exp_4.1_inter_semantics_train.json...\n", + "✓ Done: ../sigir/Beauty_new/splits/exp_data/exp_4.1_inter_semantics_train.json (Users: 22129)\n", + "Merging 1 files into exp_4.2_inter_semantics_train_short.json...\n", + "✓ Done: ../sigir/Beauty_new/splits/exp_data/exp_4.2_inter_semantics_train_short.json (Users: 22029)\n", + "Merging 3 files into exp_4.3_inter_semantics_train_leak.json...\n", + "✓ Done: ../sigir/Beauty_new/splits/exp_data/exp_4.3_inter_semantics_train_leak.json (Users: 22265)\n", + "Merging 1 files into test_set.json...\n", + "✓ Done: ../sigir/Beauty_new/splits/exp_data/test_set.json (Users: 1381)\n", + "Merging 1 files into valid_skip_set.json...\n", + "✓ Done: ../sigir/Beauty_new/splits/exp_data/valid_skip_set.json (Users: 1945)\n", + "\n", + "All done!\n" + ] + } + ], + "source": [ + "EXP_DIR = \"../sigir/Beauty_new/splits/exp_data\"\n", + "\n", + "# Tiger: P0+P1\n", + "merge_and_save([p0, p1], EXP_DIR, \"exp_4_inter_tiger_train.json\")\n", + "\n", + "# 1. Exp 4.1 (Standard)\n", + "# Semantics: P0+P1 (Всё кроме пропуска и теста)\n", + "merge_and_save([p0, p1], EXP_DIR, \"exp_4.1_inter_semantics_train.json\")\n", + "\n", + "# 2. Exp 4.2 (Short Semantics)\n", + "# Semantics: P0 (Короче на неделю, без P2)\n", + "merge_and_save([p0], EXP_DIR, \"exp_4.2_inter_semantics_train_short.json\")\n", + "\n", + "# 3. Exp 4.3 (Leak)\n", + "# Semantics: P0+P1+P2 (Видит валидацию)\n", + "merge_and_save([p0, p1, p2], EXP_DIR, \"exp_4.3_inter_semantics_train_leak.json\")\n", + "\n", + "# 4. Test Set (тест всех моделей)\n", + "merge_and_save([p3], EXP_DIR, \"test_set.json\")\n", + "\n", + "# 4. Valid Set (пропуск, имитируется разница трейна и теста чтобы потом дообучать)\n", + "merge_and_save([p2], EXP_DIR, \"valid_skip_set.json\")\n", + "\n", + "print(\"\\nAll done!\")" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "9ae1d1e5-567d-471a-8f83-4039ecacc8d2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Merging 4 files into all_set.json...\n", + "✓ Done: ../sigir/Beauty_new/splits/exp_data/all_set.json (Users: 22363)\n" + ] + } + ], + "source": [ + "merge_and_save([p0, p1, p2, p3], EXP_DIR, \"all_set.json\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "328de16c-f61d-45be-8a72-5f0eaef612e8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Проверка Train сетов (должны быть префиксами):\n", + "✅ [ПРЕФИКСЫ] Все 22129 массивов ОК. Полных совпадений: 19410\n", + "✅ [ПРЕФИКСЫ] Все 22029 массивов ОК. Полных совпадений: 18191\n", + "✅ [ПРЕФИКСЫ] Все 22265 массивов ОК. Полных совпадений: 20982\n", + "✅ [ПРЕФИКСЫ] Все 22129 массивов ОК. Полных совпадений: 19410\n", + "\n", + "Проверка Test сета (должен быть суффиксом):\n", + "✅ [СУФФИКСЫ] Все 1381 массивов ОК. Полных совпадений: 98\n", + "\n", + "(Контроль) Проверка Test сета как префикса (должна упасть):\n", + "❌ [ПРЕФИКСЫ] Найдено 1283 ошибок.\n" + ] + } + ], + "source": [ + "with open(\"../data/Beauty/inter_new.json\", 'r') as f:\n", + " old_inter_new = json.load(f)\n", + "\n", + "with open(\"../sigir/Beauty_new/splits/exp_data/exp_4.1_inter_semantics_train.json\", 'r') as ff:\n", + " first_sem = json.load(ff)\n", + " \n", + "with open(\"../sigir/Beauty_new/splits/exp_data/exp_4.2_inter_semantics_train_short.json\", 'r') as ff:\n", + " second_sem = json.load(ff)\n", + " \n", + "with open(\"../sigir/Beauty_new/splits/exp_data/exp_4.3_inter_semantics_train_leak.json\", 'r') as ff:\n", + " third_sem = json.load(ff)\n", + " \n", + "with open(\"../sigir/Beauty_new/splits/exp_data/exp_4_inter_tiger_train.json\", 'r') as ff:\n", + " tiger_sem = json.load(ff)\n", + "\n", + "with open(\"../sigir/Beauty_new/splits/exp_data/test_set.json\", 'r') as ff:\n", + " test_sem = json.load(ff)\n", + "\n", + "def check_prefix_match(full_data, subset_data, check_suffix=False):\n", + " \"\"\"\n", + " check_suffix=True включит режим проверки суффиксов (для теста).\n", + " \"\"\"\n", + " mismatch_count = 0\n", + " full_match_count = 0\n", + " \n", + " # Итерируемся по ключам сабсета, так как в full_data может быть больше юзеров\n", + " for user, sub_items in subset_data.items():\n", + " \n", + " # Проверяем есть ли такой юзер в исходнике\n", + " if user not in full_data:\n", + " print(f\"⚠ Юзер {user} не найден в исходном файле!\")\n", + " mismatch_count += 1\n", + " continue\n", + " \n", + " full_items = full_data[user]\n", + " \n", + " # Логика для проверки ПРЕФИКСА (начало совпадает)\n", + " if not check_suffix:\n", + " if len(sub_items) > len(full_items):\n", + " mismatch_count += 1\n", + " continue\n", + " \n", + " # Сравниваем начало full с sub\n", + " if full_items[:len(sub_items)] == sub_items:\n", + " if len(full_items) == len(sub_items):\n", + " full_match_count += 1\n", + " else:\n", + " mismatch_count += 1\n", + " \n", + " # Логика для проверки СУФФИКСА (конец совпадает - для теста)\n", + " else:\n", + " if len(sub_items) > len(full_items):\n", + " mismatch_count += 1\n", + " continue\n", + " \n", + " # Сравниваем конец full с sub\n", + " # Срез [-len:] берет последние N элементов\n", + " if full_items[-len(sub_items):] == sub_items:\n", + " if len(full_items) == len(sub_items):\n", + " full_match_count += 1\n", + " else:\n", + " mismatch_count += 1\n", + "\n", + " mode = \"СУФФИКСЫ\" if check_suffix else \"ПРЕФИКСЫ\"\n", + " \n", + " if mismatch_count == 0:\n", + " print(f\"✅ [{mode}] Все {len(subset_data)} массивов ОК. Полных совпадений: {full_match_count}\")\n", + " else:\n", + " print(f\"❌ [{mode}] Найдено {mismatch_count} ошибок.\")\n", + "\n", + "# --- Запуск проверок ---\n", + "print(\"Проверка Train сетов (должны быть префиксами):\")\n", + "check_prefix_match(old_inter_new, first_sem)\n", + "check_prefix_match(old_inter_new, second_sem)\n", + "check_prefix_match(old_inter_new, third_sem)\n", + "check_prefix_match(old_inter_new, tiger_sem)\n", + "\n", + "print(\"\\nПроверка Test сета (должен быть суффиксом):\")\n", + "check_prefix_match(old_inter_new, test_sem, check_suffix=True)\n", + "\n", + "print(\"\\n(Контроль) Проверка Test сета как префикса (должна упасть):\")\n", + "check_prefix_match(old_inter_new, test_sem, check_suffix=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "0715adfd", + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'суа' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mсуа\u001b[49m\n", + "\u001b[0;31mNameError\u001b[0m: name 'суа' is not defined" + ] + } + ], + "source": [ + "суа" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "f2df507d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--- Статистика по временным интервалам (Fixed Time Window) ---\n", + "Part 0 [Base]: 186516 events (Start -> 2014-06-11)\n", + "Part 1 [Gap (Week -6)]: 4073 events (2014-06-11 -> 2014-06-25)\n", + "Part 2 [Pre-Valid (Week -4)]: 4730 events (2014-06-25 -> 2014-07-09)\n", + "Part 3 [Test (Week -2)]: 3183 events (2014-07-09 -> Inf)\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA/8AAAIjCAYAAABViau2AAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjAsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvlHJYcgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAo2ZJREFUeJzs3XlcFPX/B/DXLLDcp9yIiKDigamYF14oCmreeZInqZlHpl8tK+/M1DTP1A5FE9PUNLNCUcH7NjzRkMATBEVuOXd+f/hjcl0EVheWpdfz8dhgP/OZz7xnPrvke+YznxFEURRBRERERERERFWWTNsBEBEREREREVH5YvJPREREREREVMUx+SciIiIiIiKq4pj8ExEREREREVVxTP6JiIiIiIiIqjgm/0RERERERERVHJN/IiIiIiIioiqOyT8RERERERFRFcfkn4iIiIiIiKiKY/JPRERE/3lXr17Fnj17pPdRUVH4/ffftRfQK7p37x5CQkKk9/Hx8QgNDdVeQEREVGkw+Sci0rKQkBAIgiC9jIyMUKdOHUyYMAEPHz7UdnhE/wkZGRkYO3YsTp8+jZiYGHzwwQe4cuWKtsNSmyAIGD9+PPbv34/4+HhMnz4dx44d03ZYRERUCehrOwAiInpm3rx5cHd3R05ODo4fP461a9fijz/+wNWrV2FiYqLt8IiqtFatWkkvAKhTpw5Gjx6t5ajU5+LigtGjRyMwMBAA4OTkhMjISO0GRURElYIgiqKo7SCIiP7LQkJCMHLkSJw7dw7NmjWTyqdOnYply5Zh69atGDx4sBYjJPrvuH79Op4+fQpvb2/I5XJth/PKYmNj8ejRIzRs2BCmpqbaDoeIiCoBDvsnIqqkOnbsCACIi4sDAKSkpOB///sfvL29YWZmBgsLC3Tt2hWXLl1SWTcnJwdz5sxBnTp1YGRkBCcnJ/Tt2xexsbEAnt0H/PytBi++OnToILUVGRkJQRCwfft2fPLJJ3B0dISpqSl69uyJu3fvqmz7zJkzCAwMhKWlJUxMTNC+fXucOHGi2H3s0KFDsdufM2eOSt0tW7bAx8cHxsbGsLGxwaBBg4rdfkn79jyFQoHly5ejQYMGMDIygoODA8aOHYsnT54o1atZsybeeustle1MmDBBpc3iYl+yZInKMQWA3NxczJ49G56enjA0NISrqyumT5+O3NzcYo/V8148bra2tujevTuuXr0q1Xn8+DG6du2K6tWrw9DQEE5OTggKCsLt27eV2vrqq6/QunVrVKtWDcbGxvDx8cHOnTtVtlnWfSv6vAiCgKioKKX69+/fh56eHgRBUNnG4cOH0bZtW5iamsLKygq9evVCdHR0mfb/dT83HTp0kPahfv368PHxwaVLl4r93BS12bx5c5iYmMDa2hrt2rXDgQMHADz7vJT03apZsyYA1c+pgYEBatasiWnTpiEvL0/aVtFtQfHx8VKZQqFAo0aNIAiC0v39I0aMkNr38PBAixYtkJKSAmNjY5U2ijNixAilmKytrdGhQ4dibxv45ptv0KBBAxgaGsLZ2Rnjx49Hampqie0DwJw5c5S2YW5ujubNmyvNtwA865OGDRviwoULaN26NYyNjeHu7o5169aptFnW71LRNpcvX67ShpeXFwRBwIQJE5TKU1NTMXnyZLi6usLQ0BCenp5YtGgRFAqFVKeoL7/66iuVdhs2bFjs9+PF0Rjdu3d/6WeYiEhTOOyfiKiSKkrUq1WrBgD4559/sGfPHvTv3x/u7u54+PAh1q9fj/bt2+P69etwdnYGABQWFuKtt97CoUOHMGjQIHzwwQfIyMhAeHg4rl69Cg8PD2kbgwcPRrdu3ZS2O2PGjGLjWbBgAQRBwEcffYSkpCQsX74c/v7+iIqKgrGxMYBnCVzXrl3h4+OD2bNnQyaTYePGjejYsSOOHTuG5s2bq7RbvXp1LFy4EACQmZmJcePGFbvtmTNnYsCAAXj33XeRnJyMVatWoV27dvjrr79gZWWlss6YMWPQtm1bAMAvv/yC3bt3Ky0fO3asNOpi0qRJiIuLw+rVq/HXX3/hxIkTMDAwKPY4qCM1NVXat+cpFAr07NkTx48fx5gxY1CvXj1cuXIFX3/9Nf7++2+VRKg4Xl5e+PTTTyGKImJjY7Fs2TJ069YNd+7cAQDk5eXB3NwcH3zwAapVq4bY2FisWrUKly9fVrqXfcWKFejZsyeCgoKQl5eHbdu2oX///ti3bx+6d++u9r4VMTIywsaNG7FixQqpbNOmTZDL5cjJyVGqe/DgQXTt2hW1atXCnDlz8PTpU6xatQq+vr64ePGilNA+r7w+N0U++uijYsvnzp2LOXPmoHXr1pg3bx7kcjnOnDmDw4cPo0uXLli+fDkyMzMBANHR0fjiiy/wySefoF69egAAMzMzpfaKPqe5ubnYv38/vvrqKxgZGWH+/Pkvje3HH38s83wEs2bNUjneJbG1tcXXX38N4NnkgStWrEC3bt1w9+5d6XjNmTMHc+fOhb+/P8aNG4ebN29i7dq1OHfuXJm/Oz/++CMA4NGjR/jmm2/Qv39/XL16FXXr1pXqPHnyBN26dcOAAQMwePBg/Pzzzxg3bhzkcjlGjRoFQP3vUtHncvLkyVLZyZMnVU6KAUB2djbat2+P+/fvY+zYsahRowZOnjyJGTNmICEhodiTCK/i6NGj+OOPPzTSFhFRiUQiItKqjRs3igDEgwcPisnJyeLdu3fFbdu2idWqVRONjY3Fe/fuiaIoijk5OWJhYaHSunFxcaKhoaE4b948qWzDhg0iAHHZsmUq21IoFNJ6AMQlS5ao1GnQoIHYvn176X1ERIQIQHRxcRHT09Ol8p9//lkEIK5YsUJqu3bt2mJAQIC0HVEUxezsbNHd3V3s3LmzyrZat24tNmzYUHqfnJwsAhBnz54tlcXHx4t6enriggULlNa9cuWKqK+vr1IeExMjAhA3bdoklc2ePVt8/n95x44dEwGIoaGhSuuGhYWplLu5uYndu3dXiX38+PHii/8bfTH26dOni/b29qKPj4/SMf3xxx9FmUwmHjt2TGn9devWiQDEEydOqGzvee3bt1dqTxRF8ZNPPhEBiElJSS9db/HixSIA8dGjR1JZdna2Up28vDyxYcOGYseOHV9p34o+L4MHDxarVasm5ubmSstq164tDhkyRAQg7tixQypv3LixaG9vLz5+/Fgqu3TpkiiTycRhw4ap7IemPzcvHs8//vhDBCAGBgYq9XFMTIwok8nEPn36qHwXn//Mv3gsIiIiVJYVfQc3btyoVO7s7Cx269ZNel/09yEuLk4UxWd/B2rUqCF27dpVZf3hw4eLbm5u0vurV6+KMplMqlvUxsu8uL4oiuK3334rAhDPnj0riqIoJiUliXK5XOzSpYvSMVi9erUIQNywYUOJ23jxuyiKonjgwAERgPjzzz9LZe3btxcBiEuXLpXKcnNzpc9KXl6eKIrqfZcAiG+//baor68vnj9/XioPDg6WPpfjx4+XyufPny+ampqKf//9t1LbH3/8sainpyfeuXNHFMVX+3v6/GeiRYsWUh89/xkmItI0DvsnIqok/P39YWdnB1dXVwwaNAhmZmbYvXs3XFxcAACGhoaQyZ792S4sLMTjx49hZmaGunXr4uLFi1I7u3btgq2tLSZOnKiyjeKGMJfVsGHDYG5uLr1/++234eTkJF2xioqKQkxMDIYMGYLHjx/j0aNHePToEbKystCpUyccPXpUaags8Oz2BCMjoxK3+8svv0ChUGDAgAFSm48ePYKjoyNq166NiIgIpfpFQ6YNDQ1f2uaOHTtgaWmJzp07K7Xp4+MDMzMzlTbz8/OV6j169KjUq6n379/HqlWrMHPmTJWrvTt27EC9evXg5eWl1GbRrR4vbr84RTElJyfj1KlT2L17Nxo1agRbW1ulehkZGUhKSsKpU6fw008/oUGDBrCxsZGWF43aAJ5daU1LS0Pbtm2VPlPq7FuRHj16QBAE7N27FwBw7Ngx3Lt3DwMHDlSql5CQgKioKIwYMUIprkaNGqFz587FXhEtj89NEVEUMWPGDPTr1w8tWrRQWrZnzx4oFArMmjVL+i4WedXvVmZmJh49eoT79+/j22+/RWJiIjp16vTS+mvWrMHjx48xe/bsUtueMWMGmjZtiv79+5c5HoVCIR2rqKgobN68GU5OTtLIhYMHDyIvLw+TJ09WOgajR4+GhYVFmR+PWLSN6OhorFu3DqampmjZsqVSHX19fYwdO1Z6L5fLMXbsWCQlJeHChQsA1P8uOTg4oHv37ti4cSOAZ1f3f/75Z4wcOVIlxh07dqBt27awtrZWatvf3x+FhYU4evSoUv3s7GyVvxOFhYUlHodffvkF586dw5dfflmm40ZE9Do47J+IqJJYs2YN6tSpA319fTg4OKBu3bpK/7hWKBRYsWIFvvnmG8TFxSn9o7Lo1gDg2e0CdevWhb6+Zv/E165dW+m9IAjw9PSU7iOOiYkBAAwfPvylbaSlpcHa2lp6/+jRI5V2XxQTEwNRFF9a78UhxkX3Hb8sKS1qMy0tDfb29sUuT0pKUnp/4MAB2NnZlRjni2bPng1nZ2eMHTtW5f72mJgYREdHv7TNF7dfnJMnTyqtX7t2bezZs0clCR09ejS2b98OAHjzzTfxxx9/KNXZt28fPv/8c0RFRSndI11SMlvSvhUxMDDAO++8gw0bNuDtt9/Ghg0b0K9fP1hYWCjVKxpu/fxw7yL16tXD/v37kZWVpTRpXXl8boqEhobi2rVr+Pnnn7F161alZbGxsZDJZKhfv36J21bHxIkTlU7UjRw5Eh9++GGxddPS0vDFF19gypQpcHBwKLHd48eP47fffsOhQ4ekW0HK4u7du0qfKycnJ+zatUv6Pr2sv+RyOWrVqlXs8PniPL8NCwsLhIaGwtXVVamOs7OzymSFderUAfDsPvuWLVu+0ndp5MiRGDlyJJYuXYodO3bA2tpaOlnwvJiYGFy+fLnMbc+ePbvYkzIv66vCwkJ88sknCAoKQqNGjYqtQ0SkSUz+iYgqiebNmyvN9v+iL774AjNnzsSoUaMwf/582NjYQCaTYfLkySpX1LWhKIYlS5agcePGxdZ5PiHPy8tDQkICOnfuXGq7giDgzz//hJ6eXoltAkBiYiIAwNHRscQ27e3tERoaWuzyF/+x36JFC3z++edKZatXr8avv/5a7PrR0dEICQnBli1bik0yFQoFvL29sWzZsmLXfzEJKk6jRo2wdOlSAEBycjJWrlyJDh064OLFi0r7/tlnn2HkyJGIjY3F4sWLMWjQIBw8eBD6+vo4duwYevbsiXbt2uGbb76Bk5MTDAwMsHHjRpXEt6z79rxRo0ahSZMmuHnzJnbs2CGNAngd5fW5KWp75syZCA4OlpLM8jZt2jR06dIFhYWFuHbtGubNmwdRFKUr089btGgRZDIZpk2bhsePH5fY7kcffYSAgAB07NhRaVLA0jg4OGDLli0Anp1s2LBhAwIDA3H8+HF4e3urtW8lCQ8PBwBkZWVh165dGDBgAPbt21dqv77oVb5L3bt3h1wux549e7Bx40YMHz5cZSRHUdudO3fG9OnTi237xc/ImDFjVEZZlPS4yB9++AHx8fHYv3//S+sQEWkSk38iIh2xc+dO+Pn54YcfflAqT01NVRrq7eHhgTNnziA/P18jk9YVKbqyX0QURdy6dUu6YlU0kaCFhQX8/f1Lbe/SpUvIz88v8YRHUbuiKMLd3b1MCdn169chCEKxV5Kfb/PgwYPw9fVVGvb+Mra2tir7VNKkfDNmzEDjxo1Vhrg/v/1Lly6hU6dOrzxc3NraWimmDh06wNnZGRs3blSatLFhw4Zo2LAhAMDb2xvt2rVDeHg4unbtil27dsHIyAj79+9Xuk2iuMSzrPv2PG9vbzRp0gQDBgyAnZ0d/Pz8cOTIEaU6bm5uAICbN2+qrH/jxg3Y2toqXf0tr88N8GwG+6SkpJfOuO7h4QGFQoHr16+/9ASXuurXry/1Y0BAAHJzc/HJJ59gwYIF0iSeAPDgwQOsWLECCxcuhLm5eYnJ/549e3Dq1KkSb914GSMjI6XPVc+ePWFjY4PVq1dj/fr1Sv1Vq1YtqV5eXh7i4uLK9N0HoFSvV69eOHPmDL766iul5P/Bgwcqoz7+/vtvAFB6qoG63yV9fX0MHToUCxYswLVr17Bhw4Zi63l4eCAzM7PM+1S7dm2Vui97zGJ2djbmzp2L999/XzqmRETljff8ExHpCD09PYiiqFS2Y8cO3L9/X6msX79+ePToEVavXq3Sxovrq2Pz5s3IyMiQ3u/cuRMJCQno2rUrAMDHxwceHh746quvpNnOn5ecnKwSu56eXrGP0Xte3759oaenh7lz56rEL4qiUhJUUFCAXbt2oXnz5iUO+x8wYAAKCwuLnVG9oKCgTI8se5lTp07h119/xZdffvnSZGTAgAG4f/8+vvvuO5VlT58+RVZWltrbffr0KQCU+KjAR48eKdUpeuze87eQxMfHv/TERln27UWjRo3C5cuXpcfIvcjJyQmNGzfGpk2blI771atXceDAAZWnUZTH5wZ4NjfCggUL8OGHH7501Ejv3r0hk8kwb948ldE2r/Pdel5RPz7/uD/g2VMGHBwc8N5775W4ftFQ8iFDhmjkBEVeXh4KCgqkz4y/vz/kcjlWrlyptM8//PAD0tLSSnxCREkx5+XlqXx2CwoKsH79eqVY1q9fDzs7O/j4+AB49e/SqFGjcOXKFbRr107pJMbzBgwYgFOnThV7ZT41NRUFBQVl3scXrVixAllZWfj0009fuQ0iInXxyj8RkY546623MG/ePIwcORKtW7fGlStXEBoaqvIP12HDhmHz5s2YMmUKzp49i7Zt2yIrKwsHDx7E+++/j169er3S9m1sbNCmTRuMHDkSDx8+xPLly+Hp6SkNa5XJZPj+++/RtWtXNGjQACNHjoSLiwvu37+PiIgIWFhY4LfffkNWVhbWrFmDlStXok6dOkrPuy46aXD58mWcOnUKrVq1goeHBz7//HPMmDED8fHx6N27N8zNzREXF4fdu3djzJgx+N///oeDBw9i5syZuHz5Mn777bcS96V9+/YYO3YsFi5ciKioKHTp0gUGBgaIiYnBjh07sGLFCrz99tuvdJwOHDiAzp07l3i1cOjQofj555/x3nvvISIiAr6+vigsLMSNGzfw888/Y//+/aVe2X748KE0PPvRo0dYv3499PX1paT4u+++w9GjR9G0aVNYWFjg+vXr+O677+Dk5CRNKNe9e3csW7YMgYGBGDJkCJKSkrBmzRp4enri8uXLr7RvLxo9ejT69+8PS0vLl9ZZsmQJunbtilatWiE4OFh61J+lpaV0Fb68PjdFLl68CFtb25cO8QYAT09PfPrpp5g/fz7atm2Lvn37wtDQEOfOnYOzs3OJjz58mVOnTkFfX18a9r9q1So0adJE5fGGBw4cQGhoKORyeYnt3bt3D3K5/JUfHZeVlaU07P/HH39ETk4O+vTpA+DZLTEzZszA3LlzERgYiJ49e+LmzZv45ptv8Oabb+Kdd94p03aKtpGVlYU9e/YgPj5e6fF7wLN7/hctWoT4+HjUqVMH27dvR1RUFL799ltpVNOrfpfq1auHR48elTjyZ9q0adi7dy/eeustjBgxAj4+PsjKysKVK1ewc+dOxMfHq0ywWVYHDhzAggULlOZrISIqd1p4wgARET2n6FFe586dK7FeTk6OOHXqVNHJyUk0NjYWfX19xVOnThX72Lfs7Gzx008/Fd3d3UUDAwPR0dFRfPvtt8XY2FhRFF/t0VQ//fSTOGPGDNHe3l40NjYWu3fvLt6+fVtl/b/++kvs27evWK1aNdHQ0FB0c3MTBwwYIB46dEhp26W9hg8frtTurl27xDZt2oimpqaiqamp6OXlJY4fP168efOmKIqiOHHiRLFdu3ZiWFiYSkzFPV5MFJ89xszHx0c0NjYWzc3NRW9vb3H69OnigwcPpDrqPupPEATxwoULSuXF9VFeXp64aNEisUGDBqKhoaFobW0t+vj4iHPnzhXT0tJUtvdie88fKysrK9HX11f8448/pDpHjhwR27ZtK1pZWYmGhoZizZo1xdGjR6s87u2HH34Qa9euLRoaGopeXl7ixo0biz1eZd23os/L84/ye97Llh88eFD09fUVjY2NRQsLC7FHjx7i9evXpeXl9bl5/nh+/fXXSuu+7HOzYcMGsUmTJlK/tW/fXgwPD3/pvpb0qL+il0wmE6tXry4OHz5cerynKP7796Fx48ZKjxMs7lGBw4cPFwGIH3zwgdK2Xnxc4MsUrV/0MjMzE5s2bSr++OOPKnVXr14tenl5iQYGBqKDg4M4btw48cmTJyW2L4r/HtOil7GxsVi/fn3x66+/Vtq/9u3biw0aNBDPnz8vtmrVSjQyMhLd3NzE1atXq7RZ1u8SXniU34uKW56RkSHOmDFD9PT0FOVyuWhrayu2bt1a/Oqrr6THDb7K31MnJycxKytLZft81B8RlSdBFDU0To2IiKqkyMhI+Pn5YceOHa98Nfx58fHxcHd3R1xcnMrVzSJz5sxBfHy8WhOVUdXGz81/S4cOHfDo0SNcvXpV26EQEVUZvOefiIiIiIiIqIrjPf9ERFShzMzMEBQUVOKEfI0aNVKa6ZyInxsiIqLXw+SfiIgqlK2trTTZ18v07du3gqIhXcHPDRER0evhPf9EREREREREVZxW7/k/evQoevToAWdnZwiC8NLnCgPAe++9B0EQsHz5cqXylJQUBAUFwcLCAlZWVggODlZ5vvTly5fRtm1bGBkZwdXVFYsXL1Zpf8eOHfDy8oKRkRG8vb1f+RE5RERERERERJWNVpP/rKwsvPHGG1izZk2J9Xbv3o3Tp08Xex9fUFAQrl27hvDwcOzbtw9Hjx7FmDFjpOXp6eno0qUL3NzccOHCBSxZsgRz5szBt99+K9U5efIkBg8ejODgYPz111/o3bs3evfuzRlmiYiIiIiIqEqoNMP+BUHA7t270bt3b6Xy+/fvo0WLFti/fz+6d++OyZMnY/LkyQCA6Oho1K9fH+fOnUOzZs0AAGFhYejWrRvu3bsHZ2dnrF27Fp9++ikSExMhl8sBAB9//DH27NmDGzduAAAGDhyIrKws7Nu3T9puy5Yt0bhxY6xbt65M8SsUCjx48ADm5uYQBOE1jwYRERERERFRyURRREZGBpydnSGTlXxtv1JP+KdQKDB06FBMmzYNDRo0UFl+6tQpWFlZSYk/APj7+0Mmk+HMmTPo06cPTp06hXbt2kmJPwAEBARg0aJFePLkCaytrXHq1ClMmTJFqe2AgIASb0PIzc1Fbm6u9P7+/fuoX7/+a+wtERERERERkfru3r2L6tWrl1inUif/ixYtgr6+PiZNmlTs8sTERNjb2yuV6evrw8bGBomJiVIdd3d3pToODg7SMmtrayQmJkplz9cpaqM4CxcuxNy5c1XKL168WOJjiKhiKBQKpKenw8LCotQzYFS5sO90E/tNd7HvdBf7Tjex33QX+043vazfbNu2hd7Dhyh0cACWAHq5D+F1W0BCgQgHUwccG3hMi1GXXWZmJpo2bQpzc/NS61ba5P/ChQtYsWIFLl68WCmH0c+YMUNptEB6ejpcXV3h7u4OCwsLLUZGwLMveXJyMuzs7PjHWcew73QT+013se90F/tON7HfdBf7Tje9rN8EfX0IAER9fcAMEPQAmbEA5IvQN9GHh4eH9oJWQ3p6OgCUKWeutMn/sWPHkJSUhBo1akhlhYWFmDp1KpYvX474+Hg4OjoiKSlJab2CggKkpKTA0dERAODo6IiHDx8q1Sl6X1qdouXFMTQ0hKGhoUq5TCbjH4NKQhAE9oeOYt/pJvab7mLf6S72nW5iv+ku9p1uKqnfXpYy60ofqxNnpd2joUOH4vLly4iKipJezs7OmDZtGvbv3w8AaNWqFVJTU3HhwgVpvcOHD0OhUKBFixZSnaNHjyI/P1+qEx4ejrp168La2lqqc+jQIaXth4eHo1WrVuW9m0RERERERETlTqtX/jMzM3Hr1i3pfVxcHKKiomBjY4MaNWqgWrVqSvUNDAzg6OiIunXrAgDq1auHwMBAjB49GuvWrUN+fj4mTJiAQYMGSY8FHDJkCObOnYvg4GB89NFHuHr1KlasWIGvv/5aaveDDz5A+/btsXTpUnTv3h3btm3D+fPnlR4HSERERERERKSrtJr8nz9/Hn5+ftL7onvohw8fjpCQkDK1ERoaigkTJqBTp06QyWTo168fVq5cKS23tLTEgQMHMH78ePj4+MDW1hazZs3CmDFjpDqtW7fG1q1b8dlnn+GTTz5B7dq1sWfPHjRs2FAzO/r/RFFEQUEBCgsLNdouqVIoFMjPz0dOTo7ODNmhZyq67wwMDKCnp1fu2yEiIiKiSuTcOaCwENDTA6wAiIU4l/UIhYa20BOq5r8NBVEURW0HURWkp6fD0tISaWlpxU74l5eXh4SEBGRnZ2shuv8eURShUCggk8kq5YSR9HIV3XeCIKB69ep8SsdrUigUSEpKgr29PU+46Rj2ne5i3+km9pvuYt/ppqreb6Xloc+rtBP+VSUKhQJxcXHQ09ODs7Mz5HI5E9JyVjTKQl9fn8dax1Rk34miiOTkZNy7dw+1a9fmCAAiIiIiqrKY/FeAvLw8KBQKuLq6wsTERNvh/Ccw+dddFd13dnZ2iI+PR35+PpN/IiIiIqqymPxXoKo4zIRI1/HkEBEREdF/0LffApmZgJkZ0BFAfia+jb+ITKumMJObYYzPmFKb0DVM/omIiIiIiOi/Zd484P59wMUFsAPw9D7mxctwPz8ULuYuVTL556VoIiIiIiIioiqOV/61bEdsWoVtq7+HZYVti4iIiIiIiCoPXvmnEo0YMQKCIKi8AgMDKyyGOXPmoHHjxhW2PSIiIiIioqqGV/6pVIGBgdi4caNSmaGhoZaiISIiIiIiInXxyj+VytDQEI6Ojkova2trDBkyBAMHDlSqm5+fD1tbW2zevBkAoFAosHDhQri7u8PY2BhvvPEGdu7cKdWPjIyEIAg4dOgQmjVrBhMTE7Ru3Ro3b94EAISEhGDu3Lm4dOmSNOogJCQEoihizpw5qFGjBgwNDeHs7IxJkyZV3EEhIiIiIiLSIbzyT68sKCgI/fv3R2ZmJszMzAAA+/fvR3Z2Nvr06QMAWLhwIbZs2YJ169ahdu3aOHr0KN555x3Y2dmhffv2Uluffvopli5dCjs7O7z33nsYNWoUTpw4gYEDB+Lq1asICwvDwYMHAQCWlpbYtWsXvv76a2zbtg0NGjRAYmIiLl26VPEHgYiIiIiISAcw+adS7du3T0rui3zyySeYPn06TE1NsXv3bgwdOhQAsHXrVvTs2RPm5ubIzc3FF198gYMHD6JVq1YAgFq1auH48eNYv369UvK/YMEC6f3HH3+M7t27IycnB8bGxjAzM4O+vj4cHR2l+nfu3IGjoyP8/f1hYGCAGjVqoHnz5uV9KIiIiIiIiHQSk38qlZ+fH9auXatUZmNjA319fQwYMAChoaEYOnQosrKy8Ouvv2Lbtm0AgFu3biE7OxudO3dWWjcvLw9NmjRRKmvUqJH0u5OTEwAgKSkJNWrUKDam/v37Y/ny5ahVqxYCAwPRrVs39OjRA/r6/EgTERERERG9iJkSlcrU1BSenp7FLgsKCkL79u2RlJSE8PBwGBsbS08CyMzMBAD8/vvvcHFxUVrvxQkDDQwMpN8FQQDwbL6Al3F1dcXNmzdx8OBBhIeH4/3338eSJUtw5MgRpbaIiIiIiIhU1KkDWFoCDg6ABQC5JeqYPICl3BkOpg7ajq5cMPmn19K6dWu4urpi+/bt+PPPP9G/f38p+a5fvz4MDQ1x584dpSH+6pLL5SgsLFQpNzY2Ro8ePdCjRw+MHz8eXl5euHLlCpo2bfrK2yIiIiIiov+Aw4dVi7QQRkVi8k+lys3NRWJiolKZvr4+bG1tAQBDhgzBunXr8PfffyMiIkKqY25ujv/973/48MMPoVAo0KZNG6SlpeHEiROwsLDA8OHDy7T9mjVrIi4uDlFRUahevTrMzc3x008/obCwEC1atICJiQm2bNkCY2NjuLm5aW7HiYiIiIioUtoRm1bi8v4elhUUie5g8q9luvChDAsLk+7DL1K3bl3cuHEDwLOh/wsWLICbmxt8fX2V6s2fPx92dnZYuHAh/vnnH1hZWaFp06b45JNPyrz9fv364ZdffoGfnx9SU1OxceNGWFlZ4csvv8SUKVNQWFgIb29v/Pbbb6hWrdrr7zAREREREVEVI4iiKGo7iKogPT0dlpaWSEtLg4WFhdKynJwcxMXFwd3dHUZGRlqK8L9FFEUUFBRAX19fmkOAdENF9x2/n5qhUCiQlJQEe3t7yGQybYdDamDf6S72nW5iv+ku9l3lUtYr/1W930rKQ1/EK/9ERERERET03xIUBDx6BNjaAu8DyH2EoJibeGRaF7YmtgjtG6rtCDWOyT8RERERERH9txw5Aty/D7i4AG8DeHofRx7LcD/xNlzMXUpdXRdVvXEPRERERERERKSEyT8RERERERFRFcfkn4iIiIiIiKiKY/JPREREREREVMUx+SciIiIiIiKq4pj8ExEREREREVVxTP6JiIiIiIiIqjgm/1TpnThxAt7e3jAwMEDv3r1fWqYrfvjhB3Tp0kXbYWDEiBEVcuzCwsLQuHFjKBSKct8WEREREREVj8k/lSoxMRETJ05ErVq1YGhoCFdXV/To0QOHDh0qcxuRkZEQBAGpqalqb3/KlClo3Lgx4uLiEBIS8tKyskhMTMQHH3wAT09PGBkZwcHBAb6+vli7di2ys7PVjk1dOTk5mDlzJmbPng3gWWIsCAISExOV6jk5OaFmzZpKZfHx8RAEQa3jXlFCQkLQqFEjGBkZwd7eHuPHj5eWBQYGwsDAAKGhoVqMkIiIiIh0xY7YtFJfr230aODDD5/99BwN1P0Qoz3b4cOWH2J009Gv334lpK/tAKhyi4+Ph6+vL6ysrLBkyRJ4e3sjPz8f+/fvx/jx43Hjxo1yjyE2NhbvvfceqlevXmJZaf755x9pX7744gt4e3vD0NAQV65cwbfffgsXFxf07NmzPHZBsnPnTlhYWMDX1xcA0KZNG+jr6yMyMhKDBg0CAERHR+Pp06fIzs5GfHy8dBIgIiIChoaG0rqVxbJly7B06VIsWbIELVq0QFZWFuLj45XqjBgxAitXrsTQoUO1EyQRERER0fP+/2KcUpGPFuKoQLzyTyV6//33IQgCzp49i379+qFOnTpo0KABpkyZgtOnTwP494p0VFSUtF5qaioEQUBkZCTi4+Ph5+cHALC2toYgCBgxYgQAIDc3F5MmTYK9vT2MjIzQpk0bnDt3Tqndx48fY9SoURAEASEhIcWWlXVf9PX1cf78eQwYMAD16tVDrVq10KtXL/z+++/o0aOHVHfZsmXw9vaGqakpXF1d8f777yMzM1NaHhISAisrK+zZswe1a9eGkZERAgICcPfu3RJj2LZtm9J2zMzM8OabbyIyMlIqi4yMRJs2beDr66tS3rJlSxgZGQEAvv/+e9SrVw9GRkbw8vLCN998o7Stu3fvYsCAAbCysoKNjQ169eqlkpQ/79y5c7Czs8OiRYtK3IfnPXnyBJ999hk2b96MIUOGwMPDA40aNVI5idKjRw+cP38esbGxZW6biIiIiIg0h8m/ti1bBlSvXvqruCvSPXuWbd1ly14ptJSUFISFhWH8+PEwNTVVWW5lZVWmdlxdXbFr1y4AwM2bN5GQkIAVK1YAAKZPn45du3Zh06ZNuHjxIjw9PREQEICUlBS4uroiISEBFhYWWL58ORISEtC/f3+VsoEDB0onBV7m8ePHOHDgwEv3BYDS+jKZDCtXrsS1a9ewadMmHD58GNOnT1eqn52djQULFmDz5s04ceIEUlNTpav3L3P8+HE0a9ZMqczPzw8RERHS+4iICHTo0AHt27dXKo+MjJROooSGhmLWrFlYsGABoqOj8cUXX2DmzJnYtGkTACA/Px8BAQEwNzfHsWPHcOLECZiZmSEwMBB5eXkqcR0+fBidO3fGggUL8NFHH5W4D88LDw+HQqHA/fv3Ua9ePVSvXh0DBgxQOQlSo0YNODg44NixY2Vum4iIiIiINIfD/rUtPR24f7/0eq6uqmXJyWVbNz1d/bgA3Lp1C6IowsvL65XWL6KnpwcbGxsAgL29vXTSICsrC2vXrkVISAi6du0KAPjuu+8QHh6OH374AdOmTYOjoyMEQYClpSUcHR0BAKampipllpaWqFu3bqn78mIdW1tb5OTkAADGjx8vXfWePHmyVKdmzZr4/PPP8d577yldXc/Pz8fq1avRokULAMCmTZtQr149nD17Fs2bN1eJITU1FWlpaXB2dlYq9/PzwxdffIGEhAQ4OTnhyJEjmDZtGgoKCrB27VoAz25ZuHPnjpT8z549G0uXLkXfvn0BAO7u7rh+/TrWr1+P4cOHY/v27VAoFPj++++lkxobN26ElZUVIiMjlSYc3L17N4YNG4bvv/8eAwcOfOkxLM4///wDhUKBL774AitWrIClpSU+++wzdO7cGZcvX4ZcLpfqOjs74/bt22q1T0REREREmsHkX9ssLAAXl9Lr2dkVX1aWdS0s1I8LgCiKr7ReWcXGxiI/P1/pHnYDAwM0b94c0dHRarXVp08f9OnTR+0Yzp49C4VCgaCgIOTm5krlBw8exMKFC3Hjxg2kp6ejoKAAOTk5yM7OhomJCQBAX18fb775prSOl5cXrKysEB0dXWzy//TpUwCQhu0Xad26NeRyOSIjI/HGG2/g6dOnaNq0KRQKBZKTkxEXF4fIyEgYGxujZcuWyMrKQmxsLIKDgzF69L+TkRQUFMDS0hIAcOnSJdy6dQvm5uZK28rJyVEaen/mzBns27cPO3fuLHXm/65du0pX7t3c3HDt2jUoFArk5+dj5cqV0gmFn376CY6OjoiIiEBAQIC0vrGxcYVMqkhEREREVKrq1Z9dSHVxAVYBeHof1eNluJ+vgIu5C+5NuaftCDWOyb+2TZny7PUq9u7VbCwvqF27NgRBKHVSP5ns2d0jz58syM/PL9fY1OXp6QlBEHDz5k2l8lq1agF4lpgWiY+Px1tvvYVx48ZhwYIFsLGxwfHjxxEcHIy8vDwp+VdXtWrVIAgCnjx5olRuYmKC5s2bIyIiAikpKWjTpg309PSgp6eH1q1bIyIiAhEREfD19YVcLpfW/+6776RRB0X09PQAAJmZmfDx8Sl2hn27504keXh4oFq1atiwYQO6d+8OAwODl8b//fffSycwiuo5OTkBAOrXr6/Uvq2tLe7cuaO0fkpKitK2iYiIiIio4vCef3opGxsbBAQEYM2aNcjKylJZXvTYvqKELiEhQVr2/OR/AKTh34WFhVKZh4cH5HI5Tpw4IZXl5+fj3LlzSsmkJlSrVg2dO3fG6tWri92X5124cAEKhQJLly5Fy5YtUadOHTx48EClXkFBAc6fPy+9v3nzJlJTU1GvXr1i25XL5ahfvz6uX7+usszPzw+RkZGIjIxEhw4dpPJ27dohMjISR44ckYb8Ozg4wNnZGf/88w88PT2VXu7u7gCApk2bIiYmBvb29ip1ikYHAM9uezh8+DBu3bqFAQMGlHjSxsXFRWrDzc0NAKRRG8+fVElJScGjR4+kOsC/Iw6aNGny0vaJiIiIiKj8MPmnEq1ZswaFhYVo3rw5du3ahZiYGERHR2PlypVo1aoVAEjD0b/88ktER0fjyJEj+Oyzz5TacXNzgyAI2LdvH5KTk5GZmQlTU1OMGzcO06ZNQ1hYGK5fv47Ro0cjOzsbwcHBasW5e/fuUucm+Oabb1BQUIBmzZph+/btiI6Oxs2bN7FlyxbcuHFDumru6emJ/Px8rFq1Cv/88w9+/PFHrFu3TqU9AwMDTJw4EWfOnMGFCxcwYsQItGzZstgh/0UCAgJw/PhxlXI/Pz/ExMRg//79aN++vVTevn177NmzB3fv3pWSfwCYO3cuFi5ciJUrV+Lvv//GlStXsHHjRiz7/8kdg4KCYGtri169euHYsWPSrQOTJk3CvXvKQ5js7e1x+PBh3LhxA4MHD0ZBQUGJx/F5derUQa9evfDBBx/g5MmTuHr1KoYPHw4vLy+leE+fPg1DQ0PpM0NERERERBWLyT+VqFatWrh48SL8/PwwdepUNGzYEJ07d8ahQ4ekyegAYMOGDSgoKICPjw8mT56Mzz//XKkdFxcXzJ07Fx9//DEcHBwwYcIEAMCXX36Jfv36YejQoWjatClu3bqF/fv3w9raWq0409LSVIb0v8jDwwN//fUX/P39MWPGDLzxxhto1qwZVq1ahf/973+YP38+AOCNN97AsmXLsGjRIjRs2BChoaFYuHChSnsmJib46KOPMGTIEPj6+sLMzAzbt28vMYbg4GD88ccfSEtLUypv1aoVDA0NIYoifHz+fcBoixYtkJ+fLz0SsMi7776L77//Hhs3boS3tzfat2+PkJAQ6cq/iYkJjh49iho1aqBv376oV68egoODkZOTA4ti5oBwdHTE4cOHceXKFQQFBSmN0CjN5s2b0aJFC3Tv3h3t27eHgYEBwsLClG4h+OmnnxAUFPTKt0wQEREREdHrEcTyntXtPyI9PR2WlpZIS0tTSa5ycnIQFxcHd3d3lcneqHyIooiCggLo6+uX+AjAVxUSEoLJkydLtz6oo3///mjatClmzJih8bgqo0ePHqFu3bo4f/68dHKiJOXddy/i91MzFAoFkpKSYG9vL80DQrqBfae72He6if2mu9h3FWdHbFrplUrR3+PZra4v7bcqMuFfSXnoi/ipJapgS5YsgZmZmbbDqDDx8fH45ptvypT4ExERERFR+WDyT1TBatasiYkTJ2o7jArTrFkzDBw4UNthEBERERFJsgtE6WfR70Vj4p8WihoZfVDZMPknegUjRox4pSH/RERERERE2sDkn4iIiIiIiKiK09d2AEREREREREQV6ezS9ZDl5UEhlwOegEyRh2l2N5FsVBcGMrm2wysXTP6JiIiIiIjoPyW5ZVuVMherTnDRQiwVhcP+iYiIiIiIiKo4Jv9EREREREREVRyH/RMREREREVVypT16rr+HZQVFUjXYnT727z3/9Z/d838x5d97/hs4qt4WoOt45Z8qvRMnTsDb2xsGBgbo3bv3S8uqkjlz5qBx48bS+xEjRpS6nx06dMDkyZNLbbtdu3bYunXr6wWoAYIgYM+ePeW+nZYtW2LXrl3lvh0iIiIi0h3Np45Fu1H90HzqWDS/NRbtbvbDkrOf4YvD/bDq5Fhth1cumPxTqRITEzFx4kTUqlULhoaGcHV1RY8ePXDo0KEytxEZGQlBEJCamqr29qdMmYLGjRsjLi4OISEhLy0rzYgRIyAIAgRBgFwuh6enJ+bNm4eCggK1Y3qZpUuXwtraGjk5OSrLsrOzYWFhgZUrV6rd7ooVK8q8nyXZu3cvHj58iEGDBgEABg0ahMDAQKU6YWFhEAQBc+bMUSqfM2cOatSo8doxlJdt27ZBEASVkySfffYZPv74YygUCu0ERkRERERUCTD5pxLFx8fDx8cHhw8fxpIlS3DlyhWEhYXBz88P48ePr5AYYmNj0bFjR1SvXh1WVlYvLSuLwMBAJCQkICYmBlOnTsWcOXOwZMmSYuvm5eWpHevQoUORlZWFX375RWXZzp07kZeXh3feeUftdi0tLdXaz5dZuXIlRo4cCZns2Vffz88PJ06cUDoBEhERAVdXV0RGRiqtGxERAT8/v9eOoTzEx8fjf//7H9q2VR2e1bVrV2RkZODPP//UQmRERERERJUDk38q0fvvvw9BEHD27Fn069cPderUQYMGDTBlyhScPn0awLPESxAEREVFSeulpqZCEARERkYiPj5eShqtra0hCAJGjBgBAMjNzcWkSZNgb28PIyMjtGnTBufOnVNq9/Hjxxg1ahQEQUBISEixZWVlaGgIR0dHuLm5Ydy4cfD398fevXsB/Du0fsGCBXB2dkbdunUBAHfv3sWAAQNgZWUFGxsb9OrVC/Hx8cW2b29vjx49emDDhg0qyzZs2IDevXvDxsYGH330EerUqQMTExPUqlULM2fORH5+/kvjfnHYf1ZWFoYNGwYzMzM4OTlh6dKlpe57cnIyDh8+jB49ekhlfn5+yMzMxPnz56WyyMhIfPzxxzhz5ow0giEnJwdnzpyR+jE1NRXvvvsu7OzsYGFhgY4dO+LSpUtK2/v111/RtGlTGBkZoVatWpg7d26Joyxmz54NJycnXL58udR9eV5hYSGCgoIwd+5c1KpVS2W5np4eunXrhm3btqnVLhERERFRVcLkX9uilwG7q5f+OtJTdd0jPcu2bvSyVwotJSUFYWFhGD9+PExNTVWWl/VKtKurq3TP9c2bN5GQkIAVK1YAAKZPn45du3Zh06ZNuHjxIjw9PREQEICUlBS4uroiISEBFhYWWL58ORISEtC/f3+VsoEDB0onBdRlbGysdIX/0KFDuHnzJsLDw7Fv3z7k5+cjICAA5ubmOHbsGE6cOAEzMzMEBga+dGRAcHAwDh8+jNu3b0tl//zzD44ePYrg4GAAgLm5OUJCQnD9+nWsWLEC3333Hb7++usyxz1t2jQcOXIEv/76Kw4cOIDIyEhcvHixxHWOHz8OExMT1KtXTyqrU6cOnJ2dERERAQDIyMjAxYsX0b9/f9SsWROnTp0CAJw8eRK5ublS8t+/f38kJSXhzz//xIULF9C0aVN06tQJKSkpAIBjx45h2LBh+OCDD3D9+nWsX78eISEhWLBggUpcoihi4sSJ2Lx5M44dO4ZGjRqV+TgAwLx582Bvby8d2+I0b94cx44dU6tdIiIiIqKqhLP9a1t+OvD0fun1clyLKUsu27r56erHBeDWrVsQRRFeXl6vtH4RPT092NjYAHh2ZbzopEFWVhbWrl2LkJAQdO3aFQDw3XffITw8HD/88AOmTZsGR0dHCIIAS0tLODo6AgBMTU1VyiwtLaUr9WUhiiIOHTqE/fv3Y+LEiVK5qakpvv/+e8jlcgDAli1boFAo8P3330snFzZu3AgrKytERkaiS5cuKm0HBATA2dkZGzdulO6bDwkJgaurKzp16gTg2X3oRWrWrIn//e9/2LZtG6ZPn15q7JmZmfjhhx+wZcsWqb1NmzahevXqJa53+/ZtODg4SEP+i/j5+SEyMhIzZszAsWPHUKdOHdjZ2aFdu3aIjIyUlru7u8PNzQ3Hjx/H2bNnkZSUBENDQwDAV199hT179mDnzp0YM2YM5s6di48//hjDhw8HANSqVQvz58/H9OnTMXv2bGnbBQUFeOedd/DXX3/h+PHjcHFxgSiKpR6DIsePH8cPP/ygNOqkOM7Ozrh79y4UCoXK/hMRERFpW2kz+RNpApN/bTOwAIxdSq9nZFd8WVnWNbBQPy5ArSTsVcTGxiI/Px++vr5SmYGBAZo3b47o6Gi12urTpw/69OlTar19+/bBzMwM+fn5UCgUGDJkiNLEdt7e3lLiDwCXLl3CrVu3YG5urtROTk4OYmNjcezYMenEBQCsX78eQUFBGD58OEJCQjB79myIoohNmzYp3Wu/fft2rFy5ErGxscjMzERBQQEsLMrWT7GxscjLy0OLFi2kMhsbm1JPfjx9+hRGRkYq5UVPCcjPz0dkZCQ6dOgAAGjfvj3Wr18PANJJgKJjkpmZiWrVqqm0HxsbK9U5ceKE0pX+wsJC5OTkIDs7GyYmJgCADz/8EIaGhjh9+jRsbW1fGvudO3dQv3596f0nn3yCiRMnYujQofjuu+9KXBd4NsJDoVAgNzcXxsbGJdYlIiIiIqqKmPxrW70pz16vov1ezcbygtq1a0MQBNy4caPEekUJ7fMnC0q6f12b/Pz8sHbtWsjlcjg7O0NfX/kr8OLtDZmZmfDx8UFoaKhKW3Z2dpDL5UpXnR0cHAAAo0aNwsKFC3H48GEoFArcvXsXI0eOBACcOnVKukc9ICAAlpaW2LZtW5nu238dtra2ePLkiUq5n58fsrKycO7cOURERGDatGkAniX/o0aNQkpKCs6cOYOxY5898iQzMxNOTk4qEwIC/94KkpmZiblz56Jv374qdZ4/AdG5c2f89NNP2L9/P4KCgl4au7Ozs9JxtrGxQWxsLOLj45XmMCia0V9fXx83b96Eh4cHgGe3sJiamjLxJyIiIqL/LK2Ofz169Ch69OgBZ2dnlWd+5+fn46OPPoK3tzdMTU3h7OyMYcOG4cGDB0ptpKSkICgoCBYWFrCyskJwcDAyMzOV6ly+fBlt27aFkZERXF1dsXjxYpVYduzYAS8vLxgZGcHb2xt//PFHueyzLrGxsUFAQADWrFmDrKwsleVFj+2zs3s2KiEhIUFa9uIw7KKr6YWFhVKZh4cH5HI5Tpw4IZXl5+fj3LlzSld5NcnU1BSenp6oUaOGSuJfnKZNmyImJgb29vbw9PRUellaWsLY2FiprGiEgIeHB9q3b48NGzZg48aN8Pf3h5ubG4Bn98+7ubnh008/RbNmzVC7dm2l+QFK4+HhAQMDA5w5c0Yqe/LkCf7+++8S12vSpAkSExNVTgB4eHjA1dUVe/fuRVRUFNq3bw8AcHFxgYuLC5YuXYq8vDzpyn/Tpk2RmJgIfX19lWNSdAW+adOmuHnzpspyT09PpWH3PXv2xNatW/Huu++WOCHfi9uysbGBl5cXrly5gqioKOnVs2dP+Pn5ISoqCq6u/94qc/XqVTRp0qSMR5iIiIiIdN2O2DTsiE3Drn/ScDwhC7v+SZPK/qu3WWg1+c/KysIbb7yBNWvWqCzLzs7GxYsXMXPmTFy8eBG//PILbt68iZ49lSe+CwoKwrVr16QJ2o4ePYoxY8ZIy9PT09GlSxe4ubnhwoULWLJkCebMmYNvv/1WqnPy5EkMHjwYwcHB+Ouvv9C7d2/07t0bV69eLb+d1xFr1qxBYWEhmjdvjl27diEmJgbR0dFYuXIlWrVqBeDZkOqWLVviyy+/RHR0NI4cOaJ0TzsAuLm5QRAE7Nu3D8nJycjMzISpqSnGjRuHadOmISwsDNevX8fo0aORnZ1d4uRtxdm9e/drz01QnKCgINja2qJXr144duwY4uLiEBkZiUmTJuHevXslrhscHIxffvkFu3fvVtqf2rVr486dO9i2bRtiY2OxcuVK7N69u8wxmZmZITg4GNOmTcPhw4dx9epVjBgxotR72Zs0aQJbW1ulky1F/Pz88M0338DT01MavQA8u/q/atUqaWJAAPD390erVq3Qu3dvHDhwAPHx8Th58iQ+/fRT6akBs2bNwubNmzF37lxcu3YN0dHR2LZtm8rnAnh2y8aPP/6IkSNHYufOnWU+DkZGRmjYsKHSy8rKCubm5mjYsKHS7RvHjh0rdn4GIiIiIqL/Cq0O++/atavS/dLPs7S0RHh4uFLZ6tWr0bx5c9y5cwc1atRAdHQ0wsLCcO7cOTRr1gwAsGrVKnTr1g1fffUVnJ2dERoairy8PGzYsAFyuRwNGjRAVFQUli1bJp0kWLFiBQIDA6XhzvPnz0d4eDhWr16NdevWleMRqPxq1aqFixcvYsGCBZg6dSoSEhJgZ2cHHx8frF27Vqq3YcMGBAcHw8fHB3Xr1sXixYuVki0XFxdpEriRI0di2LBhCAkJwZdffgmFQoGhQ4ciIyMDzZo1w/79+2Ftba1WnGlpabh586bG9ruIiYkJjh49io8++gh9+/ZFRkYGXFxc0KlTp1Lv0e/Xrx8mTJgAPT09pcf09ezZEx9++CEmTJiA3NxcdO/eHTNnzlSae6A0S5YsQWZmJnr06AFzc3NMnToVaWkln8HU09PDyJEjERoairfeektpmZ+fHzZv3izd71+kffv22LhxI4YMGSKVCYKAP/74A59++ilGjhyJ5ORkODo6ol27dtKJg4CAAOzbtw/z5s3DokWLYGBgAC8vL7z77rvFxvb2229LnwNBEFRO8r2O+/fv4+TJk9iyZYvG2iQiIiIi3fb7iesqZctbFFOxChHE8p7VrYwEQcDu3buVkqQXHTx4EF26dEFqaiosLCywYcMGTJ06VWkYc0FBAYyMjLBjxw706dMHw4YNQ3p6utItBREREejYsSNSUlJgbW2NGjVqYMqUKZg8ebJUZ/bs2dizZ4/Ks8uL5ObmIjc3V3qfnp4OV1dXPHnyRCUpzMnJQXx8PNzd3YudcI3KR35+PgwMDLQdRqWSmJiIhg0b4sKFC9JtCJWRJvvuo48+wpMnT5RG+zwvJycHcXFxqFmzJr+fr0GhUCA5ORl2dnZ8ooKOYd/pLvadbmK/6a7y7Ltd/7z+MPR+tSw1EEnloInjIRFFCBkpEM1tADUfDa4LxzQ9PR3W1tZIS0sr9eKkzkz4l5OTg48++giDBw+WdioxMRH29vZK9fT19WFjY4PExESpjru7u1KdoquTiYmJsLa2RmJiotJQ56I6RW0UZ+HChZg7d65KeXJyMnJycpTKimaWLygoQEFBQRn3mF6HKIrS/AKCml/yqszW1hbr169HXFwcXFzK8KQILdB039na2mLSpEkv/e4VFBRAoVDg8ePHPFn0GhQKBdLS0iCKIv8xq2PYd7qLfaeb2G+6qzz7TshQnV9LXUlJuaVX0hGaOB7/EiE8zQAE4P//U2a6cEwzMjLKXFcnkv/8/HwMGDAAoigqDTXXphkzZmDKlH9n6S+68m9nZ1fslf+MjAzo6+uXaZI50hwmc6r69eun7RDKRFN9V3Q7z8vo6+tDJpOhWrVqvPL/GhQKBQRB4JUsHcS+013sO93EftNd5dl3YubrX+m2t6/8V6nLShPH49/GREAERDP1r/zrwjFV59+vlT4TLUr8b9++jcOHDysl1o6OjkhKSlKqX1BQgJSUFDg6Okp1Hj58qFSn6H1pdYqWF8fQ0BCGhoYq5TKZTOWPgUwmgyAI0ovKnyiK0rHmMdctFd13Rd/L4r67pB4eR93FvtNd7DvdxH7TXeXWd8Lrt1elPk8aOB7/UjxL+gVBqd36K7+EQUY68s0tgL6AQWE6Vty9jNtGjWAit0D/Rh/rxDFVJ8ZKvTdFiX9MTAwOHjyIatWqKS1v1aoVUlNTceHCBams6LnqLVq0kOocPXpU6bnz4eHhqFu3rjSpXKtWrXDo0CGltsPDw6XZ7ImIiIiIiKjqcN++GXU2fgP37ZvhnrQZdRK/wZ57J/H7jW9w6NZmbYdXLrSa/GdmZkrP5waAuLg4REVF4c6dO8jPz8fbb7+N8+fPIzQ0FIWFhUhMTERiYiLy8vIAAPXq1UNgYCBGjx6Ns2fP4sSJE5gwYQIGDRokPZZsyJAhkMvlCA4OxrVr17B9+3asWLFCacj+Bx98gLCwMCxduhQ3btzAnDlzcP78eUyYMKHCjwkRERERERGRpmk1+T9//jyaNGmCJk2aAACmTJmCJk2aYNasWbh//z727t2Le/fuoXHjxnBycpJeJ0+elNoIDQ2Fl5cXOnXqhG7duqFNmzZKs3pbWlriwIEDiIuLg4+PD6ZOnYpZs2ZJj/kDgNatW2Pr1q349ttv8cYbb2Dnzp3Ys2cPGjZsWHEHg4iIiIiIiKicaPWe/w4dOqCkJw2W5SmENjY22Lp1a4l1GjVqhGPHjpVYp3///ujfv3+p2yMiIiIiIiLSNZX6nn8iIiIiIiIien1M/omIiIiIiIiqOCb/VOmdOHEC3t7eMDAwQO/evV9apgt++OEHdOnSRdthYMSIERVy3NatW4cePXqU+3aIiIiIiKhkTP6pVImJiZg4cSJq1aoFQ0NDuLq6okePHiqPRyxJZGQkBEFAamqq2tufMmUKGjdujLi4OISEhLy0rCRFz3J/2WvOnDlqx/V823v27Cm1Xk5ODmbOnInZs2cDAMLCwiAIAhITE5XqOTk5oWbNmkpl8fHxEARBrWNe3i5duoTBgwfD1dUVxsbGqFevHlasWKFUZ9SoUbh48WKpc24QEREREVH50uqEf1T5xcfHw9fXF1ZWVliyZAm8vb2Rn5+P/fv3Y/z48bhx40a5xxAbG4v33nsP1atXL7GsJAkJCdLv27dvx6xZs3Dz5k2pzMzMTHMBv8TOnTthYWEBX19fAECbNm2gr6+PyMhIDBo0CAAQHR2Np0+fIjs7G/Hx8dJJgIiICBgaGkrrVgYXLlyAvb09tmzZAldXV5w8eRJjxoyBnp6e9JhMuVyOIUOGYOXKlWjbtq2WIyYiIiIi+u/ilX8q0fvvvw9BEHD27Fn069cPderUQYMGDTBlyhScPn0awL9XpaOioqT1UlNTIQgCIiMjER8fDz8/PwCAtbU1BEHAiBEjAAC5ubmYNGkS7O3tYWRkhDZt2uDcuXNK7T5+/BijRo2CIAgICQkptqw0jo6O0svS0hKCICiVbdu2DfXq1YORkRG8vLzwzTffSOvm5eVhwoQJcHJygpGREdzc3LBw4UIAkJLzPn36QBAElSv2z9u2bZvSEHgzMzO8+eabiIyMlMoiIyPRpk0b+Pr6qpS3bNkSRkZGAIDvv//+pfECwN27dzFgwABYWVnBxsYGvXr1Qnx8/EtjO3fuHOzs7LBo0aJSjuS/Ro0ahRUrVqB9+/aoVasW3nnnHYwcORK//PKLUr0ePXpg7969ePr0aZnbJiIiIiIqT8ktfJHYpiOSW/gi2cIXiZYd8aZldbzh1BH17SvPBTdN4pV/LVt2ahmWnVpWar2mTk2xd/BepbKeP/XExYSLpa47pdUUTGk1Re3YUlJSEBYWhgULFsDU1FRluZWVVZnacXV1xa5du9CvXz/cvHkTFhYWMDY2BgBMnz4du3btwqZNm+Dm5obFixcjICAAt27dgqurKxISElC3bl3MmzcPAwcOhLm5OQIDA5XKLC0tERISgpEjR5bp8ZAvCg0NxaxZs7B69Wo0adIEf/31F0aPHg1TU1MMHz4cK1euxN69e/Hzzz+jRo0auHv3Lu7evQvgWdJsb2+PjRs3IjAwEHp6ei/dzvHjxzF06FClMj8/P+zcuVN6HxERgQ4dOqCwsBARERHSSZLIyEiMGjWqTPHm5+cjICAArVq1wrFjx6Cvr4/PP/8cgYGBuHz5MuRyuVIMhw8fRt++fbF48WKMGTNG7eP3vLS0NNjY2CiVNWvWDAUFBThz5gw6dOjwWu0TEREREWnC2WXfqZQFeWkhkArE5F/L0nPTcT/jfqn1XC1dVcqSs5PLtG56bvorxXbr1i2Ioggvr9f7Fujp6UkJob29vXTSICsrC2vXrkVISAi6du0KAPjuu+8QHh6OH374AdOmTYOjoyMEQYClpSUcHR0BAKampipllpaWqFu37ivFN3v2bCxduhR9+/YFALi7u+P69etYv349hg8fjjt37qB27dpo06YNBEGAm5ubtK6dnR2AZydCimIpTmpqKtLS0uDs7KxU7ufnhy+++AIJCQlwcnLCkSNHMG3aNBQUFGDt2rUAgH/++Qd37tyRRk+UFu/27duhUCjw/fffQxAEAMDGjRthZWWFyMhIpQkHd+/ejWHDhuH777/HwIEDX+n4FTl58iS2b9+O33//XancxMQElpaWuH379mu1T0REREREr47Jv5ZZGFrAxdyl1Hp2JnbFlpVlXQtDi1eK7VWuoqsjNjYW+fn5SvexGxgYoHnz5oiOjlarrT59+qBPnz5qx5CVlYXY2FgEBwdj9OjRUnlBQQEsLS0BPJsZv3Pnzqhbty4CAwPx1ltvqT1jf9GQ96Jh+0Vat24NuVyOyMhIvPHGG3j69CmaNm0KhUKB5ORkxMXFITIyEsbGxmjZsmWZ4r106RJu3boFc3NzpW3l5OQgNjZWen/mzBns27cPO3fuLHXm/65du0qT9rm5ueHatWtKy69evYpevXph9uzZxR4bY2NjZGdnl3KUiIiIiOi/YEdsmrZD+E9i8q9lrzokH4DKbQCaVrt2bQiCUOqkfjLZs6kjnj9ZkJ+fX66xaUpmZiaAZyMOWrRoobSsaAh/06ZNERcXhz///BMHDx7EgAED4O/vrzRcvzTVqlWDIAh48uSJUrmJiQmaN2+OiIgIpKSkoE2bNtDT04Oenh5at26NiIgIREREwNfXF3K5XFq/pHgzMzPh4+OD0NBQlTiKRioAgIeHB6pVq4YNGzage/fuMDAweGn833//vXQC48V6169fR6dOnTBmzBh89tlnxa6fkpKitG0iIiIiIqpYTP7ppWxsbBAQEIA1a9Zg0qRJKvf9p6amwsrKSkrqEhIS0KRJEwBQmvwPgHSfeWFhoVTm4eEBuVyOEydOSEPp8/Pzce7cOUyePLmc9kqZg4MDnJ2d8c8//yAoKOil9SwsLDBw4EAMHDgQb7/9NgIDA5GSkgIbGxsYGBgo7Vdx5HI56tevj+vXr6tcGffz88O2bdvw5MkTpXvi27Vrh8jISBw5cgTvvfdemeNt2rQptm/fDnt7e1hYvHzUh62tLX755Rd06NABAwYMwM8///zSEwAuLsWPMLl27Ro6duyI4cOHY8GCBcXWiY2NRU5OjvTZICIiIiLStvbv9IDho2Tk2toBnwKG+cnoHpeA2zInWBrZYXbn37QdosZxtn8q0Zo1a1BYWIjmzZtj165diImJQXR0NFauXIlWrVoBgDQk/csvv0R0dDSOHDmicgXYzc0NgiBg3759SE5ORmZmJkxNTTFu3DhMmzYNYWFhuH79OkaPHo3s7GwEBwerFefu3btfeW6CuXPnYuHChVi5ciX+/vtvXLlyBRs3bsSyZc8mYly2bBl++ukn3LhxA3///Td27NgBR0dHae6CmjVr4tChQ0hMTFS5sv+8gIAAHD9+XKXcz88PMTEx2L9/P9q3by+Vt2/fHnv27MHdu3el+/3LEm9QUBBsbW3Rq1cvHDt2TLp1YNKkSbh3757Stu3t7XH48GHcuHEDgwcPRkFBQZmP29WrV+Hn54cuXbpgypQpSExMRGJiIpKTk5XqHTt2DLVq1YKHh0eZ2yYiIiIiKk9mcbGwvHUDZnGxMHsaC8unNxD3NAP30m4gISO29AZ0EJN/KlGtWrVw8eJF+Pn5YerUqWjYsCE6d+6MQ4cOSRPSAcCGDRtQUFAAHx8fTJ48GZ9//rlSOy4uLpg7dy4+/vhjODg4SM+B//LLL9GvXz8MHToUTZs2xa1bt7B//35YW1urFWdaWhpu3rz5Svv47rvv4vvvv8fGjRvh7e2N9u3bIyQkBO7u7gAAc3NzLF68GM2aNcObb76J+Ph4/PHHH9LtDkuXLkV4eDhcXV1LvLodHByMP/74A2lpyvc4tWrVCoaGhhBFET4+PlJ5ixYtkJ+fLz0SsKzxmpiY4OjRo6hRowb69u2LevXqITg4GDk5OcWOBHB0dMThw4dx5coVBAUFlTqKocjOnTuRnJyMLVu2wMnJSXo9HysA/PTTT0rzExARERERUcUTxPKe1e0/Ij09HZaWlkhLS1NJsHJychAXFwd3d3eVCd+ofIiiiIKCAujr60sz3lcG/fv3R9OmTTFjxgxth1Ihim4L+Pvvv6UJCUtT0X3H76dmKBQKJCUlwd7eXjoxRrqBfae72He6if2mu8qz7zQxAV5/j7L9W6syqNAJ/0QFhIwUiOY2gPBvv3X3rQ+Thw+Q7eAMrAJM8h/AJU6GBwUK2Jg4Y13f6zpxTEvKQ1/EvzhEFWjJkiUwMzPTdhgVJiEhAZs3by5z4k9EREREROWDE/4RVaCaNWti4sSJ2g6jwvj7+2s7BCIiIiIiAq/8ExEREREREVV5TP6JiIiIiIiIqjgm/xWIcysSVT78XhIRERHRfwGT/wpgYGAAAMjOztZyJET0ory8PACAnp6eliMhIiIiIio/nPCvAujp6cHKygpJSUkAnj2HvTI9fq4qqqyP+qPSVWTfKRQKJCcnw8TEBPr6/HNIRERE9F9xfeJ06GdlosDUDKgO6BdmYrjBZdw3agQjg6r5dC7+a7eCODo6AoB0AoDKlyiKUCgUkMlkTP51TEX3nUwmQ40aNfg5ISIiIvoPiRs0QqWsiRPQpOJDqTBM/iuIIAhwcnKCvb098vPztR1OladQKPD48WNUq1YNMhnvbtElFd13crmcnxEiIiIiqvKY/FcwPT093ltcARQKBQwMDGBkZMTETsew74iIiIiINI/JPxEREREREf2nGCUlQigshKinB1gBgliIxzmP8VS/GmSCHqxNHLUdosYx+SciIiIiIqL/lE59OsLk4QNkOzgDqwCT/AdwiZPhQYECNibOWNf3urZD1DiOqSUiIiIiIiKq4pj8ExEREREREVVxTP6JiIiIiIiIqjgm/0RERERERERVHJN/IiIiIiIioiqOyT8RERERERFRFcfkn4iIiIiIiKiKY/JPREREREREVMUx+SciIiIiIiKq4vS1HQARERERERFRRTry46+QFRRAoa8POAEysQBz3O8hQ14dekLVTJOr5l4RERERERERvURmrdoqZVYm9WBV8aFUGA77JyIiIiIiIqrimPwTERERERERVXEc9k9ERERERET/Ka57d0D/6VMUGBsDrQF9xVP8nngZCcaNYKhvjDbu/bUdosYx+SciIiIiIqL/lEaLZsPk4QNkOzgDqwCT/AfoFSfDgwIFbEycq2Tyz2H/RERERERERFUck38iIiIiIiKiKo7JPxEREREREVEVx+SfiIiIiIiIqIpj8k9ERERERERUxTH5JyIiIiIiIqrimPwTERERERERVXFM/omIiIiIiIiqOH1tB0BERERERERUkXLs7P/9KX9WZmfwBDlya1gZ2WsxsvLD5J+IiIiIiIj+Uw7tiVQp+7RpxcdRkZj8ExERERERkUbsiE3Tdgj0Erznn4iIiIiIiKiKY/JPREREREREVMVx2D8RERERERH9pzT9bDLkqU+QZ2UNBAPygieYHh+NeMN6MJNbY0zL5doOUeO0euX/6NGj6NGjB5ydnSEIAvbs2aO0XBRFzJo1C05OTjA2Noa/vz9iYmKU6qSkpCAoKAgWFhawsrJCcHAwMjMzlepcvnwZbdu2hZGREVxdXbF48WKVWHbs2AEvLy8YGRnB29sbf/zxh8b3l4iIiIiI/nt2xKaV+KKK5xRxAK5hv8Ip4gCcnhyAa8qvOPr4Fk7f+RUXHxzQdnjlQqvJf1ZWFt544w2sWbOm2OWLFy/GypUrsW7dOpw5cwampqYICAhATk6OVCcoKAjXrl1DeHg49u3bh6NHj2LMmDHS8vT0dHTp0gVubm64cOEClixZgjlz5uDbb7+V6pw8eRKDBw9GcHAw/vrrL/Tu3Ru9e/fG1atXy2/niYiIiIiIiCqIVof9d+3aFV27di12mSiKWL58OT777DP06tULALB582Y4ODhgz549GDRoEKKjoxEWFoZz586hWbNmAIBVq1ahW7du+Oqrr+Ds7IzQ0FDk5eVhw4YNkMvlaNCgAaKiorBs2TLpJMGKFSsQGBiIadOmAQDmz5+P8PBwrF69GuvWrauAI0FERERERERUfirtPf9xcXFITEyEv7+/VGZpaYkWLVrg1KlTGDRoEE6dOgUrKysp8QcAf39/yGQynDlzBn369MGpU6fQrl07yOVyqU5AQAAWLVqEJ0+ewNraGqdOncKUKVOUth8QEKByG8LzcnNzkZubK71PT08HACgUCigUitfdfXpNCoUCoiiyL3QQ+043sd90F/tOd7HvdBP7TXe9Vt+J5d/fleYzVQH7qhZR/PeF52MTX/j54jq6kdepE2OlTf4TExMBAA4ODkrlDg4O0rLExETY29srLdfX14eNjY1SHXd3d5U2ipZZW1sjMTGxxO0UZ+HChZg7d65KeXJystJtCaQdCoUCaWlpEEURMhkfaqFL2He6if2mu9h3uot9p5vYb7rrdfpOyMgqp6j+lZSUW3qlClAR+6oeEcLTDEAA/v8//1+s+Pfni/m/qICQkVJpjmlJMjIyyly30ib/ld2MGTOURgukp6fD1dUVdnZ2sLCw0GJkBDz74ywIAuzs7Pg/Vh3DvtNN7Dfdxb7TXew73cR+012v03diZvlP6mdvb1nu2yiLithXtYgiIAKimQ0gPJf8C7J/fwovrCPIIJrbVJpjWhIjI6My1620yb+joyMA4OHDh3BycpLKHz58iMaNG0t1kpKSlNYrKChASkqKtL6joyMePnyoVKfofWl1ipYXx9DQEIaGhirlMpmMf8grCUEQ2B86in2nm9hvuot9p7vYd7qJ/aa7XrnvhPLv60rzeaqAfVWP4lnSLwgvxCa88PP5Rc/qVppjWgJ1Yqy0e+Pu7g5HR0ccOnRIKktPT8eZM2fQqlUrAECrVq2QmpqKCxcuSHUOHz4MhUKBFi1aSHWOHj2K/Px8qU54eDjq1q0La2trqc7z2ymqU7QdIiIiIiIiIl2m1eQ/MzMTUVFRiIqKAvBskr+oqCjcuXMHgiBg8uTJ+Pzzz7F3715cuXIFw4YNg7OzM3r37g0AqFevHgIDAzF69GicPXsWJ06cwIQJEzBo0CA4OzsDAIYMGQK5XI7g4GBcu3YN27dvx4oVK5SG7H/wwQcICwvD0qVLcePGDcyZMwfnz5/HhAkTKvqQEBEREREREWmcVof9nz9/Hn5+ftL7ooR8+PDhCAkJwfTp05GVlYUxY8YgNTUVbdq0QVhYmNJ9DaGhoZgwYQI6deoEmUyGfv36YeXKldJyS0tLHDhwAOPHj4ePjw9sbW0xa9Ys6TF/ANC6dWts3boVn332GT755BPUrl0be/bsQcOGDSvgKBAREREREVFFutujHwzSUpFvaQXYAgYFqQh8ehXxhg1hKrfSdnjlQhBFsZhnG5C60tPTYWlpibS0NE74VwkoFAokJSXB3t5eJ+7VoX+x73QT+013se90F/tON7HfdNfr9N2O2PKfBK+/R+WYnK4i9lUt/z9zv2huo/Z8BJXlmJZEnTyUf3GIiIiIiIiIqrhKO9s/ERERERERaU5pV+V14Uo3vTpe+SciIiIiIiKq4njln4iIiIiIiP5TArq8CeOHiXjq4AgsAYzzEtEgPgf3FEawNnbE8p7ntB2ixjH5JyIiIiIiov8U/awsGGRlID/LHCgEDBQZyCqU4WlBBowNzLUdXrlg8k9ERERERKTjKt0s+1Tp8J5/IiIiIiIioiqOV/6JiIiIiIioTKMH+EQA3cUr/0RERERERERVHJN/IiIiIiIioiqOyT8RERERERFRFcfkn4iIiIiIiKiKY/JPREREREREVMVxtn8iIiIiIiL6T7k4fxn0cnJQaGQEuAN6ihxMsryGh8YNINcz0nZ45YLJPxEREREREf2nJHQMVCmrVa03amkhlorCYf9EREREREREVRyTfyIiIiIiIqIqTu1h/ytXrixx+aRJk145GCIiIiIiIqLyZnU1CrK8PCjkcsAdkCnycCMtDqmG7tCXyVGrWmNth6hxZUr+r1+/jvr16wMAJk+eDBMTE9jb20MURaV6giAw+SciIiIiIqJKzXfsEJg8fIBsB2dgFWCS/wDD4mR4UKCAjYkz1vW9ru0QNa5Mw/7fe+89vP322wCATz/9FDKZDP7+/jh9+jTi4uKk1z///FOuwRIRERERERGR+sqU/B84cAB79+7Fo0ePMH/+fERHRyMvLw9169bFggULkJubW95xEhEREREREdErKlPyf//+fejp6cHMzAwA4OLigpCQEBw+fBiHDh2Cp6cnNm/eXK6BEhEREREREdGrKdM9/8OGDcPGjRthZGSEy5cv/7uyvj6WL1+OX3/9FRMmTMCKFStw4cKFcguWiIiIiIiIiNRXpuT/xIkT0u+NGzeGIAjSZH/P/x4VFaX5CImIiIiIiIjotaj9qL+4uLjyiIOIiIiIiIiIyonayb+bm1t5xEFERERERERE5UTt5H/v3r0lLu/Zs+crB0NEREREREREmqd28t+7d28IggAA0r3+RQRBQGFhoWYiIyIiIiIiIiKNUDv5DwoKwm+//Ybp06dj6tSpMDQ0LI+4iIiIiIiIiMrF/v1nAFEEBAEwBiCKWPJGFvJlptLF7qpGpu4KP/74Iw4dOoQDBw6gTp06CA0NLY+4iIiIiIiIiMpFgZk5Cswtnv3UM0eBvgUMjJxgIreAsYG5tsMrF2on/wDg4+ODyMhIrFixAvPmzUOzZs1w9OhRTcdGRERERERERBqgdvKfnp4uvTp27IgTJ06gV69eeOutt9C7d+9yCJGIiIiIiIiIXofa9/xbWVkVew+EKIr47bffNBIUERERERERUXmp/cNqGGRmIN/MHOgGGBRmYMP9y7hj1AgmBuZ4q/4EbYeocWon/xEREeURBxEREREREVGFqLPhG5g8fIBsB2egCWCS/wDb4mR4UPAnbEycmfwDgLu7O1xdXavsDIhEREREREREVY3a9/y7u7sjOTm5PGIhIiIiIiIionKgdvIvimJ5xEFERERERERE5UTtYf8AcO/ePeTk5BS7rEaNGq8VEBERERERERFp1isl/2+++aZKmSiKEAQBhYWFrx0UEREREREREWnOKyX/Z86cgZ2dnaZjISIiIiIiIqJyoHbyLwgCatSoAXt7+/KIh4iIiIiIiIg0TO3knxP+ERERERER/TftiE3Tdgj0itRO/uPi4jjkn4iIiIiIiHRWaoNGeOrkglybaoAp8LTABQ3M4mGqXxMWhtW0HV65UDv5v337Nm7fvv3S5e3atXutgIiIiIiIiIjK04lvt6mUjW6ghUAqkNrJf4cOHV66jLP9ExEREREREVU+aif/T548KY84iIiIiIiIiKicqJ38W1paSr9nZGTg008/RVRUFLy9vfHFF19oNDgiIiIiIiIien1qJ//Pmzp1KsLDw9G/f3/8+eefmDhxIjZv3qyp2IiIiIiIiIg0znfMIBimPH424d9UwLDgMQbHxSP+/yf8+8hPdU4AXfdayf/Bgwfxww8/oGPHjhg1ahTat2+vqbiIiIiIiIiIyoXVtcswefgA2Q7OQBZgkv8A1zJleFCQDBsTZ22HVy5kr7Pyo0ePULNmTQCAu7s7Hj16pImYiIiIiIiIiEiD1L7yn56ervQ+MzMT6enpyMnJ0VhQRERERERERKQ5aif/VlZWEAQBACCKIpo0aSL9XlRORERERERERJWH2sl/REREecRBREREREREROVE7eSfk/oRERERERER6ZYyJf8ZGRkwNzeX3j98+BBr1qzB9evXIQgCGjRogHHjxsHBwaHcAiUiIiIiIiKiV1PqbP8FBQVwdHREQkICAODEiRPw9PTE9u3bYW5uDjMzM/z000+oXbs2Tp8+rdHgCgsLMXPmTLi7u8PY2BgeHh6YP38+RFGU6oiiiFmzZsHJyQnGxsbw9/dHTEyMUjspKSkICgqChYUFrKysEBwcjMzMTKU6ly9fRtu2bWFkZARXV1csXrxYo/tCREREREREpC2lXvnX19eHoaEhnj59CgD43//+h3feeQfffPON0sR/48aNw9SpU3HixAmNBbdo0SKsXbsWmzZtQoMGDXD+/HmMHDkSlpaWmDRpEgBg8eLFWLlyJTZt2gR3d3fMnDkTAQEBuH79OoyMjAAAQUFBSEhIQHh4OPLz8zFy5EiMGTMGW7duBfDsCQZdunSBv78/1q1bhytXrmDUqFGwsrLCmDFjNLY/RERERERERNpQpmH/dnZ2yMjIAABERUUhJCREaWZ/QRAwefJkaeZ/TTl58iR69eqF7t27AwBq1qyJn376CWfPngXw7KTD8uXL8dlnn6FXr14AgM2bN8PBwQF79uzBoEGDEB0djbCwMJw7dw7NmjUDAKxatQrdunXDV199BWdnZ4SGhiIvLw8bNmyAXC5HgwYNEBUVhWXLljH5JyIiIiIiqmL+HvU+DDIzkG9mDjgBBoUZGITLuGPUCCYG5qU3oIPKlPy3atUK27dvxxtvvAEHBwfEx8ejbt26SnXi4uJgYWGh0eBat26Nb7/9Fn///Tfq1KmDS5cu4fjx41i2bJm0zcTERPj7+0vrWFpaokWLFjh16hQGDRqEU6dOwcrKSkr8AcDf3x8ymQxnzpxBnz59cOrUKbRr1w5yuVyqExAQgEWLFuHJkyewtrZWiS03Nxe5ubnS+/T0dACAQqGAQqHQ6HEg9SkUCoiiyL7QQew73cR+013sO93FvtNN7Dfd9Vp9J7K/tUYU/33h336IGfW+StWWLkBLaT3dyOvUibFMyf/kyZPRtm1b1K5dG4MHD8a7776Lr776Cq1btwbwbB6AadOmYcCAAa8W8Ut8/PHHSE9Ph5eXF/T09FBYWIgFCxYgKCgIAJCYmAgAKhMNOjg4SMsSExNhb2+vtFxfXx82NjZKddzd3VXaKFpWXPK/cOFCzJ07V6U8OTkZOTk5r7K7pEEKhQJpaWkQRREyWalTW1Alwr7TTew33cW+013sO93EftNdr9N3QkZWOUVFpRMhPM0ABOD//1NmSUm5pVfSsqIR+mVRpuS/cePG+O233zBmzBjcvn0b+fn5GDx4sDT0X09PD6NHj9b4JHk///wzQkNDsXXrVmko/uTJk+Hs7Izhw4drdFvqmjFjBqZMmSK9T09Ph6urK+zs7DQ+AoLUp1AoIAgC7Ozs+D9WHcO+003sN93FvtNd7DvdxH7TXa/Td2JmWjlFRaUSRUAERDMbQFAv+be3tyynoDSnaJ67sihT8g8AHTp0wN9//40bN24gOTlZGl5gbW0NDw8PmJqaqh9pKaZNm4aPP/4YgwYNAgB4e3vj9u3bWLhwIYYPHw5HR0cAzx496OTkJK338OFDNG7cGADg6OiIpKQkpXYLCgqQkpIire/o6IiHDx8q1Sl6X1TnRYaGhjA0NFQpl8lk/ENeSQiCwP7QUew73cR+013sO93FvtNN7Dfd9cp9J7CvtUfxLOkXBKV+0M/MeHZiQBAAYwCiiKcFWciXmUIQBBgbmOvEd1SdGMuc/Bfx8vKCl5cXAEiP3BPUPINSVtnZ2So7o6enJ514cHd3h6OjIw4dOiQl++np6Thz5gzGjRsH4Nl8Bampqbhw4QJ8fHwAAIcPH4ZCoUCLFi2kOp9++iny8/NhYGAAAAgPD0fdunWLHfJPREREREREuisgoAVMHj5AtoMzsAowyX8AlzgZHhQoYGPijHV9r2s7RI17pVMZmzdvhre3N4yNjWFsbIxGjRrhxx9/1HRs6NGjBxYsWIDff/8d8fHx2L17N5YtW4Y+ffoA+PcpA59//jn27t2LK1euYNiwYXB2dkbv3r0BAPXq1UNgYCBGjx6Ns2fP4sSJE5gwYQIGDRoEZ2dnAMCQIUMgl8sRHByMa9euYfv27VixYoXSsH4iIiIiIiIiXaX2lf9ly5Zh5syZmDBhAnx9fQEAx48fx3vvvYdHjx7hww8/1Fhwq1atwsyZM/H+++8jKSkJzs7OGDt2LGbNmiXVmT59OrKysjBmzBikpqaiTZs2CAsLU7r3ITQ0FBMmTECnTp0gk8nQr18/rFy5UlpuaWmJAwcOYPz48fDx8YGtrS1mzZrFx/wRERERERFRlSCIRWP3y8jd3R1z587FsGHDlMo3bdqEOXPmIC4uTqMB6or09HRYWloiLS2NE/5VAgqFAklJSbC3t9eJe3XoX+w73cR+013sO93FvtNN7Dfd9Tp9tyOWE/5pjaiAkJEC0dxG6Z7/7r71Sx3239+j8k/4p04eqvZfnISEBOkRf89r3bo1EhIS1G2OiIiIiIiIiMqZ2sm/p6cnfv75Z5Xy7du3o3bt2hoJioiIiIiIiIg0R+17/ufOnYuBAwfi6NGj0j3/J06cwKFDh4o9KUBERERERERE2qX2lf9+/frhzJkzsLW1xZ49e7Bnzx7Y2tri7Nmz0iz8RERERERERFR5qH3lHwB8fHywZcsWTcdCREREREREROVA7Sv/f/zxB/bv369Svn//fvz5558aCYqIiIiIiIiINEftK/8ff/wxvvzyS5VyURTx8ccfo2vXrhoJjIiIiIiIiKg8nFi/FbK8PCjkcsAdkCny8IlLHFIN3aEvk2s7vHKhdvIfExOD+vXrq5R7eXnh1q1bGgmKiIiIiIiIqLykNmysUmZv3hz2FR9KhVF72L+lpSX++ecflfJbt27B1NRUI0ERERERERERkeaonfz36tULkydPRmxsrFR269YtTJ06FT179tRocERERERERET0+tQe9r948WIEBgbCy8sL1atXBwDcu3cPbdu2xVdffaXxAImIiIiIiIg0yelwGPRyclBoZAQ0AfQUOTiafA0PjRtArmcEn+qB2g5R49RO/i0tLXHy5EmEh4fj0qVLMDY2RqNGjdCuXbvyiI+IiIiIiIhIo5rOnAKThw+Q7eAMrAJM8h/g7TgZHhQoYGPizOS/iCAI6NKlC7p06aLpeIiIiIiIiIhIw9S+55+IiIiIiIiIdAuTfyIiIiIiIqIqjsk/ERERERERURXH5J+IiIiIiIioinulCf8KCwuxZ88eREdHAwAaNGiAnj17Qk9PT6PBEREREREREdHrUzv5v3XrFrp374579+6hbt26AICFCxfC1dUVv//+Ozw8PDQeJBERERERERG9OrWH/U+aNAm1atXC3bt3cfHiRVy8eBF37tyBu7s7Jk2aVB4xEhEREREREdFrUPvK/5EjR3D69GnY2NhIZdWqVcOXX34JX19fjQZHREREREREpGkFpqbINzVHgakpoAfkF5rDVC8HxoIpjPRNtR1euVA7+Tc0NERGRoZKeWZmJuRyuUaCIiIiIiIiIiov+w+cUylb8KYWAqlAag/7f+uttzBmzBicOXMGoihCFEWcPn0a7733Hnr27FkeMRIRERERERHRa1A7+V+5ciU8PDzQqlUrGBkZwcjICL6+vvD09MSKFSvKI0YiIiIiIiIieg1qD/u3srLCr7/+ipiYGNy4cQMAUK9ePXh6emo8OCIiIiIiIiJ6fWon/0Vq166N2rVrAwAKCws1FhARERERERFReWr05UwYpKUi39IKGAIYFKRiwZ2riDdsCFO5FYb6zNd2iBqn9rD/uLg4DB48GOPGjcOTJ0/Qs2dPGBoaom7durh8+XJ5xEhERERERESkMa6/7UKtHT/C9bddcH20C7WSf0RY4iUcjv0RJ27v0nZ45ULt5H/s2LGIjo7G1atX0bFjR+Tl5eHXX39F/fr1MXny5HIIkYiIiIiIiIheh9rD/s+cOYNjx47Bzc0NNjY2OHfuHJo2bQpPT0+0aNGiPGIkIiIiIiIioteg9pX/jIwMODk5wdLSEiYmJrCysgLwbCLAjIwMTcdHRERERERERK/plSb8CwsLg6WlJRQKBQ4dOoSrV68iNTVVw6ERERERERERkSa8UvI/fPhw6fexY8dKvwuC8PoREREREREREZFGqZ38KxSK8oiDiIiIiIiIiMqJ2vf8b968Gbm5ueURCxERERERERGVA7Wv/I8cORKBgYGwt7cvj3iIiIiIiIh0yo7YNG2HQFQqtZN/URTLIw4iIiIiIiKiCpHg1wXy1CfIs7IGrAF5wRO0S49GvGE9mMmttR1euXilCf9+/vlnWFhYFLts2LBhrxUQERERERERUXm6+PlylbK+tSs+jor0Ssn/4sWLoaenp1IuCAKTfyIiIiIiIqJK5pWS//Pnz/OefyIiIiIiIiIdofZs/0RERERERESkW9S+8u/m5lbskH8iIiIiIiIiXdCpdwcYJSchx84e+BwwyktC67gnuCtYw8rIHl92i9R2iBqndvIfFxdXHnEQERERERERVQij5CSYPHzw7E0eYJL/AMn5MqQUPNBuYOVI7WH/kyZNwsqVK1XKV69ejcmTJ2siJiIiIiIiIiLSILWT/127dsHX11elvHXr1ti5c6dGgiIiIiIiIiIizVE7+X/8+DEsLS1Vyi0sLPDo0SONBEVEREREREREmqN28u/p6YmwsDCV8j///BO1atXSSFBEREREREREpDlqT/g3ZcoUTJgwAcnJyejYsSMA4NChQ1i6dCmWL1+u6fiIiIiIiIiI6DWpnfyPGjUKubm5WLBgAebPnw8AqFmzJtauXYthw4ZpPEAiIiIiIiIiej1qJ/8AMG7cOIwbNw7JyckwNjaGmZmZpuMiIiIiIiIiIg1R+55/ACgoKMDBgwfxyy+/QBRFAMCDBw+QmZmp0eCIiIiIiIiI6PWpfeX/9u3bCAwMxJ07d5Cbm4vOnTvD3NwcixYtQm5uLtatW1cecRIRERERERFpxOWP5kL/6VMUGBsDNQB9xVOMMb6MBONGMNQ31nZ45ULt5P+DDz5As2bNcOnSJVSrVk0q79OnD0aPHq3R4IiIiIiIiIg07W7P/ipl9e2B+lqIpaKonfwfO3YMJ0+ehFwuVyqvWbMm7t+/r7HAiIiIiIiIiEgz1L7nX6FQoLCwUKX83r17MDc310hQRERERERERKQ5aif/Xbp0wfLly6X3giAgMzMTs2fPRrdu3TQZGwDg/v37eOedd1CtWjUYGxvD29sb58+fl5aLoohZs2bByckJxsbG8Pf3R0xMjFIbKSkpCAoKgoWFBaysrBAcHKwyOeHly5fRtm1bGBkZwdXVFYsXL9b4vhAREREREZH2mf0TA4u/o2H2TwzMnsbAIjsaqUnhuJsajQdpMaU3oIPUHva/dOlSBAQEoH79+sjJycGQIUMQExMDW1tb/PTTTxoN7smTJ/D19YWfnx/+/PNP2NnZISYmBtbW1lKdxYsXY+XKldi0aRPc3d0xc+ZMBAQE4Pr16zAyMgIABAUFISEhAeHh4cjPz8fIkSMxZswYbN26FQCQnp6OLl26wN/fH+vWrcOVK1cwatQoWFlZYcyYMRrdJyIiIiIiItKu9kN7weThA2Q7OAOrAJP8B3CJk+FBgQI2Js5Y1/e6tkPUOLWT/+rVq+PSpUvYtm0bLl++jMzMTAQHByMoKAjGxpqdFXHRokVwdXXFxo0bpTJ3d3fpd1EUsXz5cnz22Wfo1asXAGDz5s1wcHDAnj17MGjQIERHRyMsLAznzp1Ds2bNAACrVq1Ct27d8NVXX8HZ2RmhoaHIy8vDhg0bIJfL0aBBA0RFRWHZsmVM/omIiIiIiEjnqZ38A4C+vj7eeecdTceiYu/evQgICED//v1x5MgRuLi44P3335eeKhAXF4fExET4+/tL61haWqJFixY4deoUBg0ahFOnTsHKykpK/AHA398fMpkMZ86cQZ8+fXDq1Cm0a9dOaRLDgIAALFq0CE+ePFEaaVAkNzcXubm50vv09HQAz+ZEUCgUGj8WpB6FQgFRFNkXOoh9p5vYb7qLfae72He6if2mu0rsO5H9WWmJ4r8vPN9P4gs/X1xHN/I6dWJUO/nfu3dvict79uypbpMv9c8//2Dt2rWYMmUKPvnkE5w7dw6TJk2CXC7H8OHDkZiYCABwcHBQWs/BwUFalpiYCHt7e6Xl+vr6sLGxUarz/IiC59tMTEwsNvlfuHAh5s6dq1KenJyMnJycV9xj0hSFQoG0tDSIogiZTO2pLUiL2He6if2mu9h3uot9p5vYb7qrpL4TMrK0FBWVToTwNAMQgP//z/8XK/79+WL+LyogZKQgKSkXlV1GRkaZ66qd/Pfu3VvpvSAIEEVR+r24JwG8KoVCgWbNmuGLL74AADRp0gRXr17FunXrMHz4cI1t51XMmDEDU6ZMkd6np6fD1dUVdnZ2sLCw0GJkBDz77AiCADs7O/6PVcew73QT+013se90F/tON7HfdFdJfSdmpmkpKiqVKAIiIJrZAMJzyb8g+/en8MI6ggyiuQ3s7S0rLMxXVTTPXVmonfy/OKzA3Nwcly5dQq1atdRtqlROTk6oX7++Ulm9evWwa9cuAICjoyMA4OHDh3BycpLqPHz4EI0bN5bqJCUlKbVRUFCAlJQUaX1HR0c8fPhQqU7R+6I6LzI0NIShoaFKuUwm4x/ySkIQBPaHjmLf6Sb2m+5i3+ku9p1uYr/prpf2ncC+rLwUz5J+QXihn4QXfj6/6FldXfiOqhPja++NIBRzsDTE19cXN2/eVCr7+++/4ebmBuDZ5H+Ojo44dOiQtDw9PR1nzpxBq1atAACtWrVCamoqLly4INU5fPgwFAoFWrRoIdU5evQo8vPzpTrh4eGoW7dusUP+iYiIiIiIiHTJayX/8fHxyMrKgrm5uabiUfLhhx/i9OnT+OKLL3Dr1i1s3boV3377LcaPHw/g2YmHyZMn4/PPP8fevXtx5coVDBs2DM7OztLtCfXq1UNgYCBGjx6Ns2fP4sSJE5gwYQIGDRoEZ2dnAMCQIUMgl8sRHByMa9euYfv27VixYoXSsH4iIiIiIiIiXaX2sP++ffsCAJ4+fYrTp0+jU6dOsLOz03hgAPDmm29i9+7dmDFjBubNmwd3d3csX74cQUFBUp3p06cjKysLY8aMQWpqKtq0aYOwsDClex9CQ0MxYcIEdOrUCTKZDP369cPKlSul5ZaWljhw4ADGjx8PHx8f2NraYtasWXzMHxEREREREVUJaif/lpbPJj1wdHREjx49MGrUKI0H9by33noLb7311kuXC4KAefPmYd68eS+tY2Njg61bt5a4nUaNGuHYsWOvHCcRERERERFRZaV28r9x48byiIOIiIiIiIioQhzafRhCYSFEPT3AChDEQnzh9RhP9atBJuhpO7xyoXbyn56eXuJyPuaOiIiIiIiIKrMce9WnupkYusBEC7FUFLWTfysrq2Jn+BdFEYIgoLCwUCOBEREREREREZFmqJ3816pVC0lJSfj444/h6+tbHjERERERERERkQapnfxHR0dj1apVWLBgAf766y8sXrwY7u7u5REbERERERERkca5bwuBflYmCkzNgI6AfmEmfk64jPtGjWBkYAb/2iO0HaLGqZ38GxgYYMqUKRgxYgTmzZuHRo0aYcyYMZg5cyasrKzKIUQiIiIiIiIizam/ajFMHj5AtoMzUAcwyX+A7nEyPCj4GTYmzlUy+Ze96oo2NjZYvnw5/vrrL8THx8PT0xPLly/XYGhEREREREREpAlqX/lv0qSJyoR/oigiNzcXU6dOxeTJkzUVGxERERERERFpgNrJf+/evcshDCIiIiIiIiIqL2on/7Nnzy6POIiIiIiIiIionKid/Kenp5e43MLC4pWDISIiIiIiIiLNUzv5t7KyUrnnH3h2378gCCgsLNRIYERERERERESkGWon/wCwc+dO2NjYaDoWIiIiIiIiIioHr5T8+/r6wt7eXtOxEBEREREREVE5eKXk//r163j8+DFMTU3h6OgIuVyu6biIiIiIiIiIykWmuwfyzS2Qa2sHGAP5+hZwN06ATOYESyM7bYdXLl4p+e/UqZN0j79MJoOXlxdGjRqFDz/8UNPxEREREREREWnUkS2/qZR90EgLgVQgtZP/uLg4iKKI/Px8pKen48GDBzh79ixmzpyJgoICTJs2rTziJCIiIiIiIqJXpHby7+bmpvTex8cHPXr0QJ06dTBv3jwm/0RERERERESVzCsN+y/OoEGD0KBBA001R0REREREREQa8srJ/4ULFxAdHQ0AqF+/Ppo2bYqmTZtqLDAiIiIiIiKi8tB8ymgYpjxGrk014H3AsOAx3ou7hXi5J8wNq2FSm++0HaLGqZ38JyUlYdCgQYiMjISVlRUAIDU1FX5+fti2bRvs7KrmzIhERERERERUNdidOQGThw+Q7eAMvAOY5D/AuTQZHhTcgY2Js7bDKxcydVeYOHEiMjIycO3aNaSkpCAlJQVXr15Feno6Jk2aVB4xEhEREREREdFrUPvKf1hYGA4ePIh69epJZfXr18eaNWvQpUsXjQZHRERERERERK9P7Sv/CoUCBgYGKuUGBgZQKBQaCYqIiIiIiIiINEft5L9jx4744IMP8ODBA6ns/v37+PDDD9GpUyeNBkdEREREREREr0/t5H/16tVIT09HzZo14eHhAQ8PD7i7uyM9PR2rVq0qjxiJiIiIiIiI6DWofc+/q6srLl68iIMHD+LGjRsAgHr16sHf31/jwRERERERERHR6ytz8p+RkQFzc3MAgCAI6Ny5Mzp37qxU59y5c3jzzTc1GyERERERERERvZYyD/vv0qULMjMzi11WUFCAzz77DL6+vhoLjIiIiIiIiIg0Q60r//7+/jhw4AAsLCyk8qtXr2Lo0KFITk7Gnj17yiNGIiIiIiIiIo2JGzgMBhnpyDe3AOwBg8J09M6/jNtGjWAityi9AR1U5uQ/IiICHTt2ROfOnREeHg5zc3MsXrwYs2fPRt++fXH48GFYW1uXZ6xEREREREREr+36pI9Vyjq4aSGQClTm5N/Ozg6HDx+Gv78/OnbsCENDQ8TExGDLli14++23yzNGIiIiIiIiInoNas32b2dnh0OHDsHf3x9Xr15FVFQUvLy8yis2IiIiIiIiItKAMk/4V8TW1haHDx9G/fr1MWTIEDx58qQ84iIiIiIiIiIiDSnzlf++ffsqvbewsMDRo0fRvHlzeHt7S+W//PKL5qIjIiIiIiIi0rDuvvVh8vABsh2cgVWASf4DuMTJ8KBAARsTZ6zre13bIWpcmZN/S0tLlffu7u4aD4iIiIiIiIiINKvMyf/GjRvLMw4iIiIiIiIiKidq3/NPRERERERERLqFyT8RERERERFRFcfkn4iIiIiIiKiKY/JPREREREREVMUx+SciIiIiIiKq4pj8ExEREREREVVxTP6JiIiIiIiIqjh9bQdAREREREREVJHOLl0PWV4eFHI54AnIFHmYZncTyUZ1YSCTazu8csHkn4iIiIiIiP5Tklu2VSlzseoEFy3EUlE47J+IiIiIiIioimPyT0RERERERFTFcdg/ERERERFVSTti00qt09/DsgIiocrG7vSxf+/5r//snv+LKf/e89/AUfW2AF3H5J+IiIiIiIj+U5pPHQuThw+Q7eAMrAJM8h9gcJwMDwoUsDFxxrq+17UdosZx2D8RERERERFRFcfkn4iIiIiIiKiKY/JPREREREREVMUx+SciIiIiIiKq4pj8ExEREREREVVxOpX8f/nllxAEAZMnT5bKcnJyMH78eFSrVg1mZmbo168fHj58qLTenTt30L17d5iYmMDe3h7Tpk1DQUGBUp3IyEg0bdoUhoaG8PT0REhISAXsEREREREREVH505nk/9y5c1i/fj0aNWqkVP7hhx/it99+w44dO3DkyBE8ePAAffv2lZYXFhaie/fuyMvLw8mTJ7Fp0yaEhIRg1qxZUp24uDh0794dfn5+iIqKwuTJk/Huu+9i//79FbZ/REREREREROVFJ5L/zMxMBAUF4bvvvoO1tbVUnpaWhh9++AHLli1Dx44d4ePjg40bN+LkyZM4ffo0AODAgQO4fv06tmzZgsaNG6Nr166YP38+1qxZg7y8PADAunXr4O7ujqVLl6JevXqYMGEC3n77bXz99dda2V8iIiIiIiIiTdLXdgBlMX78eHTv3h3+/v74/PPPpfILFy4gPz8f/v7+UpmXlxdq1KiBU6dOoWXLljh16hS8vb3h4OAg1QkICMC4ceNw7do1NGnSBKdOnVJqo6jO87cXvCg3Nxe5ubnS+/T0dACAQqGAQqF43V2m16RQKCCKIvtCB7HvdBP7TXex73QX+043sd8qmFj6cS5rX5TYd2XYDmmJKP77wvP9JL7w88V1dCOvUyfGSp/8b9u2DRcvXsS5c+dUliUmJkIul8PKykqp3MHBAYmJiVKd5xP/ouVFy0qqk56ejqdPn8LY2Fhl2wsXLsTcuXNVypOTk5GTk1P2HaRyoVAokJaWBlEUIZPpxAAX+n/sO93EftNd7Dvdxb7TTey3iiVkZJVaJykpt9Q6QMl9V5btkLaIEJ5mAALw//8BAPzx5zGVmivqPfcmI6XMnw1tysjIKHPdSp383717Fx988AHCw8NhZGSk7XCUzJgxA1OmTJHep6enw9XVFXZ2drCwsNBiZAQ8++MsCALs7Oz4P1Ydw77TTew33cW+013sO93EfqtYYmZaqXXs7S3L1FZJfVeW7ZCWiCIgAqKZDSAIpdd/Tlk/G9qkTp5cqZP/CxcuICkpCU2bNpXKCgsLcfToUaxevRr79+//v/buPT6q8t73+HcmyeRCMsGEXIhCCGLkDkGEBLSgYqJGWgpbEVFQ0V1sQhtTi1ijEO3eHkWKCkF3vYAeK4F4RM4xitIoIBClIGm5WFrSCKUhAWxzBUIys84fyJQhXBJIMpk1n/frNcJ61jMrv8Xv9Tj5zfOstXTixAlVVVW5zf5XVlYqNjZWkhQbG6stW7a4HffU0wBO73PmEwIqKytlt9vPOusvSYGBgQoMDGzWbrVa+R95J2GxWMiHlyJ33om8eS9y573InXcibx3IcuF/49bk4Zy5a8HPgac4Txb9Fkur8+QNY7Q1MXbqs7npppu0Y8cOlZSUuF7Dhw/X1KlTXX8PCAhQUVGR6z179uzR/v37lZKSIklKSUnRjh07dOjQIVeftWvXym63q3///q4+px/jVJ9TxwAAAAAAwJt16pn/sLAwDRw40K2tS5cuioyMdLXPmDFD2dnZioiIkN1u16xZs5SSkqLk5GRJUmpqqvr37697771Xzz//vCoqKpSTk6OMjAzXzP3MmTO1ePFizZ49Ww888IA+++wzrVy5UoWFhR17wgAAAACAdtf/5f+lgNoaNYbZpYlSgKNGL/39T9oXNFghNrvuGDzH0yG2uU5d/LfEwoULZbVaNWnSJDU0NCgtLU1Llixx7ffz89OHH36ohx9+WCkpKerSpYumT5+up59+2tUnISFBhYWFeuSRR/TSSy/piiuu0Ouvv660tDRPnBIAAAAAoB0lrHhbIZXlOhoTJ10nhTSW64MDVpU3bVRESBzFf2ewbt06t+2goCDl5eUpLy/vnO+Jj4/XRx99dN7jjh07Vtu3b2+LEAEAAAAA6FQ69TX/AAAAAADg0nndzD8AAAAAtJWC0vM/pu+OKzv/496AlmDmHwAAAAAAk6P4BwAAAADA5Cj+AQAAAAAwOYp/AAAAAABMjuIfAAAAAACT427/AAAAAHAOrqcBGE5Zautl1FVLFuZQvd3hkaMV+M/v1BARKdmlwKbvdG34Xn1r66OwwEhPh9cuKP4BAAAAAD5ly29ea9Y2ta8HAulAfGUFAAAAAIDJUfwDAAAAAGByLPsHAAAA4JVc1+MDuCCKfwAAAACATxlzz3gFHjmshm5R0hNSYONhpZcd1D5rd4UHRWnuzf/P0yG2OYp/AAAAAIBPCS0rVUhluY7W1kjHpJDGcpUds6q8qVpHG2s8HV674Jp/AAAAAABMjuIfAAAAAACTo/gHAAAAAMDkKP4BAAAAADA5in8AAAAAAEyO4h8AAAAAAJOj+AcAAAAAwOQo/gEAAAAAMDl/TwcAAAAAAEBH2j1rtvzr69TUJVS6QvJ31Gl6wJ/0j6DBCgoI9XR47YLiHwAAAADgU8ruuq9ZW1J3KanjQ+kwLPsHAAAAAMDkKP4BAAAAADA5lv0DAAAAAHxK0KEKWRwOGX5+UlfJYjj03fHvdMw/UlaLny4LifV0iG2O4h8AAAAA4FNu+vGNCqks19GYOGmRFNJYrsvLrCpvcioiJE6vTtzt6RDbHMv+AQAAAAAwOYp/AAAAAABMjuIfAAAAAACTo/gHAAAAAMDkuOEfAAAAgE6noLTa0yEApsLMPwAAAAAAJkfxDwAAAACAyVH8AwAAAABgclzzDwAAAKBNteR6/TuuDO+ASACcQvEPAAAAAPAp6//3almbmuT095e6S1ajSfMSDqjWdoX8LOYsk815VgAAAAAAnENd76uatXUN6aeuHR9Kh+GafwAAAAAATI7iHwAAAAAAk2PZPwAAAADAp/T4vwXyP3ZMTcHB0ijJ33lMhRV/0sHgwQr0D9Z1CXd4OsQ2R/EPAAAAoMO15IkAQHsZ/NxchVSW62hMnLRICmks14/KrCpvcioiJM6UxT/L/gEAAAAAMDmKfwAAAAAATI7iHwAAAAAAk6P4BwAAAADA5Cj+AQAAAAAwOYp/AAAAAABMjkf9AQAAAGgVHtMHeB9m/gEAAAAAMDlm/gEAAAAAPuV4VPS//7SdbIsK+JeO2y5T16BoD0bWfij+AQAAAAA+peiDdc3anhjW8XF0JJb9AwAAAABgchT/AAAAAACYXKcu/p999llde+21CgsLU3R0tCZMmKA9e/a49Tl+/LgyMjIUGRmp0NBQTZo0SZWVlW599u/fr/T0dIWEhCg6Olq//OUv1dTU5NZn3bp1GjZsmAIDA9WnTx8tW7asvU8PAAAAAIAO0amv+V+/fr0yMjJ07bXXqqmpSb/61a+Umpqq3bt3q0uXLpKkRx55RIWFhSooKFB4eLgyMzM1ceJEbdq0SZLkcDiUnp6u2NhYbd68WQcPHtS0adMUEBCg//7v/5YklZWVKT09XTNnztTvfvc7FRUV6cEHH1T37t2VlpbmsfMHAAAAALS9YTlZslX9Sye6XibNkGxN/9Lsb7/Rt4H9FGq7TP+Z/KKnQ2xznbr4X7Nmjdv2smXLFB0drW3btukHP/iBqqur9cYbb+jdd9/VjTfeKElaunSp+vXrpy+//FLJycn69NNPtXv3bv3+979XTEyMhg4dqmeeeUaPPfaY5s2bJ5vNpldffVUJCQlasGCBJKlfv37auHGjFi5cSPEPAAAAACbT/fNPFVJZrqMxcdJEKaSxXBu+s6q86S+KCInzdHjtolMX/2eqrq6WJEVEREiStm3bpsbGRo0bN87Vp2/fvurZs6eKi4uVnJys4uJiDRo0SDExMa4+aWlpevjhh7Vr1y4lJSWpuLjY7Rin+mRlZZ0zloaGBjU0NLi2a2pqJElOp1NOp/OSzxWXxul0yjAMcuGFyJ13Im/ei9x5L3LnnUyTN8PL478YhvHvl3zw/L3VOfNmnPHnme/xjrquNTF6TfHvdDqVlZWl0aNHa+DAgZKkiooK2Ww2de3a1a1vTEyMKioqXH1OL/xP7T+173x9ampqdOzYMQUHBzeL59lnn1Vubm6z9sOHD+v48eMXd5JoM06nU9XV1TIMQ1Zrp761Bc5A7rwTefNe5M57kTvvZJa8WWrrPR2CBxiyHKuVLNL3/4FXOEfeTn2BZTib1/+GU5baf+rQoQZ1drW1tS3u6zXFf0ZGhnbu3KmNGzd6OhRJ0uOPP67s7GzXdk1NjXr06KGoqCjZ7XYPRgbp5AerxWJRVFSUV3+w+iJy553Im/cid96L3Hkns+TNqKv2dAgdzzAkQzJCIyQLxb/XOFfeLNZ//3lmOi1WGWERio4O77AwL1ZQUFCL+3pF8Z+ZmakPP/xQGzZs0BVXXOFqj42N1YkTJ1RVVeU2+19ZWanY2FhXny1btrgd79TTAE7vc+YTAiorK2W328866y9JgYGBCgwMbNZutVq9+n/kZmKxWMiHlyJ33om8eS9y573InXcyRd4sXhz7RXOeLB4tFh89f291rrxZzvjz9F0n+3rDGG1NjJ36bAzDUGZmplatWqXPPvtMCQkJbvuvueYaBQQEqKioyNW2Z88e7d+/XykpKZKklJQU7dixQ4cOHXL1Wbt2rex2u/r37+/qc/oxTvU5dQwAAAAAALxZp575z8jI0LvvvqvVq1crLCzMdY1+eHi4goODFR4erhkzZig7O1sRERGy2+2aNWuWUlJSlJycLElKTU1V//79de+99+r5559XRUWFcnJylJGR4Zq5nzlzphYvXqzZs2frgQce0GeffaaVK1eqsLDQY+cOAAAAAEBb6dQz/6+88oqqq6s1duxYde/e3fVasWKFq8/ChQt1++23a9KkSfrBD36g2NhYvf/++679fn5++vDDD+Xn56eUlBTdc889mjZtmp5++mlXn4SEBBUWFmrt2rUaMmSIFixYoNdff53H/AEAAAAATKFTz/wbxlkeu3CGoKAg5eXlKS8v75x94uPj9dFHH533OGPHjtX27dtbHSMAAAAAAJ1dpy7+AQAAAHSsglIfvJM/fM7fx09SQHWVGsO7St2kgKYq3XJsp74NHKgutq6eDq9dUPwDAAAAAHzKn+Y806ztlt4eCKQDdepr/gEAAAAAwKWj+AcAAAAAwORY9g8AAAD4EK7pB3wTxT8AAAAAwKekpV6r4MoKHYuJleZLwScqNODb4zrgDNJlwbF68Yd/8HSIbY7iHwAAADARZvaBC/Ovr1dAfa0a68MkhxTgrFW9w6pjTbUKDgjzdHjtgmv+AQAAAAAwOYp/AAAAAABMjuIfAAAAAACTo/gHAAAAAMDkKP4BAAAAADA5in8AAAAAAEyO4h8AAAAAAJPz93QAAAAAAFqmoLTa0yEA8FIU/wAAAEAHKCitlgynLLX1MuqqJUvzRbh3XBnugcgA3/P1M7+R3/HjcgQFSQmSn/O4fha+S5XBA2TzC/J0eO2C4h8AAADoJJjZBzrGwRtvadbWO3KCensglo7CNf8AAAAAAJgcxT8AAAAAACbHsn8AAAAAgE/purNE1hMn5LTZpATJ6jyhP1eXqSowQf5Wm3pHDvV0iG2O4h8AAAAA4FNG/+RuhVSW62hMnLRICmks17Qyq8qbnIoIidOrE3d7OsQ2x7J/AAAAAABMjuIfAAAAAACTo/gHAAAAAMDkKP4BAAAAADA5in8AAAAAAEyO4h8AAAAAAJOj+AcAAAAAwOT8PR0AAAAAYAYFpdWeDgEAzomZfwAAAAAATI6ZfwAAAOACmNUHzOWTT76SDEOyWKRgSYah+UPq1WjtIovF4unw2gXFPwAAAADApzSFhjVrC/C3K8ADsXQUlv0DAAAAAGByFP8AAAAAAJgcy/4BAADg87imH/AtV72xWAF1tWoMDZNukwIctXrzH3/S/qDBCgkI0+39Mz0dYpuj+AcAAAAA+JTEN5copLJcR2PipCQppLFc+WVWlTd9rIiQOFMW/yz7BwAAAADA5Jj5BwAAQKfVkuX4d1wZ3gGRAIB3Y+YfAAAAAACTY+YfAAAAXo2b9QHAhTHzDwAAAACAyVH8AwAAAABgciz7BwAAgMewZB8AOgbFPwAAAC4Kd+IHAO9B8Q8AANCGLlQQ+1oxzMw+gM6oasBgHet+uRoiIqUu0rGmyzUg9Ft18e8le2Ckp8NrFxT/AAAA3+uIQpXZcgDwvE2/zW/W9tAADwTSgSj+AQAAfBBfQgCAb6H4BwAAXqEtltN7yxJ0Lh0AALQ1in8AAAAT8pYvOgAAHYPiHwAAmALFbtvj3xSAWY3+z7sU+M/vTt7w7xdSYNN3mlL2rb79/oZ/j93Q/J4A3o7iHwAAeBxFZusUlFZLhlOW2noZddWSxerpkADAq3Td9SeFVJbraEycVC+FNJZrV51V5U2HFRES5+nw2gWfFAAAAAAAmBwz/wAA+LD/87fqS5495uZzAAB0fhT/AADgkrBkHwCAzo9l/wAAAAAAmBwz/2fIy8vT/PnzVVFRoSFDhmjRokUaMWKEp8MCAKCZlsy4syQfAABIFP9uVqxYoezsbL366qsaOXKkXnzxRaWlpWnPnj2Kjo72dHgAALQaS/IBAIBE8e/mN7/5jR566CHdf//9kqRXX31VhYWFevPNNzVnzhwPRwcAaAmKXQAAgOYo/r934sQJbdu2TY8//rirzWq1aty4cSouLm7Wv6GhQQ0NDa7t6uqTv2xWVVXJ6XS2f8A4L6fTqZqaGtlsNlmt3nFri9Vl5y9YfpRw6Ut3L/QzWqIlcVzazzFkqf2Xbr9A7triXNCWTubNOHhcksXTwaBVDFlqa2TIT+TO25A770TevBe5805nz1uN06EmSUedDumo1NQoOY8bUpNkWBw6WlOlqirDY1G3VE1NjSTJMC4cK8X/944cOSKHw6GYmBi39piYGP35z39u1v/ZZ59Vbm5us/b4+Ph2ixEAAAAA0IYOV0r3ndo4WUD/S5W6b24vDwV0cWpraxUefv5JOor/i/T4448rOzvbte10OvXPf/5TkZGRslj4JtDTampq1KNHD/3973+X3W73dDhoBXLnncib9yJ33ovceSfy5r3InXcye94Mw1Btba3i4uIu2Jfi/3vdunWTn5+fKisr3dorKysVGxvbrH9gYKACAwPd2rp27dqeIeIi2O12Uw5yX0DuvBN5817kznuRO+9E3rwXufNOZs7bhWb8T/GOi6E7gM1m0zXXXKOioiJXm9PpVFFRkVJSUjwYGQAAAAAAl4aZ/9NkZ2dr+vTpGj58uEaMGKEXX3xR9fX1rrv/AwAAAADgjSj+TzN58mQdPnxYTz31lCoqKjR06FCtWbOm2U0A0fkFBgZq7ty5zS7NQOdH7rwTefNe5M57kTvvRN68F7nzTuTt3yxGS54JAAAAAAAAvBbX/AMAAAAAYHIU/wAAAAAAmBzFPwAAAAAAJkfxDwAAAACAyVH8o9PbsGGDxo8fr7i4OFksFn3wwQcXfM+6des0bNgwBQYGqk+fPlq2bJnb/nnz5slisbi9+vbt2z4n4MNam7uDBw/q7rvvVmJioqxWq7Kyss7ar6CgQH379lVQUJAGDRqkjz76qO2D93Htkbtly5Y1G3dBQUHtcwI+qrV5e//993XzzTcrKipKdrtdKSkp+uSTT5r1y8vLU69evRQUFKSRI0dqy5Yt7XQGvqs9csdnXftrbd42btyo0aNHKzIyUsHBwerbt68WLlzYrB9jrv21R+4Ycx3jYmqDUzZt2iR/f38NHTq02T5fGHcU/+j06uvrNWTIEOXl5bWof1lZmdLT03XDDTeopKREWVlZevDBB5v9UjRgwAAdPHjQ9dq4cWN7hO/TWpu7hoYGRUVFKScnR0OGDDlrn82bN2vKlCmaMWOGtm/frgkTJmjChAnauXNnW4bu89ojd5Jkt9vdxt2+ffvaKmSo9XnbsGGDbr75Zn300Ufatm2bbrjhBo0fP17bt2939VmxYoWys7M1d+5cff311xoyZIjS0tJ06NCh9joNn9QeuZP4rGtvrc1bly5dlJmZqQ0bNuibb75RTk6OcnJy9Nvf/tbVhzHXMdojdxJjriO0NnenVFVVadq0abrpppua7fOZcWcAXkSSsWrVqvP2mT17tjFgwAC3tsmTJxtpaWmu7blz5xpDhgxphwhxLi3J3enGjBlj/PznP2/Wfueddxrp6elubSNHjjR+8pOfXGKEOJe2yt3SpUuN8PDwNosL59favJ3Sv39/Izc317U9YsQIIyMjw7XtcDiMuLg449lnn22LMHEWbZU7Pus61sXm7cc//rFxzz33uLYZcx2vrXLHmOt4rcnd5MmTjZycnLPmyVfGHTP/MJ3i4mKNGzfOrS0tLU3FxcVubX/9618VFxen3r17a+rUqdq/f39HhomL1NL8onOqq6tTfHy8evTooR/96EfatWuXp0PCaZxOp2praxURESFJOnHihLZt2+Y25qxWq8aNG8eY62TOzN0pfNZ1btu3b9fmzZs1ZswYSYw5b3Jm7k5hzHVOS5cu1d/+9jfNnTu32T5fGncU/zCdiooKxcTEuLXFxMSopqZGx44dkySNHDlSy5Yt05o1a/TKK6+orKxM119/vWpraz0RMlrhXPmtqKjwUERoqauvvlpvvvmmVq9erXfeeUdOp1OjRo3SgQMHPB0avvfCCy+orq5Od955pyTpyJEjcjgcjDkvcGbuJD7rOrMrrrhCgYGBGj58uDIyMvTggw9KYsx5g3PlTmLMdVZ//etfNWfOHL3zzjvy9/dvtt+Xxl3zswd8wK233ur6++DBgzVy5EjFx8dr5cqVmjFjhgcjA8wrJSVFKSkpru1Ro0apX79++p//+R8988wzHowMkvTuu+8qNzdXq1evVnR0tKfDQSucK3d81nVeX3zxherq6vTll19qzpw56tOnj6ZMmeLpsNAC58sdY67zcTgcuvvuu5Wbm6vExERPh+NxFP8wndjYWFVWVrq1VVZWym63Kzg4+Kzv6dq1qxITE7V3796OCBGX4Fz5jY2N9VBEuFgBAQFKSkpi3HUC+fn5evDBB1VQUOC27LFbt27y8/NjzHVi58rd2fBZ13kkJCRIkgYNGqTKykrNmzdPU6ZMYcx5gXPl7mwYc55XW1urrVu3avv27crMzJR08jIpwzDk7++vTz/9VNddd53PjDuW/cN0UlJSVFRU5Na2du1atxnHM9XV1am0tFTdu3dv7/BwiS4mv+icHA6HduzYwbjzsOXLl+v+++/X8uXLlZ6e7rbPZrPpmmuucRtzTqdTRUVFjLlO4Hy5Oxs+6zonp9OphoYGSYw5b3N67s6GMed5drtdO3bsUElJies1c+ZMXX311SopKdHIkSN9atwx849Or66uzu0b07KyMpWUlCgiIkI9e/bU448/rn/84x96++23JUkzZ87U4sWLNXv2bD3wwAP67LPPtHLlShUWFrqO8eijj2r8+PGKj49XeXm55s6dKz8/P5bctbHW5k6SSkpKXO89fPiwSkpKZLPZ1L9/f0nSz3/+c40ZM0YLFixQenq68vPztXXr1maP2sGlaY/cPf3000pOTlafPn1UVVWl+fPna9++fW7XS+LStDZv7777rqZPn66XXnpJI0eOdF3bGBwcrPDwcElSdna2pk+fruHDh2vEiBF68cUXVV9fr/vvv7/jT9DE2iN3fNa1v9bmLS8vTz179nQ9+33Dhg164YUX9LOf/cx1DMZcx2iP3DHmOkZrcme1WjVw4EC390dHRysoKMit3WfGnacfNwBcyOeff25IavaaPn26YRiGMX36dGPMmDHN3jN06FDDZrMZvXv3NpYuXeq2f/LkyUb37t0Nm81mXH755cbkyZONvXv3dswJ+ZCLyd3Z+sfHx7v1WblypZGYmGjYbDZjwIABRmFhYceckA9pj9xlZWUZPXv2NGw2mxETE2Pcdtttxtdff91xJ+UDWpu3MWPGnLf/KYsWLXLlbsSIEcaXX37ZcSflI9ojd3zWtb/W5u3ll182BgwYYISEhBh2u91ISkoylixZYjgcDrfjMubaX3vkjjHXMS7md5TTneuRjL4w7iyGYRht8zUCAAAAAADojLjmHwAAAAAAk6P4BwAAAADA5Cj+AQAAAAAwOYp/AAAAAABMjuIfAAAAAACTo/gHAAAAAMDkKP4BAAAAADA5in8AAAAAANrBhg0bNH78eMXFxcliseiDDz5o9TFWrlypoUOHKiQkRPHx8Zo/f/5FxULxDwAAXO677z5NmDDB02EAAGAK9fX1GjJkiPLy8i7q/R9//LGmTp2qmTNnaufOnVqyZIkWLlyoxYsXt/pYFsMwjIuKAgAAeBWLxXLe/XPnztUjjzwiwzDUtWvXjgnqLO677z5VVVVd1OwIAACdlcVi0apVq9y+ZG9oaNATTzyh5cuXq6qqSgMHDtRzzz2nsWPHSpLuvvtuNTY2qqCgwPWeRYsW6fnnn9f+/fsv+Nl+Ov+2OhEAANC5HTx40PX3FStW6KmnntKePXtcbaGhoQoNDfVEaAAA+KTMzEzt3r1b+fn5iouL06pVq3TLLbdox44duuqqq9TQ0KCQkBC39wQHB+vAgQPat2+fevXq1eKfxbJ/AAB8RGxsrOsVHh4ui8Xi1hYaGtps2f/YsWM1a9YsZWVl6bLLLlNMTIxee+011dfX6/7771dYWJj69Omjjz/+2O1n7dy5U7feeqtCQ0MVExOje++9V0eOHHHtf++99zRo0CAFBwcrMjJS48aNU319vebNm6e33npLq1evlsVikcVi0bp16yRJjz32mBITExUSEqLevXvrySefVGNjo+uY8+bN09ChQ/Xmm2+qZ8+eCg0N1U9/+lM5HA49//zzio2NVXR0tP7rv/7LLVaLxaJXXnlFt956q4KDg9W7d2+99957bZ8AAABOs3//fi1dulQFBQW6/vrrdeWVV+rRRx/Vddddp6VLl0qS0tLS9P7776uoqEhOp1N/+ctftGDBAknuX+q3BMU/AAA4r7feekvdunXTli1bNGvWLD388MO64447NGrUKH399ddKTU3Vvffeq6NHj0qSqqqqdOONNyopKUlbt27VmjVrVFlZqTvvvFPSyV9WpkyZogceeEDffPON1q1bp4kTJ8owDD366KO68847dcstt+jgwYM6ePCgRo0aJUkKCwvTsmXLtHv3br300kt67bXXtHDhQrdYS0tL9fHHH2vNmjVavny53njjDaWnp+vAgQNav369nnvuOeXk5Oirr75ye9+TTz6pSZMm6Y9//KOmTp2qu+66S998800H/OsCAHzVjh075HA4lJiY6Fp9FxoaqvXr16u0tFSS9NBDDykzM1O33367bDabkpOTddddd0mSrNbWlfNc8w8AgA9atmyZsrKyVFVV5dZ+5vX2Y8eOlcPh0BdffCFJcjgcCg8P18SJE/X2229LkioqKtS9e3cVFxcrOTlZv/71r/XFF1/ok08+cR33wIED6tGjh/bs2aO6ujpdc801+vbbbxUfH98stpZe8//CCy8oPz9fW7dulXRy5n/+/PmqqKhQWFiYJOmWW27Rnj17VFpa6volqW/fvrrvvvs0Z84cSSdn/mfOnKlXXnnFdezk5GQNGzZMS5YsaeG/KAAA53fmNf8rVqzQ1KlTtWvXLvn5+bn1DQ0NVWxsrGvb4XCooqJCUVFRKioq0m233aZDhw4pKiqqxT+fa/4BAMB5DR482PV3Pz8/RUZGatCgQa62mJgYSdKhQ4ckSX/84x/1+eefn/X+AaWlpUpNTdVNN92kQYMGKS0tTampqfqP//gPXXbZZeeNY8WKFXr55ZdVWlqquro6NTU1yW63u/Xp1auXq/A/FZufn5/b7EhMTIwr1lNSUlKabZeUlJw3HgAALkVSUpIcDocOHTqk66+//rx9/fz8dPnll0uSli9frpSUlFYV/hLFPwAAuICAgAC3bYvF4tZ26k7DTqdTklRXV6fx48frueeea3as7t27y8/PT2vXrtXmzZv16aefatGiRXriiSf01VdfKSEh4awxFBcXa+rUqcrNzVVaWprCw8OVn5/vuu6xpbGeajsVKwAA7amurk579+51bZeVlamkpEQRERFKTEzU1KlTNW3aNC1YsEBJSUk6fPiwioqKNHjwYKWnp+vIkSN67733NHbsWB0/ftx1j4D169e3Ohau+QcAAG1q2LBh2rVrl3r16qU+ffq4vbp06SLpZAE+evRo5ebmavv27bLZbFq1apUkyWazyeFwuB1z8+bNio+P1xNPPKHhw4frqquu0r59+9os5i+//LLZdr9+/drs+AAA37R161YlJSUpKSlJkpSdna2kpCQ99dRTkqSlS5dq2rRp+sUvfqGrr75aEyZM0B/+8Af17NnTdYy33npLw4cP1+jRo7Vr1y6tW7dOI0aMaHUszPwDAIA2lZGRoddee01TpkzR7NmzFRERob179yo/P1+vv/66tm7dqqKiIqWmpio6OlpfffWVDh8+7Cq2e/XqpU8++UR79uxRZGSkwsPDddVVV2n//v3Kz8/Xtddeq8LCQteXBW2hoKBAw4cP13XXXaff/e532rJli9544402Oz4AwDeNHTtW57vNXkBAgHJzc5Wbm3vW/d26dVNxcXGbxMLMPwAAaFNxcXHatGmTHA6HUlNTNWjQIGVlZalr166yWq2y2+3asGGDbrvtNiUmJionJ0cLFizQrbfeKunknY2vvvpqDR8+XFFRUdq0aZN++MMf6pFHHlFmZqaGDh2qzZs368knn2yzmHNzc5Wfn6/Bgwfr7bff1vLly9W/f/82Oz4AAJ7G3f4BAIBPO/PuywAAmBEz/wAAAAAAmBzFPwAAAAAAJscN/wAAgE/jCkgAgC9g5h8AAAAAAJOj+AcAAAAAwOQo/gEAAAAAMDmKfwAAAAAATI7iHwAAAAAAk6P4BwAAAADA5Cj+AQAAAAAwOYp/AAAAAABM7v8Dgof10qsXKOkAAAAASUVORK5CYII=", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import polars as pl\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "from datetime import datetime\n", + "\n", + "\n", + "# 2. Статистика по текущему временному разбиению\n", + "global_max_time = df.select(pl.col(\"timestamps\").explode().max()).item()\n", + "days_val = 14\n", + "window_sec = days_val * 24 * 3600 \n", + "\n", + "cutoffs = [\n", + " int(global_max_time - 3 * window_sec),\n", + " int(global_max_time - 2 * window_sec),\n", + " int(global_max_time - 1 * window_sec)\n", + "]\n", + "\n", + "print(\"--- Статистика по временным интервалам (Fixed Time Window) ---\")\n", + "intervals = [0] + cutoffs + [None]\n", + "labels = [\"Base\", \"Gap (Week -6)\", \"Pre-Valid (Week -4)\", \"Test (Week -2)\"]\n", + "\n", + "# Считаем события в каждом интервале\n", + "counts = []\n", + "for i in range(len(intervals)-1):\n", + " start, end = intervals[i], intervals[i+1]\n", + " \n", + " q = df.lazy().explode([\"timestamps\"])\n", + " if end is not None:\n", + " q = q.filter((pl.col(\"timestamps\") >= start) & (pl.col(\"timestamps\") < end))\n", + " else:\n", + " q = q.filter(pl.col(\"timestamps\") >= start)\n", + " \n", + " count = q.select(pl.len()).collect().item()\n", + " counts.append(count)\n", + " \n", + " end_str = datetime.fromtimestamp(end).strftime('%Y-%m-%d') if end else \"Inf\"\n", + " start_str = datetime.fromtimestamp(start).strftime('%Y-%m-%d') if start > 0 else \"Start\"\n", + " \n", + " print(f\"Part {i} [{labels[i]}]: {count} events ({start_str} -> {end_str})\")\n", + "\n", + "# 3. Гистограмма распределения событий во времени\n", + "all_timestamps = df.select(pl.col(\"timestamps\").explode()).to_series().to_numpy()\n", + "\n", + "plt.figure(figsize=(12, 6))\n", + "plt.hist(all_timestamps, bins=100, color='skyblue', alpha=0.7, label='Events')\n", + "\n", + "# Рисуем линии отсечек\n", + "colors = ['red', 'orange', 'green']\n", + "for cutoff, color, label in zip(cutoffs, colors, labels[1:]):\n", + " plt.axvline(x=cutoff, color=color, linestyle='--', linewidth=2, label=f'Cutoff: {label}')\n", + "\n", + "plt.title(\"Распределение взаимодействий во времени\")\n", + "plt.xlabel(\"Timestamp\")\n", + "plt.ylabel(\"Количество событий\")\n", + "plt.legend()\n", + "plt.grid(True, alpha=0.3)\n", + "plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "id": "901e7400", + "metadata": {}, + "source": [ + "# QUANTILE CUTOFF" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "8c691891", + "metadata": {}, + "outputs": [], + "source": [ + "def get_quantile_cutoffs(df, num_parts=4, base_ratio=None):\n", + " \"\"\"\n", + " Считает cutoffs так, чтобы разбить данные на части.\n", + " \n", + " Args:\n", + " num_parts: На сколько частей делить \"хвост\" истории.\n", + " base_ratio: Какую долю данных отдать в Base (самую первую часть). \n", + " Если None, делит всё поровну.\n", + " \"\"\"\n", + " # Достаем все таймстемпы в один плоский массив\n", + " # Это может занять память, если данных очень много (>100M), но для Beauty (2M) это ок\n", + " all_ts = df.select(pl.col(\"timestamps\").explode()).to_series().sort()\n", + " total_events = len(all_ts)\n", + " \n", + " print(f\"Всего событий: {total_events}\")\n", + " \n", + " cutoffs = []\n", + " \n", + " if base_ratio:\n", + " # Сценарий: Base занимает X% (например 80%), а остаток делим поровну на 3 части (Valid, Gap, Test)\n", + " # Остаток = 1 - base_ratio\n", + " # Каждая малая часть = (1 - base_ratio) / num_parts_tail\n", + " \n", + " base_idx = int(total_events * base_ratio)\n", + " cutoffs.append(all_ts[base_idx]) # Первый cutoff отделяет Base\n", + " \n", + " remaining_events = total_events - base_idx\n", + " part_size = remaining_events // num_parts # Делим остаток на 3 части (P1, P2, P3)\n", + " \n", + " current_idx = base_idx\n", + " for _ in range(num_parts-1): # Нам нужно еще 2 границы, чтобы получить 3 части\n", + " current_idx += part_size\n", + " cutoffs.append(all_ts[current_idx])\n", + " \n", + " else:\n", + " # Сценарий: Просто делим всё на N равных частей\n", + " step = total_events // num_parts\n", + " for i in range(1, num_parts):\n", + " idx = i * step\n", + " cutoffs.append(all_ts[idx])\n", + " \n", + " return cutoffs\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "13c1466f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Всего событий: 198502\n", + "\n", + "--- Новые Cutoffs (по количеству событий) ---\n", + "Cutoffs: [1394150400, 1397001600, 1399939200, 1403049600]\n", + "[0, 1394150400, 1397001600, 1399939200, 1403049600, None]\n", + "Проверка количества событий в новых частях:\n", + "Part 0: 158689 events\n", + "Part 1: 9965 events\n", + "Part 2: 9701 events\n", + "Part 3: 10137 events\n", + "Part 4: 10010 events\n" + ] + } + ], + "source": [ + "equal_event_cutoffs = get_quantile_cutoffs(df, num_parts=4, base_ratio=0.8)\n", + "\n", + "print(\"\\n--- Новые Cutoffs (по количеству событий) ---\")\n", + "print(f\"Cutoffs: {equal_event_cutoffs}\")\n", + "\n", + "# Проверка распределения\n", + "intervals_eq = [0] + equal_event_cutoffs + [None]\n", + "print(intervals_eq)\n", + "print(\"Проверка количества событий в новых частях:\")\n", + "for i in range(len(intervals_eq)-1):\n", + " start, end = intervals_eq[i], intervals_eq[i+1]\n", + " q = df.lazy().explode([\"timestamps\"])\n", + " if end:\n", + " q = q.filter((pl.col(\"timestamps\") >= start) & (pl.col(\"timestamps\") < end))\n", + " else:\n", + " q = q.filter(pl.col(\"timestamps\") >= start)\n", + " count = q.select(pl.len()).collect().item()\n", + " print(f\"Part {i}: {count} events\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "4e7f7b46", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✓ Сохранено: /home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/raw/inter_new_[0_1394150400).json\n", + "✓ Сохранено: /home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/raw/inter_new_[1394150400_1399939200).json\n", + "✓ Сохранено: /home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/raw/inter_new_[1399939200_1403049600).json\n", + "✓ Сохранено: /home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/raw/inter_new_[1403049600_inf).json\n", + "0 Base 20825 158689 \n", + "1 Gap 6816 19666 \n", + "2 Valid 3817 10137 \n", + "3 Test 3626 10010 \n" + ] + } + ], + "source": [ + "new_split_files = split_session_by_timestamps(\n", + " df, \n", + " [1394150400, 1399939200, 1403049600], \n", + " output_dir=\"/home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/raw\"\n", + ")\n", + "\n", + "names = [\"Base\", \"Gap\", \"Valid\", \"Test\"]\n", + "for i, d in enumerate(new_split_files):\n", + " num_users = len(d)\n", + " \n", + " num_events = sum(len(items) for items in d.values())\n", + " \n", + " print(f\"{i:<10} {names[i]:<10} {num_users:<10} {num_events:<10}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "82fd2bca", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Merging 2 files into exp_4_0.9_inter_tiger_train.json...\n", + "✓ Done: /home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/merged_for_exps/exp_4_0.9_inter_tiger_train.json (Users: 21760)\n", + "Merging 2 files into exp_4-1_0.9_inter_semantics_train.json...\n", + "✓ Done: /home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/merged_for_exps/exp_4-1_0.9_inter_semantics_train.json (Users: 21760)\n", + "Merging 1 files into exp_4-2_0.8_inter_semantics_train.json...\n", + "✓ Done: /home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/merged_for_exps/exp_4-2_0.8_inter_semantics_train.json (Users: 20825)\n", + "Merging 3 files into exp_4-3_0.95_inter_semantics_train.json...\n", + "✓ Done: /home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/merged_for_exps/exp_4-3_0.95_inter_semantics_train.json (Users: 22079)\n", + "Merging 1 files into test_set.json...\n", + "✓ Done: /home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/merged_for_exps/test_set.json (Users: 3626)\n", + "Merging 1 files into valid_set.json...\n", + "✓ Done: /home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/merged_for_exps/valid_set.json (Users: 3817)\n", + "Merging 4 files into all_set.json...\n", + "✓ Done: /home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/merged_for_exps/all_set.json (Users: 22363)\n", + "All done!\n" + ] + } + ], + "source": [ + "EXP_DIR = \"/home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/merged_for_exps\"\n", + "\n", + "base_p, gap_p, valid_p, test_p = new_split_files[0], new_split_files[1], new_split_files[2], new_split_files[3]\n", + "\n", + "# Tiger: base + gap\n", + "merge_and_save([base_p, gap_p], EXP_DIR, \"exp_4_0.9_inter_tiger_train.json\")\n", + "\n", + "# 1. Exp 4.1 (Standard)\n", + "# Semantics: base + gap (Всё кроме валидации и теста)\n", + "merge_and_save([base_p, gap_p], EXP_DIR, \"exp_4-1_0.9_inter_semantics_train.json\")\n", + "\n", + "# 2. Exp 4.2 (Short Semantics)\n", + "# Semantics: base (Короче на пропуск, без gap)\n", + "merge_and_save([base_p], EXP_DIR, \"exp_4-2_0.8_inter_semantics_train.json\")\n", + "\n", + "# 3. Exp 4.3 (Leak)\n", + "# Semantics: base + gap + valid (Видит валидацию)\n", + "merge_and_save([base_p, gap_p, valid_p], EXP_DIR, \"exp_4-3_0.95_inter_semantics_train.json\")\n", + "\n", + "# 4. Test Set (тест всех моделей)\n", + "merge_and_save([test_p], EXP_DIR, \"test_set.json\")\n", + "\n", + "# 4. Valid Set (валидационный набор)\n", + "merge_and_save([valid_p], EXP_DIR, \"valid_set.json\")\n", + "\n", + "# 4. All Set (все данные)\n", + "merge_and_save([base_p, gap_p, valid_p, test_p], EXP_DIR, \"all_set.json\")\n", + "\n", + "print(\"All done!\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "d34b1c55", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Проверка Train сетов (должны быть префиксами):\n", + "доля событий всего 0.90:\n", + "✅ [ПРЕФИКСЫ] Все 21760 массивов ОК. Полных совпадений: 16175\n", + "доля событий всего 0.80:\n", + "✅ [ПРЕФИКСЫ] Все 20825 массивов ОК. Полных совпадений: 12129\n", + "доля событий всего 0.95:\n", + "✅ [ПРЕФИКСЫ] Все 22079 массивов ОК. Полных совпадений: 18737\n", + "доля событий всего 0.90:\n", + "✅ [ПРЕФИКСЫ] Все 21760 массивов ОК. Полных совпадений: 16175\n", + "\n", + "Проверка Test сета (должен быть суффиксом):\n", + "доля событий всего 0.05:\n", + "✅ [СУФФИКСЫ] Все 3626 массивов ОК. Полных совпадений: 284\n", + "\n", + "(Контроль) Проверка Test сета как префикса (должна упасть):\n", + "доля событий всего 0.05:\n", + "❌ [ПРЕФИКСЫ] Найдено 3342 ошибок.\n", + "доля событий всего 1.00:\n", + "✅ [ПРЕФИКСЫ] Все 22363 массивов ОК. Полных совпадений: 22363\n" + ] + } + ], + "source": [ + "with open(\"/home/jovyan/IRec/data/Beauty/inter_new.json\", 'r') as f:\n", + " old_inter_new = json.load(f)\n", + "\n", + "with open(\"/home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/merged_for_exps/exp_4-1_0.9_inter_semantics_train.json\", 'r') as ff:\n", + " first_sem = json.load(ff)\n", + " \n", + "with open(\"/home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/merged_for_exps/exp_4-2_0.8_inter_semantics_train.json\", 'r') as ff:\n", + " second_sem = json.load(ff)\n", + " \n", + "with open(\"/home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/merged_for_exps/exp_4-3_0.95_inter_semantics_train.json\", 'r') as ff:\n", + " third_sem = json.load(ff)\n", + " \n", + "with open(\"/home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/merged_for_exps/exp_4_0.9_inter_tiger_train.json\", 'r') as ff:\n", + " tiger_sem = json.load(ff)\n", + "\n", + "with open(\"//home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/merged_for_exps/test_set.json\", 'r') as ff:\n", + " test_sem = json.load(ff)\n", + "\n", + "with open(\"/home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/merged_for_exps/all_set.json\", 'r') as ff:\n", + " all_test_data = json.load(ff)\n", + "\n", + "def check_prefix_match(full_data, subset_data, check_suffix=False):\n", + " \"\"\"\n", + " check_suffix=True включит режим проверки суффиксов (для теста).\n", + " \"\"\"\n", + " mismatch_count = 0\n", + " full_match_count = 0\n", + "\n", + " num_events_full_data = sum(len(items) for items in full_data.values())\n", + " num_events_subset_data = sum(len(items) for items in subset_data.values())\n", + " print(f\"доля событий всего {(num_events_subset_data/num_events_full_data):.2f}:\")\n", + " \n", + " # Итерируемся по ключам сабсета, так как в full_data может быть больше юзеров\n", + " for user, sub_items in subset_data.items():\n", + " \n", + " # Проверяем есть ли такой юзер в исходнике\n", + " if user not in full_data:\n", + " print(f\"⚠ Юзер {user} не найден в исходном файле!\")\n", + " mismatch_count += 1\n", + " continue\n", + " \n", + " full_items = full_data[user]\n", + " \n", + " # Логика для проверки ПРЕФИКСА (начало совпадает)\n", + " if not check_suffix:\n", + " if len(sub_items) > len(full_items):\n", + " mismatch_count += 1\n", + " continue\n", + " \n", + " # Сравниваем начало full с sub\n", + " if full_items[:len(sub_items)] == sub_items:\n", + " if len(full_items) == len(sub_items):\n", + " full_match_count += 1\n", + " else:\n", + " mismatch_count += 1\n", + " \n", + " # Логика для проверки СУФФИКСА (конец совпадает - для теста)\n", + " else:\n", + " if len(sub_items) > len(full_items):\n", + " mismatch_count += 1\n", + " continue\n", + " \n", + " # Сравниваем конец full с sub\n", + " # Срез [-len:] берет последние N элементов\n", + " if full_items[-len(sub_items):] == sub_items:\n", + " if len(full_items) == len(sub_items):\n", + " full_match_count += 1\n", + " else:\n", + " mismatch_count += 1\n", + "\n", + " mode = \"СУФФИКСЫ\" if check_suffix else \"ПРЕФИКСЫ\"\n", + " \n", + " if mismatch_count == 0:\n", + " print(f\"✅ [{mode}] Все {len(subset_data)} массивов ОК. Полных совпадений: {full_match_count}\")\n", + " else:\n", + " print(f\"❌ [{mode}] Найдено {mismatch_count} ошибок.\")\n", + "\n", + "# --- Запуск проверок ---\n", + "print(\"Проверка Train сетов (должны быть префиксами):\")\n", + "check_prefix_match(old_inter_new, first_sem)\n", + "check_prefix_match(old_inter_new, second_sem)\n", + "check_prefix_match(old_inter_new, third_sem)\n", + "check_prefix_match(old_inter_new, tiger_sem)\n", + "\n", + "print(\"\\nПроверка Test сета (должен быть суффиксом):\")\n", + "check_prefix_match(old_inter_new, test_sem, check_suffix=True)\n", + "\n", + "print(\"\\n(Контроль) Проверка Test сета как префикса (должна упасть):\")\n", + "check_prefix_match(old_inter_new, test_sem, check_suffix=False)\n", + "\n", + "check_prefix_match(old_inter_new, all_test_data)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "501fae46", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Part 0 [Base]: 19666 events (2014-03-07 -> 2014-05-13)\n", + "Part 1 [Gap]: 10137 events (2014-05-13 -> 2014-06-18)\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA/8AAAIjCAYAAABViau2AAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjAsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvlHJYcgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAlp5JREFUeJzs3XmcjXX/x/H3NftiFsOsmhj7nqWslSVrIpJSKsVNiSTdlBahpFSyluoXWriTlBbdGCSSLRIiodGCmcGY3azn+v3hnpPjjJlzmDFzptfz8Zh7nO/3c33P5zrfc6b7c67r+l6GaZqmAAAAAABAheVW1gkAAAAAAIDSRfEPAAAAAEAFR/EPAAAAAEAFR/EPAAAAAEAFR/EPAAAAAEAFR/EPAAAAAEAFR/EPAAAAAEAFR/EPAAAAAEAFR/EPAAAAAEAFR/EPAAD+8fbt26cVK1ZYH+/evVsrV64su4Qu0V9//aVFixZZHx89elSLFy8uu4QAAOUGxT8AlLFFixbJMAzrj4+Pj+rWratRo0YpISGhrNMD/hHS0tL04IMPauvWrTp06JAeffRR7d27t6zTcpphGBo5cqRWr16to0ePavz48dq0aVNZpwUAKAc8yjoBAMA5U6ZMUUxMjLKysvTdd9/pzTff1Ndff619+/bJz8+vrNMDKrS2bdtafySpbt26GjZsWBln5bxq1app2LBh6tGjhyQpMjJSGzZsKNukAADlgmGaplnWSQDAP9miRYv0wAMPaMeOHbr22mut7Y8//rhmzJihJUuW6K677irDDIF/jv379+vs2bNq0qSJvLy8yjqdS3bkyBGdOnVKjRs3lr+/f1mnAwAoBzjtHwDKqc6dO0uS4uLiJElJSUn697//rSZNmqhSpUoKDAxUz5499dNPP9ltm5WVpUmTJqlu3bry8fFRZGSkbrvtNh05ckTSueuAz7/U4MKfjh07WsfasGGDDMPQ0qVL9dRTTykiIkL+/v7q06eP/vzzT7vn3rZtm3r06KGgoCD5+fmpQ4cO2rx5c6H72LFjx0Kff9KkSXaxH374oVq2bClfX1+FhIRo4MCBhT5/Uft2PovFopkzZ6pRo0by8fFReHi4HnzwQZ05c8YmrkaNGrrlllvsnmfUqFF2YxaW+yuvvGL3mkpSdna2nnvuOdWuXVve3t6Kjo7W+PHjlZ2dXehrdb4LX7eqVauqV69e2rdvnzXm9OnT6tmzp6666ip5e3srMjJSgwYN0u+//24z1quvvqp27dqpSpUq8vX1VcuWLfXJJ5/YPaej+1bwfjEMQ7t377aJP3bsmNzd3WUYht1zrF+/XjfccIP8/f0VHBysW2+9VQcOHHBo/y/3fdOxY0frPjRs2FAtW7bUTz/9VOj7pmDMVq1ayc/PT5UrV9aNN96oNWvWSDr3finqs1WjRg1J9u9TT09P1ahRQ+PGjVNOTo71uQouCzp69Ki1zWKxqGnTpjIMw+b6/vvvv986fq1atdS6dWslJSXJ19fXbozC3H///TY5Va5cWR07diz0soE33nhDjRo1kre3t6KiojRy5EglJycXOb4kTZo0yeY5AgIC1KpVK5v1FqRzc9K4cWPt3LlT7dq1k6+vr2JiYjR//ny7MR39LBU858yZM+3GqF+/vgzD0KhRo2zak5OTNWbMGEVHR8vb21u1a9fWyy+/LIvFYo0pmMtXX33VbtzGjRsX+vm48GyMXr16XfQ9DAAlhdP+AaCcKijUq1SpIkn67bfftGLFCg0YMEAxMTFKSEjQW2+9pQ4dOmj//v2KioqSJOXn5+uWW27RunXrNHDgQD366KNKS0tTbGys9u3bp1q1almf46677tLNN99s87wTJkwoNJ+pU6fKMAw98cQTSkxM1MyZM9WlSxft3r1bvr6+ks4VcD179lTLli313HPPyc3NTQsXLlTnzp21adMmtWrVym7cq666StOmTZMkpaena8SIEYU+97PPPqs77rhD//rXv3Ty5EnNmTNHN954o3788UcFBwfbbTN8+HDdcMMNkqRPP/1Un332mU3/gw8+aD3rYvTo0YqLi9PcuXP1448/avPmzfL09Cz0dXBGcnKydd/OZ7FY1KdPH3333XcaPny4GjRooL179+r111/Xr7/+alcIFaZ+/fp6+umnZZqmjhw5ohkzZujmm2/WH3/8IUnKyclRQECAHn30UVWpUkVHjhzRnDlztGfPHptr2WfNmqU+ffpo0KBBysnJ0UcffaQBAwboq6++Uq9evZzetwI+Pj5auHChZs2aZW1777335OXlpaysLJvYtWvXqmfPnqpZs6YmTZqks2fPas6cOWrfvr127dplLWjPV1rvmwJPPPFEoe2TJ0/WpEmT1K5dO02ZMkVeXl7atm2b1q9fr27dumnmzJlKT0+XJB04cEAvvviinnrqKTVo0ECSVKlSJZvxCt6n2dnZWr16tV599VX5+Pjo+eefv2huH3zwgcPrEUycONHu9S5K1apV9frrr0s6t3jgrFmzdPPNN+vPP/+0vl6TJk3S5MmT1aVLF40YMUIHDx7Um2++qR07djj82fnggw8kSadOndIbb7yhAQMGaN++fapXr5415syZM7r55pt1xx136K677tLHH3+sESNGyMvLS0OGDJHk/Gep4H05ZswYa9v3339v96WYJGVmZqpDhw46duyYHnzwQV199dX6/vvvNWHCBJ04caLQLxEuxcaNG/X111+XyFgAUCQTAFCmFi5caEoy165da548edL8888/zY8++sisUqWK6evra/7111+maZpmVlaWmZ+fb7NtXFyc6e3tbU6ZMsXatmDBAlOSOWPGDLvnslgs1u0kma+88opdTKNGjcwOHTpYH3/zzTemJLNatWpmamqqtf3jjz82JZmzZs2yjl2nTh2ze/fu1ucxTdPMzMw0Y2JizK5du9o9V7t27czGjRtbH588edKUZD733HPWtqNHj5ru7u7m1KlTbbbdu3ev6eHhYdd+6NAhU5L53nvvWduee+458/z/5G3atMmUZC5evNhm21WrVtm1V69e3ezVq5dd7iNHjjQv/M/ohbmPHz/eDAsLM1u2bGnzmn7wwQemm5ubuWnTJpvt58+fb0oyN2/ebPd85+vQoYPNeKZpmk899ZQpyUxMTLzodtOnTzclmadOnbK2ZWZm2sTk5OSYjRs3Njt37nxJ+1bwfrnrrrvMKlWqmNnZ2da+OnXqmHfffbcpyVy2bJm1vVmzZmZYWJh5+vRpa9tPP/1kurm5mffdd5/dfpT0++bC1/Prr782JZk9evSwmeNDhw6Zbm5uZr9+/ew+i+e/5y98Lb755hu7voLP4MKFC23ao6KizJtvvtn6uODvQ1xcnGma5/4OXH311WbPnj3tth88eLBZvXp16+N9+/aZbm5u1tiCMS7mwu1N0zTffvttU5K5fft20zRNMzEx0fTy8jK7detm8xrMnTvXlGQuWLCgyOe48LNomqa5Zs0aU5L58ccfW9s6dOhgSjJfe+01a1t2drb1vZKTk2OapnOfJUnm7bffbnp4eJg//PCDtX3o0KHW9+XIkSOt7c8//7zp7+9v/vrrrzZjP/nkk6a7u7v5xx9/mKZ5aX9Pz39PtG7d2jpH57+HAaCkcdo/AJQTXbp0UWhoqKKjozVw4EBVqlRJn332mapVqyZJ8vb2lpvbuT/b+fn5On36tCpVqqR69epp165d1nGWL1+uqlWr6pFHHrF7jsJOYXbUfffdp4CAAOvj22+/XZGRkdYjVrt379ahQ4d099136/Tp0zp16pROnTqljIwM3XTTTdq4caPNqbLSucsTfHx8inzeTz/9VBaLRXfccYd1zFOnTikiIkJ16tTRN998YxNfcMq0t7f3RcdctmyZgoKC1LVrV5sxW7ZsqUqVKtmNmZubaxN36tSpYo+mHjt2THPmzNGzzz5rd7R32bJlatCggerXr28zZsGlHhc+f2EKcjp58qS2bNmizz77TE2bNlXVqlVt4tLS0pSYmKgtW7boP//5jxo1aqSQkBBrf8FZG9K5I60pKSm64YYbbN5Tzuxbgd69e8swDH3xxReSpE2bNumvv/7SnXfeaRN34sQJ7d69W/fff79NXk2bNlXXrl0LPSJaGu+bAqZpasKECerfv79at25t07dixQpZLBZNnDjR+lkscKmfrfT0dJ06dUrHjh3T22+/rfj4eN10000XjZ83b55Onz6t5557rtixJ0yYoBYtWmjAgAEO52OxWKyv1e7du/X+++8rMjLSeubC2rVrlZOTozFjxti8BsOGDVNgYKDDt0cseI4DBw5o/vz58vf3V5s2bWxiPDw89OCDD1ofe3l56cEHH1RiYqJ27twpyfnPUnh4uHr16qWFCxdKOnd0/+OPP9YDDzxgl+OyZct0ww03qHLlyjZjd+nSRfn5+dq4caNNfGZmpt3fifz8/CJfh08//VQ7duzQSy+95NDrBgCXg9P+AaCcmDdvnurWrSsPDw+Fh4erXr16Nv/n2mKxaNasWXrjjTcUFxdn838qCy4NkM5dLlCvXj15eJTsn/g6derYPDYMQ7Vr17ZeR3zo0CFJ0uDBgy86RkpKiipXrmx9fOrUKbtxL3To0CGZpnnRuAtPMS647vhiRWnBmCkpKQoLCyu0PzEx0ebxmjVrFBoaWmSeF3ruuecUFRWlBx980O769kOHDunAgQMXHfPC5y/M999/b7N9nTp1tGLFCrsidNiwYVq6dKkk6brrrtPXX39tE/PVV1/phRde0O7du22ukS6qmC1q3wp4enrqnnvu0YIFC3T77bdrwYIF6t+/vwIDA23iCk63Pv907wINGjTQ6tWrlZGRYbNoXWm8bwosXrxYP//8sz7++GMtWbLEpu/IkSNyc3NTw4YNi3xuZzzyyCM2X9Q98MADeuyxxwqNTUlJ0YsvvqixY8cqPDy8yHG/++47ffnll1q3bp31UhBH/Pnnnzbvq8jISC1fvtz6ebrYfHl5ealmzZqFnj5fmPOfIzAwUIsXL1Z0dLRNTFRUlN1ihXXr1pV07jr7Nm3aXNJn6YEHHtADDzyg1157TcuWLVPlypWtXxac79ChQ9qzZ4/DYz/33HOFfilzsbnKz8/XU089pUGDBqlp06aFxgBASaL4B4ByolWrVjar/V/oxRdf1LPPPqshQ4bo+eefV0hIiNzc3DRmzBi7I+ploSCHV155Rc2aNSs05vyCPCcnRydOnFDXrl2LHdcwDP33v/+Vu7t7kWNKUnx8vCQpIiKiyDHDwsK0ePHiQvsv/D/7rVu31gsvvGDTNnfuXH3++eeFbn/gwAEtWrRIH374YaFFpsViUZMmTTRjxoxCt7+wCCpM06ZN9dprr0mSTp48qdmzZ6tjx47atWuXzb4/88wzeuCBB3TkyBFNnz5dAwcO1Nq1a+Xh4aFNmzapT58+uvHGG/XGG28oMjJSnp6eWrhwoV3h6+i+nW/IkCFq3ry5Dh48qGXLllnPArgcpfW+KRj72Wef1dChQ61FZmkbN26cunXrpvz8fP3888+aMmWKTNO0Hpk+38svvyw3NzeNGzdOp0+fLnLcJ554Qt27d1fnzp1tFgUsTnh4uD788ENJ575sWLBggXr06KHvvvtOTZo0cWrfihIbGytJysjI0PLly3XHHXfoq6++KnZeL3Qpn6VevXrJy8tLK1as0MKFCzV48GC7MzkKxu7atavGjx9f6NgXvkeGDx9ud5ZFUbeLfPfdd3X06FGtXr36ojEAUJIo/gHARXzyySfq1KmT3n33XZv25ORkm1O9a9WqpW3btik3N7dEFq0rUHBkv4Bpmjp8+LD1iFXBQoKBgYHq0qVLseP99NNPys3NLfILj4JxTdNUTEyMQwXZ/v37ZRhGoUeSzx9z7dq1at++vc1p7xdTtWpVu30qalG+CRMmqFmzZnanuJ///D/99JNuuummSz5dvHLlyjY5dezYUVFRUVq4cKHNoo2NGzdW48aNJUlNmjTRjTfeqNjYWPXs2VPLly+Xj4+PVq9ebXOZRGGFp6P7dr4mTZqoefPmuuOOOxQaGqpOnTrp22+/tYmpXr26JOngwYN22//yyy+qWrWqzdHf0nrfSOdWsE9MTLzoiuu1atWSxWLR/v37L/oFl7MaNmxoncfu3bsrOztbTz31lKZOnWpdxFOSjh8/rlmzZmnatGkKCAgosvhfsWKFtmzZUuSlGxfj4+Nj877q06ePQkJCNHfuXL311ls281WzZk1rXE5OjuLi4hz67Euyibv11lu1bds2vfrqqzbF//Hjx+3O+vj1118lyeauBs5+ljw8PHTvvfdq6tSp+vnnn7VgwYJC42rVqqX09HSH96lOnTp2sRe7zWJmZqYmT56shx9+2PqaAkBp45p/AHAR7u7uMk3Tpm3ZsmU6duyYTVv//v116tQpzZ07126MC7d3xvvvv6+0tDTr408++UQnTpxQz549JUktW7ZUrVq19Oqrr1pXOz/fyZMn7XJ3d3cv9DZ657vtttvk7u6uyZMn2+VvmqZNEZSXl6fly5erVatWRZ72f8cddyg/P7/QFdXz8vIcumXZxWzZskWff/65XnrppYsWI3fccYeOHTumd955x67v7NmzysjIcPp5z549K0lF3irw1KlTNjEFt907/xKSo0ePXvSLDUf27UJDhgzRnj17rLeRu1BkZKSaNWum9957z+Z137dvn9asWWN3N4rSeN9I59ZGmDp1qh577LGLnjXSt29fubm5acqUKXZn21zOZ+t8BfN4/u3+pHN3GQgPD9dDDz1U5PYFp5LffffdJfIFRU5OjvLy8qzvmS5dusjLy0uzZ8+22ed3331XKSkpRd4hoqicc3Jy7N67eXl5euutt2xyeeuttxQaGqqWLVtKuvTP0pAhQ7R3717deOONNl9inO+OO+7Qli1bCj0yn5ycrLy8PIf38UKzZs1SRkaGnn766UseAwCcxZF/AHARt9xyi6ZMmaIHHnhA7dq10969e7V48WK7/+N633336f3339fYsWO1fft23XDDDcrIyNDatWv18MMP69Zbb72k5w8JCdH111+vBx54QAkJCZo5c6Zq165tPa3Vzc1N//d//6eePXuqUaNGeuCBB1StWjUdO3ZM33zzjQIDA/Xll18qIyND8+bN0+zZs1W3bl2b+10XfGmwZ88ebdmyRW3btlWtWrX0wgsvaMKECTp69Kj69u2rgIAAxcXF6bPPPtPw4cP173//W2vXrtWzzz6rPXv26MsvvyxyXzp06KAHH3xQ06ZN0+7du9WtWzd5enrq0KFDWrZsmWbNmqXbb7/9kl6nNWvWqGvXrkUeLbz33nv18ccf66GHHtI333yj9u3bKz8/X7/88os+/vhjrV69utgj2wkJCdbTs0+dOqW33npLHh4e1qL4nXfe0caNG9WiRQsFBgZq//79eueddxQZGWldUK5Xr16aMWOGevToobvvvluJiYmaN2+eateurT179lzSvl1o2LBhGjBggIKCgi4a88orr6hnz55q27athg4dar3VX1BQkPUofGm9bwrs2rVLVatWvegp3pJUu3ZtPf3003r++ed1ww036LbbbpO3t7d27NihqKioIm99eDFbtmyRh4eH9bT/OXPmqHnz5na3N1yzZo0WL14sLy+vIsf766+/5OXldcm3jsvIyLA57f+DDz5QVlaW+vXrJ+ncJTETJkzQ5MmT1aNHD/Xp00cHDx7UG2+8oeuuu0733HOPQ89T8BwZGRlasWKFjh49anP7PencNf8vv/yyjh49qrp162rp0qXavXu33n77betZTZf6WWrQoIFOnTpV5Jk/48aN0xdffKFbbrlF999/v1q2bKmMjAzt3btXn3zyiY4ePWq3wKaj1qxZo6lTp9qs1wIApa4M7jAAADhPwa28duzYUWRcVlaW+fjjj5uRkZGmr6+v2b59e3PLli2F3vYtMzPTfPrpp82YmBjT09PTjIiIMG+//XbzyJEjpmle2q2p/vOf/5gTJkwww8LCTF9fX7NXr17m77//brf9jz/+aN52221mlSpVTG9vb7N69ermHXfcYa5bt87muYv7GTx4sM24y5cvN6+//nrT39/f9Pf3N+vXr2+OHDnSPHjwoGmapvnII4+YN954o7lq1Sq7nAq7vZhpnruNWcuWLU1fX18zICDAbNKkiTl+/Hjz+PHj1hhnb/VnGIa5c+dOm/bC5ignJ8d8+eWXzUaNGpne3t5m5cqVzZYtW5qTJ082U1JS7J7vwvHOf62Cg4PN9u3bm19//bU15ttvvzVvuOEGMzg42PT29jZr1KhhDhs2zO52b++++65Zp04d09vb26xfv765cOHCQl8vR/et4P1y/q38znex/rVr15rt27c3fX19zcDAQLN3797m/v37rf2l9b45//V8/fXXbba92PtmwYIFZvPmza3z1qFDBzM2Nvai+1rUrf4Kftzc3MyrrrrKHDx4sPX2nqb599+HZs2a2dxOsLBbBQ4ePNiUZD766KM2z3Xh7QIvpmD7gp9KlSqZLVq0MD/44AO72Llz55r169c3PT09zfDwcHPEiBHmmTNnihzfNP9+TQt+fH19zYYNG5qvv/66zf516NDBbNSokfnDDz+Ybdu2NX18fMzq1aubc+fOtRvT0c+SLriV34UK609LSzMnTJhg1q5d2/Ty8jKrVq1qtmvXznz11Vettxu8lL+nkZGRZkZGht3zc6s/AKXJMM0SOk8NAFAhbdiwQZ06ddKyZcsu+Wj4+Y4ePaqYmBjFxcXZHd0sMGnSJB09etSphcpQsfG++Wfp2LGjTp06pX379pV1KgBQYXDNPwAAAAAAFRzX/AMArqhKlSpp0KBBRS7I17RpU5uVzgHeNwAAXB6KfwDAFVW1alXrYl8Xc9ttt12hbOAqeN8AAHB5uOYfAAAAAIAKrkyv+d+4caN69+6tqKgoGYZx0fsKS9JDDz0kwzA0c+ZMm/akpCQNGjRIgYGBCg4O1tChQ+3uL71nzx7dcMMN8vHxUXR0tKZPn243/rJly1S/fn35+PioSZMml3yLHAAAAAAAypsyLf4zMjJ0zTXXaN68eUXGffbZZ9q6dWuh1/ENGjRIP//8s2JjY/XVV19p48aNGj58uLU/NTVV3bp1U/Xq1bVz50698sormjRpkt5++21rzPfff6+77rpLQ4cO1Y8//qi+ffuqb9++rDALAAAAAKgQys1p/4Zh6LPPPlPfvn1t2o8dO6bWrVtr9erV6tWrl8aMGaMxY8ZIkg4cOKCGDRtqx44duvbaayVJq1at0s0336y//vpLUVFRevPNN/X0008rPj5eXl5ekqQnn3xSK1as0C+//CJJuvPOO5WRkaGvvvrK+rxt2rRRs2bNNH/+fIfyt1gsOn78uAICAmQYxmW+GgAAAAAAFM00TaWlpSkqKkpubkUf2y/XC/5ZLBbde++9GjdunBo1amTXv2XLFgUHB1sLf0nq0qWL3NzctG3bNvXr109btmzRjTfeaC38Jal79+56+eWXdebMGVWuXFlbtmzR2LFjbcbu3r17kZchZGdnKzs72/r42LFjatiw4WXsLQAAAAAAzvvzzz911VVXFRlTrov/l19+WR4eHho9enSh/fHx8QoLC7Np8/DwUEhIiOLj460xMTExNjHh4eHWvsqVKys+Pt7adn5MwRiFmTZtmiZPnmzXvmvXriJvQ4Qrw2KxKDU1VYGBgcV+A4byhblzTcyb62LuXBdz55qYN9fF3JUPVW+4Qe4JCcoPD9epTZuKj99yvdxzEpXvFaZTbb+z6bth6Q1KyEhQuH+4Nt1Z/FjlUXp6ulq0aKGAgIBiY8tt8b9z507NmjVLu3btKpen0U+YMMHmbIHU1FRFR0crJiZGgYGBZZgZpHN/nE+ePKnQ0FD+OLsY5s41MW+ui7lzXcyda2LeXBdzVz4YHh4yJJkeHgqqVav4+H2eMs5Kpq+nXbyHn4eUf+53LQfGKo9SU1MlyaGaudwW/5s2bVJiYqKuvvpqa1t+fr4ef/xxzZw5U0ePHlVERIQSExNttsvLy1NSUpIiIiIkSREREUpISLCJKXhcXExBf2G8vb3l7e1t1+7m5sYfg3LCMAzmw0Uxd66JeXNdzJ3rYu5cE/Pmupi78sOQZDgwD+cvcFfUvLnqnDqTd7ndw3vvvVd79uzR7t27rT9RUVEaN26cVq9eLUlq27atkpOTtXPnTut269evl8ViUevWra0xGzduVG5urjUmNjZW9erVU+XKla0x69ats3n+2NhYtW3btrR3EwAAAACAUlemR/7T09N1+PBh6+O4uDjt3r1bISEhuvrqq1WlShWbeE9PT0VERKhevXqSpAYNGqhHjx4aNmyY5s+fr9zcXI0aNUoDBw603hbw7rvv1uTJkzV06FA98cQT2rdvn2bNmqXXX3/dOu6jjz6qDh066LXXXlOvXr300Ucf6YcffrC5HSAAAAAAAK6qTIv/H374QZ06dbI+LriGfvDgwVq0aJFDYyxevFijRo3STTfdJDc3N/Xv31+zZ8+29gcFBWnNmjUaOXKkWrZsqapVq2rixIkaPny4NaZdu3ZasmSJnnnmGT311FOqU6eOVqxYocaNG5fMjv6PaZrKy8tTfn5+iY4LexaLRbm5ucrKyiq3p/C4u7vLw8OjXK5pAQAAAJRLO3ZI+fmSu7tD4Wa3bTp5MkFVQ8N14f/r3jFsh/LNfLkbjo3l6gzTNM3iw1Cc1NRUBQUFKSUlpdAF/3JycnTixAllZmaWQXb/PKZpymKxyM3NrVwX135+foqMjLS5FeU/ncViUWJiosLCwsrtFzewx7y5LubOdTF3rol5c13MnWuq6PNWXB16vnK74F9FYrFYFBcXJ3d3d0VFRcnLy6tcF6QVQcFZFuX1yLppmsrJydHJkycVFxenOnXqVMg/RgAAAADKB4r/KyAnJ0cWi0XR0dHy8/Mr63T+Ecp78S9Jvr6+8vT01O+//66cnBz5+PiUdUoAAAAAKiiK/yuII7u4EO8JAAAAwAlvvy2lp0uVKknnreN2UYffll9yvJQaIdV9yHaonW8rPSddlbwqaXhLB8ZycRT/AAAAAADXMGWKdOyYVK2aQ8W/8fMLCjx7TKZvNbvif8q3U3Qs7ZiqBVT7RxT/HHYEAAAAAKCC48h/GVt2JOWKPdeAWkFX7LkAAAAAAOUHR/5RpPvvv1+GYdj99OjR44rlMGnSJDVr1uyKPR8AAAAAVDQc+UexevTooYULF9q0eXt7l1E2AAAAAABnceQfxfL29lZERITNT+XKlXX33XfrzjvvtInNzc1V1apV9f7770uSLBaLpk2bppiYGPn6+uqaa67RJ598Yo3fsGGDDMPQunXrdO2118rPz0/t2rXTwYMHJUmLFi3S5MmT9dNPP1nPOli0aJFM09SkSZN09dVXy9vbW1FRURo9evSVe1EAAAAAwIVw5B+XbNCgQRowYIDS09NVqVIlSdLq1auVmZmpfv36SZKmTZumDz/8UPPnz1edOnW0ceNG3XPPPQoNDVWHDh2sYz399NN67bXXFBoaqoceekhDhgzR5s2bdeedd2rfvn1atWqV1q5dK0kKCgrS8uXL9frrr+ujjz5So0aNFB8fr59++unKvwgAAAAA4AIo/lGsr776ylrcF3jqqac0fvx4+fv767PPPtO9994rSVqyZIn69OmjgIAAZWdn68UXX9TatWvVtm1bSVLNmjX13Xff6a233rIp/qdOnWp9/OSTT6pXr17KysqSr6+vKlWqJA8PD0VERFjj//jjD0VERKhLly7y9PTU1VdfrVatWpX2SwEAAAAALoniH8Xq1KmT3nzzTZu2kJAQeXh46I477tDixYt17733KiMjQ59//rk++ugjSdLhw4eVmZmprl272mybk5Oj5s2b27Q1bdrU+u/IyEhJUmJioq6++upCcxowYIBmzpypmjVrqkePHrr55pvVu3dveXjwlgYAAACAC1EpoVj+/v6qXbt2oX2DBg1Shw4dlJiYqNjYWPn6+lrvBJCeni5JWrlypapVq2az3YULBnp6elr/bRiGpHPrBVxMdHS0Dh48qLVr1yo2NlYPP/ywXnnlFX377bc2YwEAAACoQOrWlYKCpPBwx+ID6irXzV8e/tXsuupWqasgnyCF+zs4louj+MdladeunaKjo7V06VL997//1YABA6zFd8OGDeXt7a0//vjD5hR/Z3l5eSk/P9+u3dfXV71791bv3r01cuRI1a9fX3v37lWLFi0u+bkAAAAAlGPr1zsVbnZeq9OJiQoLC5Nx4VCDnRvL1VH8o1jZ2dmKj4+3afPw8FDVqlUlSXfffbfmz5+vX3/9Vd988401JiAgQP/+97/12GOPyWKx6Prrr1dKSoo2b96swMBADR482KHnr1GjhuLi4rR7925dddVVCggI0H/+8x/l5+erdevW8vPz04cffihfX19Vr1695HYcAAAAQLm07EhKkf0DagVdoUxcB8V/GXOFN+WqVaus1+EXqFevnn755RdJ5079nzp1qqpXr6727dvbxD3//PMKDQ3VtGnT9Ntvvyk4OFgtWrTQU0895fDz9+/fX59++qk6deqk5ORkLVy4UMHBwXrppZc0duxY5efnq0mTJvryyy9VpUqVy99hAAAAAKhgDNM0zbJOoiJITU1VUFCQUlJSFBgYaNOXlZWluLg4xcTEyMfHp4wy/GcxTVN5eXny8PCwriFQHvHesGexWJT4v1Oz3NzcyjodOIh5c13Mneti7lwT8+a6mLvyxdEj/xV93oqqQy/EkX8AAAAAgGsYNEg6dUqtfIK0fcY7xYYbW+5R5bTjMgKipPZLbIf6dJBOZZ5SVb+qWnzb4tLKuNyg+AcAAAAAuIZvv5WOHVNoeJRj8Ykb5X32mMyz9qv9f3v0Wx1LO6ZqAfZ9FVHFO+8BAAAAAADYoPgHAAAAAKCCo/gHAAAAAKCCo/gHAAAAAKCCo/gHAAAAAKCCo/gHAAAAAKCCo/gHAAAAAKCCo/hHubd582Y1adJEnp6e6tu370XbAAAAAACF8yjrBFD+xcfHa+rUqVq5cqWOHTumsLAwNWvWTGPGjNFNN93k0BgbNmxQp06ddObMGQUHBzv1/GPHjlWzZs303//+V5UqVbpom6P7Mm3aNK1cuVJ//fWXgoKCVLt2bd1zzz0aPHiw/Pz8nMoNAAAAQMladiTlon0N+98rz7RU5QYEOjSWWetfykw5Id+gSBkX9A1rMUwp2SkK8g66jGxdB8U/inT06FG1b99ewcHBeuWVV9SkSRPl5uZq9erVGjlypH755ZdSz+HIkSN66KGHdNVVVxXZVpzffvvNui8vvviimjRpIm9vb+3du1dvv/22qlWrpj59+pTGLgAAAAAoAftHP+ncBo0nKi0xUb5hYXZdz3V8roSycg2c9o8iPfzwwzIMQ9u3b1f//v1Vt25dNWrUSGPHjtXWrVslnfuCwDAM7d6927pdcnKyDMPQhg0bdPToUXXq1EmSVLlyZRmGofvvv1+SlJ2drdGjRyssLEw+Pj66/vrrtWPHDptxT58+rSFDhsgwDC1atKjQNkf3xcPDQz/88IPuuOMONWjQQDVr1tStt96qlStXqnfv3tbYGTNmqEmTJvL391d0dLQefvhhpaenW/sXLVqk4OBgrVixQnXq1JGPj4+6d++uP//88zJebQAAAAAoHRT/ZW3GDOmqq4r/KeyIdJ8+jm07Y8YlpZaUlKRVq1Zp5MiR8vf3t+t39PT96OhoLV++XJJ08OBBnThxQrNmzZIkjR8/XsuXL9d7772nXbt2qXbt2urevbuSkpIUHR2tEydOKDAwUDNnztSJEyc0YMAAu7Y777zT+qXAxZw+fVpr1qy56L5Istnezc1Ns2fP1s8//6z33ntP69ev1/jx423iMzMzNXXqVL3//vvavHmzkpOTNXDgQIdeEwAAAAC4kjjtv6ylpkrHjhUfFx1t33bypGPbpqY6n5ekw4cPyzRN1a9f/5K2L+Du7q6QkBBJUlhYmPVLg4yMDL355ptatGiRevbsKUl65513FBsbq3fffVfjxo1TRESEDMNQUFCQIiIiJEn+/v52bUFBQapXr16x+3JhTNWqVZWVlSVJGjlypF5++WVJ0pgxY6wxNWrU0AsvvKCHHnpIb7zxhrU9NzdXc+fOVevWrSVJ7733nho0aKDt27erVatWl/pyAQAAAECJo/gva4GBUrVqxceFhhbe5si2gY4thnEh0zQvaTtHHTlyRLm5uWrfvr21zdPTU61atdKBAwecGqtfv37q16+f0zls375dFotFgwYNUnZ2trV97dq1mjZtmn755RelpqYqLy9PWVlZyszMtC4K6OHhoeuuu866Tf369RUcHKwDBw5Q/AMAAACloFf7hvJLOK7M8Cit3Ly/2Hjj86sVcfaYTN9qUr+/bPqumnGVjqUdU7WAavpr7F8XGaHioPgva2PHnvu5FF98UbK5XKBOnToyDKPYRf3c3M5dPXL+lwW5ubmlmpuzateuLcMwdPDgQZv2mjVrSpJ8fX2tbUePHtUtt9yiESNGaOrUqQoJCdF3332noUOHKicnhzsCAAAAAHA5XPOPiwoJCVH37t01b948ZWRk2PUnJydLkkL/d1bCiRMnrH3nL/4nSV5eXpKk/Px8a1utWrXk5eWlzZs3W9tyc3O1Y8cONWzYsKR2Q5JUpUoVde3aVXPnzi10X863c+dOWSwWvfbaa2rTpo3q1q2r48eP28Xl5eXphx9+sD4+ePCgkpOT1aBBgxLNHQAAAAAuF8U/ijRv3jzl5+erVatWWr58uQ4dOqQDBw5o9uzZatu2raRzR83btGmjl156SQcOHNC3336rZ555xmac6tWryzAMffXVVzp58qTS09Pl7++vESNGaNy4cVq1apX279+vYcOGKTMzU0OHDnUqz88++6zYtQneeOMN5eXl6dprr9XSpUt14MABHTx4UB9++KF++eUXubu7Szp3lkBubq7mzJmj3377TR988IHmz59vN56np6ceeeQRbdu2TTt37tT999+vNm3acMo/AAAAgHKH4h9Fqlmzpnbt2qVOnTrp8ccfV+PGjdW1a1etW7dOb775pjVuwYIFysvLU8uWLTVmzBi98MILNuNUq1ZNkydP1pNPPqnw8HCNGjVKkvTSSy+pf//+uvfee9WiRQsdPnxYq1evVuXKlZ3KMyUlxe6U/gvVqlVLP/74o7p06aIJEybommuu0bXXXqs5c+bo3//+t55//nlJ0jXXXKMZM2bo5ZdfVuPGjbV48WJNmzbNbjw/Pz898cQTuvvuu9W+fXtVqlRJS5cudSpvAAAAALgSDLO0V3X7h0hNTVVQUJBSUlIUeMECe1lZWYqLi1NMTIx8fHzKKMN/FtM0lZeXJw8PjyJvAXipFi1apDFjxlgvfbhUvDfsWSwWJSYmKiwszLqeBMo/5s11MXeui7lzTcyb62LurpxlR1Iu2ufogn8DagVJkszPrpLxvwX/jAq44F9RdeiFeNcCAAAAAFDBsdo/AAAAAOAfZdmRFJ3NP3cS/Nl8s9CzDQrOHqgoOPIPXIL777//sk/5BwAAAIArheIfAAAAAIAKjtP+AQAAAAAuYftrb8ktJ0cWLy+H4s027+tMUoKCQ8J14TLgj7R7S7mWHHm6OTaWq6P4BwAAAAC4hJNtbnBug/COyjESpbAwu65GEU6O5eI47R8AAAAAgAqO4h8AAAAAgAqO0/4BAAAAoJwr7FZ056tot6W7mNCtm6zX/Dt0CUDCBnklJUhmuBTZ2abr5/hN1mv+/wmXAHDkH+Xe5s2b1aRJE3l6eqpv374XbbsSOnbsqDFjxlgf16hRQzNnzixyG8MwtGLFilLNCwAAAPgnaPX4g7pxSH+1evxBh+KNrfcp5Ke7ZWy9z65vzvcP6sX1/TXne8fGcnUU/yhWfHy8HnnkEdWsWVPe3t6Kjo5W7969tW7dOofH2LBhgwzDUHJystPPP3bsWDVr1kxxcXFatGjRRduK0rt3b/Xo0aPQvk2bNskwDO3Zs8fp3Hbs2KHhw4c7vR0AAAAAXEkU/yjS0aNH1bJlS61fv16vvPKK9u7dq1WrVqlTp04aOXLkFcnhyJEj6ty5s6666ioFBwdftK0oQ4cOVWxsrP766y+7voULF+raa69V06ZNnc4tNDRUfn5+Tm8HAAAAAFcSxT+K9PDDD8swDG3fvl39+/dX3bp11ahRI40dO1Zbt26VdO4LAsMwtHv3but2ycnJMgxDGzZs0NGjR9WpUydJUuXKlWUYhu6//35JUnZ2tkaPHq2wsDD5+Pjo+uuv144dO2zGPX36tIYMGSLDMLRo0aJC24pzyy23KDQ01C42PT1dy5Yt09ChQ3X69Gndddddqlatmvz8/NSkSRP95z//KXLcC0/7P3TokG688Ub5+PioYcOGio2NLTY3AAAAAChtLPhX1g7MkH6ZUXxcSAupwxe2bd/2kZJ2Fb9t/bFSg7FOp5aUlKRVq1Zp6tSp8vf3t+t35Ii7JEVHR2v58uXq37+/Dh48qMDAQPn6+kqSxo8fr+XLl+u9995T9erVNX36dHXv3l2HDx9WdHS0Tpw4oXr16mnKlCm68847FRAQoB49eti0BQUFadGiRXrggQdkmmahOXh4eOi+++7TokWL9PTTT8swDEnSsmXLlJ+fr7vuukvp6elq2bKlnnjiCQUGBmrlypW69957VatWLbVq1arY/bRYLLrtttsUHh6ubdu2KSUlxWZ9AAAAAAAoKxT/ZS03VTp7rPi4rOhC2k46tm1uqvN5STp8+LBM01T9+vUvafsC7u7uCgkJkSSFhYVZvzTIyMjQm2++qUWLFqlnz56SpHfeeUexsbF69913NW7cOEVERMgwDAUFBSkiIkKS5O/vb9cWFBSkevXqFZnHkCFD9Morr+jbb79Vx44dJZ075b9///4KCgpSUFCQ/v3vf1vjH3nkEa1evVoff/yxQ8X/2rVr9csvv2j16tWKioqSJL344ovWfQMAAAAKU9xK/kBJoPgva56Bkm+14uN8Qgtvc2Rbz0Dn85IuehS9pBw5ckS5ublq3769tc3T01OtWrXSgQMHnBqrX79+6tevX5Ex9evXV7t27bRgwQJ17NhRhw8f1qZNmzRlyhRJUn5+vl588UV9/PHHOnbsmHJycpSdne3wNf0HDhxQdHS0tfCXpLZt2zq1HwAAAABQGij+y1qDSzslX5L9ZQAlrE6dOjIMQ7/88kuRcW5u55aOOP/Lgtzc3FLN7VINHTpUjzzyiObNm6eFCxeqVq1a6tChgyTplVde0axZszRz5kw1adJE/v7+GjNmjHJycso4awAAAAC4PGW64N/GjRvVu3dvRUVF2d0LPTc3V0888YS1CIuKitJ9992n48eP24yRlJSkQYMGKTAwUMHBwRo6dKjS09NtYvbs2aMbbrhBPj4+io6O1vTp0+1yWbZsmerXry8fHx81adJEX3/9danssysJCQlR9+7dNW/ePGVkZNj1F9y2LzT03FkJJ06csPadv/ifJHl5eUk6d3S9QK1ateTl5aXNmzdb23Jzc7Vjxw41bNiwpHbDxh133CE3NzctWbJE77//vnXRQEnavHmzbr31Vt1zzz265pprVLNmTf36668Oj92gQQP9+eefNq9DwaKIAAAAAK6cZUdStOxIis7mWSRJZ/Ms1rZ/6mUWZVr8Z2Rk6JprrtG8efPs+jIzM7Vr1y49++yz2rVrlz799FMdPHhQffr0sYkbNGiQfv75Z8XGxuqrr77Sxo0bbe67npqaqm7duql69erauXOnXnnlFU2aNElvv/22Neb777/XXXfdpaFDh+rHH39U37591bdvX+3bt6/0dt5FzJs3T/n5+WrVqpWWL1+uQ4cO6cCBA5o9e7b1lHZfX1+1adNGL730kg4cOKBvv/1WzzzzjM041atXl2EY+uqrr3Ty5Emlp6fL399fI0aM0Lhx47Rq1Srt379fw4YNU2ZmpoYOHepUnp999plDaxNUqlRJd955pyZMmKATJ05Y7zognTvTITY2Vt9//70OHDigBx98UAkJCQ7n0KVLF9WtW1eDBw/WTz/9pE2bNunpp592aj8AAAAAoDSU6Wn/PXv2vOhiaEFBQXa3SZs7d65atWqlP/74Q1dffbUOHDigVatWaceOHbr22mslSXPmzNHNN9+sV199VVFRUVq8eLFycnK0YMECeXl5qVGjRtq9e7dmzJhh/ZJg1qxZ6tGjh8aNGydJev755xUbG6u5c+dq/vz5pfgKlH81a9bUrl27NHXqVD3++OM6ceKEQkND1bJlS7355pvWuAULFmjo0KFq2bKl6tWrp+nTp6tbt27W/mrVqmny5Ml68skn9cADD1hX3n/ppZdksVh07733Ki0tTddee61Wr16typUrO5VnSkqKDh486FDs0KFD9e677+rmm2+2uT7/mWee0W+//abu3bvLz89Pw4cPV9++fZWS4tg3g25ubvrss880dOhQtWrVSjVq1NDs2bPVo0cPp/YFAAAAQOFWbt7vXHzzn2WkJckMCLHrm3+bc2O5OsMs7VXdHGQYhj777DP17dv3ojFr165Vt27dlJycrMDAQC1YsECPP/64zpw5Y43Jy8uTj4+Pli1bpn79+um+++5TamqqzSUF33zzjTp37qykpCRVrlxZV199tcaOHWtzW7bnnntOK1as0E8//VRoLtnZ2crOzrY+Tk1NVXR0tM6cOaPAQNsF9rKysnT06FHFxMTIx8fHuRcGlyw3N1eenp5lnUaRsrKyFBcXpxo1avDe+B+LxaKTJ08qNDTUup4Eyj/mzXUxd66LuXNNzJvrKs25W/7b5Z+G3r9mUAlkUj6UxOthZZp/F///u9zXUa7wmqampqpy5cpKSUmxq0Mv5DIL/mVlZemJJ57QXXfdZd2p+Ph4hYWF2cR5eHgoJCRE8fHx1piYmBibmPDwcGtf5cqVFR8fb207P6ZgjMJMmzZNkydPtms/efKksrKybNpyc3NlsViUl5envLw8B/cYl8M0Tev6AoaTH/IrKS8vTxaLRadPny73X1RcKRaLRSkpKTJNk/9T5EKYN9fF3Lku5s41MW+uqzTnzkizX1/LWYmJ2cUHuYiSeD3+Zso4myYZ0v/+x2Gu8JqmpaU5HOsSxX9ubq7uuOMOmaZpc6p5WZowYYLGjv17lf6CI/+hoaGFHvlPS0uTh4eHPDxc4iWvMMp7Qe3h4SE3NzdVqVKFI///Y7FYZBgGR0RcDPPmupg718XcuSbmzXWV5tyZ6Zd/pDssrPwfpXZUSbwefw9mSqZkVnL+yL8rvKbO1BDlvhItKPx///13rV+/3qawjoiIUGJiok18Xl6ekpKSFBERYY25cNG2gsfFxRT0F8bb21ve3t527W5ubnZ/DNzc3GQYhvUHpc80TetrXZ5f84L3RGHvm38yXhPXxLy5LubOdTF3rol5c12lNnfG5Y9Xod5PRbweDWe/JM+0VOUGBGr/6CeLHarhsWnyPJuo3LQw7b9qgk3fsj0vKTMnVX5egRrQ1H4sV3hNncmxXO9NQeF/6NAhrV27VlWqVLHpb9u2rZKTk7Vz505r2/r162WxWNS6dWtrzMaNG23uOx8bG6t69epZF5Vr27at1q1bZzN2bGysdTV7AAAAAEDZi1n6vuoufEMxS993LD7xfdVNWqiYRPv4dYff18pf3tC6w46N5erKtPhPT0/X7t27rfeEj4uL0+7du/XHH38oNzdXt99+u3744QctXrxY+fn5io+PV3x8vHJyciSdu696jx49NGzYMG3fvl2bN2/WqFGjNHDgQOsq7nfffbe8vLw0dOhQ/fzzz1q6dKlmzZplc8r+o48+qlWrVum1117TL7/8okmTJumHH37QqFGjrvhrAgAAAABASSvT4v+HH35Q8+bN1bx5c0nS2LFj1bx5c02cOFHHjh3TF198ob/++kvNmjVTZGSk9ef777+3jrF48WLVr19fN910k26++WZdf/31evvtt639QUFBWrNmjeLi4tSyZUs9/vjjmjhxovU2f5LUrl07LVmyRG+//bauueYaffLJJ1qxYoUaN2585V4MAAAAAABKSZle89+xY0cVdadBR+5CGBISoiVLlhQZ07RpU23atKnImAEDBmjAgAHFPh8AAAAAAK6mXF/zDwAAAAAALh/FPwAAAAAAFRzFP8q9zZs3q0mTJvL09FTfvn0v2gYAAAAAKBzFP4oVHx+vRx55RDVr1pS3t7eio6PVu3dvu9sjFmXDhg0yDEPJyclOP//YsWPVrFkzxcXFadGiRRdtK4phGEX+TJo0yem8zh97xYoVl7w9AAAAAJS2Ml3wD+Xf0aNH1b59ewUHB+uVV15RkyZNlJubq9WrV2vkyJH65ZdfSj2HI0eO6KGHHtJVV11VZFtRTpw4Yf330qVLNXHiRB08eNDaVqlSpZJLGAAAAADKGY78o0gPP/ywDMPQ9u3b1b9/f9WtW1eNGjXS2LFjtXXrVknnviAwDEO7d++2bpecnCzDMLRhwwYdPXpUnTp1kiRVrlxZhmHo/vvvlyRlZ2dr9OjRCgsLk4+Pj66//nrt2LHDZtzTp09ryJAhMgxDixYtKrStOBEREdafoKAgGYZh0/bRRx+pQYMG8vHxUf369fXGG29Yt83JydGoUaMUGRkpHx8fVa9eXdOmTZMk1ahRQ5LUr18/GYZhfQwAAACg5J1s3V7x13fWydbtHYsPbK94/+t1MtA+vmFYe10T2VkNwxwby9Vx5L+MzdgyQzO2zCg2rkVkC31x1xc2bX3+00e7TuwqdtuxbcdqbNuxTueWlJSkVatWaerUqfL397frDw4Odmic6OhoLV++XP3799fBgwcVGBgoX19fSdL48eO1fPlyvffee6pevbqmT5+u7t276/Dhw4qOjtaJEydUr149TZkyRXfeeacCAgLUo0cPm7agoCAtWrRIDzzwgEO3h7zQ4sWLNXHiRM2dO1fNmzfXjz/+qGHDhsnf31+DBw/W7Nmz9cUXX+jjjz/W1VdfrT///FN//vmnJGnHjh0KCwvTwoUL1aNHD7m7uzv9/AAAAAAcs33GO87F13pbRlqSzIAQu77R1zs3lquj+C9jqdmpOpZ2rNi46KBou7aTmScd2jY1O/WScjt8+LBM01T9+vUvafsC7u7uCgk592ELCwuzfmmQkZGhN998U4sWLVLPnj0lSe+8845iY2P17rvvaty4cYqIiJBhGAoKClJERIQkyd/f364tKChI9erVu6T8nnvuOb322mu67bbbJEkxMTHav3+/3nrrLQ0ePFh//PGH6tSpo+uvv16GYah69erWbUNDQyWd+yKkIBcAAAAAKG8o/stYoHegqgVUKzYu1C+00DZHtg30Dryk3C7lKLozjhw5otzcXLVv//dpNp6enmrVqpUOHDjg1Fj9+vVTv379nM4hIyNDR44c0dChQzVs2DBre15enoKCgiRJ999/v7p27ap69eqpR48euuWWW9StWzennwsAAACAtOxISlmn8I9E8V/GLvWUfEl2lwGUtDp16sgwjGIX9XNzO7d0xPlfFuTm5pZqbiUlPT1d0rkzDlq3bm3TV3AKf4sWLRQXF6f//ve/Wrt2re644w516dJFn3zyyRXPFwAAAAAuBQv+4aJCQkLUvXt3zZs3TxkZGXb9BbftKzj1/fwV9c9f/E+SvLy8JEn5+fnWtlq1asnLy0ubN2+2tuXm5mrHjh1q2LBhSe1GkcLDwxUVFaXffvtNtWvXtvmJiYmxxgUGBurOO+/UO++8o6VLl2r58uVKSkqSdO5shfP3CwAAAEDp6HBPb3Xr0UYd7untWPyBW9X1SA91OHCrXd/k2N4a+2UbTY51bCxXx5F/FGnevHlq3769WrVqpSlTpqhp06bKy8tTbGys3nzzTR04cEC+vr5q06aNXnrpJcXExCgxMVHPPPOMzTjVq1eXYRj66quvdPPNN8vX11eVKlXSiBEjNG7cOIWEhOjqq6/W9OnTlZmZqaFDhzqV52effaYJEyZc0q0HJ0+erNGjRysoKEg9evRQdna2fvjhB505c0Zjx47VjBkzFBkZqebNm8vNzU3Lli1TRESEde2CGjVqaN26dWrfvr28vb1VuXJlp3MAAAAAULxKcUfkl3BcmWmOrWtWKeuw/HJPKNO0P5h5Iu2IkjKPKzP30tZIczUc+UeRatasqV27dqlTp056/PHH1bhxY3Xt2lXr1q3Tm2++aY1bsGCB8vLy1LJlS40ZM0YvvPCCzTjVqlXT5MmT9eSTTyo8PFyjRo2SJL300kvq37+/7r33XrVo0UKHDx/W6tWrnS6gU1JSdPDgwUvax3/961/6v//7Py1cuFBNmjRRhw4dtGjRIuuR/4CAAE2fPl3XXnutrrvuOh09elRff/219XKH1157TbGxsYqOjlbz5s0vKQcAAAAAKE2GWdqruv1DpKamKigoSCkpKQoMtF1gLysrS3FxcYqJiZGPj08ZZfjPYpqm8vLy5OHhIcMwyjqdi+K9Yc9isSgxMVFhYWHWL1hQ/jFvrou5c13MnWti3lxXac5dSSyAN6BWUAlkcmVczv72at/w3JH/8Cit3Ly/+PhdDc4d+feM1MoWtouKP/RpQyVlHleIX5Tm32Y/liu8pkXVoRfiLw4AAAAAABUcxT8AAAAAABUcxT8AAAAAABUcxT8AAAAAABUcxf8VxNqKuBDvCQAAAABXAsX/FeDp6SlJyszMLONMUN4UvCcK3iMAAAAAUBo8yjqBfwJ3d3cFBwcrMTFRkuTn51eubz9XEZT3W/2ZpqnMzEwlJiYqODhY7u7uZZ0SAAAAUO7tf2S8PDLSledfybH4auPkmXlSuX6hdn23NxmvrNx0+Xg6Nparo/i/QiIiIiTJ+gUASpdpmrJYLHJzcyuXxX+B4OBg63sDAAAAQNHiBt7vXHzY/TLSkmQGhNj1danj3FiujuL/CjEMQ5GRkQoLC1Nubm5Zp1PhWSwWnT59WlWqVJGbW/m8usXT05Mj/gAAAACuCIr/K8zd3Z2C7wqwWCzy9PSUj49PuS3+AQAAAOBKofgHAAAAALgEn8R4Gfn5Mt3dlRVW/OWzPjnxcss9LUtOjrK8o2z6zmTGy2Lmy81wV2W/in8pLsU/AAAAAMAl3NSvs/wSjiszPEorN+8vPv7nm+SXe0KZnpFa2eKATd+EVZ2VlHlcIX5Rmn9b8WO5Os6HBgAAAACggqP4BwAAAACggqP4BwAAAACggqP4BwAAAACggqP4BwAAAACggqP4BwAAAACggqP4BwAAAACggqP4BwAAAACggqP4BwAAAACggvMo6wQAAAAAAHDEtx98Lre8PFk8HCtlv62/Qu4Zp5XvX8Wub+JNnyvfzJO78c8oi/8ZewkAAAAAcHnpNes4F+9bR0ZeFZm+IXZ9UUHOjeXqOO0fAAAAAIAKjuIfAAAAAIAKjtP+AQAAAAAuIfqLZfI4e1Z5vr76s8+A4uNPfSKPzJPKyw7Vn6F32PR9F7dM2Xln5e3hq+tjih/L1VH8AwAAAABcQtOXn5NfwnFlhkc5VPw3/fM5+eWeUKZnpF3x/+GPzykp87hC/KL+EcU/p/0DAAAAAFDBUfwDAAAAAFDBUfwDAAAAAFDBUfwDAAAAAFDBUfwDAAAAAFDBUfwDAAAAAFDBUfwDAAAAAFDBUfwDAAAAAFDBeZR1AgAAAAAAOCIrNMzmd7HxnuGSaTn3+wLBPmE2vys6in8AAAAAgEtYt2KDc/GN18tIS5IZEGLX99LNzo3l6ij+AQAAAAAlYtmRlLJOARfBNf8AAAAAAFRwFP8AAAAAAFRwnPYPAAAAAHAJLZ4ZI6/kM8oJrqxdL8wsPj7uMXlnJSjbJ1y7as6y6Xt76xil55xRJa/KGt6m+LFcXZke+d+4caN69+6tqKgoGYahFStW2PSbpqmJEycqMjJSvr6+6tKliw4dOmQTk5SUpEGDBikwMFDBwcEaOnSo0tPTbWL27NmjG264QT4+PoqOjtb06dPtclm2bJnq168vHx8fNWnSRF9//XWJ7y8AAACAf55lR1KK/IHjIr9Zo+hVnyvymzWOxSev0VVpqxSZbB+/6/gabf3jc+067thYrq5Mi/+MjAxdc801mjdvXqH906dP1+zZszV//nxt27ZN/v7+6t69u7KysqwxgwYN0s8//6zY2Fh99dVX2rhxo4YPH27tT01NVbdu3VS9enXt3LlTr7zyiiZNmqS3337bGvP999/rrrvu0tChQ/Xjjz+qb9++6tu3r/bt21d6Ow8AAAAAwBVSpqf99+zZUz179iy0zzRNzZw5U88884xuvfVWSdL777+v8PBwrVixQgMHDtSBAwe0atUq7dixQ9dee60kac6cObr55pv16quvKioqSosXL1ZOTo4WLFggLy8vNWrUSLt379aMGTOsXxLMmjVLPXr00Lhx4yRJzz//vGJjYzV37lzNnz//CrwSAAAAAACUnnJ7zX9cXJzi4+PVpUsXa1tQUJBat26tLVu2aODAgdqyZYuCg4Othb8kdenSRW5ubtq2bZv69eunLVu26MYbb5SXl5c1pnv37nr55Zd15swZVa5cWVu2bNHYsWNtnr979+52lyGcLzs7W9nZ2dbHqampkiSLxSKLxXK5u4/LZLFYZJomc+GCmDvXxLy5LubOdTF3rol5c12XNXdm6c93uXlPlfq+mn//dva5Low3zb9/FzJWuXlNi+BMjuW2+I+Pj5ckhYeH27SHh4db++Lj4xUWFmbT7+HhoZCQEJuYmJgYuzEK+ipXrqz4+Pgin6cw06ZN0+TJk+3aT548aXNZAsqGxWJRSkqKTNOUmxs3tXAlzJ1rYt5cF3Pnupg718S8ua7LmTsjLaOUsvpbYmJ28UFXQKnva0GRblpkpCVdXnwxY5WX17QoaWlpDseW2+K/vJswYYLN2QKpqamKjo5WaGioAgMDyzAzSOf+OBuGodDQUP7D6mKYO9fEvLku5s51MXeuiXlzXZczd2Z66S/qFxYWVOrP4YhS31fDzfrbDAi5vPhixiovr2lRfHx8HI4tt8V/RESEJCkhIUGRkZHW9oSEBDVr1swak5iYaLNdXl6ekpKSrNtHREQoISHBJqbgcXExBf2F8fb2lre3t127m5sbf8jLCcMwmA8Xxdy5JubNdTF3rou5c03Mm+u65LkzSn+uy837qdT31fj7t7PPdWG8Yfz9u5Cxys1rWgRnciy3exMTE6OIiAitW7fO2paamqpt27apbdu2kqS2bdsqOTlZO3futMasX79eFotFrVu3tsZs3LhRubm51pjY2FjVq1dPlStXtsac/zwFMQXPAwAAAACAKyvT4j89PV27d+/W7t27JZ1b5G/37t36448/ZBiGxowZoxdeeEFffPGF9u7dq/vuu09RUVHq27evJKlBgwbq0aOHhg0bpu3bt2vz5s0aNWqUBg4cqKioKEnS3XffLS8vLw0dOlQ///yzli5dqlmzZtmcsv/oo49q1apVeu211/TLL79o0qRJ+uGHHzRq1Kgr/ZIAAAAAAFDiyvS0/x9++EGdOnWyPi4oyAcPHqxFixZp/PjxysjI0PDhw5WcnKzrr79eq1atsrmuYfHixRo1apRuuukmubm5qX///po9e7a1PygoSGvWrNHIkSPVsmVLVa1aVRMnTrTe5k+S2rVrpyVLluiZZ57RU089pTp16mjFihVq3LjxFXgVAAAAAACO+LN3f3mmJCs3KNix+Cr95XU2QTm+4XZ97av3V0ZOsvy9HBvL1RmmWXB/A1yO1NRUBQUFKSUlhQX/ygGLxaLExESFhYW5xLU6+Btz55qYN9fF3Lku5s41MW+u63LmbtmR0l/wb0Ct8rE43ZXYV6f8byV/MyDE6TUCystrWhRn6lD+4gAAAAAAUMGV29X+AQAAAAAlp7ij8q5wpBuXjiP/AAAAAABUcBz5BwAAAAC4hO7drpNvQrzOhkdo9ZodxcfvaS3fnOM66xWl1dfYxo/54jqdORuvyr4Rmtmn+LFcHUf+AQAAAAAuwSMjQ54ZafLIyHAsPj9dnpYMeeSn2/Vl5WXobG6asvIcG8vVceQfAAAAAFxcuVtlH+UOR/4BAAAAAKjgOPIPAAAAAHDo7AHuCOC6OPIPAAAAAEAFR/EPAAAAAEAFR/EPAAAAAEAFR/EPAAAAAEAFR/EPAAAAAEAFx2r/AAAAAACXsOv5GXLPylK+j49j8TVmyD3zpPL9Qu36hrWaoZz8LHm5OzaWq6P4BwAAAAC4hBOdezgXX7m7DI8kmQEhdn0tr3JuLFfHaf8AAAAAAFRwFP8AAAAAAFRwTp/2P3v27CL7R48efcnJAAAAAABwMcH7dsstJ0cWLy8lN25WfHzGbrlnnlK+W1UlV2ph0/fb6d3Ks+TIw81LNasUP5arc6j4379/vxo2bChJGjNmjPz8/BQWFibTNG3iDMOg+AcAAAAAlIr2D94tv4TjygyP0srN+4uP/3WQ/HJPKNMzUitbHLDpm/7t3UrKPK4QvyjNv634sVydQ6f9P/TQQ7r99tslSU8//bTc3NzUpUsXbd26VXFxcdaf3377rVSTBQAAAAAAznOo+F+zZo2++OILnTp1Ss8//7wOHDignJwc1atXT1OnTlV2dnZp5wkAAAAAAC6RQ8X/sWPH5O7urkqVKkmSqlWrpkWLFmn9+vVat26dateurffff79UEwUAAAAAAJfGoWv+77vvPi1cuFA+Pj7as2fP3xt7eGjmzJn6/PPPNWrUKM2aNUs7d+4stWQBAAAAAIDzHCr+N2/ebP13s2bNZBiGdbG/8/+9e/fuks8QAAAAAABcFqdv9RcXF1caeQAAAAAAgFLidPFfvXr10sgDAAAAAACUEqeL/y+++KLI/j59+lxyMgAAAAAAoOQ5Xfz37dtXhmFIkvVa/wKGYSg/P79kMgMAAAAAACXC6eJ/0KBB+vLLLzV+/Hg9/vjj8vb2Lo28AAAAAACwsXr1Nsk0pf8dkC42vulWGWlJMgNC7Ppe771NpmlaD25XdG7ObvDBBx9o3bp1WrNmjerWravFixeXRl4AAAAAANjIqxSgvIBA5VUKcCzePcD6cyFfzwD5eQXK19OxsVyd08W/JLVs2VIbNmzQrFmzNGXKFF177bXauHFjSecGAAAAAABKgNPFf2pqqvWnc+fO2rx5s2699Vbdcsst6tu3bymkCAAAAAAALofT1/wHBwcXek2EaZr68ssvSyQpAAAAAAAuVOfdufJMT1NupQAdGjqq+PgT8+R1NlE56WE6FPWITd9X++cqMzdNfp4BuqVh8WO5OqeL/2+++aY08gAAAAAAoEh1F7whv4TjygyPcqj4rxv/hvxyTyjTM9K++P/lDSVlHleIXxTFf2FiYmIUHR39j1kREQAAAAAAV+f0Nf8xMTE6efJkaeQCAAAAAABKgdPFv2mapZEHAAAAAAAoJU6f9i9Jf/31l7Kysgrtu/rqqy8rIQAAAAAAULIuqfi/7rrr7NpM05RhGMrPz7/spAAAAAAAQMm5pOJ/27ZtCg0NLelcAAAAAABAKXC6+DcMQ1dffbXCwsJKIx8AAAAAAFDCnC7+WfAPAAAAAP6Zlh1JKesUcImcLv7j4uI45R8AAAAAcMUlN2qqs5HVlB1SxbF4/2t0Njtc2d4Rdn0xlZuqil81BXo7Nparc7r4//333/X7779ftP/GG2+8rIQAAAAAACjM5rc/ci6+7hIZaUkyA0Ls+p7o5NxYrs7p4r9jx44X7WO1fwAAAAAAyh+ni/8zZ86URh4AAAAAAKCUOF38BwUFWf+dlpamp59+Wrt371aTJk304osvlmhyAAAAAADg8jld/J/v8ccfV2xsrAYMGKD//ve/euSRR/T++++XVG4AAAAAAFi1Hz5Q3kmnlR1SxaHr/9v/ere8s+OV7R2hzfVs41/+ZqBSs08r0LvKP+L6/8sq/teuXat3331XnTt31pAhQ9ShQ4eSygsAAAAAABvBP++RX8JxZYZHORaf8ZP8ck8oMy/Sri/uzB4lZR5XiJ9jY7k6t8vZ+NSpU6pRo4YkKSYmRqdOnSqJnAAAAAAAQAly+sh/amqqzeP09HSlpqYqKyurxJICAAAAAAAlx+niPzg4WIZhSJJM01Tz5s2t/y5oBwAAAAAA5YfTxf8333xTGnkAAAAAAIBS4nTxz6J+AAAAAAC4FoeK/7S0NAUEBFgfJyQkaN68edq/f78Mw1CjRo00YsQIhYeHl1qiAAAAAADg0hS72n9eXp4iIiJ04sQJSdLmzZtVu3ZtLV26VAEBAapUqZL+85//qE6dOtq6dWuJJpefn69nn31WMTEx8vX1Va1atfT888/LNE1rjGmamjhxoiIjI+Xr66suXbro0KFDNuMkJSVp0KBBCgwMVHBwsIYOHar09HSbmD179uiGG26Qj4+PoqOjNX369BLdFwAAAAAAykqxR/49PDzk7e2ts2fPSpL+/e9/65577tEbb7xhs/DfiBEj9Pjjj2vz5s0lltzLL7+sN998U++9954aNWqkH374QQ888ICCgoI0evRoSdL06dM1e/Zsvffee4qJidGzzz6r7t27a//+/fLx8ZEkDRo0SCdOnFBsbKxyc3P1wAMPaPjw4VqyZImkc3cw6Natm7p06aL58+dr7969GjJkiIKDgzV8+PAS2x8AAAAAAMqCQ6f9h4aGKi0tTZK0e/duLVq0yGZlf8MwNGbMGOvK/yXl+++/16233qpevXpJkmrUqKH//Oc/2r59u6RzXzrMnDlTzzzzjG699VZJ0vvvv6/w8HCtWLFCAwcO1IEDB7Rq1Srt2LFD1157rSRpzpw5uvnmm/Xqq68qKipKixcvVk5OjhYsWCAvLy81atRIu3fv1owZMyj+AQAAAKCc+HXIw/JMT1NupYDigyX9GvGwvM4mKsc3zK7vlvoPKzM3TX6ejo3l6hwq/tu2baulS5fqmmuuUXh4uI4ePap69erZxMTFxSkwMLBEk2vXrp3efvtt/frrr6pbt65++uknfffdd5oxY4b1OePj49WlSxfrNkFBQWrdurW2bNmigQMHasuWLQoODrYW/pLUpUsXubm5adu2berXr5+2bNmiG2+8UV5eXtaY7t276+WXX9aZM2dUuXJlu9yys7OVnZ1tfZyamipJslgsslgsJfo6wHkWi0WmaTIXLoi5c03Mm+ti7lwXc+eamDfXdVlzZzLfJeXQkIf/fuDA63oo4mEZaUkyA0Ls4m9pUPRYrvA5dSZHh4r/MWPG6IYbblCdOnV011136V//+pdeffVVtWvXTtK5dQDGjRunO+6449Iyvognn3xSqampql+/vtzd3ZWfn6+pU6dq0KBBkqT4+HhJsltoMDw83NoXHx+vsDDbb3k8PDwUEhJiExMTE2M3RkFfYcX/tGnTNHnyZLv2kydPKisr61J2FyXIYrEoJSVFpmnKza3YpS1QjjB3rol5c13Mneti7lwT8+a6LmfujLSMUsoKxTNlnE2TDOl//+OwxMTs4oPKWMEZ+o5wqPhv1qyZvvzySw0fPly///67cnNzddddd1lP/Xd3d9ewYcNKfJG8jz/+WIsXL9aSJUusp+KPGTNGUVFRGjx4cIk+l7MmTJigsWPHWh+npqYqOjpaoaGhJX4GBJxnsVhkGIZCQ0P5D6uLYe5cE/Pmupg718XcuSbmzXVdztyZ6SmllBWKZZqSKZmVQiTDueI/LCyolJIqOQXr3DnCoeJfkjp27Khff/1Vv/zyi06ePGk9vaBy5cqqVauW/P39nc+0GOPGjdOTTz6pgQMHSpKaNGmi33//XdOmTdPgwYMVEREh6dytByMjI63bJSQkqFmzZpKkiIgIJSYm2oybl5enpKQk6/YRERFKSEiwiSl4XBBzIW9vb3l7e9u1u7m58Ye8nDAMg/lwUcyda2LeXBdz57qYO9fEvLmuS547g7kuKR7paecKesNQngPX/Xvkp8iwpMu0eCnPw7aYP5ubJtM0ZRiGfAu57t8VPqPO5Ohw8V+gfv36ql+/viRZb7lnOPkNiqMyMzPtdsbd3d36xUNMTIwiIiK0bt06a7Gfmpqqbdu2acSIEZLOrVeQnJysnTt3qmXLlpKk9evXy2KxqHXr1taYp59+Wrm5ufL09JQkxcbGql69eoWe8g8AAAAAuPK6d28tv4TjygyP0srN+4uP39NGfrknlOkZqZUtDtj0PfZlayVlHleIX5Tm31b8WK7ukr7KeP/999WkSRP5+vrK19dXTZs21QcffFDSual3796aOnWqVq5cqaNHj+qzzz7TjBkz1K9fP0l/32XghRde0BdffKG9e/fqvvvuU1RUlPr27StJatCggXr06KFhw4Zp+/bt2rx5s0aNGqWBAwcqKipKknT33XfLy8tLQ4cO1c8//6ylS5dq1qxZNqf1AwAAAADgqpw+8j9jxgw9++yzGjVqlNq3by9J+u677/TQQw/p1KlTeuyxx0osuTlz5ujZZ5/Vww8/rMTEREVFRenBBx/UxIkTrTHjx49XRkaGhg8fruTkZF1//fVatWqVzbUPixcv1qhRo3TTTTfJzc1N/fv31+zZs639QUFBWrNmjUaOHKmWLVuqatWqmjhxIrf5AwAAAABUCE4X/3PmzNGbb76p++67z9rWp08fNWrUSJMmTSrR4j8gIEAzZ87UzJkzLxpjGIamTJmiKVOmXDQmJCRES5YsKfK5mjZtqk2bNl1qqgAAAAAAlFtOn/Z/4sQJ6y3+zteuXTudOHGiRJICAAAAAAAlx+niv3bt2vr444/t2pcuXao6deqUSFIAAAAAAKDkOH3a/+TJk3XnnXdq48aN1mv+N2/erHXr1hX6pQAAAAAAAChbTh/579+/v7Zt26aqVatqxYoVWrFihapWrart27dbV+EHAAAAAADlh9NH/iWpZcuW+vDDD0s6FwAAAAAAUAqcPvL/9ddfa/Xq1Xbtq1ev1n//+98SSQoAAAAAAJQcp4/8P/nkk3rppZfs2k3T1JNPPqmePXuWSGIAAAAAAJxv81tL5JaTI4uXl2PxdRfLPf2U8itVtesb32GJ8iw58nBzbCxX53Txf+jQITVs2NCuvX79+jp8+HCJJAUAAAAAwIWSGzdzLt6/mQxLkkz/ELu+mlWcG8vVOX3af1BQkH777Te79sOHD8vf379EkgIAAAAAACXH6eL/1ltv1ZgxY3TkyBFr2+HDh/X444+rT58+JZocAAAAAAC4fE6f9j99+nT16NFD9evX11VXXSVJ+uuvv3TDDTfo1VdfLfEEAQAAAACQpMj1q+SelaV8Hx+d6Nyj+Pgzq+WeeVL5eaE6EWK7Pt3Ov1YpJz9LXu4+anlV8WO5OqeL/6CgIH3//feKjY3VTz/9JF9fXzVt2lQ33nhjaeQHAAAAAIAkqcWzY+WXcFyZ4VFa6UDx3+LoWPnlnlCmZ6RWXlD8v7N9rJIyjyvEL4ri/2IMw1C3bt3UrVu3ks4HAAAAAACUMKev+QcAAAAAAK6F4h8AAAAAgAqO4h8AAAAAgAqO4h8AAAAAgArukhb8y8/P14oVK3TgwAFJUqNGjdSnTx+5u7uXaHIAAAAAAODyOV38Hz58WL169dJff/2levXqSZKmTZum6OhorVy5UrVq1SrxJAEAAAAAwKVz+rT/0aNHq2bNmvrzzz+1a9cu7dq1S3/88YdiYmI0evTo0sgRAAAAAABcBqeP/H/77bfaunWrQkJCrG1VqlTRSy+9pPbt25docgAAAAAAFMjz91euf4Dy/P0di3evpNx8f+W5V7Lr8/Hwl69ngHw8HBvL1Tld/Ht7eystLc2uPT09XV5eXiWSFAAAAAAAF1q9Zodz8U23yUhLkhkQYtc3s49zY7k6p0/7v+WWWzR8+HBt27ZNpmnKNE1t3bpVDz30kPr06VMaOQIAAAAAgMvgdPE/e/Zs1apVS23btpWPj498fHzUvn171a5dW7NmzSqNHAEAAAAAwGVw+rT/4OBgff755zp06JB++eUXSVKDBg1Uu3btEk8OAAAAAABcPqeL/wJ16tRRnTp1JEn5+fkllhAAAAAAAIVp+tKz8kxJVm5QsPY8+Xzx8X9MlNfZBOX4hmtP9Rds+j7Y+awycpLl7xWse1sWP5arc/q0/7i4ON11110aMWKEzpw5oz59+sjb21v16tXTnj17SiNHAAAAAAAU/eVy1Vz2gaK/XO5Y/OnliklZpujT9vGbf1+u9Uc+0ObfHRvL1Tld/D/44IM6cOCA9u3bp86dOysnJ0eff/65GjZsqDFjxpRCigAAAAAA4HI4fdr/tm3btGnTJlWvXl0hISHasWOHWrRoodq1a6t169alkSMAAAAAALgMTh/5T0tLU2RkpIKCguTn56fg4GBJ5xYCTEtLK+n8AAAAAADAZbqkBf9WrVqloKAgWSwWrVu3Tvv27VNycnIJpwYAAAAAAErCJRX/gwcPtv77wQcftP7bMIzLzwgAAAAAAJQop4t/i8VSGnkAAAAAAIBS4vQ1/++//76ys7NLIxcAAAAAAFAKnD7y/8ADD6hHjx4KCwsrjXwAAAAAwKUsO5JS1ikAxXK6+DdNszTyAAAAAACgSCc6dZNX8hnlBFd2LD64m7yzEpTtE27X1yKqm9JzzqiSl2NjubpLWvDv448/VmBgYKF9991332UlBAAAAABAYXa9MNO5+JjXZaQlyQwIsesb3sa5sVzdJRX/06dPl7u7u127YRgU/wAAAAAAlDOXVPz/8MMPXPMPAAAAAICLcHq1fwAAAAAA4FqcPvJfvXr1Qk/5BwAAAACgNN3Ut6N8TiYqKzRM61ZsKD5+X2f55JxQllek1jWxjX/y645KzkpUsE+YXrq5+LFcndPFf1xcXGnkAQAAAABAkXxOJsov4bjj8bkJ8stLkAz7k96TsxKVlOn4WK7O6dP+R48erdmzZ9u1z507V2PGjCmJnAAAAAAAQAlyuvhfvny52rdvb9ferl07ffLJJyWSFAAAAAAAKDlOF/+nT59WUFCQXXtgYKBOnTpVIkkBAAAAAICS43TxX7t2ba1atcqu/b///a9q1qxZIkkBAAAAAICS4/SCf2PHjtWoUaN08uRJde7cWZK0bt06vfbaa5o5c2ZJ5wcAAAAAAC6T08X/kCFDlJ2dralTp+r555+XJNWoUUNvvvmm7rvvvhJPEAAAAAAAXB6ni39JGjFihEaMGKGTJ0/K19dXlSpVKum8AAAAAABACXH6mn9JysvL09q1a/Xpp5/KNE1J0vHjx5Wenl6iyQEAAAAAgMvn9JH/33//XT169NAff/yh7Oxsde3aVQEBAXr55ZeVnZ2t+fPnl0aeAAAAAIB/uD1PTJbH2bPK8/V1LD56sjwyTyrPL9Su757mk5Wdd1beHo6N5eqcLv4fffRRXXvttfrpp59UpUoVa3u/fv00bNiwEk0OAAAAAIACf/YZ4Fx81dtlpCXJDAix67s+xrmxXJ3Txf+mTZv0/fffy8vLy6a9Ro0aOnbsWIklBgAAAAAASobT1/xbLBbl5+fbtf/1118KCAgokaQAAAAAAEDJcbr479atm2bOnGl9bBiG0tPT9dxzz+nmm28uydwkSceOHdM999yjKlWqyNfXV02aNNEPP/xg7TdNUxMnTlRkZKR8fX3VpUsXHTp0yGaMpKQkDRo0SIGBgQoODtbQoUPtFifcs2ePbrjhBvn4+Cg6OlrTp08v8X0BAAAAAFy6Sr8dUuCvB1Tpt0PFB0uqdPaQArN/VaWz9vHHUw7pz+QDOp7i2FiuzunT/l977TV1795dDRs2VFZWlu6++24dOnRIVatW1X/+858STe7MmTNq3769OnXqpP/+978KDQ3VoUOHVLlyZWvM9OnTNXv2bL333nuKiYnRs88+q+7du2v//v3y8fGRJA0aNEgnTpxQbGyscnNz9cADD2j48OFasmSJJCk1NVXdunVTly5dNH/+fO3du1dDhgxRcHCwhg8fXqL7BAAAAAC4NB3uvVV+CceVGR6llZv3Fx//S1/55Z5QpmekVrY4YNM3Zd2tSso8rhC/KM2/rfixXJ3Txf9VV12ln376SR999JH27Nmj9PR0DR06VIMGDZKvgysuOurll19WdHS0Fi5caG2LiYmx/ts0Tc2cOVPPPPOMbr31VknS+++/r/DwcK1YsUIDBw7UgQMHtGrVKu3YsUPXXnutJGnOnDm6+eab9eqrryoqKkqLFy9WTk6OFixYIC8vLzVq1Ei7d+/WjBkzKP4BAAAAAC7P6eJfkjw8PHTPPfeUdC52vvjiC3Xv3l0DBgzQt99+q2rVqunhhx+23lUgLi5O8fHx6tKli3WboKAgtW7dWlu2bNHAgQO1ZcsWBQcHWwt/SerSpYvc3Ny0bds29evXT1u2bNGNN95os4hh9+7d9fLLL+vMmTM2ZxoUyM7OVnZ2tvVxamqqpHNrIlgslhJ/LeAci8Ui0zSZCxfE3Lkm5s11MXeui7lzTcyb6ypy7kzm88ox//7t7Ot+Ybxp/v27kLFc4XPqTI5OF/9ffPFFkf19+vRxdsiL+u233/Tmm29q7Nixeuqpp7Rjxw6NHj1aXl5eGjx4sOLj4yVJ4eHhNtuFh4db++Lj4xUWFmbT7+HhoZCQEJuY888oOH/M+Pj4Qov/adOmafLkyXbtJ0+eVFZW1iXuMUqKxWJRSkqKTNOUm5vTS1ugDDF3rol5c13Mneti7lwT8+a6ipo7Iy2jjLL6Byoo0k2LjLSky4svZqzExGy7tvImLS3N4Vini/++ffvaPDYMQ+b/vjExDKPQOwFcKovFomuvvVYvvviiJKl58+bat2+f5s+fr8GDB5fY81yKCRMmaOzYsdbHqampio6OVmhoqAIDA8swM0jn3juGYSg0NJT/sLoY5s41MW+ui7lzXcyda2LeXFdRc2emp5RRVv9Ahpv1txkQcnnxxYwVFhZ0GYleGQXr3DnC6eL/wtMKAgIC9NNPP6lmzZrODlWsyMhINWzY0KatQYMGWr58uSQpIiJCkpSQkKDIyEhrTEJCgpo1a2aNSUxMtBkjLy9PSUlJ1u0jIiKUkJBgE1PwuCDmQt7e3vL29rZrd3Nz4w95OWEYBvPhopg718S8uS7mznUxd66JeXNdF507g7m8coy/fzv7ul8Ybxh//y5kLFf4jDqT42XvjVHwgpWC9u3b6+DBgzZtv/76q6pXry7p3OJ/ERERWrdunbU/NTVV27ZtU9u2bSVJbdu2VXJysnbu3GmNWb9+vSwWi1q3bm2N2bhxo3Jzc60xsbGxqlevXqGn/AMAAAAA4Eouq/g/evSoMjIyFBAQUFL52Hjssce0detWvfjiizp8+LCWLFmit99+WyNHjpR07ouHMWPG6IUXXtAXX3yhvXv36r777lNUVJT18oQGDRqoR48eGjZsmLZv367Nmzdr1KhRGjhwoKKioiRJd999t7y8vDR06FD9/PPPWrp0qWbNmmVzWj8AAAAAAK7K6dP+b7vtNknS2bNntXXrVt10000KDQ0t8cQk6brrrtNnn32mCRMmaMqUKYqJidHMmTM1aNAga8z48eOVkZGh4cOHKzk5Wddff71WrVplc+3D4sWLNWrUKN10001yc3NT//79NXv2bGt/UFCQ1qxZo5EjR6ply5aqWrWqJk6cyG3+AAAAAAAVgtPFf1DQuUUPIiIi1Lt3bw0ZMqTEkzrfLbfcoltuueWi/YZhaMqUKZoyZcpFY0JCQrRkyZIin6dp06batGnTJecJAAAAAEB55XTxv3DhwtLIAwAAAACAIq37bL2M/HyZ7u6OxTdaJ7f007JUqmLXN63HelnMfLkZjo3l6pwu/lNTU4vs5zZ3AAAAAIDSkBVW+N3YLhrvFSHD00uml/2t/Cr7OTeWq3O6+A8ODi50hX/TNGUYhvLz80skMQAAAAAAUDKcLv5r1qypxMREPfnkk2rfvn1p5AQAAAAAAEqQ08X/gQMHNGfOHE2dOlU//vijpk+frpiYmNLIDQAAAAAAq5iPFskjI115/pUUN/D+4uMTF8kz86Ryz4YqLtx2sfq1hxYpKzddPp6V1KVO8WO5OjdnN/D09NTYsWN16NAhVatWTU2bNtXjjz+u5OTkUkgPAAAAAIBzGs6ZrmbTnlHDOdMdiz/2iq5JnKaGx16x6/tk73S9v+sZfbLXsbFcndPFf4GQkBDNnDlTP/74o44eParatWtr5syZJZgaAAAAAAAoCU6f9t+8eXO7Bf9M01R2drYef/xxjRkzpqRyAwAAAAAAJcDp4r9v376lkAYAAAAAACgtThf/zz33XGnkAQAAAAAASonTxX9qamqR/YGBgZecDAAAAAAAKHlOF//BwcF21/xL5677NwxD+fn5JZIYAAAAAAAoGU4X/5L0ySefKCQkpKRzAQAAAAAApeCSiv/27dsrLCyspHMBAAAAAACl4JKK//379+v06dPy9/dXRESEvLy8SjovAAAAAABspMfUUm5AoLKrhjoW71NbuYa/sr0j7foiA2rJzzNQQT6OjeXqLqn4v+mmm6zX+Lu5ual+/foaMmSIHnvssZLODwAAAAAASdK3H37pXHyDz2WkJckMsL9s/bmuzo3l6pwu/uPi4mSapnJzc5Wamqrjx49r+/btevbZZ5WXl6dx48aVRp4AAAAAAOASOV38V69e3eZxy5Yt1bt3b9WtW1dTpkyh+AcAAAAAoJy5pNP+CzNw4EA1atSopIYDAAAAAAAl5JKL/507d+rAgQOSpIYNG6pFixZq0aJFiSUGAAAAAMD5Wo0dJu+k08oOqaLtM94pPv7IcHlnxSvbJ0Lba/+fTd/s74YpLfu0AryraPT1xY/l6pwu/hMTEzVw4EBt2LBBwcHBkqTk5GR16tRJH330kUJD/xkrJQIAAAAArqzQbZvll3BcmeFRjsWnbpZf7gll5tiv9r8/cbOSMo8rxM+xsVydm7MbPPLII0pLS9PPP/+spKQkJSUlad++fUpNTdXo0aNLI0cAAAAAAHAZnD7yv2rVKq1du1YNGjSwtjVs2FDz5s1Tt27dSjQ5AAAAAABw+Zw+8m+xWOTp6WnX7unpKYvFUiJJAQAAAACAkuN08d+5c2c9+uijOn78uLXt2LFjeuyxx3TTTTeVaHIAAAAAAODyOV38z507V6mpqapRo4Zq1aqlWrVqKSYmRqmpqZozZ05p5AgAAAAAAC6D09f8R0dHa9euXVq7dq1++eUXSVKDBg3UpUuXEk8OAAAAAABcPoeL/7S0NAUEBEiSDMNQ165d1bVrV5uYHTt26LrrrivZDAEAAAAAwGVx+LT/bt26KT09vdC+vLw8PfPMM2rfvn2JJQYAAAAAAEqGU0f+u3TpojVr1igwMNDavm/fPt177706efKkVqxYURo5AgAAAACguDvvk2daqnIDAosPlhQXdp88zyYq1zfMru+m2vcpMydVfl6OjeXqHC7+v/nmG3Xu3Fldu3ZVbGysAgICNH36dD333HO67bbbtH79elWuXLk0cwUAAAAA/IPtH/2kc/HVnpCRliQzIMSub0BT58ZydQ4X/6GhoVq/fr26dOmizp07y9vbW4cOHdKHH36o22+/vTRzBAAAAAAAl8Gp1f5DQ0O1bt06denSRfv27dPu3btVv3790soNAAAAAACUAIcX/CtQtWpVrV+/Xg0bNtTdd9+tM2fOlEZeAAAAAACghDh85P+2226zeRwYGKiNGzeqVatWatKkibX9008/LbnsAAAAAAD4n17tG8ov4bgyw6O0cvP+4uN/bCS/3BPK9IzUyhYHbPoe+rShkjKPK8QvSvNvK34sV+dw8R8UFGT3OCYmpsQTAgAAAAAAJcvh4n/hwoWlmQcAAAAAACglTl/zDwAAAAAAXAvFPwAAAAAAFRzFPwAAAAAAFRzFPwAAAAAAFRzFPwAAAAAAFRzFPwAAAAAAFRzFPwAAAAAAFZxHWScAAAAAAIAjtr/2ltxycmTx8nIsvtZbcs84pXz/qnZ9j7R7S7mWHHm6OTaWq6P4BwAAAAC4hJNtbnAuPvB6GUaSzIAQu75GEc6N5eo47R8AAAAAgAqO4h8AAAAAgAqO0/4BAAAAVEjLjqQUGzOgVtAVyAQlJXTrJus1/45cAhCa+t25a/7NqjoZdKNN38/xm6zX/P8TLgGg+AcAAAAAuIRWjz8ov4TjygyP0srN+4uPP/Kg/HJPKNMzUitbHLDpm/P9g0rKPK4QvyjNv634sVwdp/0DAAAAAFDBUfwDAAAAAFDBUfwDAAAAAFDBUfwDAAAAAFDBUfwDAAAAAFDBuVTx/9JLL8kwDI0ZM8balpWVpZEjR6pKlSqqVKmS+vfvr4SEBJvt/vjjD/Xq1Ut+fn4KCwvTuHHjlJeXZxOzYcMGtWjRQt7e3qpdu7YWLVp0BfYIAAAAAIDS5zLF/44dO/TWW2+padOmNu2PPfaYvvzySy1btkzffvutjh8/rttuu83an5+fr169eiknJ0fff/+93nvvPS1atEgTJ060xsTFxalXr17q1KmTdu/erTFjxuhf//qXVq9efcX2DwAAAACA0uISxX96eroGDRqkd955R5UrV7a2p6Sk6N1339WMGTPUuXNntWzZUgsXLtT333+vrVu3SpLWrFmj/fv368MPP1SzZs3Us2dPPf/885o3b55ycnIkSfPnz1dMTIxee+01NWjQQKNGjdLtt9+u119/vUz2FwAAAACAkuRR1gk4YuTIkerVq5e6dOmiF154wdq+c+dO5ebmqkuXLta2+vXr6+qrr9aWLVvUpk0bbdmyRU2aNFF4eLg1pnv37hoxYoR+/vlnNW/eXFu2bLEZoyDm/MsLLpSdna3s7Gzr49TUVEmSxWKRxWK53F3GZbJYLDJNk7lwQcyda2LeXBdz57qYO9fEvF1hZvGvs6NzUeTcOfA8KCnm37+dfd0vjDfNv38XMpYrfE6dybHcF/8fffSRdu3apR07dtj1xcfHy8vLS8HBwTbt4eHhio+Pt8acX/gX9Bf0FRWTmpqqs2fPytfX1+65p02bpsmTJ9u1nzx5UllZWY7vIEqFxWJRSkqKTNOUm5tLnOCC/2HuXBPz5rqYO9fF3Lkm5u3KMtIyio1JTMwuNkYqeu4ceR6UjK//u8n6byMtqfj42htlZKbJ9Auwi3+r699jqZCxHH1vlKW0tDSHY8t18f/nn3/q0UcfVWxsrHx8fMo6HRsTJkzQ2LFjrY9TU1MVHR2t0NBQBQYGlmFmkM79cTYMQ6GhofyH1cUwd66JeXNdzJ3rYu5cE/N2ZZnpKcXGhIUFOTRWUXPnyPOgjJjmuZMEKoVIhuHUpo6+N8qSM3VyuS7+d+7cqcTERLVo0cLalp+fr40bN2ru3LlavXq1cnJylJycbHP0PyEhQREREZKkiIgIbd++3WbcgrsBnB9z4R0CEhISFBgYWOhRf0ny9vaWt7e3Xbubmxt/yMsJwzCYDxfF3Lkm5s11MXeui7lzTczbFWQU/xo7Mw8XnTsHngdlxXKu6DcMp+fJFT6jzuRYrvfmpptu0t69e7V7927rz7XXXqtBgwZZ/+3p6al169ZZtzl48KD++OMPtW3bVpLUtm1b7d27V4mJidaY2NhYBQYGqmHDhtaY88coiCkYAwAAAAAAV1auj/wHBASocePGNm3+/v6qUqWKtX3o0KEaO3asQkJCFBgYqEceeURt27ZVmzZtJEndunVTw4YNde+992r69OmKj4/XM888o5EjR1qP3D/00EOaO3euxo8fryFDhmj9+vX6+OOPtXLlyiu7wwAAAACAi2o4+yV5pqUqNyBQ+0c/WXz8sZfleTZRualh2n/VBJu+ZXteUmZOqvy8AjWgafFjubpyXfw74vXXX5ebm5v69++v7Oxsde/eXW+88Ya1393dXV999ZVGjBihtm3byt/fX4MHD9aUKVOsMTExMVq5cqUee+wxzZo1S1dddZX+7//+T927dy+LXQIAAAAAFCJm6fvySziuzPAoh4r/mMT35Zd7QpmekXbF/7rD7ysp87hC/KIo/sujDRs22Dz28fHRvHnzNG/evItuU716dX399ddFjtuxY0f9+OOPJZEiAAAAAADlSrm+5h8AAAAAAFw+lzvyDwAAAAAlZdmRom/TN6BW+b/dG+AIjvwDAAAAAFDBUfwDAAAAAFDBUfwDAAAAAFDBUfwDAAAAAFDBUfwDAAAAAFDBsdo/AAAAAFyE9W4ApkVGWobM9BTJ4BhqWTnZur28k04rO6SKY/GB7eWdFa9snwi7voZh7ZWWfVoB3o6N5eoo/gEAAAAALmH7jHeci6/1toy0JJkBIXZ9o693bixXx1dWAAAAAABUcBT/AAAAAABUcJz2DwAAAMAlWa/HB1Asin8AAAAAgEvocE9veZ86qeyqofr2wy+Ljz9wq7yzTyjbO1LfNrSNnxzbWylZJxXkE6rnuhY/lquj+AcAAAAAuIRKcUfkl3BcmWmpjsVnHZZf7gllmhl2fSfSjigp87gycx0by9VxzT8AAAAAABUcxT8AAAAAABUcxT8AAAAAABUcxT8AAAAAABUcxT8AAAAAABUcxT8AAAAAABUcxT8AAAAAABUcxT8AAAAAABWcR1knAAAAAACAI/Y/Ml4eGenK86/kWHy1cfLMPKlcv1C7vtubjFdWbrp8PB0by9VR/AMAAAAAXELcwPudiw+7X0ZaksyAELu+LnWcG8vVcdo/AAAAAAAVHMU/AAAAAAAVHKf9AwAAAABcgk9ivIz8fJnu7soKiyg+PidebrmnZcnJUZZ3lE3fmcx4Wcx8uRnuquxX/FiujuIfAAAAAOASburXWX4Jx5UZHqWVm/cXH//zTfLLPaFMz0itbHHApm/Cqs5KyjyuEL8ozb+t+LFcHaf9AwAAAABQwVH8AwAAAABQwVH8AwAAAABQwVH8AwAAAABQwbHgHwAAAIByZ9mRlLJOAahQOPIPAAAAAEAFR/EPAAAAAEAFR/EPAAAAAEAFxzX/AAAAAEqUI9frD6gVdAUyAVCA4h8AAAAA4BK+/eBzueXlyeLhWCn7bf0Vcs84rXz/KnZ9E2/6XPlmntyNf0ZZ/M/YSwAAAACAy0uvWce5eN86MvKqyPQNseuLCnJuLFfHNf8AAAAAAFRwFP8AAAAAAFRwnPYPAAAAAHAJ0V8sk8fZs8rz9dWffQYUH3/qE3lknlRedqj+DL3Dpu+7uGXKzjsrbw9fXR9T/FiujuIfAAAAwBXnyB0BgAs1ffk5+SUcV2Z4lEPFf9M/n5Nf7gllekbaFf8f/vickjKPK8Qv6h9R/HPaPwAAAAAAFRzFPwAAAAAAFRzFPwAAAAAAFRzFPwAAAAAAFRzFPwAAAAAAFRzFPwAAAAAAFRy3+gMAAADgFG7TB7gejvwDAAAAAFDBceQfAAAAAOASskLDbH4XG+8ZLpmWc78vEOwTZvO7oqP4BwAAAAC4hHUrNjgX33i9jLQkmQEhdn0v3ezcWK6O0/4BAAAAAKjgKP4BAAAAAKjgynXxP23aNF133XUKCAhQWFiY+vbtq4MHD9rEZGVlaeTIkapSpYoqVaqk/v37KyEhwSbmjz/+UK9eveTn56ewsDCNGzdOeXl5NjEbNmxQixYt5O3trdq1a2vRokWlvXsAAAAAAFwR5br4//bbbzVy5Eht3bpVsbGxys3NVbdu3ZSRkWGNeeyxx/Tll19q2bJl+vbbb3X8+HHddttt1v78/Hz16tVLOTk5+v777/Xee+9p0aJFmjhxojUmLi5OvXr1UqdOnbR7926NGTNG//rXv7R69eorur8AAAAAgItr8cwYtRk1WC2eGeNYfNxjavPXKLWIe8yu7+2tYzRj42C9vdWxsVxduV7wb9WqVTaPFy1apLCwMO3cuVM33nijUlJS9O6772rJkiXq3LmzJGnhwoVq0KCBtm7dqjZt2mjNmjXav3+/1q5dq/DwcDVr1kzPP/+8nnjiCU2aNEleXl6aP3++YmJi9Nprr0mSGjRooO+++06vv/66unfvfsX3GwAAAABgL/KbNfJLOK7M8CjH4pPXyC/3hDKzIu36dh1fo6TM4wrxc2wsV1eui/8LpaSkSJJCQs6t1Lhz507l5uaqS5cu1pj69evr6quv1pYtW9SmTRtt2bJFTZo0UXj437d26N69u0aMGKGff/5ZzZs315YtW2zGKIgZM2bMRXPJzs5Wdna29XFqaqokyWKxyGKxXPa+4vJYLBaZpslcuCDmzjUxb66LuXNdzJ1rqjDzZrp4/pfCNP/+0T9w/8sN8+/fzr4PL4w3zb9/FzKWK3xOncnRZYp/i8WiMWPGqH379mrcuLEkKT4+Xl5eXgoODraJDQ8PV3x8vDXm/MK/oL+gr6iY1NRUnT17Vr6+vnb5TJs2TZMnT7ZrP3nypLKysi5tJ1FiLBaLUlJSZJqm3NzK9dUtuABz55qYN9fF3Lku5s41VZR5M9Iyig+qcEwZZ9MkQ/rf/6AsFBTppkVGWtLlxRczVmJitl1beZOWluZwrMsU/yNHjtS+ffv03XfflXUqkqQJEyZo7Nix1sepqamKjo5WaGioAgMDyzAzSOf+w2oYhkJDQ136P6z/RMyda2LeXBdz57qYO9dUUebNTE8p6xSuPNM8d7C5UohkUPyXGcPN+tsMCLm8+GLGCgsLuoxErwwfHx+HY12i+B81apS++uorbdy4UVdddZW1PSIiQjk5OUpOTrY5+p+QkKCIiAhrzPbt223GK7gbwPkxF94hICEhQYGBgYUe9Zckb29veXt727W7ubm59B/yisQwDObDRTF3rol5c13Mneti7lxThZg3w4Vzv2SWc0W/YfxD97+8MP7+7ew8XBhf8CXORebUFT6jzuRYrvfGNE2NGjVKn332mdavX6+YmBib/pYtW8rT01Pr1q2zth08eFB//PGH2rZtK0lq27at9u7dq8TERGtMbGysAgMD1bBhQ2vM+WMUxBSMAQAAAACAKyvXR/5HjhypJUuW6PPPP1dAQID1Gv2goCD5+voqKChIQ4cO1dixYxUSEqLAwEA98sgjatu2rdq0aSNJ6tatmxo2bKh7771X06dPV3x8vJ555hmNHDnSeuT+oYce0ty5czV+/HgNGTJE69ev18cff6yVK1eW2b4DAAAAAFBSyvWR/zfffFMpKSnq2LGjIiMjrT9Lly61xrz++uu65ZZb1L9/f914442KiIjQp59+au13d3fXV199JXd3d7Vt21b33HOP7rvvPk2ZMsUaExMTo5UrVyo2NlbXXHONXnvtNf3f//0ft/kDAAAAAFQI5frIv1lw64Ui+Pj4aN68eZo3b95FY6pXr66vv/66yHE6duyoH3/80ekcAQAAAAAo78p18Q8AAADgylp25B+4kj9cxp+9+8szJVm5QcGOxVfpL6+zCcrxDbfra1+9vzJykuXv5dhYro7iHwAAAADgEvY8+bxz8VdPkZGWVOit/O5t6dxYrq5cX/MPAAAAAAAuH8U/AAAAAAAVHKf9AwAAAP8gXNMP/DNR/AMAAAAAXEL3btfJNyFeZ8MjtHrNjuLj97SWb85xnfWK0uprbOPHfHGdzpyNV2XfCM3sU/xYro7iHwAAAKhAOLKPiswjI0OeGWnKzQhwLD4/XZ6WDOXmp9v1ZeVl6Gxumnw9HRvL1XHNPwAAAAAAFRzFPwAAAAAAFRzFPwAAAAAAFRzFPwAAAAAAFRzFPwAAAAAAFRzFPwAAAAAAFRzFPwAAAAAAFZxHWScAAP/f3v1HRVXnfxx/DegghGCggpRCpmgaAsqC+ONoZaKSm9tumrL5o+ybu9pGrMf0iCLb7umYmpapbW6injZQ/Gae8zW1Dptiirqo9FUzS3J1XQF1XZQfhTZzv3+U820cBUb5NTPPxzlzDvfez3x4X97nc+685/O5FwAAUD+5xZebOwQALoriHwAAAGgCucWXJcMqU0WVjMrLkslxEe6T9wc2Q2SA6zj0yuvy/u47Wdq0qV/7iNflXX1BFr8ODseei39dVy3fyexdv75cHcU/AAAA0EIwsw/UruThEc61vztJplaXZLQNcjjW717n+nJ13PMPAAAAAICbo/gHAAAAAMDNsewfAAAAAOAS2h0tktfVq7KazSp/MKbu9lVF8q6+KItXe5X797U79s2/i/S99apaeZnVNbjuvlwdxT8AAAAAwCUMfH6C/MrOqTokTFv3fFF3+69S5HetRNWtO2lr3+N2x17bNUGXqs8pyC9Mbz9Rd1+ujmX/AAAAAAC4OYp/AAAAAADcHMU/AAAAAABujuIfAAAAAAA3R/EPAAAAAICbo/gHAAAAAMDNUfwDAAAAAODmWjV3AAAAAIA7yC2+3NwhAMAtMfMPAAAAAICbY+YfAAAAqAOz+kDLsGPHfskwJJOpfu377JOp4pKMtkEOx5aO3i/DMGSqZ1+ujuIfAAAAAOASvvdv61x777YyeV+T4e34Pt/WzvXl6lj2DwAAAACAm6P4BwAAAADAzbHsHwAAAB6Pe/oB19D93bfUurJC1/zb6utnZ9TdvmSFzN+e19XKjvo67AW7Y//zxVuqvlYhv9Zt9VivuvtydRT/AAAAAACXELlmpfzKzqk6JKxexX9k6Ur5XStRdetOjsX/lyt1qfqcgvzCPKL4Z9k/AAAAAABujpl/AAAAtFj1WY7/5P2BTRAJALg2Zv4BAAAAAHBzzPwDAADApfGwPgCoGzP/AAAAAAC4OYp/AAAAAADcHMv+AQAA0GxYsg8ATYPiHwAAALeFJ/EDgOug+AcAAGhAdRXEnlYMM7MPoCGV9+6jbzvdo5qg4Pq1vyta39aEqMYn1OHYfXf3UbDfPQrwqV9fro7iHwAA4EdNUagyWw4At2/POznOtY98X6aKSzLaBjkce/kh5/pydRT/AAAAHogvIQDAs1D8AwAAl9AQy+ldZQk6tw4AABoaxT8AAIAbcpUvOgAATYPiHwAAuAWK3YbH3xRASzPwv56Sz6V/qyYouF73/w/8aoJ8akpV4xOqPT3s2y/89Cldqfm3AnyCPeL+f4p/AADQ7CgynZNbfFkyrDJVVMmovCyZvJo7JABoEu2O/a/8ys6pOiSsfu2rPpfftRJVf9/J4dip//yvLlWfU5Bf/fpydVwpAAAAAABwc8z8AwDgwf77m8t3PHvMw+cAAGj5KP4BAMAdYck+AAAtH8v+AQAAAABwc8z832DFihVatGiRSktLFR0dreXLlys+Pr65wwIAwEF9ZtxZkg8AACSKfzsbNmxQWlqa3n77bSUkJGjZsmVKSkrSiRMn1LFjx+YODwAAp7EkHwAASBT/dl5//XU999xzmjJliiTp7bff1tatW7VmzRrNnj27maMDANQHxS4AAIAjiv8fXb16VQcPHtScOXNs+7y8vDRs2DAVFBQ4tK+pqVFNTY1t+/LlHz5slpeXy2q1Nn7AqJXVatWVK1dkNpvl5eUaj7bYcqr2guXx++586W5dv6M+6hPHnf0eQ6aK/+ixOnLXEOeChvRD3oyS7ySZmjsYOMWQqeKKDHmL3LkacueayJvrInctwRWrRd9LqrZaVH2lvO721VZ9f02qbm11aG98a5G+kwzTzfsqLzcaJObGdOXKFUmSYdQdK8X/jy5evCiLxaKQkBC7/SEhIfryyy8d2r/66qvKzMx02B8eHt5oMQIAAAAAJF0ok/pGOPGGMkk3b/8flWlyhjN9tTwVFRUKDKx9ko7i/zbNmTNHaWlptm2r1apLly4pODhYJhPfBDa3K1euqHPnzvrnP/+pgICA5g4HTiB3rom8uS5y57rInWsib66L3Lkmd8+bYRiqqKhQWFhYnW0p/n/Uvn17eXt7q6yszG5/WVmZQkNDHdr7+PjIx8fHbl+7du0aM0TchoCAALcc5J6A3Lkm8ua6yJ3rIneuiby5LnLnmtw5b3XN+F/nGjdDNwGz2ax+/fopLy/Pts9qtSovL0+JiYnNGBkAAAAAAHeGmf+fSEtL06RJkxQXF6f4+HgtW7ZMVVVVtqf/AwAAAADgiij+f2LcuHG6cOGC5s+fr9LSUsXExGj79u0ODwFEy+fj46OMjAyHWzPQ8pE710TeXBe5c13kzjWRN9dF7lwTeft/JqM+/xMAAAAAAAC4LO75BwAAAADAzVH8AwAAAADg5ij+AQAAAABwcxT/AAAAAAC4OYp/tHj5+fkaPXq0wsLCZDKZ9OGHH9b5np07d6pv377y8fFRt27dtHbtWrvjCxYskMlksnv17NmzcU7Agzmbu5KSEk2YMEGRkZHy8vJSamrqTdvl5uaqZ8+eatOmjaKiovTRRx81fPAerjFyt3btWodx16ZNm8Y5AQ/lbN4++OADPfroo+rQoYMCAgKUmJioHTt2OLRbsWKFIiIi1KZNGyUkJOjAgQONdAaeqzFyx7Wu8Tmbt88++0wDBw5UcHCwfH191bNnTy1dutShHWOu8TVG7hhzTeN2aoPr9uzZo1atWikmJsbhmCeMO4p/tHhVVVWKjo7WihUr6tX+1KlTSk5O1kMPPaSioiKlpqZq6tSpDh+KevfurZKSEtvrs88+a4zwPZqzuaupqVGHDh2Unp6u6Ojom7bZu3evxo8fr2effVaHDx/WmDFjNGbMGB09erQhQ/d4jZE7SQoICLAbd6dPn26okCHn85afn69HH31UH330kQ4ePKiHHnpIo0eP1uHDh21tNmzYoLS0NGVkZOjQoUOKjo5WUlKSzp8/31in4ZEaI3cS17rG5mze7rrrLs2YMUP5+fk6fvy40tPTlZ6ernfeecfWhjHXNBojdxJjrik4m7vrysvLNXHiRD3yyCMOxzxm3BmAC5FkbN68udY2s2bNMnr37m23b9y4cUZSUpJtOyMjw4iOjm6ECHEr9cndTw0ZMsR48cUXHfaPHTvWSE5OttuXkJBgPP/883cYIW6loXKXlZVlBAYGNlhcqJ2zebuuV69eRmZmpm07Pj7emD59um3bYrEYYWFhxquvvtoQYeImGip3XOua1u3m7Re/+IXx61//2rbNmGt6DZU7xlzTcyZ348aNM9LT02+aJ08Zd8z8w+0UFBRo2LBhdvuSkpJUUFBgt+/rr79WWFiYunbtqpSUFJ05c6Ypw8Rtqm9+0TJVVlYqPDxcnTt31uOPP65jx441d0j4CavVqoqKCgUFBUmSrl69qoMHD9qNOS8vLw0bNowx18LcmLvruNa1bIcPH9bevXs1ZMgQSYw5V3Jj7q5jzLVMWVlZ+uabb5SRkeFwzJPGHcU/3E5paalCQkLs9oWEhOjKlSv69ttvJUkJCQlau3attm/frlWrVunUqVMaPHiwKioqmiNkOOFW+S0tLW2miFBfPXr00Jo1a7Rlyxa99957slqtGjBggM6ePdvcoeFHixcvVmVlpcaOHStJunjxoiwWC2POBdyYO4lrXUt27733ysfHR3FxcZo+fbqmTp0qiTHnCm6VO4kx11J9/fXXmj17tt577z21atXK4bgnjTvHswc8wMiRI20/9+nTRwkJCQoPD9fGjRv17LPPNmNkgPtKTExUYmKibXvAgAF64IEH9Oc//1mvvPJKM0YGSXr//feVmZmpLVu2qGPHjs0dDpxwq9xxrWu5du/ercrKSu3bt0+zZ89Wt27dNH78+OYOC/VQW+4Ycy2PxWLRhAkTlJmZqcjIyOYOp9lR/MPthIaGqqyszG5fWVmZAgIC5Ovre9P3tGvXTpGRkTp58mRThIg7cKv8hoaGNlNEuF2tW7dWbGws464FyMnJ0dSpU5Wbm2u37LF9+/by9vZmzLVgt8rdzXCtaznuu+8+SVJUVJTKysq0YMECjR8/njHnAm6Vu5thzDW/iooKFRYW6vDhw5oxY4akH26TMgxDrVq10scff6xBgwZ5zLhj2T/cTmJiovLy8uz2ffLJJ3YzjjeqrKxUcXGxOnXq1Njh4Q7dTn7RMlksFh05coRx18yys7M1ZcoUZWdnKzk52e6Y2WxWv3797Mac1WpVXl4eY64FqC13N8O1rmWyWq2qqamRxJhzNT/N3c0w5ppfQECAjhw5oqKiIttr2rRp6tGjh4qKipSQkOBR446Zf7R4lZWVdt+Ynjp1SkVFRQoKClKXLl00Z84c/etf/9L69eslSdOmTdNbb72lWbNm6ZlnntHf/vY3bdy4UVu3brX1MXPmTI0ePVrh4eE6d+6cMjIy5O3tzZK7BuZs7iSpqKjI9t4LFy6oqKhIZrNZvXr1kiS9+OKLGjJkiJYsWaLk5GTl5OSosLDQ4V/t4M40Ru7+8Ic/qH///urWrZvKy8u1aNEinT592u5+SdwZZ/P2/vvva9KkSXrjjTeUkJBgu7fR19dXgYGBkqS0tDRNmjRJcXFxio+P17Jly1RVVaUpU6Y0/Qm6scbIHde6xuds3lasWKEuXbrY/vd7fn6+Fi9erN/97ne2PhhzTaMxcseYaxrO5M7Ly0sPPvig3fs7duyoNm3a2O33mHHX3P9uAKjLp59+akhyeE2aNMkwDMOYNGmSMWTIEIf3xMTEGGaz2ejatauRlZVld3zcuHFGp06dDLPZbNxzzz3GuHHjjJMnTzbNCXmQ28ndzdqHh4fbtdm4caMRGRlpmM1mo3fv3sbWrVub5oQ8SGPkLjU11ejSpYthNpuNkJAQY9SoUcahQ4ea7qQ8gLN5GzJkSK3tr1u+fLktd/Hx8ca+ffua7qQ8RGPkjmtd43M2b2+++abRu3dvw8/PzwgICDBiY2ONlStXGhaLxa5fxlzja4zcMeaaxu18RvmpW/1LRk8YdybDMIyG+RoBAAAAAAC0RNzzDwAAAACAm6P4BwAAAADAzVH8AwAAAADg5ij+AQAAAABwcxT/AAAAAAC4OYp/AAAAAADcHMU/AAAAAABujuIfAAAAAIBGkJ+fr9GjRyssLEwmk0kffvih031s3LhRMTEx8vPzU3h4uBYtWnRbsVD8AwAAm8mTJ2vMmDHNHQYAAG6hqqpK0dHRWrFixW29f9u2bUpJSdG0adN09OhRrVy5UkuXLtVbb73ldF8mwzCM24oCAAC4FJPJVOvxjIwMvfTSSzIMQ+3atWuaoG5i8uTJKi8vv63ZEQAAWiqTyaTNmzfbfcleU1OjuXPnKjs7W+Xl5XrwwQe1cOFCDR06VJI0YcIEXbt2Tbm5ubb3LF++XK+99prOnDlT57X9p1o11IkAAICWraSkxPbzhg0bNH/+fJ04ccK2z9/fX/7+/s0RGgAAHmnGjBn64osvlJOTo7CwMG3evFkjRozQkSNH1L17d9XU1MjPz8/uPb6+vjp79qxOnz6tiIiIev8ulv0DAOAhQkNDba/AwECZTCa7ff7+/g7L/ocOHaoXXnhBqampuvvuuxUSEqLVq1erqqpKU6ZMUdu2bdWtWzdt27bN7ncdPXpUI0eOlL+/v0JCQvT000/r4sWLtuObNm1SVFSUfH19FRwcrGHDhqmqqkoLFizQunXrtGXLFplMJplMJu3cuVOS9PLLLysyMlJ+fn7q2rWr5s2bp2vXrtn6XLBggWJiYrRmzRp16dJF/v7++u1vfyuLxaLXXntNoaGh6tixo/70pz/ZxWoymbRq1SqNHDlSvr6+6tq1qzZt2tTwCQAA4CfOnDmjrKws5ebmavDgwbr//vs1c+ZMDRo0SFlZWZKkpKQkffDBB8rLy5PVatVXX32lJUuWSLL/Ur8+KP4BAECt1q1bp/bt2+vAgQN64YUX9Jvf/EZPPvmkBgwYoEOHDmn48OF6+umnVV1dLUkqLy/Xww8/rNjYWBUWFmr79u0qKyvT2LFjJf3wYWX8+PF65plndPz4ce3cuVNPPPGEDMPQzJkzNXbsWI0YMUIlJSUqKSnRgAEDJElt27bV2rVr9cUXX+iNN97Q6tWrtXTpUrtYi4uLtW3bNm3fvl3Z2dl69913lZycrLNnz2rXrl1auHCh0tPTtX//frv3zZs3T7/85S/1+eefKyUlRU899ZSOHz/eBH9dAICnOnLkiCwWiyIjI22r7/z9/bVr1y4VFxdLkp577jnNmDFDjz32mMxms/r376+nnnpKkuTl5Vw5zz3/AAB4oLVr1yo1NVXl5eV2+2+8337o0KGyWCzavXu3JMlisSgwMFBPPPGE1q9fL0kqLS1Vp06dVFBQoP79++uPf/yjdu/erR07dtj6PXv2rDp37qwTJ06osrJS/fr10z/+8Q+Fh4c7xFbfe/4XL16snJwcFRYWSvph5n/RokUqLS1V27ZtJUkjRozQiRMnVFxcbPuQ1LNnT02ePFmzZ8+W9MPM/7Rp07Rq1Spb3/3791ffvn21cuXKev5FAQCo3Y33/G/YsEEpKSk6duyYvL297dr6+/srNDTUtm2xWFRaWqoOHTooLy9Po0aN0vnz59WhQ4d6/37u+QcAALXq06eP7Wdvb28FBwcrKirKti8kJESSdP78eUnS559/rk8//fSmzw8oLi7W8OHD9cgjjygqKkpJSUkaPny4fvWrX+nuu++uNY4NGzbozTffVHFxsSorK/X9998rICDArk1ERISt8L8em7e3t93sSEhIiC3W6xITEx22i4qKao0HAIA7ERsbK4vFovPnz2vw4MG1tvX29tY999wjScrOzlZiYqJThb9E8Q8AAOrQunVru22TyWS37/qThq1WqySpsrJSo0eP1sKFCx366tSpk7y9vfXJJ59o7969+vjjj7V8+XLNnTtX+/fv13333XfTGAoKCpSSkqLMzEwlJSUpMDBQOTk5tvse6xvr9X3XYwUAoDFVVlbq5MmTtu1Tp06pqKhIQUFBioyMVEpKiiZOnKglS5YoNjZWFy5cUF5envr06aPk5GRdvHhRmzZt0tChQ/Xdd9/ZnhGwa9cup2Phnn8AANCg+vbtq2PHjikiIkLdunWze911112SfijABw4cqMzMTB0+fFhms1mbN2+WJJnNZlksFrs+9+7dq/DwcM2dO1dxcXHq3r27Tp8+3WAx79u3z2H7gQceaLD+AQCeqbCwULGxsYqNjZUkpaWlKTY2VvPnz5ckZWVlaeLEifr973+vHj16aMyYMfr73/+uLl262PpYt26d4uLiNHDgQB07dkw7d+5UfHy807Ew8w8AABrU9OnTtXr1ao0fP16zZs1SUFCQTp48qZycHP3lL39RYWGh8vLyNHz4cHXs2FH79+/XhQsXbMV2RESEduzYoRMnTig4OFiBgYHq3r27zpw5o5ycHP3sZz/T1q1bbV8WNITc3FzFxcVp0KBB+utf/6oDBw7o3XffbbD+AQCeaejQoartMXutW7dWZmamMjMzb3q8ffv2KigoaJBYmPkHAAANKiwsTHv27JHFYtHw4cMVFRWl1NRUtWvXTl5eXgoICFB+fr5GjRqlyMhIpaena8mSJRo5cqSkH55s3KNHD8XFxalDhw7as2ePfv7zn+ull17SjBkzFBMTo71792revHkNFnNmZqZycnLUp08frV+/XtnZ2erVq1eD9Q8AQHPjaf8AAMCj3fj0ZQAA3BEz/wAAAAAAuDmKfwAAAAAA3BwP/AMAAB6NOyABAJ6AmX8AAAAAANwcxT8AAAAAAG6O4h8AAAAAADdH8Q8AAAAAgJuj+AcAAAAAwM1R/AMAAAAA4OYo/gEAAAAAcHMU/wAAAAAAuLn/AxvQweiDU8GTAAAAAElFTkSuQmCC", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import polars as pl\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "from datetime import datetime\n", + "labels = [\"Base\", \"Gap\", \"Valid\", \"Test\"]\n", + "\n", + "# Считаем события в каждом интервале\n", + "counts = []\n", + "intervals = [1394150400, 1399939200, 1403049600]\n", + "for i in range(len(intervals)-1):\n", + " start, end = intervals[i], intervals[i+1]\n", + " \n", + " q = df.lazy().explode([\"timestamps\"])\n", + " if end is not None:\n", + " q = q.filter((pl.col(\"timestamps\") >= start) & (pl.col(\"timestamps\") < end))\n", + " else:\n", + " q = q.filter(pl.col(\"timestamps\") >= start)\n", + " \n", + " count = q.select(pl.len()).collect().item()\n", + " counts.append(count)\n", + " \n", + " end_str = datetime.fromtimestamp(end).strftime('%Y-%m-%d') if end else \"Inf\"\n", + " start_str = datetime.fromtimestamp(start).strftime('%Y-%m-%d') if start > 0 else \"Start\"\n", + " \n", + " print(f\"Part {i} [{labels[i]}]: {count} events ({start_str} -> {end_str})\")\n", + "\n", + "# 3. Гистограмма распределения событий во времени\n", + "all_timestamps = df.select(pl.col(\"timestamps\").explode()).to_series().to_numpy()\n", + "\n", + "plt.figure(figsize=(12, 6))\n", + "plt.hist(all_timestamps, bins=100, color='skyblue', alpha=0.7, label='Events')\n", + "\n", + "# Рисуем линии отсечек\n", + "colors = ['red', 'orange', 'green']\n", + "for cutoff, color, label in zip(intervals, colors, labels[1:]):\n", + " plt.axvline(x=cutoff, color=color, linestyle='--', linewidth=2, label=f'Cutoff: {label}')\n", + "\n", + "plt.title(\"Распределение взаимодействий во времени\")\n", + "plt.xlabel(\"Timestamp\")\n", + "plt.ylabel(\"Количество событий\")\n", + "plt.legend()\n", + "plt.grid(True, alpha=0.3)\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/sigir/yambda_processing/YambdaDatasetProcessing.ipynb b/sigir/yambda_processing/YambdaDatasetProcessing.ipynb new file mode 100644 index 0000000..c36af65 --- /dev/null +++ b/sigir/yambda_processing/YambdaDatasetProcessing.ipynb @@ -0,0 +1,640 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "SbkKok0dfjjS" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.12/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "from collections import defaultdict, Counter\n", + "from typing import Any, Dict, List, Optional, Tuple\n", + "\n", + "from datasets import load_dataset\n", + "\n", + "import numpy as np\n", + "\n", + "import polars as pl\n", + "\n", + "import torch\n", + "import torch.nn as nn\n", + "from torch.utils.data import Dataset, DataLoader" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gwwdsnwBfjjT" + }, + "source": [ + "## 🛠️ Подготовка данных" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "viKiSaEKfjjT", + "outputId": "6229cbba-dc3b-4d15-a8e4-ac08e4e187d6" + }, + "outputs": [], + "source": [ + "format = 'sequential'\n", + "size = '50m'\n", + "events = 'listens'\n", + "# listens_data = load_dataset('yandex/yambda', data_dir=f'{format}/{size}', data_files=f'{events}.parquet')\n", + "# yambda_df = pl.from_arrow(listens_data['train'].data.table)\n", + "yambda_df = pl.read_parquet(\"/home/jovyan/yambda_sequential_50m/sequential/50m/listens.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "VNanksDRfjjT", + "outputId": "e118e2b4-0076-475d-9104-5e1565dab7d9" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ test_yambda_data_loading: OK\n" + ] + } + ], + "source": [ + "def test_yambda_data_loading():\n", + " assert isinstance(yambda_df, pl.DataFrame), 'yambda_df должен быть Polars DataFrame'\n", + " assert yambda_df.shape == (9238, 6), f'Неправильный размер: {yambda_df.shape}'\n", + "\n", + " expected_cols = {'uid', 'timestamp', 'item_id', 'is_organic', 'played_ratio_pct', 'track_length_seconds'}\n", + " assert set(yambda_df.columns) == expected_cols, f'Неправильные колонки: {yambda_df.columns}'\n", + "\n", + " assert yambda_df['item_id'].dtype == pl.List(pl.UInt32), 'item_id должен быть List[UInt32]'\n", + " assert yambda_df['timestamp'].dtype == pl.List(pl.UInt32), 'timestamp должен быть List[UInt32]'\n", + "\n", + " assert yambda_df['item_id'].list.len().min() > 0, 'Есть пустые истории'\n", + "\n", + " print('✅ test_yambda_data_loading: OK')\n", + "\n", + "test_yambda_data_loading()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 527 + }, + "id": "q33EG4wlc8ev", + "outputId": "01d03740-713e-46e8-d8c5-81ebf6b71546" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(shape: (5, 6)\n", + " ┌─────┬─────────────────────┬────────────┬─────────────┬─────────────────────┬─────────────────────┐\n", + " │ uid ┆ timestamp ┆ item_id ┆ is_organic ┆ played_ratio_pct ┆ track_length_second │\n", + " │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ s │\n", + " │ u32 ┆ list[u32] ┆ list[u32] ┆ list[u8] ┆ list[u16] ┆ --- │\n", + " │ ┆ ┆ ┆ ┆ ┆ list[u32] │\n", + " ╞═════╪═════════════════════╪════════════╪═════════════╪═════════════════════╪═════════════════════╡\n", + " │ 100 ┆ [39420, 39420, … ┆ [8326270, ┆ [0, 0, … 0] ┆ [100, 100, … 100] ┆ [170, 105, … 165] │\n", + " │ ┆ 25966140] ┆ 1441281, … ┆ ┆ ┆ │\n", + " │ ┆ ┆ 4734787] ┆ ┆ ┆ │\n", + " │ 200 ┆ [14329075, ┆ [3285270, ┆ [1, 1, … 1] ┆ [9, 28, … 100] ┆ [170, 170, … 145] │\n", + " │ ┆ 14329075, … ┆ 5253582, … ┆ ┆ ┆ │\n", + " │ ┆ 2545672… ┆ 3778807] ┆ ┆ ┆ │\n", + " │ 300 ┆ [54090, 54100, … ┆ [618910, ┆ [1, 1, … 1] ┆ [2, 4, … 15] ┆ [270, 130, … 210] │\n", + " │ ┆ 25907225] ┆ 8793425, … ┆ ┆ ┆ │\n", + " │ ┆ ┆ 9286415] ┆ ┆ ┆ │\n", + " │ 500 ┆ [22695440, ┆ [6417502, ┆ [0, 0, … 1] ┆ [100, 37, … 13] ┆ [225, 210, … 230] │\n", + " │ ┆ 22695690, … ┆ 6896222, … ┆ ┆ ┆ │\n", + " │ ┆ 2486145… ┆ 4077285] ┆ ┆ ┆ │\n", + " │ 600 ┆ [1329190, 1329405, ┆ [8077497, ┆ [0, 0, … 0] ┆ [100, 100, … 100] ┆ [245, 215, … 205] │\n", + " │ ┆ … 25997540] ┆ 1865247, … ┆ ┆ ┆ │\n", + " │ ┆ ┆ 6481452] ┆ ┆ ┆ │\n", + " └─────┴─────────────────────┴────────────┴─────────────┴─────────────────────┴─────────────────────┘,\n", + " shape: (0, 6)\n", + " ┌─────┬───────────┬───────────┬────────────┬──────────────────┬──────────────────────┐\n", + " │ uid ┆ timestamp ┆ item_id ┆ is_organic ┆ played_ratio_pct ┆ track_length_seconds │\n", + " │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + " │ u32 ┆ list[u32] ┆ list[u32] ┆ list[u8] ┆ list[u16] ┆ list[u32] │\n", + " ╞═════╪═══════════╪═══════════╪════════════╪══════════════════╪══════════════════════╡\n", + " └─────┴───────────┴───────────┴────────────┴──────────────────┴──────────────────────┘)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(yambda_df.head(), yambda_df.filter(yambda_df['timestamp'].list.len() != yambda_df['item_id'].list.len()))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "-9ou8IARfjjT" + }, + "outputs": [ + { + "ename": "ColumnNotFoundError", + "evalue": "'explode' on column: 'is_organic' is invalid\n\nSchema at this point: Schema:\nname: _idx, field: UInt32\nname: uid, field: UInt32\nname: timestamp, field: List(UInt32)\nname: item_id, field: List(UInt32)\n", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mColumnNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[7], line 5\u001b[0m\n\u001b[1;32m 1\u001b[0m yambda_df \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 2\u001b[0m \u001b[43myambda_df\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfilter\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpl\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcol\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43muid\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m%\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m200\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# надо убрать\u001b[39;49;00m\n\u001b[1;32m 4\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwith_row_index\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m_idx\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m----> 5\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexplode\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtimestamp\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mitem_id\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mis_organic\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mplayed_ratio_pct\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 10\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtrack_length_seconds\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 11\u001b[0m \u001b[43m \u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;241m.\u001b[39mfilter(\n\u001b[1;32m 13\u001b[0m (pl\u001b[38;5;241m.\u001b[39mcol(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mis_organic\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m) \u001b[38;5;241m&\u001b[39m\n\u001b[1;32m 14\u001b[0m (pl\u001b[38;5;241m.\u001b[39mcol(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mplayed_ratio_pct\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m50\u001b[39m)\n\u001b[1;32m 15\u001b[0m )\n\u001b[1;32m 16\u001b[0m \u001b[38;5;241m.\u001b[39mgroup_by([\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m_idx\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124muid\u001b[39m\u001b[38;5;124m'\u001b[39m], maintain_order\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 17\u001b[0m \u001b[38;5;241m.\u001b[39magg([\n\u001b[1;32m 18\u001b[0m pl\u001b[38;5;241m.\u001b[39mcol(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtimestamp\u001b[39m\u001b[38;5;124m'\u001b[39m),\n\u001b[1;32m 19\u001b[0m pl\u001b[38;5;241m.\u001b[39mcol(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mitem_id\u001b[39m\u001b[38;5;124m'\u001b[39m),\n\u001b[1;32m 20\u001b[0m ])\n\u001b[1;32m 21\u001b[0m \u001b[38;5;241m.\u001b[39mdrop(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m_idx\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 22\u001b[0m )\n", + "File \u001b[0;32m/usr/local/lib/python3.12/dist-packages/polars/dataframe/frame.py:8072\u001b[0m, in \u001b[0;36mDataFrame.explode\u001b[0;34m(self, columns, *more_columns)\u001b[0m\n\u001b[1;32m 8015\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mexplode\u001b[39m(\n\u001b[1;32m 8016\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 8017\u001b[0m columns: \u001b[38;5;28mstr\u001b[39m \u001b[38;5;241m|\u001b[39m Expr \u001b[38;5;241m|\u001b[39m Sequence[\u001b[38;5;28mstr\u001b[39m \u001b[38;5;241m|\u001b[39m Expr],\n\u001b[1;32m 8018\u001b[0m \u001b[38;5;241m*\u001b[39mmore_columns: \u001b[38;5;28mstr\u001b[39m \u001b[38;5;241m|\u001b[39m Expr,\n\u001b[1;32m 8019\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m DataFrame:\n\u001b[1;32m 8020\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 8021\u001b[0m \u001b[38;5;124;03m Explode the dataframe to long format by exploding the given columns.\u001b[39;00m\n\u001b[1;32m 8022\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 8070\u001b[0m \u001b[38;5;124;03m └─────────┴─────────┘\u001b[39;00m\n\u001b[1;32m 8071\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 8072\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlazy\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexplode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mmore_columns\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcollect\u001b[49m\u001b[43m(\u001b[49m\u001b[43m_eager\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/usr/local/lib/python3.12/dist-packages/polars/lazyframe/frame.py:2053\u001b[0m, in \u001b[0;36mLazyFrame.collect\u001b[0;34m(self, type_coercion, predicate_pushdown, projection_pushdown, simplify_expression, slice_pushdown, comm_subplan_elim, comm_subexpr_elim, cluster_with_columns, collapse_joins, no_optimization, streaming, engine, background, _eager, **_kwargs)\u001b[0m\n\u001b[1;32m 2051\u001b[0m \u001b[38;5;66;03m# Only for testing purposes\u001b[39;00m\n\u001b[1;32m 2052\u001b[0m callback \u001b[38;5;241m=\u001b[39m _kwargs\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpost_opt_callback\u001b[39m\u001b[38;5;124m\"\u001b[39m, callback)\n\u001b[0;32m-> 2053\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m wrap_df(\u001b[43mldf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcollect\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcallback\u001b[49m\u001b[43m)\u001b[49m)\n", + "\u001b[0;31mColumnNotFoundError\u001b[0m: 'explode' on column: 'is_organic' is invalid\n\nSchema at this point: Schema:\nname: _idx, field: UInt32\nname: uid, field: UInt32\nname: timestamp, field: List(UInt32)\nname: item_id, field: List(UInt32)\n" + ] + } + ], + "source": [ + "yambda_df = (\n", + " yambda_df\n", + " .filter(pl.col('uid') % 200 == 0) # надо убрать\n", + " .with_row_index('_idx')\n", + " .explode([\n", + " 'timestamp',\n", + " 'item_id',\n", + " 'is_organic',\n", + " 'played_ratio_pct',\n", + " 'track_length_seconds',\n", + " ])\n", + " .filter(\n", + " (pl.col('is_organic') == 0) &\n", + " (pl.col('played_ratio_pct') >= 50)\n", + " )\n", + " .group_by(['_idx', 'uid'], maintain_order=True)\n", + " .agg([\n", + " pl.col('timestamp'),\n", + " pl.col('item_id'),\n", + " ])\n", + " .drop('_idx')\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "4HTturbHfjjU", + "outputId": "ea8b1d93-4997-441a-c6e3-2628acd11d7f" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ test_yambda_filtering: OK\n" + ] + } + ], + "source": [ + "def test_yambda_filtering():\n", + " assert yambda_df.shape[0] == 4289, \\\n", + " f'Неправильное количество пользователей: {yambda_df.shape[0]}'\n", + "\n", + " expected_columns = {'uid', 'timestamp', 'item_id'}\n", + " actual_columns = set(yambda_df.columns)\n", + " assert actual_columns == expected_columns, \\\n", + " f'Неправильные колонки. Ожидалось: {expected_columns}, получено: {actual_columns}'\n", + "\n", + " assert yambda_df['timestamp'].dtype == pl.List(pl.UInt32), \\\n", + " f\"timestamp должен быть List[UInt32], получено: {yambda_df['timestamp'].dtype}\"\n", + " assert yambda_df['item_id'].dtype == pl.List(pl.UInt32), \\\n", + " f\"item_id должен быть List[UInt32], получено: {yambda_df['item_id'].dtype}\"\n", + "\n", + " seq_lengths = yambda_df['item_id'].list.len()\n", + " assert seq_lengths.min() >= 1, \\\n", + " f'Минимальная длина последовательности должна быть >= 1, получено: {seq_lengths.min()}'\n", + " assert seq_lengths.sum() == 7587469, \\\n", + " f'Общее количество событий неверно. Ожидалось: 7587469, получено: {seq_lengths.sum()}'\n", + "\n", + " unique_items = yambda_df.select('item_id').explode('item_id').unique().shape[0]\n", + " assert unique_items == 304787, \\\n", + " f'Количество уникальных айтемов неверно. Ожидалось: 304787, получено: {unique_items}'\n", + "\n", + " print('✅ test_yambda_filtering: OK')\n", + "\n", + "test_yambda_filtering()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Оригинальные эмбеддинги: (7721749, 3)\n", + "Колонки: ['item_id', 'embed', 'normalized_embed']\n", + "shape: (5, 3)\n", + "┌─────────┬─────────────────────────────────┬─────────────────────────────────┐\n", + "│ item_id ┆ embed ┆ normalized_embed │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ u32 ┆ list[f64] ┆ list[f64] │\n", + "╞═════════╪═════════════════════════════════╪═════════════════════════════════╡\n", + "│ 2 ┆ [-1.534035, -0.366767, … 0.999… ┆ [-0.064638, -0.015454, … 0.042… │\n", + "│ 3 ┆ [-3.761467, -1.068254, … -2.66… ┆ [-0.163937, -0.046558, … -0.11… │\n", + "│ 4 ┆ [2.445533, -2.523603, … -0.536… ┆ [0.076272, -0.078707, … -0.016… │\n", + "│ 5 ┆ [0.832846, 0.116125, … -1.4857… ┆ [0.03149, 0.004391, … -0.05617… │\n", + "│ 6 ┆ [-2.431483, -0.56872, … 0.0946… ┆ [-0.10345, -0.024197, … 0.0040… │\n", + "└─────────┴─────────────────────────────────┴─────────────────────────────────┘\n" + ] + } + ], + "source": [ + "import polars as pl\n", + "import pandas as pd\n", + "import pickle\n", + "\n", + "# === 1. Загрузить оригинальные embeddings ===\n", + "embeddings_path = \"/home/jovyan/yambda_embeddings/embeddings.parquet\"\n", + "emb_df = pl.read_parquet(embeddings_path)\n", + "\n", + "print(f\"Оригинальные эмбеддинги: {emb_df.shape}\")\n", + "print(f\"Колонки: {emb_df.columns}\")\n", + "print(emb_df.head())" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Валидные item_id: 7721749\n", + "Было строк: 4289\n", + "Стало строк: 4138\n" + ] + } + ], + "source": [ + "valid_item_ids = set(emb_df['item_id'].to_list())\n", + "print(f\"\\nВалидные item_id: {len(valid_item_ids)}\")\n", + "valid_ids_pl = pl.Series(list(valid_item_ids))\n", + "\n", + "valid_item_ids = set(emb_df['item_id'].to_list())\n", + "valid_ids_pl = pl.Series(list(valid_item_ids))\n", + "\n", + "yambda_df_filtered = (\n", + " yambda_df\n", + " .with_columns(\n", + " pl.col(\"item_id\").list.eval(\n", + " pl.when(pl.element().is_in(valid_ids_pl))\n", + " .then(pl.int_range(pl.len()))\n", + " .otherwise(None)\n", + " ).list.drop_nulls().alias(\"valid_indices\")\n", + " )\n", + " .with_columns([\n", + " pl.col(\"item_id\").list.gather(pl.col(\"valid_indices\")),\n", + " pl.col(\"timestamp\").list.gather(pl.col(\"valid_indices\"))\n", + " ])\n", + " .drop(\"valid_indices\")\n", + " .filter(pl.col(\"item_id\").list.len() > 0)\n", + " .rename({\"item_id\": \"item_ids\", \"timestamp\": \"timestamps\"})\n", + ")\n", + "yambda_df_filtered = yambda_df_filtered.filter(yambda_df_filtered['item_ids'].list.len() >= 5)\n", + "\n", + "print(f\"Было строк: {yambda_df.shape[0]}\")\n", + "print(f\"Стало строк: {yambda_df_filtered.shape[0]}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3️⃣ Получите все уникальные ID треков из датасета и создайте маппинг: старый_id - новый_id, где новый_id находится в диапазоне от 0 до N - 1.\n", + "\n", + "Модели глубокого обучения требуют, чтобы категориальные признаки (в нашем случае ID треков) были представлены целыми числами в диапазоне от 0 до N-1, где N — количество уникальных треков. Датасет Yambda содержит оригинальные ID треков, которые могут быть разреженными (например, [100, 5000, 7, 12000, ...]) — это неэффективно для embedding-таблиц." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "unique_items = (\n", + " yambda_df_filtered\n", + " .select('item_ids')\n", + " .explode('item_ids')\n", + " .unique()\n", + " .sort('item_ids')\n", + ").with_row_index('new_item_ids')\n", + "\n", + "\n", + "item_mapping = dict(zip(unique_items['item_ids'], unique_items['new_item_ids']))\n", + "\n", + "\n", + "yambda_df_filtered = yambda_df_filtered.with_columns([\n", + " pl.col('item_ids')\n", + " .map_elements(\n", + " lambda items: [item_mapping[item] for item in items],\n", + " return_dtype=pl.List(pl.UInt32)\n", + " )\n", + " .alias('item_ids')\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ test_item_mapping: OK\n" + ] + } + ], + "source": [ + "def test_item_mapping():\n", + " assert unique_items.shape == (292865, 2), f'Неправильный размер unique_items: {unique_items.shape}'\n", + " assert set(unique_items.columns) == {'new_item_ids', 'item_ids'}, 'Неправильные колонки unique_items'\n", + "\n", + " assert len(item_mapping) == 292865, f'Неправильный размер item_mapping: {len(item_mapping)}'\n", + " assert item_mapping[50] == 0 and item_mapping[175] == 1 and item_mapping[195] == 2, \\\n", + " 'Неверные первые маппинги'\n", + "\n", + " new_ids = unique_items['new_item_ids']\n", + " assert new_ids.min() == 0 and new_ids.max() == 292864, 'new_item_id должны быть в [0, 292865,]'\n", + "\n", + " all_ids = yambda_df_filtered.select('item_ids').explode('item_ids')['item_ids']\n", + " assert all_ids.min() == 0 and all_ids.max() == 292864, 'item_id в yambda_df не обновлены'\n", + " assert all_ids.n_unique() == 292865, 'Количество уникальных item_id изменилось'\n", + "\n", + " print('✅ test_item_mapping: OK')\n", + "\n", + "test_item_mapping()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Маппинг использует: 292865 уникальных item_id\n", + "Переиндексированные эмбеддинги: (292865, 2)\n", + "shape: (5, 2)\n", + "┌─────────────────────────────────┬─────────┐\n", + "│ embedding ┆ item_id │\n", + "│ --- ┆ --- │\n", + "│ list[f64] ┆ u32 │\n", + "╞═════════════════════════════════╪═════════╡\n", + "│ [-0.0526, 0.048672, … -0.04217… ┆ 0 │\n", + "│ [0.090222, -0.00718, … -0.0862… ┆ 1 │\n", + "│ [-0.00822, -0.057882, … 0.2188… ┆ 2 │\n", + "│ [-0.107289, -0.034719, … -0.02… ┆ 3 │\n", + "│ [0.012762, -0.043315, … 0.1494… ┆ 4 │\n", + "└─────────────────────────────────┴─────────┘\n" + ] + } + ], + "source": [ + "print(f\"\\nМаппинг использует: {len(item_mapping)} уникальных item_id\")\n", + "\n", + "# === 3. Переиндексировать embeddings используя существующий маппинг ===\n", + "emb_df_reindexed = emb_df.with_columns(\n", + " pl.col('item_id')\n", + " .map_elements(\n", + " lambda x: item_mapping.get(x, None),\n", + " return_dtype=pl.UInt32\n", + " )\n", + " .alias('new_item_id')\n", + ").filter(pl.col('new_item_id').is_not_null()).drop('item_id', 'embed').rename({'new_item_id': 'item_id', 'normalized_embed': 'embedding'})\n", + "\n", + "print(f\"Переиндексированные эмбеддинги: {emb_df_reindexed.shape}\")\n", + "print(emb_df_reindexed.head())\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ test_item_mapping: OK\n" + ] + } + ], + "source": [ + "def test_emb_item_mapping():\n", + " all_ids = emb_df_reindexed['item_id']\n", + " assert all_ids.min() == 0 and all_ids.max() == 292864, 'item_id в yambda_df не обновлены'\n", + " assert all_ids.n_unique() == 292865, 'Количество уникальных item_id изменилось'\n", + "\n", + " print('✅ test_item_mapping: OK')\n", + "\n", + "test_emb_item_mapping()" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "✓ Сохранены embeddings: /home/jovyan/IRec/sigir/yambda_data/yambda_embeddings_reindexed.parquet\n" + ] + } + ], + "source": [ + "embeddings_output_parquet_path = \"/home/jovyan/IRec/sigir/yambda_data/yambda_embeddings_reindexed.parquet\"\n", + "emb_df_reindexed.write_parquet(embeddings_output_parquet_path)\n", + "print(f\"\\n✓ Сохранены embeddings: {embeddings_output_parquet_path}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Тест пройден: все 4138 строк синхронизированы\n" + ] + } + ], + "source": [ + "def test_integrity(df):\n", + " bad_rows = df.filter(\n", + " (pl.col(\"item_ids\").list.len() != pl.col(\"timestamps\").list.len()) | (pl.col(\"timestamps\").list.len() < 5)\n", + " )\n", + " \n", + " if bad_rows.height > 0:\n", + " print(f\"ОШИБКА: {bad_rows.height} строк рассинхронизированы!\")\n", + " raise ValueError(\"Рассинхрон массивов!\")\n", + " \n", + " print(f\"Тест пройден: все {df.height} строк синхронизированы\")\n", + "\n", + "test_integrity(yambda_df_filtered)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Сохранён filtered yambda_df: /home/jovyan/IRec/sigir/yambda_data/yambda_sequential_50m_filtered_reindexed.parquet\n" + ] + } + ], + "source": [ + "yambda_output_parquet_path = \"/home/jovyan/IRec/sigir/yambda_data/yambda_sequential_50m_filtered_reindexed.parquet\"\n", + "yambda_df_filtered.write_parquet(yambda_output_parquet_path)\n", + "print(f\"Сохранён filtered yambda_df: {yambda_output_parquet_path}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Сохранён маппинг: /home/jovyan/IRec/sigir/yambda_data/old_to_new_item_id_mapping.json\n" + ] + } + ], + "source": [ + "import json\n", + "mapping_output_path = \"/home/jovyan/IRec/sigir/yambda_data/old_to_new_item_id_mapping.json\"\n", + "\n", + "with open(mapping_output_path, 'w') as f:\n", + " json.dump({str(k): v for k, v in item_mapping.items()}, f, indent=2)\n", + "\n", + "print(f\"Сохранён маппинг: {mapping_output_path}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (10, 3)
uidtimestampsitem_ids
u32list[u32]list[u32]
600[1329190, 1329405, … 25997540][252026, 58171, … 201909]
800[121100, 121290, … 25977310][20844, 198210, … 60455]
1000[11335730, 11335925, … 25972225][46643, 57592, … 95670]
1400[280570, 280735, … 25993315][4634, 213798, … 104891]
1600[899275, 930305, … 25941890][223933, 154424, … 104876]
2000[18814620, 18828965, … 25225145][137828, 138498, … 19072]
2200[10053900, 10054120, … 25948025][4923, 231643, … 28122]
2400[14246260, 14246390, … 25999860][157350, 217652, … 75038]
2600[6089640, 6089915, … 25951140][9426, 202953, … 140393]
2800[19744285, 19744475, … 25894825][123607, 291065, … 272888]
" + ], + "text/plain": [ + "shape: (10, 3)\n", + "┌──────┬─────────────────────────────────┬────────────────────────────┐\n", + "│ uid ┆ timestamps ┆ item_ids │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ u32 ┆ list[u32] ┆ list[u32] │\n", + "╞══════╪═════════════════════════════════╪════════════════════════════╡\n", + "│ 600 ┆ [1329190, 1329405, … 25997540] ┆ [252026, 58171, … 201909] │\n", + "│ 800 ┆ [121100, 121290, … 25977310] ┆ [20844, 198210, … 60455] │\n", + "│ 1000 ┆ [11335730, 11335925, … 2597222… ┆ [46643, 57592, … 95670] │\n", + "│ 1400 ┆ [280570, 280735, … 25993315] ┆ [4634, 213798, … 104891] │\n", + "│ 1600 ┆ [899275, 930305, … 25941890] ┆ [223933, 154424, … 104876] │\n", + "│ 2000 ┆ [18814620, 18828965, … 2522514… ┆ [137828, 138498, … 19072] │\n", + "│ 2200 ┆ [10053900, 10054120, … 2594802… ┆ [4923, 231643, … 28122] │\n", + "│ 2400 ┆ [14246260, 14246390, … 2599986… ┆ [157350, 217652, … 75038] │\n", + "│ 2600 ┆ [6089640, 6089915, … 25951140] ┆ [9426, 202953, … 140393] │\n", + "│ 2800 ┆ [19744285, 19744475, … 2589482… ┆ [123607, 291065, … 272888] │\n", + "└──────┴─────────────────────────────────┴────────────────────────────┘" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "yambda_df_filtered.head(10)" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/sigir/yambda_processing/yambda_exps_data.ipynb b/sigir/yambda_processing/yambda_exps_data.ipynb new file mode 100644 index 0000000..e8f8d8b --- /dev/null +++ b/sigir/yambda_processing/yambda_exps_data.ipynb @@ -0,0 +1,1168 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "e2462a97-6705-44e1-a232-4dd78a5dfc85", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "import json\n", + "from typing import List, Dict" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "fd38624d-5796-4aa5-929f-7e82c5544f6c", + "metadata": {}, + "outputs": [], + "source": [ + "interactions_output_parquet_path = '/home/jovyan/IRec/sigir/yambda_data/yambda_sequential_50m_filtered_reindexed.parquet'\n", + "df = pl.read_parquet(interactions_output_parquet_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "69066941", + "metadata": {}, + "outputs": [], + "source": [ + "def merge_and_save(parts_to_merge, dirr, output_name):\n", + " merged = {}\n", + " print(f\"Merging {len(parts_to_merge)} files into {output_name}...\")\n", + " \n", + " for part in parts_to_merge:\n", + " # with open(fp, 'r') as f:\n", + " # part = json.load(f)\n", + " for uid, items in part.items():\n", + " if uid not in merged:\n", + " merged[uid] = []\n", + " merged[uid].extend(items)\n", + " \n", + " out_path = f\"{dirr}/{output_name}\"\n", + " with open(out_path, 'w') as f:\n", + " json.dump(merged, f)\n", + " print(f\"✓ Done: {out_path} (Users: {len(merged)})\")\n", + "\n", + "\n", + "def merge_and_save_with_filter(parts_to_merge, dirr, output_name, min_history_len=5):\n", + " merged = {}\n", + " print(f\"Merging {len(parts_to_merge)} files into {output_name} (min len={min_history_len})...\")\n", + " \n", + " for part in parts_to_merge:\n", + " for uid, items in part.items():\n", + " if uid not in merged:\n", + " merged[uid] = []\n", + " merged[uid].extend(items)\n", + "\n", + " filtered_merged = {}\n", + " filtered_count = 0\n", + " \n", + " for uid, items in merged.items():\n", + " if len(items) >= min_history_len:\n", + " filtered_merged[uid] = items\n", + " else:\n", + " filtered_count += 1\n", + " \n", + " print(f\"Filtered {filtered_count} users with history < {min_history_len}\")\n", + " print(f\"Remaining: {len(filtered_merged)} users\")\n", + " \n", + " out_path = f\"{dirr}/{output_name}\"\n", + " with open(out_path, 'w') as f:\n", + " json.dump(filtered_merged, f)\n", + " print(f\"Done: {out_path} (Users: {len(filtered_merged)})\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "ee127317-66b8-4f22-9109-94bcb8b1f1ae", + "metadata": {}, + "outputs": [], + "source": [ + "def split_session_by_timestamps(\n", + " df: pl.DataFrame,\n", + " time_cutoffs: List[int],\n", + " output_dir: str = None,\n", + " return_dicts: bool = True\n", + ") -> List[Dict[int, List[int]]]:\n", + " \"\"\"\n", + " Args:\n", + " df: Polars DataFrame с колонками uid, item_ids (list), timestamps (list)\n", + " time_cutoffs: Лист временных точек для разбиения\n", + " output_dir: Директория для сохранения JSON файлов (опционально)\n", + " return_dicts: Возвращать ли словари (как json_data format)\n", + " \n", + " Возвращает лист словарей в формате {user_id: [item_ids для интервала]}\n", + " \"\"\"\n", + " \n", + " result_dicts = []\n", + " \n", + " def extract_interval(df_source, start, end=None):\n", + " q = df_source.lazy()\n", + " q = q.explode([\"item_ids\", \"timestamps\"])\n", + " \n", + " if end is not None:\n", + " q = q.filter(\n", + " (pl.col(\"timestamps\") >= start) & \n", + " (pl.col(\"timestamps\") < end)\n", + " )\n", + " else:\n", + " q = q.filter(\n", + " pl.col(\"timestamps\") >= start\n", + " )\n", + " \n", + " q = q.group_by(\"uid\").agg([\n", + " pl.col(\"item_ids\").alias(\"item_ids\")\n", + " ]).sort(\"uid\")\n", + " \n", + " return q.collect()\n", + " \n", + " intervals = []\n", + " current_start = 0\n", + " for cutoff in time_cutoffs:\n", + " intervals.append((current_start, cutoff))\n", + " current_start = cutoff\n", + "\n", + " intervals.append((current_start, None))\n", + "\n", + " for start, end in intervals:\n", + " subset = extract_interval(df, start, end)\n", + "\n", + " json_dict = {}\n", + " for user_id, item_ids in subset.iter_rows():\n", + " json_dict[user_id] = item_ids\n", + " \n", + " result_dicts.append(json_dict)\n", + "\n", + " if output_dir:\n", + " if end is not None:\n", + " filename = f\"inter_new_[{start}_{end}).json\"\n", + " else:\n", + " filename = f\"inter_new_[{start}_inf).json\"\n", + " \n", + " filepath = f\"{output_dir}/{filename}\"\n", + " with open(filepath, 'w') as f:\n", + " json.dump(json_dict, f, indent=2)\n", + " \n", + " print(f\"✓ Сохранено: {filepath}\")\n", + " \n", + " return result_dicts" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "6cff8e7b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (5, 3)
uidtimestampsitem_ids
u32list[u32]list[u32]
600[1329190, 1329405, … 25997540][252026, 58171, … 201909]
800[121100, 121290, … 25977310][20844, 198210, … 60455]
1000[11335730, 11335925, … 25972225][46643, 57592, … 95670]
1400[280570, 280735, … 25993315][4634, 213798, … 104891]
1600[899275, 930305, … 25941890][223933, 154424, … 104876]
" + ], + "text/plain": [ + "shape: (5, 3)\n", + "┌──────┬─────────────────────────────────┬────────────────────────────┐\n", + "│ uid ┆ timestamps ┆ item_ids │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ u32 ┆ list[u32] ┆ list[u32] │\n", + "╞══════╪═════════════════════════════════╪════════════════════════════╡\n", + "│ 600 ┆ [1329190, 1329405, … 25997540] ┆ [252026, 58171, … 201909] │\n", + "│ 800 ┆ [121100, 121290, … 25977310] ┆ [20844, 198210, … 60455] │\n", + "│ 1000 ┆ [11335730, 11335925, … 2597222… ┆ [46643, 57592, … 95670] │\n", + "│ 1400 ┆ [280570, 280735, … 25993315] ┆ [4634, 213798, … 104891] │\n", + "│ 1600 ┆ [899275, 930305, … 25941890] ┆ [223933, 154424, … 104876] │\n", + "└──────┴─────────────────────────────────┴────────────────────────────┘" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "901e7400", + "metadata": {}, + "source": [ + "# QUANTILE CUTOFF" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "8c691891", + "metadata": {}, + "outputs": [], + "source": [ + "def get_quantile_cutoffs(df, num_parts=4, base_ratio=None):\n", + " \"\"\"\n", + " Считает cutoffs так, чтобы разбить данные на части.\n", + " \n", + " Args:\n", + " num_parts: На сколько частей делить \"хвост\" истории.\n", + " base_ratio: Какую долю данных отдать в Base (самую первую часть). \n", + " Если None, делит всё поровну.\n", + " \"\"\"\n", + " # Достаем все таймстемпы в один плоский массив\n", + " # Это может занять память, если данных очень много (>100M), но для Beauty (2M) это ок\n", + " all_ts = df.select(pl.col(\"timestamps\").explode()).to_series().sort()\n", + " total_events = len(all_ts)\n", + " \n", + " print(f\"Всего событий: {total_events}\")\n", + " \n", + " cutoffs = []\n", + " \n", + " if base_ratio:\n", + " # Base занимает X% (например 80%), а остаток делим поровну на 3 части (Valid, Gap, Test)\n", + " # Остаток = 1 - base_ratio\n", + " # Каждая малая часть = (1 - base_ratio) / num_parts_tail\n", + " \n", + " base_idx = int(total_events * base_ratio)\n", + " cutoffs.append(all_ts[base_idx]) # Первый cutoff отделяет Base\n", + " \n", + " remaining_events = total_events - base_idx\n", + " part_size = remaining_events // num_parts # Делим остаток на 3 части (P1, P2, P3)\n", + " \n", + " current_idx = base_idx\n", + " for _ in range(num_parts-1): # Нам нужно еще 2 границы, чтобы получить 3 части\n", + " current_idx += part_size\n", + " cutoffs.append(all_ts[current_idx])\n", + " \n", + " else:\n", + " # Сценарий: Просто делим всё на N равных частей\n", + " step = total_events // num_parts\n", + " for i in range(1, num_parts):\n", + " idx = i * step\n", + " cutoffs.append(all_ts[idx])\n", + " \n", + " return cutoffs\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "13c1466f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Всего событий: 7371990\n", + "\n", + "--- Новые Cutoffs (по количеству событий) ---\n", + "Cutoffs: [22138015, 23136375, 24137410, 25093085]\n", + "[0, 22138015, 23136375, 24137410, 25093085, None]\n" + ] + } + ], + "source": [ + "equal_event_cutoffs = get_quantile_cutoffs(df, num_parts=4, base_ratio=0.8)\n", + "\n", + "print(\"\\n--- Новые Cutoffs (по количеству событий) ---\")\n", + "print(f\"Cutoffs: {equal_event_cutoffs}\")\n", + "\n", + "# Проверка распределения\n", + "intervals_eq = [0] + equal_event_cutoffs + [None]\n", + "print(intervals_eq)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "4e7f7b46", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✓ Сохранено: /home/jovyan/IRec/data/Yambda/updated_quantile_splits/raw/inter_new_[0_22138015).json\n", + "✓ Сохранено: /home/jovyan/IRec/data/Yambda/updated_quantile_splits/raw/inter_new_[22138015_24137410).json\n", + "✓ Сохранено: /home/jovyan/IRec/data/Yambda/updated_quantile_splits/raw/inter_new_[24137410_25093085).json\n", + "✓ Сохранено: /home/jovyan/IRec/data/Yambda/updated_quantile_splits/raw/inter_new_[25093085_inf).json\n", + "0 Base 3813 5897592 \n", + "1 Gap 3315 737198 \n", + "2 Valid 3120 368599 \n", + "3 Test 3154 368601 \n" + ] + } + ], + "source": [ + "new_split_files = split_session_by_timestamps(\n", + " df, \n", + " [22138015, 24137410, 25093085], \n", + " output_dir=\"/home/jovyan/IRec/data/Yambda/updated_quantile_splits/raw\"\n", + ")\n", + "\n", + "names = [\"Base\", \"Gap\", \"Valid\", \"Test\"]\n", + "for i, d in enumerate(new_split_files):\n", + " num_users = len(d)\n", + " \n", + " num_events = sum(len(items) for items in d.values())\n", + " \n", + " print(f\"{i:<10} {names[i]:<10} {num_users:<10} {num_events:<10}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "82fd2bca", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Merging 2 files into exp_4_0.9_inter_tiger_train.json...\n", + "✓ Done: /home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/exp_4_0.9_inter_tiger_train.json (Users: 4016)\n", + "Merging 2 files into exp_4-1_0.9_inter_semantics_train.json...\n", + "✓ Done: /home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/exp_4-1_0.9_inter_semantics_train.json (Users: 4016)\n", + "Merging 1 files into exp_4-2_0.8_inter_semantics_train.json...\n", + "✓ Done: /home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/exp_4-2_0.8_inter_semantics_train.json (Users: 3813)\n", + "Merging 3 files into exp_4-3_0.95_inter_semantics_train.json...\n", + "✓ Done: /home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/exp_4-3_0.95_inter_semantics_train.json (Users: 4118)\n", + "Merging 1 files into test_set.json...\n", + "✓ Done: /home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/test_set.json (Users: 3154)\n", + "Merging 1 files into valid_set.json...\n", + "✓ Done: /home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/valid_set.json (Users: 3120)\n", + "Merging 4 files into all_set.json...\n", + "✓ Done: /home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/all_set.json (Users: 4138)\n", + "All done!\n" + ] + } + ], + "source": [ + "EXP_DIR = \"/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps\"\n", + "\n", + "base_p, gap_p, valid_p, test_p = new_split_files[0], new_split_files[1], new_split_files[2], new_split_files[3]\n", + "\n", + "# Tiger: base + gap\n", + "merge_and_save([base_p, gap_p], EXP_DIR, \"exp_4_0.9_inter_tiger_train.json\")\n", + "\n", + "# 1. Exp 4.1 (Standard)\n", + "# Semantics: base + gap (Всё кроме валидации и теста)\n", + "merge_and_save([base_p, gap_p], EXP_DIR, \"exp_4-1_0.9_inter_semantics_train.json\")\n", + "\n", + "# 2. Exp 4.2 (Short Semantics)\n", + "# Semantics: base (Короче на пропуск, без gap)\n", + "merge_and_save([base_p], EXP_DIR, \"exp_4-2_0.8_inter_semantics_train.json\")\n", + "\n", + "# 3. Exp 4.3 (Leak)\n", + "# Semantics: base + gap + valid (Видит валидацию)\n", + "merge_and_save([base_p, gap_p, valid_p], EXP_DIR, \"exp_4-3_0.95_inter_semantics_train.json\")\n", + "\n", + "# 4. Test Set (тест всех моделей)\n", + "merge_and_save([test_p], EXP_DIR, \"test_set.json\")\n", + "\n", + "# 4. Valid Set (валидационный набор)\n", + "merge_and_save([valid_p], EXP_DIR, \"valid_set.json\")\n", + "\n", + "# 4. All Set (все данные)\n", + "merge_and_save([base_p, gap_p, valid_p, test_p], EXP_DIR, \"all_set.json\")\n", + "\n", + "print(\"All done!\")" + ] + }, + { + "cell_type": "code", + "id": "d34b1c55", + "metadata": { + "ExecuteTime": { + "end_time": "2025-12-11T08:56:58.546300Z", + "start_time": "2025-12-11T08:56:58.343394Z" + } + }, + "source": [ + "with open(\"/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/all_set.json\", 'r') as f:\n", + " old_inter_new = json.load(f)\n", + "\n", + "with open(\"/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/exp_4-1_0.9_inter_semantics_train.json\", 'r') as ff:\n", + " first_sem = json.load(ff)\n", + " \n", + "with open(\"/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/exp_4-2_0.8_inter_semantics_train.json\", 'r') as ff:\n", + " second_sem = json.load(ff)\n", + " \n", + "with open(\"/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/exp_4-3_0.95_inter_semantics_train.json\", 'r') as ff:\n", + " third_sem = json.load(ff)\n", + " \n", + "with open(\"/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/exp_4_0.9_inter_tiger_train.json\", 'r') as ff:\n", + " tiger_sem = json.load(ff)\n", + "\n", + "with open(\"/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/test_set.json\", 'r') as ff:\n", + " test_sem = json.load(ff)\n", + "\n", + "with open(\"/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/all_set.json\", 'r') as ff:\n", + " all_test_data = json.load(ff)\n", + "\n", + "def check_prefix_match(full_data, subset_data, check_suffix=False):\n", + " \"\"\"\n", + " check_suffix=True включит режим проверки суффиксов (для теста).\n", + " \"\"\"\n", + " mismatch_count = 0\n", + " full_match_count = 0\n", + "\n", + " num_events_full_data = sum(len(items) for items in full_data.values())\n", + " num_events_subset_data = sum(len(items) for items in subset_data.values())\n", + " print(f\"доля событий всего {(num_events_subset_data/num_events_full_data):.2f}:\")\n", + " \n", + " for user, sub_items in subset_data.items():\n", + " \n", + " if user not in full_data:\n", + " print(f\"⚠ Юзер {user} не найден в исходном файле!\")\n", + " mismatch_count += 1\n", + " continue\n", + " \n", + " full_items = full_data[user]\n", + " \n", + " if not check_suffix:\n", + " if len(sub_items) > len(full_items):\n", + " mismatch_count += 1\n", + " continue\n", + " \n", + " if full_items[:len(sub_items)] == sub_items:\n", + " if len(full_items) == len(sub_items):\n", + " full_match_count += 1\n", + " else:\n", + " mismatch_count += 1\n", + "\n", + " else:\n", + " if len(sub_items) > len(full_items):\n", + " mismatch_count += 1\n", + " continue\n", + "\n", + " if full_items[-len(sub_items):] == sub_items:\n", + " if len(full_items) == len(sub_items):\n", + " full_match_count += 1\n", + " else:\n", + " mismatch_count += 1\n", + "\n", + " mode = \"СУФФИКСЫ\" if check_suffix else \"ПРЕФИКСЫ\"\n", + " \n", + " if mismatch_count == 0:\n", + " print(f\"OK [{mode}] Все {len(subset_data)} массивов ОК. Полных совпадений: {full_match_count}\")\n", + " else:\n", + " print(f\"NOT OK [{mode}] Найдено {mismatch_count} ошибок.\")\n", + "\n", + "# --- Запуск проверок ---\n", + "print(\"Проверка Train сетов (должны быть префиксами):\")\n", + "check_prefix_match(old_inter_new, first_sem)\n", + "check_prefix_match(old_inter_new, second_sem)\n", + "check_prefix_match(old_inter_new, third_sem)\n", + "check_prefix_match(old_inter_new, tiger_sem)\n", + "\n", + "print(\"\\nПроверка Test сета (должен быть суффиксом):\")\n", + "check_prefix_match(old_inter_new, test_sem, check_suffix=True)\n", + "\n", + "print(\"\\n(Контроль) Проверка Test сета как префикса (должна упасть):\")\n", + "check_prefix_match(old_inter_new, test_sem, check_suffix=False)\n", + "\n", + "check_prefix_match(old_inter_new, all_test_data)\n" + ], + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: '/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/all_set.json'", + "output_type": "error", + "traceback": [ + "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", + "\u001B[0;31mFileNotFoundError\u001B[0m Traceback (most recent call last)", + "Cell \u001B[0;32mIn[1], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m \u001B[38;5;28;43mopen\u001B[39;49m\u001B[43m(\u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43m/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/all_set.json\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43mr\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m)\u001B[49m \u001B[38;5;28;01mas\u001B[39;00m f:\n\u001B[1;32m 2\u001B[0m old_inter_new \u001B[38;5;241m=\u001B[39m json\u001B[38;5;241m.\u001B[39mload(f)\n\u001B[1;32m 4\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m \u001B[38;5;28mopen\u001B[39m(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/exp_4-1_0.9_inter_semantics_train.json\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mr\u001B[39m\u001B[38;5;124m'\u001B[39m) \u001B[38;5;28;01mas\u001B[39;00m ff:\n", + "File \u001B[0;32m~/repositories/ucp-author-centric/ucp-env/lib/python3.9/site-packages/IPython/core/interactiveshell.py:310\u001B[0m, in \u001B[0;36m_modified_open\u001B[0;34m(file, *args, **kwargs)\u001B[0m\n\u001B[1;32m 303\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m file \u001B[38;5;129;01min\u001B[39;00m {\u001B[38;5;241m0\u001B[39m, \u001B[38;5;241m1\u001B[39m, \u001B[38;5;241m2\u001B[39m}:\n\u001B[1;32m 304\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\n\u001B[1;32m 305\u001B[0m \u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mIPython won\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mt let you open fd=\u001B[39m\u001B[38;5;132;01m{\u001B[39;00mfile\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m by default \u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[1;32m 306\u001B[0m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mas it is likely to crash IPython. If you know what you are doing, \u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[1;32m 307\u001B[0m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124myou can use builtins\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124m open.\u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[1;32m 308\u001B[0m )\n\u001B[0;32m--> 310\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mio_open\u001B[49m\u001B[43m(\u001B[49m\u001B[43mfile\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n", + "\u001B[0;31mFileNotFoundError\u001B[0m: [Errno 2] No such file or directory: '/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/all_set.json'" + ] + } + ], + "execution_count": 1 + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "c3a0adf2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "================================================================================\n", + "ПРОВЕРКА НА ПУСТЫЕ ЧАСТИ ИСТОРИЙ\n", + "================================================================================\n", + "\n", + "[exp_4-1_0.9] Анализ...\n", + " Юзеров в сплите: 4,016 / 4,138\n", + " ПУСТЫХ сессий: 15\n", + " ОБЩИХ ПРОБЛЕМ: 15\n", + "\n", + "[exp_4-2_0.8] Анализ...\n", + " Юзеров в сплите: 3,813 / 4,138\n", + " ПУСТЫХ сессий: 22\n", + " ОБЩИХ ПРОБЛЕМ: 22\n", + "\n", + "[exp_4-3_0.95] Анализ...\n", + " Юзеров в сплите: 4,118 / 4,138\n", + " ПУСТЫХ сессий: 7\n", + " ОБЩИХ ПРОБЛЕМ: 7\n", + "\n", + "[exp_4_0.9_tiger] Анализ...\n", + " Юзеров в сплите: 4,016 / 4,138\n", + " ПУСТЫХ сессий: 15\n", + " ОБЩИХ ПРОБЛЕМ: 15\n", + "\n", + "[test_set] Анализ...\n", + " Юзеров в сплите: 3,154 / 4,138\n", + " ПУСТЫХ сессий: 105\n", + " ОБЩИХ ПРОБЛЕМ: 105\n" + ] + } + ], + "source": [ + "def check_non_empty_splits(full_data, splits_data, split_names, min_history_len=2):\n", + " \"\"\"\n", + " Проверяет, что ни одна часть истории пользователя НЕ пустая во всех разбиениях.\n", + " \"\"\"\n", + " print(\"\\n\" + \"=\"*80)\n", + " print(\"ПРОВЕРКА НА ПУСТЫЕ ЧАСТИ ИСТОРИЙ\")\n", + " print(\"=\"*80)\n", + " \n", + " all_users = set(full_data.keys())\n", + " total_issues = 0\n", + " \n", + " for i in range(len(split_names)):\n", + " split_name = split_names[i]\n", + " split_data = splits_data[i]\n", + " print(f\"\\n[{split_name}] Анализ...\")\n", + " \n", + " split_users = set(split_data.keys())\n", + " empty_sessions = []\n", + " \n", + " for user, items in split_data.items():\n", + " if not items or len(items) < min_history_len:\n", + " empty_sessions.append(user)\n", + " \n", + " issues_count = len(empty_sessions)\n", + " total_issues += issues_count\n", + " \n", + " print(f\" Юзеров в сплите: {len(split_users):,} / {len(all_users):,}\")\n", + " print(f\" ПУСТЫХ сессий: {len(empty_sessions)}\")\n", + " print(f\" ОБЩИХ ПРОБЛЕМ: {issues_count}\")\n", + " \n", + " if total_issues == 0:\n", + " print(\"\\nВСЕ РАЗБИЕНИЯ БЕЗ ПУСТЫХ СЕССИЙ\")\n", + "\n", + "split_names = ['exp_4-1_0.9', 'exp_4-2_0.8', 'exp_4-3_0.95', 'exp_4_0.9_tiger', 'test_set']\n", + "splits_list = [first_sem, second_sem, third_sem, tiger_sem, test_sem]\n", + "\n", + "check_non_empty_splits(old_inter_new, splits_list, split_names)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "43aa0142", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Merging 2 files into exp_4_0.9_inter_tiger_train.json (min len=2)...\n", + "Filtered 15 users with history < 2\n", + "Remaining: 4001 users\n", + "✓ Done: /home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps_filtered/exp_4_0.9_inter_tiger_train.json (Users: 4001)\n", + "Merging 2 files into exp_4-1_0.9_inter_semantics_train.json (min len=2)...\n", + "Filtered 15 users with history < 2\n", + "Remaining: 4001 users\n", + "✓ Done: /home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps_filtered/exp_4-1_0.9_inter_semantics_train.json (Users: 4001)\n", + "Merging 1 files into exp_4-2_0.8_inter_semantics_train.json (min len=2)...\n", + "Filtered 22 users with history < 2\n", + "Remaining: 3791 users\n", + "✓ Done: /home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps_filtered/exp_4-2_0.8_inter_semantics_train.json (Users: 3791)\n", + "Merging 3 files into exp_4-3_0.95_inter_semantics_train.json (min len=2)...\n", + "Filtered 7 users with history < 2\n", + "Remaining: 4111 users\n", + "✓ Done: /home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps_filtered/exp_4-3_0.95_inter_semantics_train.json (Users: 4111)\n", + "Merging 1 files into test_set.json...\n", + "✓ Done: /home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps_filtered/test_set.json (Users: 3154)\n", + "Merging 1 files into valid_set.json...\n", + "✓ Done: /home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps_filtered/valid_set.json (Users: 3120)\n", + "Merging 4 files into all_set.json...\n", + "✓ Done: /home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps_filtered/all_set.json (Users: 4138)\n", + "All done!\n" + ] + } + ], + "source": [ + "EXP_DIR_FILTERED = \"/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps_filtered\"\n", + "\n", + "base_p, gap_p, valid_p, test_p = new_split_files[0], new_split_files[1], new_split_files[2], new_split_files[3]\n", + "\n", + "# Tiger: base + gap\n", + "merge_and_save_with_filter([base_p, gap_p], EXP_DIR_FILTERED, \"exp_4_0.9_inter_tiger_train.json\", min_history_len=2)\n", + "\n", + "# 1. Exp 4.1 (Standard)\n", + "# Semantics: base + gap (Всё кроме валидации и теста)\n", + "merge_and_save_with_filter([base_p, gap_p], EXP_DIR_FILTERED, \"exp_4-1_0.9_inter_semantics_train.json\", min_history_len=2)\n", + "\n", + "# 2. Exp 4.2 (Short Semantics)\n", + "# Semantics: base (Короче на пропуск, без gap)\n", + "merge_and_save_with_filter([base_p], EXP_DIR_FILTERED, \"exp_4-2_0.8_inter_semantics_train.json\", min_history_len=2)\n", + "\n", + "# 3. Exp 4.3 (Leak)\n", + "# Semantics: base + gap + valid (Видит валидацию)\n", + "merge_and_save_with_filter([base_p, gap_p, valid_p], EXP_DIR_FILTERED, \"exp_4-3_0.95_inter_semantics_train.json\", min_history_len=2)\n", + "\n", + "# 4. Test Set (тест всех моделей)\n", + "merge_and_save([test_p], EXP_DIR_FILTERED, \"test_set.json\")\n", + "\n", + "# 4. Valid Set (валидационный набор)\n", + "merge_and_save([valid_p], EXP_DIR_FILTERED, \"valid_set.json\")\n", + "\n", + "# 4. All Set (все данные)\n", + "merge_and_save([base_p, gap_p, valid_p, test_p], EXP_DIR_FILTERED, \"all_set.json\")\n", + "\n", + "print(\"All done!\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9060beaa", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Проверка Train сетов (должны быть префиксами):\n", + "доля событий всего 0.90:\n", + "✅ [ПРЕФИКСЫ] Все 4001 массивов ОК. Полных совпадений: 564\n", + "доля событий всего 0.80:\n", + "✅ [ПРЕФИКСЫ] Все 3791 массивов ОК. Полных совпадений: 343\n", + "доля событий всего 0.95:\n", + "✅ [ПРЕФИКСЫ] Все 4111 массивов ОК. Полных совпадений: 984\n", + "доля событий всего 0.90:\n", + "✅ [ПРЕФИКСЫ] Все 4001 массивов ОК. Полных совпадений: 564\n", + "\n", + "Проверка Test сета (должен быть суффиксом):\n", + "доля событий всего 0.05:\n", + "✅ [СУФФИКСЫ] Все 3154 массивов ОК. Полных совпадений: 20\n", + "\n", + "(Контроль) Проверка Test сета как префикса (должна упасть):\n", + "доля событий всего 0.05:\n", + "❌ [ПРЕФИКСЫ] Найдено 3134 ошибок.\n", + "доля событий всего 1.00:\n", + "✅ [ПРЕФИКСЫ] Все 4138 массивов ОК. Полных совпадений: 4138\n", + "\n", + "================================================================================\n", + "ПРОВЕРКА НА ПУСТЫЕ ЧАСТИ ИСТОРИЙ\n", + "================================================================================\n", + "\n", + "[exp_4-1_0.9] Анализ...\n", + " Юзеров в сплите: 4,001 / 4,138\n", + " ПУСТЫХ сессий: 0\n", + " ОБЩИХ ПРОБЛЕМ: 0\n", + "\n", + "[exp_4-2_0.8] Анализ...\n", + " Юзеров в сплите: 3,791 / 4,138\n", + " ПУСТЫХ сессий: 0\n", + " ОБЩИХ ПРОБЛЕМ: 0\n", + "\n", + "[exp_4-3_0.95] Анализ...\n", + " Юзеров в сплите: 4,111 / 4,138\n", + " ПУСТЫХ сессий: 0\n", + " ОБЩИХ ПРОБЛЕМ: 0\n", + "\n", + "[exp_4_0.9_tiger] Анализ...\n", + " Юзеров в сплите: 4,001 / 4,138\n", + " ПУСТЫХ сессий: 0\n", + " ОБЩИХ ПРОБЛЕМ: 0\n", + "\n", + "ВСЕ РАЗБИЕНИЯ БЕЗ ПУСТЫХ СЕССИЙ\n" + ] + } + ], + "source": [ + "with open(\"/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_expsx/exp_4-1_0.9_inter_semantics_train.json\", 'r') as ff:\n", + " filtered_first_sem = json.load(ff)\n", + " \n", + "with open(\"/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps_filtered/exp_4-2_0.8_inter_semantics_train.json\", 'r') as ff:\n", + " filtered_second_sem = json.load(ff)\n", + " \n", + "with open(\"/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps_filtered/exp_4-3_0.95_inter_semantics_train.json\", 'r') as ff:\n", + " filtered_third_sem = json.load(ff)\n", + " \n", + "with open(\"/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps_filtered/exp_4_0.9_inter_tiger_train.json\", 'r') as ff:\n", + " filtered_tiger_sem = json.load(ff)\n", + "\n", + "with open(\"/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps_filtered/valid_set.json\", 'r') as ff:\n", + " fiiltered_valid_sem = json.load(ff)\n", + "\n", + "with open(\"/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps_filtered/test_set.json\", 'r') as ff:\n", + " fiiltered_test_sem = json.load(ff)\n", + "\n", + "with open(\"/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps_filtered/all_set.json\", 'r') as ff:\n", + " filtered_all_test_data = json.load(ff)\n", + "\n", + "# --- Запуск проверок ---\n", + "print(\"Проверка Train сетов (должны быть префиксами):\")\n", + "check_prefix_match(filtered_all_test_data, filtered_first_sem)\n", + "check_prefix_match(filtered_all_test_data, filtered_second_sem)\n", + "check_prefix_match(filtered_all_test_data, filtered_third_sem)\n", + "check_prefix_match(filtered_all_test_data, filtered_tiger_sem)\n", + "\n", + "print(\"\\nПроверка Test сета (должен быть суффиксом):\")\n", + "check_prefix_match(filtered_all_test_data, test_sem, check_suffix=True)\n", + "\n", + "print(\"\\n(Контроль) Проверка Test сета как префикса (должна упасть):\")\n", + "check_prefix_match(filtered_all_test_data, test_sem, check_suffix=False)\n", + "\n", + "check_prefix_match(filtered_all_test_data, all_test_data)\n", + "\n", + "split_names = ['exp_4-1_0.9', 'exp_4-2_0.8', 'exp_4-3_0.95', 'exp_4_0.9_tiger']\n", + "splits_list_filtered = [filtered_first_sem, filtered_second_sem, filtered_third_sem, filtered_tiger_sem]\n", + "\n", + "check_non_empty_splits(filtered_all_test_data, splits_list_filtered, split_names, min_history_len = 2)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "c540c8d5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "для теста и валидации (может упасть и скорее всего упадет)\n", + "\n", + "================================================================================\n", + "ПРОВЕРКА НА ПУСТЫЕ ЧАСТИ ИСТОРИЙ\n", + "================================================================================\n", + "\n", + "[valid] Анализ...\n", + " Юзеров в сплите: 3,120 / 4,138\n", + " ПУСТЫХ сессий: 88\n", + " ОБЩИХ ПРОБЛЕМ: 88\n", + "\n", + "[test] Анализ...\n", + " Юзеров в сплите: 3,154 / 4,138\n", + " ПУСТЫХ сессий: 105\n", + " ОБЩИХ ПРОБЛЕМ: 105\n" + ] + } + ], + "source": [ + "print(\"для теста и валидации (может упасть и скорее всего упадет)\")\n", + "vt_split_names = ['valid', 'test']\n", + "vt_splits_list_filtered = [fiiltered_valid_sem, test_sem]\n", + "\n", + "check_non_empty_splits(filtered_all_test_data, vt_splits_list_filtered, vt_split_names, min_history_len = 2)" + ] + }, + { + "cell_type": "markdown", + "id": "89efa96e", + "metadata": {}, + "source": [ + "# Разбиение YAMBDA по неделям" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "28e4ddc8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cutoffs: [25740785, 25827185, 25913585]\n", + "✓ Сохранено: /home/jovyan/IRec/data/Yambda/day-splits/raw/inter_new_[0_25740785).json\n", + "✓ Сохранено: /home/jovyan/IRec/data/Yambda/day-splits/raw/inter_new_[25740785_25827185).json\n", + "✓ Сохранено: /home/jovyan/IRec/data/Yambda/day-splits/raw/inter_new_[25827185_25913585).json\n", + "✓ Сохранено: /home/jovyan/IRec/data/Yambda/day-splits/raw/inter_new_[25913585_inf).json\n", + "Part 0 [Base]: 4133 users\n", + "Part 1 [day -3]: 1381 users\n", + "Part 2 [day -2]: 1350 users\n", + "Part 3 [day -1]: 1403 users\n" + ] + } + ], + "source": [ + "global_max_time = df.select(\n", + " pl.col(\"timestamps\").explode().max()\n", + ").item()\n", + "\n", + "# 3. Размер окна (неделя)\n", + "days_val = 1\n", + "window_sec = days_val * 24 * 3600 \n", + "\n", + "# 4. Три отсечки с конца\n", + "cutoff_test_start = global_max_time - window_sec # T - 1w\n", + "cutoff_val_start = global_max_time - 2 * window_sec # T - 2w\n", + "cutoff_gap_start = global_max_time - 3 * window_sec # T - 3w\n", + "\n", + "cutoffs = [\n", + " int(cutoff_gap_start), # Граница Part 0 | Part 1\n", + " int(cutoff_val_start), # Граница Part 1 | Part 2\n", + " int(cutoff_test_start) # Граница Part 2 | Part 3\n", + "]\n", + "\n", + "print(f\"Cutoffs: {cutoffs}\")\n", + "\n", + "split_files = split_session_by_timestamps(\n", + " df, \n", + " cutoffs, \n", + " output_dir=\"/home/jovyan/IRec/data/Yambda/day-splits/raw\"\n", + ")\n", + "\n", + "names = [\"Base\", \"day -3\", \"day -2\", \"day -1\"]\n", + "for i, d in enumerate(split_files):\n", + " print(f\"Part {i} [{names[i]}]: {len(d)} users\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "8d5b0c22", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Merging 2 files into exp_4_0.9_inter_tiger_train.json (min len=2)...\n", + "Filtered 3 users with history < 2\n", + "Remaining: 4133 users\n", + "✓ Done: /home/jovyan/IRec/data/Yambda/day-splits/merged_for_exps_filtered/exp_4_0.9_inter_tiger_train.json (Users: 4133)\n", + "Merging 2 files into exp_4-1_0.9_inter_semantics_train.json (min len=2)...\n", + "Filtered 3 users with history < 2\n", + "Remaining: 4133 users\n", + "✓ Done: /home/jovyan/IRec/data/Yambda/day-splits/merged_for_exps_filtered/exp_4-1_0.9_inter_semantics_train.json (Users: 4133)\n", + "Merging 1 files into exp_4-2_0.8_inter_semantics_train.json (min len=2)...\n", + "Filtered 3 users with history < 2\n", + "Remaining: 4130 users\n", + "✓ Done: /home/jovyan/IRec/data/Yambda/day-splits/merged_for_exps_filtered/exp_4-2_0.8_inter_semantics_train.json (Users: 4130)\n", + "Merging 3 files into exp_4-3_0.95_inter_semantics_train.json (min len=2)...\n", + "Filtered 3 users with history < 2\n", + "Remaining: 4133 users\n", + "✓ Done: /home/jovyan/IRec/data/Yambda/day-splits/merged_for_exps_filtered/exp_4-3_0.95_inter_semantics_train.json (Users: 4133)\n", + "Merging 1 files into test_set.json (min len=1)...\n", + "Filtered 0 users with history < 1\n", + "Remaining: 1403 users\n", + "✓ Done: /home/jovyan/IRec/data/Yambda/day-splits/merged_for_exps_filtered/test_set.json (Users: 1403)\n", + "Merging 1 files into valid_set.json (min len=1)...\n", + "Filtered 0 users with history < 1\n", + "Remaining: 1350 users\n", + "✓ Done: /home/jovyan/IRec/data/Yambda/day-splits/merged_for_exps_filtered/valid_set.json (Users: 1350)\n", + "Merging 4 files into all_set.json...\n", + "✓ Done: /home/jovyan/IRec/data/Yambda/day-splits/merged_for_exps_filtered/all_set.json (Users: 4138)\n", + "All done!\n" + ] + } + ], + "source": [ + "EXP_DIR_FILTERED = \"/home/jovyan/IRec/data/Yambda/day-splits/merged_for_exps_filtered\"\n", + "\n", + "base_p, gap_p, valid_p, test_p = split_files[0], split_files[1], split_files[2], split_files[3]\n", + "\n", + "# Tiger: base + gap\n", + "merge_and_save_with_filter([base_p, gap_p], EXP_DIR_FILTERED, \"exp_4_0.9_inter_tiger_train.json\", min_history_len=2)\n", + "\n", + "# 1. Exp 4.1 (Standard)\n", + "# Semantics: base + gap (Всё кроме валидации и теста)\n", + "merge_and_save_with_filter([base_p, gap_p], EXP_DIR_FILTERED, \"exp_4-1_0.9_inter_semantics_train.json\", min_history_len=2)\n", + "\n", + "# 2. Exp 4.2 (Short Semantics)\n", + "# Semantics: base (Короче на пропуск, без gap)\n", + "merge_and_save_with_filter([base_p], EXP_DIR_FILTERED, \"exp_4-2_0.8_inter_semantics_train.json\", min_history_len=2)\n", + "\n", + "# 3. Exp 4.3 (Leak)\n", + "# Semantics: base + gap + valid (Видит валидацию)\n", + "merge_and_save_with_filter([base_p, gap_p, valid_p], EXP_DIR_FILTERED, \"exp_4-3_0.95_inter_semantics_train.json\", min_history_len=2)\n", + "\n", + "# 4. Test Set (тест всех моделей)\n", + "merge_and_save_with_filter([test_p], EXP_DIR_FILTERED, \"test_set.json\", min_history_len=1)\n", + "\n", + "# 4. Valid Set (валидационный набор)\n", + "merge_and_save_with_filter([valid_p], EXP_DIR_FILTERED, \"valid_set.json\", min_history_len=1)\n", + "\n", + "# 4. All Set (все данные)\n", + "merge_and_save([base_p, gap_p, valid_p, test_p], EXP_DIR_FILTERED, \"all_set.json\")\n", + "\n", + "print(\"All done!\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "c0b9b767", + "metadata": {}, + "outputs": [], + "source": [ + "def check_non_empty_splits(full_data, splits_data, split_names, min_history_len=2):\n", + " \"\"\"\n", + " Проверяет, что ни одна часть истории пользователя НЕ пустая во всех разбиениях.\n", + " \"\"\"\n", + " print(\"\\n\" + \"=\"*80)\n", + " print(\"ПРОВЕРКА НА ПУСТЫЕ ЧАСТИ ИСТОРИЙ\")\n", + " print(\"=\"*80)\n", + " \n", + " all_users = set(full_data.keys())\n", + " total_issues = 0\n", + " \n", + " for i in range(len(split_names)):\n", + " split_name = split_names[i]\n", + " split_data = splits_data[i]\n", + " print(f\"\\n[{split_name}] Анализ...\")\n", + " \n", + " split_users = set(split_data.keys())\n", + " empty_sessions = []\n", + " \n", + " for user, items in split_data.items():\n", + " if not items or len(items) < min_history_len:\n", + " empty_sessions.append(user)\n", + " \n", + " issues_count = len(empty_sessions)\n", + " total_issues += issues_count\n", + " \n", + " print(f\" Юзеров в сплите: {len(split_users):,} / {len(all_users):,}\")\n", + " print(f\" ПУСТЫХ сессий: {len(empty_sessions)}\")\n", + " print(f\" ОБЩИХ ПРОБЛЕМ: {issues_count}\")\n", + " \n", + " if total_issues == 0:\n", + " print(\"\\nВСЕ РАЗБИЕНИЯ БЕЗ ПУСТЫХ СЕССИЙ\")\n", + "\n", + "def check_prefix_match(full_data, subset_data, check_suffix=False):\n", + " \"\"\"\n", + " check_suffix=True включит режим проверки суффиксов (для теста).\n", + " \"\"\"\n", + " mismatch_count = 0\n", + " full_match_count = 0\n", + " \n", + " # Итерируемся по ключам сабсета, так как в full_data может быть больше юзеров\n", + " for user, sub_items in subset_data.items():\n", + " \n", + " # Проверяем есть ли такой юзер в исходнике\n", + " if user not in full_data:\n", + " print(f\"⚠ Юзер {user} не найден в исходном файле!\")\n", + " mismatch_count += 1\n", + " continue\n", + " \n", + " full_items = full_data[user]\n", + " \n", + " # Логика для проверки ПРЕФИКСА (начало совпадает)\n", + " if not check_suffix:\n", + " if len(sub_items) > len(full_items):\n", + " mismatch_count += 1\n", + " continue\n", + " \n", + " # Сравниваем начало full с sub\n", + " if full_items[:len(sub_items)] == sub_items:\n", + " if len(full_items) == len(sub_items):\n", + " full_match_count += 1\n", + " else:\n", + " mismatch_count += 1\n", + " \n", + " # Логика для проверки СУФФИКСА (конец совпадает - для теста)\n", + " else:\n", + " if len(sub_items) > len(full_items):\n", + " mismatch_count += 1\n", + " continue\n", + " \n", + " # Сравниваем конец full с sub\n", + " # Срез [-len:] берет последние N элементов\n", + " if full_items[-len(sub_items):] == sub_items:\n", + " if len(full_items) == len(sub_items):\n", + " full_match_count += 1\n", + " else:\n", + " mismatch_count += 1\n", + "\n", + " mode = \"СУФФИКСЫ\" if check_suffix else \"ПРЕФИКСЫ\"\n", + " \n", + " if mismatch_count == 0:\n", + " print(f\"✅ [{mode}] Все {len(subset_data)} массивов ОК. Полных совпадений: {full_match_count}\")\n", + " else:\n", + " print(f\"❌ [{mode}] Найдено {mismatch_count} ошибок.\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "36ac0115", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Проверка Train сетов (должны быть префиксами):\n", + "✅ [ПРЕФИКСЫ] Все 4133 массивов ОК. Полных совпадений: 2272\n", + "✅ [ПРЕФИКСЫ] Все 4130 массивов ОК. Полных совпадений: 1969\n", + "✅ [ПРЕФИКСЫ] Все 4133 массивов ОК. Полных совпадений: 2735\n", + "✅ [ПРЕФИКСЫ] Все 4133 массивов ОК. Полных совпадений: 2272\n", + "\n", + "Проверка Test сета (должен быть суффиксом):\n", + "✅ [СУФФИКСЫ] Все 1403 массивов ОК. Полных совпадений: 2\n", + "\n", + "(Контроль) Проверка Test сета как префикса (должна упасть):\n", + "❌ [ПРЕФИКСЫ] Найдено 1401 ошибок.\n", + "✅ [ПРЕФИКСЫ] Все 4138 массивов ОК. Полных совпадений: 4138\n" + ] + } + ], + "source": [ + "with open(f\"{EXP_DIR_FILTERED}/exp_4-1_0.9_inter_semantics_train.json\", 'r') as ff:\n", + " filtered_first_sem = json.load(ff)\n", + " \n", + "with open(f\"{EXP_DIR_FILTERED}/exp_4-2_0.8_inter_semantics_train.json\", 'r') as ff:\n", + " filtered_second_sem = json.load(ff)\n", + " \n", + "with open(f\"{EXP_DIR_FILTERED}/exp_4-3_0.95_inter_semantics_train.json\", 'r') as ff:\n", + " filtered_third_sem = json.load(ff)\n", + " \n", + "with open(f\"{EXP_DIR_FILTERED}/exp_4_0.9_inter_tiger_train.json\", 'r') as ff:\n", + " filtered_tiger_sem = json.load(ff)\n", + "\n", + "with open(f\"{EXP_DIR_FILTERED}/valid_set.json\", 'r') as ff:\n", + " fiiltered_valid_sem = json.load(ff)\n", + "\n", + "with open(f\"{EXP_DIR_FILTERED}/test_set.json\", 'r') as ff:\n", + " filtered_test_sem = json.load(ff)\n", + "\n", + "with open(f\"{EXP_DIR_FILTERED}/all_set.json\", 'r') as ff:\n", + " filtered_all_test_data = json.load(ff)\n", + "\n", + "# --- Запуск проверок ---\n", + "print(\"Проверка Train сетов (должны быть префиксами):\")\n", + "check_prefix_match(filtered_all_test_data, filtered_first_sem)\n", + "check_prefix_match(filtered_all_test_data, filtered_second_sem)\n", + "check_prefix_match(filtered_all_test_data, filtered_third_sem)\n", + "check_prefix_match(filtered_all_test_data, filtered_tiger_sem)\n", + "\n", + "print(\"\\nПроверка Test сета (должен быть суффиксом):\")\n", + "check_prefix_match(filtered_all_test_data, filtered_test_sem, check_suffix=True)\n", + "\n", + "print(\"\\n(Контроль) Проверка Test сета как префикса (должна упасть):\")\n", + "check_prefix_match(filtered_all_test_data, filtered_test_sem, check_suffix=False)\n", + "\n", + "check_prefix_match(filtered_all_test_data, filtered_all_test_data)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "2c65331b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "================================================================================\n", + "ПРОВЕРКА НА ПУСТЫЕ ЧАСТИ ИСТОРИЙ\n", + "================================================================================\n", + "\n", + "[exp_4-1_0.9] Анализ...\n", + " Юзеров в сплите: 4,133 / 4,138\n", + " ПУСТЫХ сессий: 0\n", + " ОБЩИХ ПРОБЛЕМ: 0\n", + "\n", + "[exp_4-2_0.8] Анализ...\n", + " Юзеров в сплите: 4,130 / 4,138\n", + " ПУСТЫХ сессий: 0\n", + " ОБЩИХ ПРОБЛЕМ: 0\n", + "\n", + "[exp_4-3_0.95] Анализ...\n", + " Юзеров в сплите: 4,133 / 4,138\n", + " ПУСТЫХ сессий: 0\n", + " ОБЩИХ ПРОБЛЕМ: 0\n", + "\n", + "[exp_4_0.9_tiger] Анализ...\n", + " Юзеров в сплите: 4,133 / 4,138\n", + " ПУСТЫХ сессий: 0\n", + " ОБЩИХ ПРОБЛЕМ: 0\n", + "\n", + "ВСЕ РАЗБИЕНИЯ БЕЗ ПУСТЫХ СЕССИЙ\n" + ] + } + ], + "source": [ + "split_names = ['exp_4-1_0.9', 'exp_4-2_0.8', 'exp_4-3_0.95', 'exp_4_0.9_tiger']\n", + "splits_list_filtered = [filtered_first_sem, filtered_second_sem, filtered_third_sem, filtered_tiger_sem]\n", + "\n", + "check_non_empty_splits(filtered_all_test_data, splits_list_filtered, split_names, min_history_len = 2)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "f596db64", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 Base 4133 7264231 \n", + "1 day -3 1381 36676 \n", + "2 day -2 1350 35128 \n", + "3 day -1 1403 35955 \n" + ] + } + ], + "source": [ + "filtered_all_test_data.keys()\n", + "for i, d in enumerate(split_files):\n", + " num_users = len(d)\n", + " \n", + " num_events = sum(len(items) for items in d.values())\n", + " \n", + " print(f\"{i:<10} {names[i]:<10} {num_users:<10} {num_events:<10}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/irec/callbacks/stopping.py b/src/irec/callbacks/stopping.py index 3d1405f..bbe091f 100644 --- a/src/irec/callbacks/stopping.py +++ b/src/irec/callbacks/stopping.py @@ -44,14 +44,18 @@ def after_step(self, runner: Runner, context: RunnerContext): metric = context.metrics[self._metric] if self._best_metric is None: self._best_metric = metric - torch.save(runner.model.state_dict(), f'{self._model_path}_best_{round(self._best_metric, 4)}.pth') + save_path = f'{self._model_path}_best_{round(self._best_metric, 4)}.pth' + os.makedirs(os.path.dirname(save_path), exist_ok=True) + torch.save(runner.model.state_dict(), save_path) else: if (self._minimize and metric < self._best_metric) or (not self._minimize and metric > self._best_metric): self._wait = 0 old_metric = self._best_metric self._best_metric = metric # Saving new model - torch.save(runner.model.state_dict(), f'{self._model_path}_best_{round(self._best_metric, 4)}.pth') + save_path = f'{self._model_path}_best_{round(self._best_metric, 4)}.pth' + os.makedirs(os.path.dirname(save_path), exist_ok=True) + torch.save(runner.model.state_dict(), save_path) # Deleting old model if str(round(self._best_metric, 4)) != str(round(old_metric, 4)): os.remove(f'{self._model_path}_best_{round(old_metric, 4)}.pth')