diff --git a/notebooks/AmazonBeautyDatasetStatistics.ipynb b/notebooks/AmazonBeautyDatasetStatistics.ipynb
index 6d34ff2..379239d 100644
--- a/notebooks/AmazonBeautyDatasetStatistics.ipynb
+++ b/notebooks/AmazonBeautyDatasetStatistics.ipynb
@@ -405,7 +405,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": ".venv",
+ "display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@@ -419,7 +419,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.12.6"
+ "version": "3.10.12"
}
},
"nbformat": 4,
diff --git a/notebooks/LsvdDownload.ipynb b/notebooks/LsvdDownload.ipynb
new file mode 100644
index 0000000..c57e1af
--- /dev/null
+++ b/notebooks/LsvdDownload.ipynb
@@ -0,0 +1,574 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "id": "SbkKok0dfjjS"
+ },
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import polars as pl"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'1.8.2'"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pl.__version__"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "HF_ENDPOINT=\"http://huggingface.proxy\" hf download deepvk/VK-LSVD --repo-type dataset --include \"metadata/*\" --local-dir /home/jovyan/IRec/sigir/lsvd_data/raw\n",
+ "\n",
+ "HF_ENDPOINT=\"http://huggingface.proxy\" hf download deepvk/VK-LSVD --repo-type dataset --include \"subsamples/ur0.01_ir0.01/*\" --local-dir /home/jovyan/IRec/sigir/lsvd_data/raw\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Разбиение сабсэмплов на базовую, гэп, вал и тест части\n",
+ "\n",
+ "Добавляется колонка original_order чтобы сохранять порядок внутри каждой из частей"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(8, 1, 1, 1, 11, 9)"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "subsample_name = 'ur0.01_ir0.01'\n",
+ "content_embedding_size = 256\n",
+ "DATASET_PATH = \"/home/jovyan/IRec/sigir/lsvd_data/raw\"\n",
+ "\n",
+ "metadata_files = ['metadata/users_metadata.parquet',\n",
+ " 'metadata/items_metadata.parquet',\n",
+ " 'metadata/item_embeddings.npz']\n",
+ "\n",
+ "BASE_WEEKS = (15, 23)\n",
+ "GAP_WEEKS = (23, 24) #увеличить гэп\n",
+ "VAL_WEEKS = (24, 25)\n",
+ "\n",
+ "base_interactions_files = [f'subsamples/{subsample_name}/train/week_{i:02}.parquet'\n",
+ " for i in range(BASE_WEEKS[0], BASE_WEEKS[1])]\n",
+ "\n",
+ "gap_interactions_files = [f'subsamples/{subsample_name}/train/week_{i:02}.parquet'\n",
+ " for i in range(GAP_WEEKS[0], GAP_WEEKS[1])]\n",
+ "\n",
+ "val_interactions_files = [f'subsamples/{subsample_name}/train/week_{i:02}.parquet'\n",
+ " for i in range(VAL_WEEKS[0], VAL_WEEKS[1])]\n",
+ "\n",
+ "test_interactions_files = [f'subsamples/{subsample_name}/validation/week_25.parquet']\n",
+ "\n",
+ "all_interactions_files = base_interactions_files + gap_interactions_files + val_interactions_files + test_interactions_files\n",
+ "\n",
+ "base_with_gap_interactions_files = base_interactions_files + gap_interactions_files\n",
+ "\n",
+ "len(base_interactions_files), len(gap_interactions_files), len(val_interactions_files), len(test_interactions_files), len(all_interactions_files), len(base_with_gap_interactions_files)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_parquet_interactions(data_files):\n",
+ " data_interactions = pl.concat([pl.scan_parquet(f'{DATASET_PATH}/{file}')\n",
+ " for file in data_files])\n",
+ " data_interactions = data_interactions.collect(streaming=True)\n",
+ " data_interactions = data_interactions.with_row_index(\"original_order\")\n",
+ " return data_interactions\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "base_interactions = get_parquet_interactions(base_interactions_files)\n",
+ "gap_interactions = get_parquet_interactions(gap_interactions_files)\n",
+ "val_interactions = get_parquet_interactions(val_interactions_files)\n",
+ "test_interactions = get_parquet_interactions(test_interactions_files)\n",
+ "all_data_interactions = get_parquet_interactions(all_interactions_files)\n",
+ "base_with_gap_interactions = get_parquet_interactions(base_with_gap_interactions_files)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Загрузка и фильтрация эмбеддингов"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "all_data_users = all_data_interactions.select('user_id').unique()\n",
+ "all_data_items = all_data_interactions.select('item_id').unique()\n",
+ "\n",
+ "item_ids = np.load(f\"{DATASET_PATH}/metadata/item_embeddings.npz\")['item_id']\n",
+ "item_embeddings = np.load(f\"{DATASET_PATH}/metadata/item_embeddings.npz\")['embedding']\n",
+ "\n",
+ "mask = np.isin(item_ids, all_data_items.to_numpy())\n",
+ "item_ids = item_ids[mask]\n",
+ "item_embeddings = item_embeddings[mask]\n",
+ "item_embeddings = item_embeddings[:, :content_embedding_size]\n",
+ "\n",
+ "users_metadata = pl.read_parquet(f\"{DATASET_PATH}/metadata/users_metadata.parquet\")\n",
+ "items_metadata = pl.read_parquet(f\"{DATASET_PATH}/metadata/items_metadata.parquet\")\n",
+ "\n",
+ "users_metadata = users_metadata.join(all_data_users, on='user_id')\n",
+ "items_metadata = items_metadata.join(all_data_items, on='item_id')\n",
+ "items_metadata = items_metadata.join(pl.DataFrame({'item_id': item_ids, \n",
+ " 'embedding': item_embeddings}), on='item_id')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Сжатие айтем айди и ремапинг"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Total users: 79074, Total items: 62758\n"
+ ]
+ }
+ ],
+ "source": [
+ "all_data_items = all_data_interactions.select('item_id').unique()\n",
+ "all_data_users = all_data_interactions.select('user_id').unique()\n",
+ "\n",
+ "unique_items_sorted = all_data_items.sort('item_id').with_row_index('new_item_id')\n",
+ "global_item_mapping = dict(zip(unique_items_sorted['item_id'], unique_items_sorted['new_item_id']))\n",
+ "\n",
+ "print(f\"Total users: {all_data_users.shape[0]}, Total items: {len(global_item_mapping)}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def remap_interactions(df, mapping):\n",
+ " return df.with_columns(\n",
+ " pl.col('item_id')\n",
+ " .map_elements(lambda x: mapping.get(x, None), return_dtype=pl.UInt32)\n",
+ " )\n",
+ "\n",
+ "base_interactions_remapped = remap_interactions(base_interactions, global_item_mapping)\n",
+ "gap_interactions_remapped = remap_interactions(gap_interactions, global_item_mapping)\n",
+ "test_interactions_remapped = remap_interactions(test_interactions, global_item_mapping)\n",
+ "val_interactions_remapped = remap_interactions(val_interactions, global_item_mapping)\n",
+ "all_data_interactions_remapped = remap_interactions(all_data_interactions, global_item_mapping)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "del base_interactions, gap_interactions, test_interactions, val_interactions, all_data_interactions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "base_with_gap_interactions_remapped = remap_interactions(base_with_gap_interactions, global_item_mapping)\n",
+ "del base_with_gap_interactions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "items_metadata_remapped = remap_interactions(items_metadata, global_item_mapping)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Группировка по юзер айди"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "interactions count: (1244323, 13)\n",
+ "users count: (74862, 3)\n",
+ "interactions count: (176791, 13)\n",
+ "users count: (44444, 3)\n",
+ "interactions count: (170111, 13)\n",
+ "users count: (43370, 3)\n",
+ "interactions count: (164151, 13)\n",
+ "users count: (43233, 3)\n",
+ "interactions count: (1755376, 13)\n",
+ "users count: (79074, 3)\n"
+ ]
+ }
+ ],
+ "source": [
+ "def get_grouped_interactions(data_interactions):\n",
+ " print(f\"interactions count: {data_interactions.shape}\")\n",
+ " data_res = (\n",
+ " data_interactions\n",
+ " .select(['original_order', 'user_id', 'item_id'])\n",
+ " .group_by('user_id')\n",
+ " .agg(\n",
+ " pl.col('item_id')\n",
+ " .sort_by(pl.col('original_order'))\n",
+ " .alias('item_ids'),\n",
+ " pl.col('original_order').alias('timestamps')\n",
+ " )\n",
+ " .rename({'user_id': 'uid'})\n",
+ " )\n",
+ " print(f\"users count: {data_res.shape}\")\n",
+ " return data_res\n",
+ "base_interactions_grouped = get_grouped_interactions(base_interactions_remapped)\n",
+ "gap_interactions_grouped = get_grouped_interactions(gap_interactions_remapped)\n",
+ "test_interactions_grouped = get_grouped_interactions(test_interactions_remapped)\n",
+ "val_interactions_grouped = get_grouped_interactions(val_interactions_remapped)\n",
+ "all_data_interactions_grouped = get_grouped_interactions(all_data_interactions_remapped)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "
shape: (1, 3)| uid | item_ids | timestamps |
|---|
| u32 | list[u32] | list[u32] |
| 2655558 | [16621, 42990, … 51285] | [46109, 59132, … 1209536] |
"
+ ],
+ "text/plain": [
+ "shape: (1, 3)\n",
+ "┌─────────┬─────────────────────────┬───────────────────────────┐\n",
+ "│ uid ┆ item_ids ┆ timestamps │\n",
+ "│ --- ┆ --- ┆ --- │\n",
+ "│ u32 ┆ list[u32] ┆ list[u32] │\n",
+ "╞═════════╪═════════════════════════╪═══════════════════════════╡\n",
+ "│ 2655558 ┆ [16621, 42990, … 51285] ┆ [46109, 59132, … 1209536] │\n",
+ "└─────────┴─────────────────────────┴───────────────────────────┘"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "base_interactions_grouped.head(1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "del base_interactions_remapped, gap_interactions_remapped, test_interactions_remapped, val_interactions_remapped"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "interactions count: (1421114, 13)\n",
+ "users count: (76483, 3)\n"
+ ]
+ }
+ ],
+ "source": [
+ "base_with_gap_interactions_grouped = get_grouped_interactions(base_with_gap_interactions_remapped)\n",
+ "del base_with_gap_interactions_remapped"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Сохранение"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Сохранён маппинг: /home/jovyan/IRec/sigir/lsvd_data/8-days-base-ows/global_item_mapping.json\n"
+ ]
+ }
+ ],
+ "source": [
+ "import json\n",
+ "OUTPUT_DIR = \"/home/jovyan/IRec/sigir/lsvd_data/8-days-base-ows\"\n",
+ "\n",
+ "mapping_output_path = f\"{OUTPUT_DIR}/global_item_mapping.json\"\n",
+ "\n",
+ "with open(mapping_output_path, 'w') as f:\n",
+ " json.dump({str(k): v for k, v in global_item_mapping.items()}, f, indent=2)\n",
+ "\n",
+ "print(f\"Сохранён маппинг: {mapping_output_path}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Сохранен файл: items_metadata_remapped\n",
+ "Сохранен файл: items_metadata_remapped_old\n",
+ "Сохранен файл: base_interactions_grouped\n",
+ "Сохранен файл: gap_interactions_grouped\n",
+ "Сохранен файл: test_interactions_grouped\n",
+ "Сохранен файл: val_interactions_grouped\n",
+ "Сохранен файл: base_with_gap_interactions_grouped\n",
+ "Сохранен файл: all_data_interactions_grouped\n",
+ "Сохранен файл: all_data_interactions_remapped\n"
+ ]
+ }
+ ],
+ "source": [
+ "def write_parquet(output_dir, data, file_name):\n",
+ " output_parquet_path = f\"{output_dir}/{file_name}.parquet\"\n",
+ " data.write_parquet(output_parquet_path)\n",
+ " print(f\"Сохранен файл: {file_name}\")\n",
+ "\n",
+ "write_parquet(OUTPUT_DIR, items_metadata_remapped, \"items_metadata_remapped\")\n",
+ "write_parquet(OUTPUT_DIR, items_metadata, \"items_metadata_remapped_old\")\n",
+ "\n",
+ "write_parquet(OUTPUT_DIR, base_interactions_grouped, \"base_interactions_grouped\")\n",
+ "write_parquet(OUTPUT_DIR, gap_interactions_grouped, \"gap_interactions_grouped\")\n",
+ "write_parquet(OUTPUT_DIR, test_interactions_grouped, \"test_interactions_grouped\")\n",
+ "write_parquet(OUTPUT_DIR, val_interactions_grouped, \"val_interactions_grouped\")\n",
+ "write_parquet(OUTPUT_DIR, base_with_gap_interactions_grouped, \"base_with_gap_interactions_grouped\")\n",
+ "\n",
+ "write_parquet(OUTPUT_DIR, all_data_interactions_grouped, \"all_data_interactions_grouped\")\n",
+ "\n",
+ "write_parquet(OUTPUT_DIR, all_data_interactions_remapped, \"all_data_interactions_remapped\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(64, 64)"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(list(items_metadata_remapped.head(1)['embedding'].item())), len(list(items_metadata.head(1)['embedding'].item()))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(62758, 5)"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "items_metadata_remapped.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (1, 5)| item_id | author_id | duration | train_interactions_rank | embedding |
|---|
| u32 | u32 | u8 | u32 | array[f32, 64] |
| 0 | 1249424 | 9 | 771612 | [-0.503418, 0.201538, … 0.007988] |
"
+ ],
+ "text/plain": [
+ "shape: (1, 5)\n",
+ "┌─────────┬───────────┬──────────┬─────────────────────────┬─────────────────────────────────┐\n",
+ "│ item_id ┆ author_id ┆ duration ┆ train_interactions_rank ┆ embedding │\n",
+ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
+ "│ u32 ┆ u32 ┆ u8 ┆ u32 ┆ array[f32, 64] │\n",
+ "╞═════════╪═══════════╪══════════╪═════════════════════════╪═════════════════════════════════╡\n",
+ "│ 0 ┆ 1249424 ┆ 9 ┆ 771612 ┆ [-0.503418, 0.201538, … 0.0079… │\n",
+ "└─────────┴───────────┴──────────┴─────────────────────────┴─────────────────────────────────┘"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "items_metadata_remapped.head(1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (5, 3)| uid | item_ids | timestamps |
|---|
| u32 | list[u32] | list[u32] |
| 4465123 | [28298, 3829, … 28995] | [257260, 272293, … 1390041] |
| 3043171 | [8638, 23487, … 15086] | [6628, 11364, … 1370935] |
| 2757146 | [56345, 56828, … 37056] | [194522, 217739, … 1390752] |
| 1148408 | [40326, 42152] | [427153, 1367211] |
| 2537065 | [27766, 39966, … 19887] | [9428, 35459, … 1214991] |
"
+ ],
+ "text/plain": [
+ "shape: (5, 3)\n",
+ "┌─────────┬─────────────────────────┬─────────────────────────────┐\n",
+ "│ uid ┆ item_ids ┆ timestamps │\n",
+ "│ --- ┆ --- ┆ --- │\n",
+ "│ u32 ┆ list[u32] ┆ list[u32] │\n",
+ "╞═════════╪═════════════════════════╪═════════════════════════════╡\n",
+ "│ 4465123 ┆ [28298, 3829, … 28995] ┆ [257260, 272293, … 1390041] │\n",
+ "│ 3043171 ┆ [8638, 23487, … 15086] ┆ [6628, 11364, … 1370935] │\n",
+ "│ 2757146 ┆ [56345, 56828, … 37056] ┆ [194522, 217739, … 1390752] │\n",
+ "│ 1148408 ┆ [40326, 42152] ┆ [427153, 1367211] │\n",
+ "│ 2537065 ┆ [27766, 39966, … 19887] ┆ [9428, 35459, … 1214991] │\n",
+ "└─────────┴─────────────────────────┴─────────────────────────────┘"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "base_with_gap_interactions_grouped.head()"
+ ]
+ }
+ ],
+ "metadata": {
+ "accelerator": "GPU",
+ "colab": {
+ "gpuType": "T4",
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": ".venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/scripts/plum-lsvd/callbacks.py b/scripts/plum-lsvd/callbacks.py
new file mode 100644
index 0000000..43ec460
--- /dev/null
+++ b/scripts/plum-lsvd/callbacks.py
@@ -0,0 +1,64 @@
+import torch
+
+import irec.callbacks as cb
+from irec.runners import TrainingRunner, TrainingRunnerContext
+
+class InitCodebooks(cb.TrainingCallback):
+ def __init__(self, dataloader):
+ super().__init__()
+ self._dataloader = dataloader
+
+ @torch.no_grad()
+ def before_run(self, runner: TrainingRunner):
+ for i in range(len(runner.model.codebooks)):
+ X = next(iter(self._dataloader))['embedding']
+ idx = torch.randperm(X.shape[0], device=X.device)[:len(runner.model.codebooks[i])]
+ remainder = runner.model.encoder(X[idx])
+
+ for j in range(i):
+ codebook_indices = runner.model.get_codebook_indices(remainder, runner.model.codebooks[j])
+ codebook_vectors = runner.model.codebooks[j][codebook_indices]
+ remainder = remainder - codebook_vectors
+
+ runner.model.codebooks[i].data = remainder.detach()
+
+
+class FixDeadCentroids(cb.TrainingCallback):
+ def __init__(self, dataloader):
+ super().__init__()
+ self._dataloader = dataloader
+
+ def after_step(self, runner: TrainingRunner, context: TrainingRunnerContext):
+ for i, num_fixed in enumerate(self.fix_dead_codebooks(runner)):
+ context.metrics[f'num_dead/{i}'] = num_fixed
+
+ @torch.no_grad()
+ def fix_dead_codebooks(self, runner: TrainingRunner):
+ num_fixed = []
+ for codebook_idx, codebook in enumerate(runner.model.codebooks):
+ centroid_counts = torch.zeros(codebook.shape[0], dtype=torch.long, device=codebook.device)
+ random_batch = next(iter(self._dataloader))['embedding']
+
+ for batch in self._dataloader:
+ remainder = runner.model.encoder(batch['embedding'])
+ for l in range(codebook_idx):
+ ind = runner.model.get_codebook_indices(remainder, runner.model.codebooks[l])
+ remainder = remainder - runner.model.codebooks[l][ind]
+
+ indices = runner.model.get_codebook_indices(remainder, codebook)
+ centroid_counts.scatter_add_(0, indices, torch.ones_like(indices))
+
+ dead_mask = (centroid_counts == 0)
+ num_dead = int(dead_mask.sum().item())
+ num_fixed.append(num_dead)
+ if num_dead == 0:
+ continue
+
+ remainder = runner.model.encoder(random_batch)
+ for l in range(codebook_idx):
+ ind = runner.model.get_codebook_indices(remainder, runner.model.codebooks[l])
+ remainder = remainder - runner.model.codebooks[l][ind]
+ remainder = remainder[torch.randperm(remainder.shape[0], device=codebook.device)][:num_dead]
+ codebook[dead_mask] = remainder.detach()
+
+ return num_fixed
diff --git a/scripts/plum-lsvd/cooc_data.py b/scripts/plum-lsvd/cooc_data.py
new file mode 100644
index 0000000..7cea906
--- /dev/null
+++ b/scripts/plum-lsvd/cooc_data.py
@@ -0,0 +1,117 @@
+import json
+from collections import defaultdict, Counter
+from data import InteractionsDatasetParquet
+from collections import defaultdict, Counter
+
+
+class CoocMappingDataset:
+ def __init__(
+ self,
+ train_sampler,
+ num_items,
+ cooccur_counter_mapping=None
+ ):
+ self._train_sampler = train_sampler
+ self._num_items = num_items
+ self._cooccur_counter_mapping = cooccur_counter_mapping
+
+ @classmethod
+ def create(cls, inter_json_path, window_size):
+ max_item_id = 0
+ train_dataset = []
+
+ with open(inter_json_path, 'r') as f:
+ user_interactions = json.load(f)
+
+ for user_id_str, item_ids in user_interactions.items():
+ user_id = int(user_id_str)
+ if item_ids:
+ max_item_id = max(max_item_id, max(item_ids))
+ if len(item_ids) >= 5:
+ print(f'Core-5 dataset is used, user {user_id} has only {len(item_ids)} items')
+ train_dataset.append({
+ 'user_ids': [user_id],
+ 'item_ids': item_ids[:-2],
+ })
+
+
+ cooccur_counter_mapping = cls.build_cooccur_counter_mapping(train_dataset, window_size=window_size)
+ print(f'Computed window-based co-occurrence mapping for {len(cooccur_counter_mapping)} items but max_item_id is {max_item_id}')
+
+
+ train_sampler = train_dataset
+
+
+ return cls(
+ train_sampler=train_sampler,
+ num_items=max_item_id + 1,
+ cooccur_counter_mapping=cooccur_counter_mapping
+ )
+
+
+ @classmethod
+ def create_from_split_part(
+ cls,
+ train_inter_parquet_path,
+ window_size,
+ ):
+
+ max_item_id = 0
+ train_dataset = []
+
+
+ train_interactions = InteractionsDatasetParquet(train_inter_parquet_path)
+
+ actions_num = 0
+ for session in train_interactions:
+ user_id, item_ids = int(session['user_id']), session['item_ids']
+ if item_ids.any():
+ max_item_id = max(max_item_id, max(item_ids))
+ actions_num += len(item_ids)
+ train_dataset.append({
+ 'user_ids': [user_id],
+ 'item_ids': item_ids,
+ })
+
+
+ print(f'Train: {len(train_dataset)} users')
+ print(f'Max item ID: {max_item_id}')
+ print(f"Actions num: {actions_num}")
+
+
+ cooccur_counter_mapping = cls.build_cooccur_counter_mapping(
+ train_dataset,
+ window_size=window_size
+ )
+
+
+ print(f'Computed window-based co-occurrence mapping for {len(cooccur_counter_mapping)} items')
+
+
+ return cls(
+ train_sampler=train_dataset,
+ num_items=max_item_id + 1,
+ cooccur_counter_mapping=cooccur_counter_mapping
+ )
+
+
+
+ @staticmethod
+ def build_cooccur_counter_mapping(train_dataset, window_size):
+ cooccur_counts = defaultdict(Counter)
+ for session in train_dataset:
+ items = session['item_ids']
+ for i in range(len(items)):
+ item_i = items[i]
+ for j in range(max(0, i - window_size), min(len(items), i + window_size + 1)):
+ if i != j:
+ cooccur_counts[item_i][items[j]] += 1
+ max_hist_len = max(len(counter) for counter in cooccur_counts.values()) if cooccur_counts else 0
+ print(f"Max cooccurrence history length is {max_hist_len}")
+ return cooccur_counts
+
+
+
+ @property
+ def cooccur_counter_mapping(self):
+ return self._cooccur_counter_mapping
\ No newline at end of file
diff --git a/scripts/plum-lsvd/data.py b/scripts/plum-lsvd/data.py
new file mode 100644
index 0000000..5a780fb
--- /dev/null
+++ b/scripts/plum-lsvd/data.py
@@ -0,0 +1,87 @@
+import numpy as np
+import pickle
+
+from irec.data.base import BaseDataset
+from irec.data.transforms import Transform
+
+
+import polars as pl
+
+class InteractionsDatasetParquet(BaseDataset):
+ def __init__(self, data_path, max_items=None):
+ self.df = pl.read_parquet(data_path)
+ assert 'uid' in self.df.columns, "Missing 'uid' column"
+ assert 'item_ids' in self.df.columns, "Missing 'item_ids' column"
+ print(f"Dataset loaded: {len(self.df)} users")
+
+ if max_items is not None:
+ self.df = self.df.with_columns(
+ pl.col("item_ids").list.slice(-max_items).alias("item_ids")
+ )
+
+ def __getitem__(self, idx):
+ row = self.df.row(idx, named=True)
+ return {
+ 'user_id': row['uid'],
+ 'item_ids': np.array(row['item_ids'], dtype=np.uint32),
+ }
+
+ def __len__(self):
+ return len(self.df)
+
+ def __iter__(self):
+ for idx in range(len(self)):
+ yield self[idx]
+
+
+class EmbeddingDatasetParquet(BaseDataset):
+ def __init__(self, data_path):
+ self.df = pl.read_parquet(data_path)
+ self.item_ids = np.array(self.df['item_id'], dtype=np.int64)
+ self.embeddings = np.array(self.df['embedding'].to_list(), dtype=np.float32)
+ print(f"embedding dim: {self.embeddings[0].shape}")
+
+ def __getitem__(self, idx):
+ index = self.item_ids[idx]
+ tensor_emb = self.embeddings[idx]
+ return {
+ 'item_id': index,
+ 'embedding': tensor_emb,
+ 'embedding_dim': len(tensor_emb)
+ }
+
+ def __len__(self):
+ return len(self.embeddings)
+
+
+class EmbeddingDataset(BaseDataset):
+ def __init__(self, data_path):
+ self.data_path = data_path
+ with open(data_path, 'rb') as f:
+ self.data = pickle.load(f)
+
+ self.item_ids = np.array(self.data['item_id'], dtype=np.int64)
+ self.embeddings = np.array(self.data['embedding'], dtype=np.float32)
+
+ def __getitem__(self, idx):
+ index = self.item_ids[idx]
+ tensor_emb = self.embeddings[idx]
+ return {
+ 'item_id': index,
+ 'embedding': tensor_emb,
+ 'embedding_dim': len(tensor_emb)
+ }
+
+ def __len__(self):
+ return len(self.embeddings)
+
+
+class ProcessEmbeddings(Transform):
+ def __init__(self, embedding_dim, keys):
+ self.embedding_dim = embedding_dim
+ self.keys = keys
+
+ def __call__(self, batch):
+ for key in self.keys:
+ batch[key] = batch[key].reshape(-1, self.embedding_dim)
+ return batch
\ No newline at end of file
diff --git a/scripts/plum-lsvd/infer_default.py b/scripts/plum-lsvd/infer_default.py
new file mode 100644
index 0000000..b15fb6d
--- /dev/null
+++ b/scripts/plum-lsvd/infer_default.py
@@ -0,0 +1,146 @@
+from loguru import logger
+import os
+
+import torch
+
+import irec.callbacks as cb
+from irec.data.dataloader import DataLoader
+from irec.data.transforms import Collate, ToTorch, ToDevice
+from irec.runners import InferenceRunner
+
+from irec.utils import fix_random_seed
+
+from data import EmbeddingDataset, ProcessEmbeddings
+from models import PlumRQVAE
+
+# ПУТИ
+IREC_PATH = '/home/jovyan/IRec/'
+EMBEDDINGS_PATH = '/home/jovyan/tiger/data/Beauty/default_content_embeddings.pkl'
+MODEL_PATH = '/home/jovyan/IRec/checkpoints/4-1_plum_rqvae_beauty_ws_2_best_0.0051.pth'
+RESULTS_PATH = os.path.join(IREC_PATH, 'results')
+
+WINDOW_SIZE = 2
+
+EXPERIMENT_NAME = f'test_plum_rqvae_beauty_ws_{WINDOW_SIZE}'
+
+# ОСТАЛЬНОЕ
+
+SEED_VALUE = 42
+DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+BATCH_SIZE = 1024
+
+INPUT_DIM = 4096
+HIDDEN_DIM = 32
+CODEBOOK_SIZE = 256
+NUM_CODEBOOKS = 3
+
+BETA = 0.25
+
+
+
+def main():
+ fix_random_seed(SEED_VALUE)
+
+ dataset = EmbeddingDataset(
+ data_path=EMBEDDINGS_PATH
+ )
+
+ item_id_to_embedding = {}
+ all_item_ids = []
+ for idx in range(len(dataset)):
+ sample = dataset[idx]
+ item_id = int(sample['item_id'])
+ item_id_to_embedding[item_id] = torch.tensor(sample['embedding'])
+ all_item_ids.append(item_id)
+
+ dataloader = DataLoader(
+ dataset,
+ batch_size=BATCH_SIZE,
+ shuffle=False,
+ drop_last=False,
+ ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding']))
+
+ model = PlumRQVAE(
+ input_dim=INPUT_DIM,
+ num_codebooks=NUM_CODEBOOKS,
+ codebook_size=CODEBOOK_SIZE,
+ embedding_dim=HIDDEN_DIM,
+ beta=BETA,
+ quant_loss_weight=1.0,
+ contrastive_loss_weight=1.0,
+ temperature=1.0
+ ).to(DEVICE)
+
+ total_params = sum(p.numel() for p in model.parameters())
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+ logger.debug(f'Overall parameters: {total_params:,}')
+ logger.debug(f'Trainable parameters: {trainable_params:,}')
+
+ callbacks = [
+ cb.LoadModel(MODEL_PATH),
+
+ cb.BatchMetrics(metrics=lambda model_outputs, _: {
+ 'loss': model_outputs['loss'],
+ 'recon_loss': model_outputs['recon_loss'],
+ 'rqvae_loss': model_outputs['rqvae_loss'],
+ 'con_loss': model_outputs['con_loss']
+ }, name='valid'),
+
+ cb.MetricAccumulator(
+ accumulators={
+ 'valid/loss': cb.MeanAccumulator(),
+ 'valid/recon_loss': cb.MeanAccumulator(),
+ 'valid/rqvae_loss': cb.MeanAccumulator(),
+ 'valid/con_loss': cb.MeanAccumulator(),
+ },
+ ),
+
+ cb.Logger().every_num_steps(len(dataloader)),
+
+ cb.InferenceSaver(
+ metrics=lambda batch, model_outputs, _: {'item_id': batch['item_id'], 'clusters': model_outputs['clusters']},
+ save_path=os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'),
+ format='json'
+ )
+ ]
+
+ logger.debug('Everything is ready for training process!')
+
+ runner = InferenceRunner(
+ model=model,
+ dataset=dataloader,
+ callbacks=callbacks,
+ )
+ runner.run()
+
+ import json
+ from collections import defaultdict
+ import numpy as np
+
+ with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'), 'r') as f:
+ mappings = json.load(f)
+
+ inter = {}
+ sem_2_ids = defaultdict(list)
+ for mapping in mappings:
+ item_id = mapping['item_id']
+ clusters = mapping['clusters']
+ inter[int(item_id)] = clusters
+ sem_2_ids[tuple(clusters)].append(int(item_id))
+
+ for semantics, items in sem_2_ids.items():
+ assert len(items) <= CODEBOOK_SIZE, str(len(items))
+ collision_solvers = np.random.permutation(CODEBOOK_SIZE)[:len(items)].tolist()
+ for item_id, collision_solver in zip(items, collision_solvers):
+ inter[item_id].append(collision_solver)
+ for i in range(len(inter[item_id])):
+ inter[item_id][i] += CODEBOOK_SIZE * i
+
+ with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters_colisionless.json'), 'w') as f:
+ json.dump(inter, f, indent=2)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/plum-lsvd/infer_plum_4.1.py b/scripts/plum-lsvd/infer_plum_4.1.py
new file mode 100644
index 0000000..bb70a9d
--- /dev/null
+++ b/scripts/plum-lsvd/infer_plum_4.1.py
@@ -0,0 +1,146 @@
+from loguru import logger
+import os
+
+import torch
+
+import irec.callbacks as cb
+from irec.data.dataloader import DataLoader
+from irec.data.transforms import Collate, ToTorch, ToDevice
+from irec.runners import InferenceRunner
+
+from irec.utils import fix_random_seed
+
+from data import EmbeddingDatasetParquet, ProcessEmbeddings
+from models import PlumRQVAE
+
+# ЭКСПЕРИМЕНТ С ПОЛНОЙ ИСТОРИЕЙ
+IREC_PATH = '../../'
+MODEL_PATH = '/home/jovyan/IRec/checkpoints/4-1_vk_lsvd_ods_base_with_gap_cb_512_ws_2_k_2000_8w_e35_best_0.0096.pth'
+EMBEDDINGS_PATH = "/home/jovyan/IRec/sigir/lsvd_data/8-days-base-ows/items_metadata_remapped.parquet"
+
+RESULTS_PATH = os.path.join(IREC_PATH, 'results')
+
+WINDOW_SIZE = 2
+CODEBOOK_SIZE = 512
+K = 2000
+EXPERIMENT_NAME = f'4-1_vk_lsvd_ods_base_with_gap_cb_{CODEBOOK_SIZE}_ws_{WINDOW_SIZE}_k_{K}_8w_e_35'
+# ОСТАЛЬНОЕ
+
+SEED_VALUE = 42
+DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+
+BATCH_SIZE = 1024
+
+INPUT_DIM = 64
+HIDDEN_DIM = 32
+NUM_CODEBOOKS = 3
+BETA = 0.25
+
+
+
+def main():
+ fix_random_seed(SEED_VALUE)
+
+ dataset = EmbeddingDatasetParquet(
+ data_path=EMBEDDINGS_PATH
+ )
+
+ item_id_to_embedding = {}
+ all_item_ids = []
+ for idx in range(len(dataset)):
+ sample = dataset[idx]
+ item_id = int(sample['item_id'])
+ item_id_to_embedding[item_id] = torch.tensor(sample['embedding'])
+ all_item_ids.append(item_id)
+
+ dataloader = DataLoader(
+ dataset,
+ batch_size=BATCH_SIZE,
+ shuffle=False,
+ drop_last=False,
+ ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding']))
+
+ model = PlumRQVAE(
+ input_dim=INPUT_DIM,
+ num_codebooks=NUM_CODEBOOKS,
+ codebook_size=CODEBOOK_SIZE,
+ embedding_dim=HIDDEN_DIM,
+ beta=BETA,
+ quant_loss_weight=1.0,
+ contrastive_loss_weight=1.0,
+ temperature=1.0
+ ).to(DEVICE)
+
+ total_params = sum(p.numel() for p in model.parameters())
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+ logger.debug(f'Overall parameters: {total_params:,}')
+ logger.debug(f'Trainable parameters: {trainable_params:,}')
+
+ callbacks = [
+ cb.LoadModel(MODEL_PATH),
+
+ cb.BatchMetrics(metrics=lambda model_outputs, _: {
+ 'loss': model_outputs['loss'],
+ 'recon_loss': model_outputs['recon_loss'],
+ 'rqvae_loss': model_outputs['rqvae_loss'],
+ 'con_loss': model_outputs['con_loss']
+ }, name='valid'),
+
+ cb.MetricAccumulator(
+ accumulators={
+ 'valid/loss': cb.MeanAccumulator(),
+ 'valid/recon_loss': cb.MeanAccumulator(),
+ 'valid/rqvae_loss': cb.MeanAccumulator(),
+ 'valid/con_loss': cb.MeanAccumulator(),
+ },
+ ),
+
+ cb.Logger().every_num_steps(len(dataloader)),
+
+ cb.InferenceSaver(
+ metrics=lambda batch, model_outputs, _: {'item_id': batch['item_id'], 'clusters': model_outputs['clusters']},
+ save_path=os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'),
+ format='json'
+ )
+ ]
+
+ logger.debug('Everything is ready for training process!')
+
+ runner = InferenceRunner(
+ model=model,
+ dataset=dataloader,
+ callbacks=callbacks,
+ )
+ runner.run()
+
+ import json
+ from collections import defaultdict
+ import numpy as np
+
+ with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'), 'r') as f:
+ mappings = json.load(f)
+
+ inter = {}
+ sem_2_ids = defaultdict(list)
+ for mapping in mappings:
+ item_id = mapping['item_id']
+ clusters = mapping['clusters']
+ inter[int(item_id)] = clusters
+ sem_2_ids[tuple(clusters)].append(int(item_id))
+
+ for semantics, items in sem_2_ids.items():
+ assert len(items) <= CODEBOOK_SIZE, str(len(items))
+ collision_solvers = np.random.permutation(CODEBOOK_SIZE)[:len(items)].tolist()
+ for item_id, collision_solver in zip(items, collision_solvers):
+ inter[item_id].append(collision_solver)
+ for i in range(len(inter[item_id])):
+ inter[item_id][i] += CODEBOOK_SIZE * i
+
+ with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters_colisionless.json'), 'w') as f:
+ json.dump(inter, f, indent=2)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/plum-lsvd/infer_plum_4.2.py b/scripts/plum-lsvd/infer_plum_4.2.py
new file mode 100644
index 0000000..977c0b5
--- /dev/null
+++ b/scripts/plum-lsvd/infer_plum_4.2.py
@@ -0,0 +1,146 @@
+from loguru import logger
+import os
+
+import torch
+
+import irec.callbacks as cb
+from irec.data.dataloader import DataLoader
+from irec.data.transforms import Collate, ToTorch, ToDevice
+from irec.runners import InferenceRunner
+
+from irec.utils import fix_random_seed
+
+from data import EmbeddingDatasetParquet, ProcessEmbeddings
+from models import PlumRQVAE
+
+# ЭКСПЕРИМЕНТ С ОБРЕЗАННОЙ ИСТОРИЕЙ
+IREC_PATH = '../../'
+MODEL_PATH = '/home/jovyan/IRec/checkpoints/4-2_vk_lsvd_ods_base_with_gap_cb_512_ws_2_k_2000_8w_e35_best_0.0096.pth'
+EMBEDDINGS_PATH = "/home/jovyan/IRec/sigir/lsvd_data/8-days-base-ows/items_metadata_remapped.parquet"
+
+RESULTS_PATH = os.path.join(IREC_PATH, 'results')
+
+WINDOW_SIZE = 2
+CODEBOOK_SIZE = 512
+K = 2000
+EXPERIMENT_NAME = f'4-2_vk_lsvd_ods_base_with_gap_cb_{CODEBOOK_SIZE}_ws_{WINDOW_SIZE}_k_{K}_8w_e_35'
+# ОСТАЛЬНОЕ
+
+SEED_VALUE = 42
+DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+
+BATCH_SIZE = 1024
+
+INPUT_DIM = 64
+HIDDEN_DIM = 32
+NUM_CODEBOOKS = 3
+BETA = 0.25
+
+
+
+def main():
+ fix_random_seed(SEED_VALUE)
+
+ dataset = EmbeddingDatasetParquet(
+ data_path=EMBEDDINGS_PATH
+ )
+
+ item_id_to_embedding = {}
+ all_item_ids = []
+ for idx in range(len(dataset)):
+ sample = dataset[idx]
+ item_id = int(sample['item_id'])
+ item_id_to_embedding[item_id] = torch.tensor(sample['embedding'])
+ all_item_ids.append(item_id)
+
+ dataloader = DataLoader(
+ dataset,
+ batch_size=BATCH_SIZE,
+ shuffle=False,
+ drop_last=False,
+ ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding']))
+
+ model = PlumRQVAE(
+ input_dim=INPUT_DIM,
+ num_codebooks=NUM_CODEBOOKS,
+ codebook_size=CODEBOOK_SIZE,
+ embedding_dim=HIDDEN_DIM,
+ beta=BETA,
+ quant_loss_weight=1.0,
+ contrastive_loss_weight=1.0,
+ temperature=1.0
+ ).to(DEVICE)
+
+ total_params = sum(p.numel() for p in model.parameters())
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+ logger.debug(f'Overall parameters: {total_params:,}')
+ logger.debug(f'Trainable parameters: {trainable_params:,}')
+
+ callbacks = [
+ cb.LoadModel(MODEL_PATH),
+
+ cb.BatchMetrics(metrics=lambda model_outputs, _: {
+ 'loss': model_outputs['loss'],
+ 'recon_loss': model_outputs['recon_loss'],
+ 'rqvae_loss': model_outputs['rqvae_loss'],
+ 'con_loss': model_outputs['con_loss']
+ }, name='valid'),
+
+ cb.MetricAccumulator(
+ accumulators={
+ 'valid/loss': cb.MeanAccumulator(),
+ 'valid/recon_loss': cb.MeanAccumulator(),
+ 'valid/rqvae_loss': cb.MeanAccumulator(),
+ 'valid/con_loss': cb.MeanAccumulator(),
+ },
+ ),
+
+ cb.Logger().every_num_steps(len(dataloader)),
+
+ cb.InferenceSaver(
+ metrics=lambda batch, model_outputs, _: {'item_id': batch['item_id'], 'clusters': model_outputs['clusters']},
+ save_path=os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'),
+ format='json'
+ )
+ ]
+
+ logger.debug('Everything is ready for training process!')
+
+ runner = InferenceRunner(
+ model=model,
+ dataset=dataloader,
+ callbacks=callbacks,
+ )
+ runner.run()
+
+ import json
+ from collections import defaultdict
+ import numpy as np
+
+ with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'), 'r') as f:
+ mappings = json.load(f)
+
+ inter = {}
+ sem_2_ids = defaultdict(list)
+ for mapping in mappings:
+ item_id = mapping['item_id']
+ clusters = mapping['clusters']
+ inter[int(item_id)] = clusters
+ sem_2_ids[tuple(clusters)].append(int(item_id))
+
+ for semantics, items in sem_2_ids.items():
+ assert len(items) <= CODEBOOK_SIZE, str(len(items))
+ collision_solvers = np.random.permutation(CODEBOOK_SIZE)[:len(items)].tolist()
+ for item_id, collision_solver in zip(items, collision_solvers):
+ inter[item_id].append(collision_solver)
+ for i in range(len(inter[item_id])):
+ inter[item_id][i] += CODEBOOK_SIZE * i
+
+ with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters_colisionless.json'), 'w') as f:
+ json.dump(inter, f, indent=2)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/plum-lsvd/infer_rqvae.py b/scripts/plum-lsvd/infer_rqvae.py
new file mode 100644
index 0000000..53a587c
--- /dev/null
+++ b/scripts/plum-lsvd/infer_rqvae.py
@@ -0,0 +1,161 @@
+from loguru import logger
+import os
+
+import torch
+
+import irec.callbacks as cb
+from irec.data.dataloader import DataLoader
+from irec.data.transforms import Collate, ToTorch, ToDevice
+from irec.runners import InferenceRunner
+
+from irec.utils import fix_random_seed
+
+from data import EmbeddingDatasetParquet, ProcessEmbeddings
+from collections import Counter
+from models import PlumRQVAE
+
+# ПУТИ
+IREC_PATH = '/home/jovyan/IRec/'
+EMBEDDINGS_PATH = '/home/jovyan/IRec/sigir/lsvd_data/8-weeks-base-ows/items_metadata_remapped.parquet'
+MODEL_PATH = '/home/jovyan/IRec/checkpoints/rqvae_vk_lsvd_cz_512_8-weeks_best_0.009.pth'
+RESULTS_PATH = os.path.join(IREC_PATH, 'results')
+
+WINDOW_SIZE = 2
+
+EXPERIMENT_NAME = f'rqvae_vk_lsvd_cz_512_8-weeks'
+
+# ОСТАЛЬНОЕ
+
+SEED_VALUE = 42
+DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+BATCH_SIZE = 1024
+
+INPUT_DIM = 64
+HIDDEN_DIM = 32
+CODEBOOK_SIZE = 512
+NUM_CODEBOOKS = 3
+
+BETA = 0.25
+
+
+
+def main():
+ fix_random_seed(SEED_VALUE)
+
+ dataset = EmbeddingDatasetParquet(
+ data_path=EMBEDDINGS_PATH
+ )
+
+ item_id_to_embedding = {}
+ all_item_ids = []
+ for idx in range(len(dataset)):
+ sample = dataset[idx]
+ item_id = int(sample['item_id'])
+ item_id_to_embedding[item_id] = torch.tensor(sample['embedding'])
+ all_item_ids.append(item_id)
+
+ dataloader = DataLoader(
+ dataset,
+ batch_size=BATCH_SIZE,
+ shuffle=False,
+ drop_last=False,
+ ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding']))
+
+ model = PlumRQVAE(
+ input_dim=INPUT_DIM,
+ num_codebooks=NUM_CODEBOOKS,
+ codebook_size=CODEBOOK_SIZE,
+ embedding_dim=HIDDEN_DIM,
+ beta=BETA,
+ quant_loss_weight=1.0,
+ contrastive_loss_weight=1.0,
+ temperature=1.0
+ ).to(DEVICE)
+
+ total_params = sum(p.numel() for p in model.parameters())
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+ logger.debug(f'Overall parameters: {total_params:,}')
+ logger.debug(f'Trainable parameters: {trainable_params:,}')
+
+ callbacks = [
+ cb.LoadModel(MODEL_PATH),
+
+ cb.BatchMetrics(metrics=lambda model_outputs, _: {
+ 'loss': model_outputs['loss'],
+ 'recon_loss': model_outputs['recon_loss'],
+ 'rqvae_loss': model_outputs['rqvae_loss'],
+ 'con_loss': model_outputs['con_loss']
+ }, name='valid'),
+
+ cb.MetricAccumulator(
+ accumulators={
+ 'valid/loss': cb.MeanAccumulator(),
+ 'valid/recon_loss': cb.MeanAccumulator(),
+ 'valid/rqvae_loss': cb.MeanAccumulator(),
+ 'valid/con_loss': cb.MeanAccumulator(),
+ },
+ ),
+
+ cb.Logger().every_num_steps(len(dataloader)),
+
+ cb.InferenceSaver(
+ metrics=lambda batch, model_outputs, _: {'item_id': batch['item_id'], 'clusters': model_outputs['clusters']},
+ save_path=os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'),
+ format='json'
+ )
+ ]
+
+ logger.debug('Everything is ready for training process!')
+
+ runner = InferenceRunner(
+ model=model,
+ dataset=dataloader,
+ callbacks=callbacks,
+ )
+ runner.run()
+
+ import json
+ from collections import defaultdict
+ import numpy as np
+
+ with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'), 'r') as f:
+ mappings = json.load(f)
+
+ inter = {}
+ sem_2_ids = defaultdict(list)
+ collision_stats = []
+ for mapping in mappings:
+ item_id = mapping['item_id']
+ clusters = mapping['clusters']
+ inter[int(item_id)] = clusters
+ sem_2_ids[tuple(clusters)].append(int(item_id))
+
+ for semantics, items in sem_2_ids.items():
+ assert len(items) <= CODEBOOK_SIZE, str(len(items))
+ collision_solvers = np.random.permutation(CODEBOOK_SIZE)[:len(items)].tolist()
+ for item_id, collision_solver in zip(items, collision_solvers):
+ inter[item_id].append(collision_solver)
+ collision_stats.append(collision_solver)
+ for i in range(len(inter[item_id])):
+ inter[item_id][i] += CODEBOOK_SIZE * i
+
+ if collision_stats:
+ max_col_tok = max(collision_stats)
+ avg_col_tok = np.mean(collision_stats)
+ collision_distribution = Counter(collision_stats)
+
+ print(f"Max collision token: {max_col_tok}")
+ print(f"Avg collision token: {avg_col_tok:.2f}")
+ print(f"Total items with collisions: {len(collision_stats)}")
+ print(f"Collision solver distribution: {dict(collision_distribution)}")
+ else:
+ print("No collisions detected")
+
+ with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters_colisionless.json'), 'w') as f:
+ json.dump(inter, f, indent=2)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/plum-lsvd/models.py b/scripts/plum-lsvd/models.py
new file mode 100644
index 0000000..d475712
--- /dev/null
+++ b/scripts/plum-lsvd/models.py
@@ -0,0 +1,131 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class PlumRQVAE(nn.Module):
+ def __init__(
+ self,
+ input_dim,
+ num_codebooks,
+ codebook_size,
+ embedding_dim,
+ beta=0.25,
+ quant_loss_weight=1.0,
+ contrastive_loss_weight=1.0,
+ temperature=0.0,
+ ):
+ super().__init__()
+ self.register_buffer('beta', torch.tensor(beta))
+ self.temperature = temperature
+
+ self.input_dim = input_dim
+ self.num_codebooks = num_codebooks
+ self.codebook_size = codebook_size
+ self.embedding_dim = embedding_dim
+ self.quant_loss_weight = quant_loss_weight
+
+ self.contrastive_loss_weight = contrastive_loss_weight
+
+ self.encoder = self.make_encoding_tower(input_dim, embedding_dim)
+ self.decoder = self.make_encoding_tower(embedding_dim, input_dim)
+
+ self.codebooks = torch.nn.ParameterList()
+ for _ in range(num_codebooks):
+ cb = torch.FloatTensor(codebook_size, embedding_dim)
+ #nn.init.normal_(cb)
+ self.codebooks.append(cb)
+
+ @staticmethod
+ def make_encoding_tower(d1, d2, bias=False):
+ return torch.nn.Sequential(
+ nn.Linear(d1, d1),
+ nn.ReLU(),
+ nn.Linear(d1, d2),
+ nn.ReLU(),
+ nn.Linear(d2, d2, bias=bias)
+ )
+
+ @staticmethod
+ def get_codebook_indices(remainder, codebook):
+ dist = torch.cdist(remainder, codebook)
+ return dist.argmin(dim=-1)
+
+ def _quantize_representation(self, latent_vector):
+ latent_restored = 0
+ remainder = latent_vector
+
+ for codebook in self.codebooks:
+ codebook_indices = self.get_codebook_indices(remainder, codebook)
+ quantized = codebook[codebook_indices]
+ codebook_vectors = remainder + (quantized - remainder).detach()
+ latent_restored += codebook_vectors
+ remainder = remainder - codebook_vectors
+
+ return latent_restored
+
+ def contrastive_loss(self, p_i, p_i_star):
+ N_b = p_i.size(0)
+
+ p_i = F.normalize(p_i, p=2, dim=-1) #TODO посмотреть без нормалайза
+ p_i_star = F.normalize(p_i_star, p=2, dim=-1)
+
+ similarities = torch.matmul(p_i, p_i_star.T) / self.temperature
+
+ labels = torch.arange(N_b, dtype=torch.long, device=p_i.device)
+
+ loss = F.cross_entropy(similarities, labels)
+
+ return loss
+
+ def forward(self, inputs):
+ latent_vector = self.encoder(inputs['embedding'])
+ item_ids = inputs['item_id']
+
+ latent_restored = 0
+ rqvae_loss = 0
+ clusters = []
+ remainder = latent_vector
+
+ for codebook in self.codebooks:
+ codebook_indices = self.get_codebook_indices(remainder, codebook)
+ clusters.append(codebook_indices)
+
+ quantized = codebook[codebook_indices]
+ codebook_vectors = remainder + (quantized - remainder).detach()
+
+ rqvae_loss += self.beta * torch.nn.functional.mse_loss(remainder, quantized.detach())
+ rqvae_loss += torch.nn.functional.mse_loss(quantized, remainder.detach())
+
+ latent_restored += codebook_vectors
+ remainder = remainder - codebook_vectors
+
+ embeddings_restored = self.decoder(latent_restored)
+ recon_loss = torch.nn.functional.mse_loss(embeddings_restored, inputs['embedding'])
+
+ if 'cooccurrence_embedding' in inputs:
+ cooccurrence_latent = self.encoder(inputs['cooccurrence_embedding'].to(latent_restored.device))
+ cooccurrence_restored = self._quantize_representation(cooccurrence_latent)
+ con_loss = self.contrastive_loss(latent_restored, cooccurrence_restored)
+ else:
+ con_loss = torch.as_tensor(0.0, device=latent_vector.device)
+
+ loss = (
+ recon_loss
+ + self.quant_loss_weight * rqvae_loss
+ + self.contrastive_loss_weight * con_loss
+ ).mean()
+
+ clusters_counts = []
+ for cluster in clusters:
+ clusters_counts.append(torch.bincount(cluster, minlength=self.codebook_size))
+
+ return loss, {
+ 'loss': loss.item(),
+ 'recon_loss': recon_loss.mean().item(),
+ 'rqvae_loss': rqvae_loss.mean().item(),
+ 'con_loss': con_loss.item(),
+
+ 'clusters_counts': clusters_counts,
+ 'clusters': torch.stack(clusters).T,
+ 'embedding_hat': embeddings_restored,
+ }
\ No newline at end of file
diff --git a/scripts/plum-lsvd/train_plum_4.1.py b/scripts/plum-lsvd/train_plum_4.1.py
new file mode 100644
index 0000000..85027b3
--- /dev/null
+++ b/scripts/plum-lsvd/train_plum_4.1.py
@@ -0,0 +1,180 @@
+from loguru import logger
+import os
+
+import torch
+
+import pickle
+
+import irec.callbacks as cb
+from irec.data.dataloader import DataLoader
+from irec.data.transforms import Collate, ToTorch, ToDevice
+from irec.runners import TrainingRunner
+
+from irec.utils import fix_random_seed
+
+from callbacks import InitCodebooks, FixDeadCentroids
+from data import EmbeddingDatasetParquet, ProcessEmbeddings
+from models import PlumRQVAE
+from transforms import AddWeightedCooccurrenceEmbeddingsVectorized
+from cooc_data import CoocMappingDataset
+
+# ЭКСПЕРИМЕНТ С ПОЛНОЙ ИСТОРИЕЙ
+SEED_VALUE = 42
+DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+NUM_EPOCHS = 35
+BATCH_SIZE = 1024
+
+INPUT_DIM = 64
+HIDDEN_DIM = 32
+CODEBOOK_SIZE = 512
+NUM_CODEBOOKS = 3
+BETA = 0.25
+LR = 1e-4
+WINDOW_SIZE = 2
+K=2000
+
+EXPERIMENT_NAME = f'4-1_vk_lsvd_ods_base_with_gap_cb_{CODEBOOK_SIZE}_ws_{WINDOW_SIZE}_k_{K}_8w_e{NUM_EPOCHS}'
+INTER_TRAIN_PATH = "/home/jovyan/IRec/sigir/lsvd_data/8-weeks-base-ows/base_with_gap_interactions_grouped.parquet"
+EMBEDDINGS_PATH = "/home/jovyan/IRec/sigir/lsvd_data/8-weeks-base-ows/items_metadata_remapped.parquet"
+IREC_PATH = '../../'
+
+print(INTER_TRAIN_PATH)
+def main():
+ fix_random_seed(SEED_VALUE)
+
+ dataset = EmbeddingDatasetParquet(
+ data_path=EMBEDDINGS_PATH,
+ )
+
+ data = CoocMappingDataset.create_from_split_part(
+ train_inter_parquet_path=INTER_TRAIN_PATH,
+ window_size=WINDOW_SIZE
+ )
+
+ item_id_to_embedding = {}
+ all_item_ids = []
+ for idx in range(len(dataset)):
+ sample = dataset[idx]
+ item_id = int(sample['item_id'])
+ item_id_to_embedding[item_id] = torch.tensor(sample['embedding'], device=DEVICE)
+ all_item_ids.append(item_id)
+
+ # add_cooc_transform = AddWeightedCooccurrenceEmbeddings(data.cooccur_counter_mapping, item_id_to_embedding, all_item_ids, K)
+ add_cooc_transform = AddWeightedCooccurrenceEmbeddingsVectorized(
+ cooccur_counts=data.cooccur_counter_mapping,
+ item_id_to_embedding=item_id_to_embedding,
+ all_item_ids=all_item_ids,
+ device=DEVICE,
+ max_neighbors=K,
+ seed=42
+ )
+
+ train_dataloader = DataLoader( #call в основном потоке делается нужно исправить
+ dataset,
+ batch_size=BATCH_SIZE,
+ shuffle=True,
+ drop_last=True,
+ ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(
+ ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])
+ ).map(add_cooc_transform
+ ).repeat(NUM_EPOCHS)
+
+ valid_dataloader = DataLoader(
+ dataset,
+ batch_size=BATCH_SIZE,
+ shuffle=False,
+ drop_last=False,
+ ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])
+ ).map(add_cooc_transform)
+
+ LOG_EVERY_NUM_STEPS = int(len(train_dataloader) // NUM_EPOCHS)
+
+ model = PlumRQVAE(
+ input_dim=INPUT_DIM,
+ num_codebooks=NUM_CODEBOOKS,
+ codebook_size=CODEBOOK_SIZE,
+ embedding_dim=HIDDEN_DIM,
+ beta=BETA,
+ quant_loss_weight=1.0,
+ contrastive_loss_weight=1.0,
+ temperature=1.0
+ ).to(DEVICE)
+
+ total_params = sum(p.numel() for p in model.parameters())
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+ logger.debug(f'Overall parameters: {total_params:,}')
+ logger.debug(f'Trainable parameters: {trainable_params:,}')
+
+ optimizer = torch.optim.Adam(model.parameters(), lr=LR, fused=True)
+
+ callbacks = [
+ InitCodebooks(valid_dataloader),
+
+ cb.BatchMetrics(metrics=lambda model_outputs, batch: {
+ 'loss': model_outputs['loss'],
+ 'recon_loss': model_outputs['recon_loss'],
+ 'rqvae_loss': model_outputs['rqvae_loss'],
+ 'con_loss': model_outputs['con_loss']
+ }, name='train'),
+
+ FixDeadCentroids(valid_dataloader),
+
+ cb.MetricAccumulator(
+ accumulators={
+ 'train/loss': cb.MeanAccumulator(),
+ 'train/recon_loss': cb.MeanAccumulator(),
+ 'train/rqvae_loss': cb.MeanAccumulator(),
+ 'train/con_loss': cb.MeanAccumulator(),
+ 'num_dead/0': cb.MeanAccumulator(),
+ 'num_dead/1': cb.MeanAccumulator(),
+ 'num_dead/2': cb.MeanAccumulator(),
+ },
+ reset_every_num_steps=LOG_EVERY_NUM_STEPS
+ ),
+
+ cb.Validation(
+ dataset=valid_dataloader,
+ callbacks=[
+ cb.BatchMetrics(metrics=lambda model_outputs, batch: {
+ 'loss': model_outputs['loss'],
+ 'recon_loss': model_outputs['recon_loss'],
+ 'rqvae_loss': model_outputs['rqvae_loss'],
+ 'con_loss': model_outputs['con_loss']
+ }, name='valid'),
+ cb.MetricAccumulator(
+ accumulators={
+ 'valid/loss': cb.MeanAccumulator(),
+ 'valid/recon_loss': cb.MeanAccumulator(),
+ 'valid/rqvae_loss': cb.MeanAccumulator(),
+ 'valid/con_loss': cb.MeanAccumulator()
+ }
+ ),
+ ],
+ ).every_num_steps(LOG_EVERY_NUM_STEPS),
+
+ cb.Logger().every_num_steps(LOG_EVERY_NUM_STEPS),
+ cb.TensorboardLogger(experiment_name=EXPERIMENT_NAME, logdir=os.path.join(IREC_PATH, 'tensorboard_logs')),
+
+ cb.EarlyStopping(
+ metric='valid/recon_loss',
+ patience=40,
+ minimize=True,
+ model_path=os.path.join(IREC_PATH, 'checkpoints', EXPERIMENT_NAME)
+ ).every_num_steps(LOG_EVERY_NUM_STEPS),
+ ]
+
+ logger.debug('Everything is ready for training process!')
+
+ runner = TrainingRunner(
+ model=model,
+ optimizer=optimizer,
+ dataset=train_dataloader,
+ callbacks=callbacks,
+ )
+ runner.run()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/plum-lsvd/train_plum_4.2.py b/scripts/plum-lsvd/train_plum_4.2.py
new file mode 100644
index 0000000..de1864d
--- /dev/null
+++ b/scripts/plum-lsvd/train_plum_4.2.py
@@ -0,0 +1,180 @@
+from loguru import logger
+import os
+
+import torch
+
+import pickle
+
+import irec.callbacks as cb
+from irec.data.dataloader import DataLoader
+from irec.data.transforms import Collate, ToTorch, ToDevice
+from irec.runners import TrainingRunner
+
+from irec.utils import fix_random_seed
+
+from callbacks import InitCodebooks, FixDeadCentroids
+from data import EmbeddingDatasetParquet, ProcessEmbeddings
+from models import PlumRQVAE
+from transforms import AddWeightedCooccurrenceEmbeddingsVectorized
+from cooc_data import CoocMappingDataset
+
+# ЭКСПЕРИМЕНТ С ОБРЕЗАННОЙ ИСТОРИЕЙ
+SEED_VALUE = 42
+DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+NUM_EPOCHS = 35
+BATCH_SIZE = 1024
+
+INPUT_DIM = 64
+HIDDEN_DIM = 32
+CODEBOOK_SIZE = 512
+NUM_CODEBOOKS = 3
+BETA = 0.25
+LR = 1e-4
+WINDOW_SIZE = 2
+K=2000
+
+EXPERIMENT_NAME = f'4-2_vk_lsvd_ods_base_with_gap_cb_{CODEBOOK_SIZE}_ws_{WINDOW_SIZE}_k_{K}_8w_e{NUM_EPOCHS}'
+INTER_TRAIN_PATH = "/home/jovyan/IRec/sigir/lsvd_data/8-weeks-base-ows/base_interactions_grouped.parquet"
+EMBEDDINGS_PATH = "/home/jovyan/IRec/sigir/lsvd_data/8-weeks-base-ows/items_metadata_remapped.parquet"
+IREC_PATH = '../../'
+
+print(INTER_TRAIN_PATH)
+def main():
+ fix_random_seed(SEED_VALUE)
+
+ dataset = EmbeddingDatasetParquet(
+ data_path=EMBEDDINGS_PATH,
+ )
+
+ data = CoocMappingDataset.create_from_split_part(
+ train_inter_parquet_path=INTER_TRAIN_PATH,
+ window_size=WINDOW_SIZE
+ )
+
+ item_id_to_embedding = {}
+ all_item_ids = []
+ for idx in range(len(dataset)):
+ sample = dataset[idx]
+ item_id = int(sample['item_id'])
+ item_id_to_embedding[item_id] = torch.tensor(sample['embedding'], device=DEVICE)
+ all_item_ids.append(item_id)
+
+ # add_cooc_transform = AddWeightedCooccurrenceEmbeddings(data.cooccur_counter_mapping, item_id_to_embedding, all_item_ids, K)
+ add_cooc_transform = AddWeightedCooccurrenceEmbeddingsVectorized(
+ cooccur_counts=data.cooccur_counter_mapping,
+ item_id_to_embedding=item_id_to_embedding,
+ all_item_ids=all_item_ids,
+ device=DEVICE,
+ max_neighbors=K,
+ seed=42
+ )
+
+ train_dataloader = DataLoader( #call в основном потоке делается нужно исправить
+ dataset,
+ batch_size=BATCH_SIZE,
+ shuffle=True,
+ drop_last=True,
+ ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(
+ ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])
+ ).map(add_cooc_transform
+ ).repeat(NUM_EPOCHS)
+
+ valid_dataloader = DataLoader(
+ dataset,
+ batch_size=BATCH_SIZE,
+ shuffle=False,
+ drop_last=False,
+ ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])
+ ).map(add_cooc_transform)
+
+ LOG_EVERY_NUM_STEPS = int(len(train_dataloader) // NUM_EPOCHS)
+
+ model = PlumRQVAE(
+ input_dim=INPUT_DIM,
+ num_codebooks=NUM_CODEBOOKS,
+ codebook_size=CODEBOOK_SIZE,
+ embedding_dim=HIDDEN_DIM,
+ beta=BETA,
+ quant_loss_weight=1.0,
+ contrastive_loss_weight=1.0,
+ temperature=1.0
+ ).to(DEVICE)
+
+ total_params = sum(p.numel() for p in model.parameters())
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+ logger.debug(f'Overall parameters: {total_params:,}')
+ logger.debug(f'Trainable parameters: {trainable_params:,}')
+
+ optimizer = torch.optim.Adam(model.parameters(), lr=LR, fused=True)
+
+ callbacks = [
+ InitCodebooks(valid_dataloader),
+
+ cb.BatchMetrics(metrics=lambda model_outputs, batch: {
+ 'loss': model_outputs['loss'],
+ 'recon_loss': model_outputs['recon_loss'],
+ 'rqvae_loss': model_outputs['rqvae_loss'],
+ 'con_loss': model_outputs['con_loss']
+ }, name='train'),
+
+ FixDeadCentroids(valid_dataloader),
+
+ cb.MetricAccumulator(
+ accumulators={
+ 'train/loss': cb.MeanAccumulator(),
+ 'train/recon_loss': cb.MeanAccumulator(),
+ 'train/rqvae_loss': cb.MeanAccumulator(),
+ 'train/con_loss': cb.MeanAccumulator(),
+ 'num_dead/0': cb.MeanAccumulator(),
+ 'num_dead/1': cb.MeanAccumulator(),
+ 'num_dead/2': cb.MeanAccumulator(),
+ },
+ reset_every_num_steps=LOG_EVERY_NUM_STEPS
+ ),
+
+ cb.Validation(
+ dataset=valid_dataloader,
+ callbacks=[
+ cb.BatchMetrics(metrics=lambda model_outputs, batch: {
+ 'loss': model_outputs['loss'],
+ 'recon_loss': model_outputs['recon_loss'],
+ 'rqvae_loss': model_outputs['rqvae_loss'],
+ 'con_loss': model_outputs['con_loss']
+ }, name='valid'),
+ cb.MetricAccumulator(
+ accumulators={
+ 'valid/loss': cb.MeanAccumulator(),
+ 'valid/recon_loss': cb.MeanAccumulator(),
+ 'valid/rqvae_loss': cb.MeanAccumulator(),
+ 'valid/con_loss': cb.MeanAccumulator()
+ }
+ ),
+ ],
+ ).every_num_steps(LOG_EVERY_NUM_STEPS),
+
+ cb.Logger().every_num_steps(LOG_EVERY_NUM_STEPS),
+ cb.TensorboardLogger(experiment_name=EXPERIMENT_NAME, logdir=os.path.join(IREC_PATH, 'tensorboard_logs')),
+
+ cb.EarlyStopping(
+ metric='valid/recon_loss',
+ patience=40,
+ minimize=True,
+ model_path=os.path.join(IREC_PATH, 'checkpoints', EXPERIMENT_NAME)
+ ).every_num_steps(LOG_EVERY_NUM_STEPS),
+ ]
+
+ logger.debug('Everything is ready for training process!')
+
+ runner = TrainingRunner(
+ model=model,
+ optimizer=optimizer,
+ dataset=train_dataloader,
+ callbacks=callbacks,
+ )
+ runner.run()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/plum-lsvd/train_rqvae.py b/scripts/plum-lsvd/train_rqvae.py
new file mode 100644
index 0000000..ea41b74
--- /dev/null
+++ b/scripts/plum-lsvd/train_rqvae.py
@@ -0,0 +1,174 @@
+from loguru import logger
+import os
+
+import torch
+
+import pickle
+
+import irec.callbacks as cb
+from irec.data.dataloader import DataLoader
+from irec.data.transforms import Collate, ToTorch, ToDevice
+from irec.runners import TrainingRunner
+
+from irec.utils import fix_random_seed
+
+from callbacks import InitCodebooks, FixDeadCentroids
+from data import EmbeddingDatasetParquet, ProcessEmbeddings
+from models import PlumRQVAE
+from transforms import AddWeightedCooccurrenceEmbeddings
+from cooc_data import CoocMappingDataset
+
+SEED_VALUE = 42
+DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+NUM_EPOCHS = 15
+BATCH_SIZE = 1024
+
+INPUT_DIM = 64
+HIDDEN_DIM = 32
+CODEBOOK_SIZE = 512
+NUM_CODEBOOKS = 3
+BETA = 0.25
+LR = 1e-4
+WINDOW_SIZE = 3
+MAX_LEN = 500
+K=100
+
+# EXPERIMENT_NAME = f'4-1_vk_lsvd_ods_base_with_gap_cb_{CODEBOOK_SIZE}_ws_{WINDOW_SIZE}_k_{K}_ml_{MAX_LEN}'
+EXPERIMENT_NAME = f'rqvae_vk_lsvd_cz_512_8-weeks'
+EMBEDDINGS_PATH = "/home/jovyan/IRec/sigir/lsvd_data/8-weeks-base-ows/items_metadata_remapped.parquet"
+IREC_PATH = '../../'
+
+# print(INTER_TRAIN_PATH)
+def main():
+ fix_random_seed(SEED_VALUE)
+
+ dataset = EmbeddingDatasetParquet(
+ data_path=EMBEDDINGS_PATH
+ )
+
+ # data = CoocMappingDataset.create_from_split_part(
+ # train_inter_parquet_path=INTER_TRAIN_PATH,
+ # window_size=WINDOW_SIZE,
+ # max_items=MAX_LEN
+ # )
+
+ item_id_to_embedding = {}
+ all_item_ids = []
+ for idx in range(len(dataset)):
+ sample = dataset[idx]
+ item_id = int(sample['item_id'])
+ item_id_to_embedding[item_id] = torch.tensor(sample['embedding'], device=DEVICE)
+ all_item_ids.append(item_id)
+
+ # add_cooc_transform = AddWeightedCooccurrenceEmbeddings(data.cooccur_counter_mapping, item_id_to_embedding, all_item_ids)
+
+ train_dataloader = DataLoader( #call в основном потоке делается нужно исправить
+ dataset,
+ batch_size=BATCH_SIZE,
+ shuffle=True,
+ drop_last=True,
+ ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(
+ ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])
+ # ).map(add_cooc_transform
+ ).repeat(NUM_EPOCHS)
+
+ valid_dataloader = DataLoader(
+ dataset,
+ batch_size=BATCH_SIZE,
+ shuffle=False,
+ drop_last=False,
+ ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])
+ # ).map(add_cooc_transform)
+ )
+
+ LOG_EVERY_NUM_STEPS = int(len(train_dataloader) // NUM_EPOCHS)
+
+ model = PlumRQVAE(
+ input_dim=INPUT_DIM,
+ num_codebooks=NUM_CODEBOOKS,
+ codebook_size=CODEBOOK_SIZE,
+ embedding_dim=HIDDEN_DIM,
+ beta=BETA,
+ quant_loss_weight=1.0,
+ contrastive_loss_weight=1.0,
+ temperature=1.0
+ ).to(DEVICE)
+
+ total_params = sum(p.numel() for p in model.parameters())
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+ logger.debug(f'Overall parameters: {total_params:,}')
+ logger.debug(f'Trainable parameters: {trainable_params:,}')
+
+ optimizer = torch.optim.Adam(model.parameters(), lr=LR, fused=True)
+
+ callbacks = [
+ InitCodebooks(valid_dataloader),
+
+ cb.BatchMetrics(metrics=lambda model_outputs, batch: {
+ 'loss': model_outputs['loss'],
+ 'recon_loss': model_outputs['recon_loss'],
+ 'rqvae_loss': model_outputs['rqvae_loss'],
+ 'con_loss': model_outputs['con_loss']
+ }, name='train'),
+
+ FixDeadCentroids(valid_dataloader),
+
+ cb.MetricAccumulator(
+ accumulators={
+ 'train/loss': cb.MeanAccumulator(),
+ 'train/recon_loss': cb.MeanAccumulator(),
+ 'train/rqvae_loss': cb.MeanAccumulator(),
+ 'train/con_loss': cb.MeanAccumulator(),
+ 'num_dead/0': cb.MeanAccumulator(),
+ 'num_dead/1': cb.MeanAccumulator(),
+ 'num_dead/2': cb.MeanAccumulator(),
+ },
+ reset_every_num_steps=LOG_EVERY_NUM_STEPS
+ ),
+
+ cb.Validation(
+ dataset=valid_dataloader,
+ callbacks=[
+ cb.BatchMetrics(metrics=lambda model_outputs, batch: {
+ 'loss': model_outputs['loss'],
+ 'recon_loss': model_outputs['recon_loss'],
+ 'rqvae_loss': model_outputs['rqvae_loss'],
+ 'con_loss': model_outputs['con_loss']
+ }, name='valid'),
+ cb.MetricAccumulator(
+ accumulators={
+ 'valid/loss': cb.MeanAccumulator(),
+ 'valid/recon_loss': cb.MeanAccumulator(),
+ 'valid/rqvae_loss': cb.MeanAccumulator(),
+ 'valid/con_loss': cb.MeanAccumulator()
+ }
+ ),
+ ],
+ ).every_num_steps(LOG_EVERY_NUM_STEPS),
+
+ cb.Logger().every_num_steps(LOG_EVERY_NUM_STEPS),
+ cb.TensorboardLogger(experiment_name=EXPERIMENT_NAME, logdir=os.path.join(IREC_PATH, 'tensorboard_logs')),
+
+ cb.EarlyStopping(
+ metric='valid/recon_loss',
+ patience=40,
+ minimize=True,
+ model_path=os.path.join(IREC_PATH, 'checkpoints', EXPERIMENT_NAME)
+ ).every_num_steps(LOG_EVERY_NUM_STEPS),
+ ]
+
+ logger.debug('Everything is ready for training process!')
+
+ runner = TrainingRunner(
+ model=model,
+ optimizer=optimizer,
+ dataset=train_dataloader,
+ callbacks=callbacks,
+ )
+ runner.run()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/plum-lsvd/transforms.py b/scripts/plum-lsvd/transforms.py
new file mode 100644
index 0000000..143002b
--- /dev/null
+++ b/scripts/plum-lsvd/transforms.py
@@ -0,0 +1,287 @@
+import numpy as np
+import pickle
+import torch
+from typing import Dict, List
+import time
+from collections import defaultdict, Counter
+
+class AddWeightedCooccurrenceEmbeddings:
+ def __init__(self, cooccur_counts, item_id_to_embedding, all_item_ids, top_k):
+ self.cooccur_counts = cooccur_counts
+ self.item_id_to_embedding = item_id_to_embedding
+ self.all_item_ids = all_item_ids
+ self.call_count = 0
+ self.top_k = top_k
+
+ # Предвычисляем top_k для каждого item_id
+ self._top_k_cache = {}
+ self._build_top_k_cache()
+
+ def _build_top_k_cache(self):
+ """Предвычисляет top-k соседей для каждого item_id"""
+ for item_id, counter in self.cooccur_counts.items():
+ if counter and len(counter) > 0:
+ # Сортируем по частоте и берем top_k
+ top_items = counter.most_common(self.top_k)
+ cooc_ids, freqs = zip(*top_items)
+ freqs_array = np.array(freqs, dtype=np.float32)
+ probs = freqs_array / freqs_array.sum()
+
+ self._top_k_cache[item_id] = {
+ 'cooc_ids': cooc_ids,
+ 'probs': probs
+ }
+
+ def __call__(self, batch):
+ self.call_count += 1
+ item_ids = batch['item_id']
+ cooccurrence_embeddings = []
+
+ for idx, item_id in enumerate(item_ids):
+ item_id_val = int(item_id.item()) if torch.is_tensor(item_id) else int(item_id)
+
+ # Используем предвычисленный top-k кэш
+ if item_id_val in self._top_k_cache:
+ cache_entry = self._top_k_cache[item_id_val]
+ cooc_id = np.random.choice(
+ cache_entry['cooc_ids'],
+ p=cache_entry['probs']
+ )
+ else:
+ cooc_id = np.random.choice(self.all_item_ids)
+ if self.call_count % 500 == 0 and idx < 5:
+ print(f" idx={idx}: item_id={item_id_val} fallback random")
+ if self.call_count % 500 == 0 and idx < 5:
+ print(f" idx={idx}: item_id={item_id_val} cooc_id={cooc_id}")
+ cooc_emb = self.item_id_to_embedding.get(cooc_id, batch['embedding'][0])
+ cooccurrence_embeddings.append(cooc_emb)
+
+ batch['cooccurrence_embedding'] = torch.stack(cooccurrence_embeddings)
+ return batch
+
+#запустить сасрек, леттер, sasrec << tiger < letter < plum
+
+
+class AddWeightedCooccurrenceEmbeddingsVectorized:
+
+ def __init__(
+ self,
+ cooccur_counts: Dict[int, Dict[int, int]],
+ item_id_to_embedding: Dict[int, torch.Tensor],
+ all_item_ids: List[int],
+ device: torch.device,
+ limit_neighbors: bool = True,
+ max_neighbors: int = 256,
+ seed: int = 42,
+ verbose: bool = True
+ ):
+ self.device = device
+ self.call_count = 0
+ self.limit_neighbors = limit_neighbors
+ self.max_neighbors = max_neighbors
+ self.seed = seed
+ self.verbose = verbose
+
+ torch.manual_seed(seed)
+ np.random.seed(seed)
+
+ if self.verbose:
+ print(f"\n{'='*80}")
+ print(f"Initializing AddWeightedCooccurrenceEmbeddingsVectorized")
+ print(f"{'='*80}")
+ init_start = time.time()
+
+ all_item_ids_sorted = sorted(all_item_ids)
+ self.item_id_to_idx = {item_id: idx for idx, item_id in enumerate(all_item_ids_sorted)}
+ self.idx_to_item_id = torch.tensor(all_item_ids_sorted, device=device, dtype=torch.long)
+
+ if self.verbose:
+ print(f"[INIT] Sorted {len(all_item_ids)} item IDs and created mappings")
+
+ num_items = len(all_item_ids_sorted)
+ embedding_dim = next(iter(item_id_to_embedding.values())).shape[0]
+
+ if self.verbose:
+ print(f"[INIT] Num items: {num_items}, Embedding dim: {embedding_dim}")
+
+ self.embedding_matrix = torch.zeros(
+ size=(num_items, embedding_dim),
+ device=device,
+ dtype=torch.float32,
+ requires_grad=False
+ )
+
+ emb_load_start = time.time()
+ for item_id, emb in item_id_to_embedding.items():
+ idx = self.item_id_to_idx[item_id]
+ if isinstance(emb, torch.Tensor):
+ self.embedding_matrix[idx] = emb.to(device).detach()
+ else:
+ self.embedding_matrix[idx] = torch.tensor(emb, device=device, dtype=torch.float32)
+
+ if self.verbose:
+ emb_load_time = time.time() - emb_load_start
+ print(f"[INIT] Loaded {len(item_id_to_embedding)} embeddings in {emb_load_time*1000:.2f}ms")
+
+ self._build_cooccurrence_tables(cooccur_counts, num_items)
+
+ if self.verbose:
+ init_time = time.time() - init_start
+ print(f"[INIT] Total initialization time: {init_time*1000:.2f}ms")
+ print(f"{'='*80}\n")
+
+ def _build_cooccurrence_tables(self, cooccur_counts: Dict, num_items: int):
+ if self.verbose:
+ build_start = time.time()
+ print(f"\n[BUILD] Building cooccurrence tables...")
+
+ indexed_cooccur_counts = {}
+ for item_id, neighbors in cooccur_counts.items():
+ if item_id in self.item_id_to_idx:
+ idx = self.item_id_to_idx[item_id]
+ indexed_neighbors = {}
+ for neighbor_id, count in neighbors.items():
+ if neighbor_id in self.item_id_to_idx:
+ neighbor_idx = self.item_id_to_idx[neighbor_id]
+ indexed_neighbors[neighbor_idx] = count
+ if indexed_neighbors:
+ indexed_cooccur_counts[idx] = indexed_neighbors
+
+ if self.verbose:
+ items_with_cooc = len(indexed_cooccur_counts)
+ print(f"[BUILD] Items with cooccurrences: {items_with_cooc}/{num_items}")
+ total_pairs = sum(len(neighbors) for neighbors in indexed_cooccur_counts.values())
+ print(f"[BUILD] Total cooccurrence pairs: {total_pairs}")
+
+ max_actual_neighbors = 0
+ for idx in range(num_items):
+ counter = indexed_cooccur_counts.get(idx)
+ if counter and len(counter) > 0:
+ num_neighbors = len(counter)
+ if self.limit_neighbors:
+ num_neighbors = min(num_neighbors, self.max_neighbors)
+ else:
+ num_neighbors = num_items
+ max_actual_neighbors = max(max_actual_neighbors, num_neighbors)
+
+ if self.limit_neighbors:
+ max_actual_neighbors = min(max_actual_neighbors, self.max_neighbors)
+
+ if self.verbose:
+ print(f"[BUILD] Max neighbors per item: {max_actual_neighbors}")
+
+ neighbors_matrix = torch.zeros(
+ (num_items, max_actual_neighbors),
+ dtype=torch.long,
+ device=self.device,
+ requires_grad=False
+ )
+
+ probs_matrix = torch.zeros(
+ (num_items, max_actual_neighbors),
+ dtype=torch.float32,
+ device=self.device,
+ requires_grad=False
+ )
+
+ valid_mask = torch.zeros(
+ (num_items, max_actual_neighbors),
+ dtype=torch.bool,
+ device=self.device,
+ requires_grad=False
+ )
+
+ matrix_fill_start = time.time()
+
+ for idx in range(num_items):
+ counter = indexed_cooccur_counts.get(idx)
+
+ if counter and len(counter) > 0:
+ cooc_items = sorted(counter.items(), key=lambda x: x, reverse=True)
+ cooc_ids, freqs = zip(*cooc_items)
+ cooc_ids = list(cooc_ids)
+ freqs = np.array(freqs, dtype=np.float32)
+
+ num_neighbors = min(len(cooc_ids), max_actual_neighbors)
+ cooc_ids = cooc_ids[:num_neighbors]
+ freqs = freqs[:num_neighbors]
+
+ probs = freqs / freqs.sum()
+
+ neighbors_matrix[idx, :num_neighbors] = torch.tensor(
+ cooc_ids, dtype=torch.long, device=self.device
+ )
+ probs_matrix[idx, :num_neighbors] = torch.tensor(
+ probs, dtype=torch.float32, device=self.device
+ )
+ valid_mask[idx, :num_neighbors] = True
+
+ else:
+ if max_actual_neighbors >= num_items:
+ neighbors_matrix[idx, :num_items] = torch.arange(num_items, device=self.device)
+ probs_matrix[idx, :num_items] = 1.0 / num_items
+ valid_mask[idx, :num_items] = True
+ else:
+ perm = torch.randperm(num_items, device=self.device)[:max_actual_neighbors]
+ neighbors_matrix[idx] = perm
+ probs_matrix[idx] = 1.0 / max_actual_neighbors
+ valid_mask[idx] = True
+
+ if self.verbose:
+ matrix_fill_time = time.time() - matrix_fill_start
+ print(f"[BUILD] Filled matrices in {matrix_fill_time*1000:.2f}ms")
+
+ self.neighbors_matrix = neighbors_matrix
+ self.probs_matrix = probs_matrix
+ self.valid_mask = valid_mask
+
+ if self.verbose:
+ print(f"[BUILD] neighbors_matrix shape: {neighbors_matrix.shape}")
+ print(f"[BUILD] probs_matrix shape: {probs_matrix.shape}")
+ print(f"[BUILD] valid_mask shape: {valid_mask.shape}")
+ build_time = time.time() - build_start
+ print(f"[BUILD] Total build time: {build_time*1000:.2f}ms")
+
+ def __call__(self, batch):
+ self.call_count += 1
+
+ call_start = time.time()
+
+ item_ids = batch['item_id']
+
+ if not isinstance(item_ids, torch.Tensor):
+ item_ids = torch.tensor(item_ids, device=self.device, dtype=torch.long)
+ else:
+ item_ids = item_ids.to(device=self.device, dtype=torch.long)
+
+ batch_size = item_ids.shape
+
+ indexed_item_ids = torch.tensor(
+ [self.item_id_to_idx.get(int(iid.item()), 0) for iid in item_ids],
+ device=self.device,
+ dtype=torch.long
+ )
+
+ probs = self.probs_matrix[indexed_item_ids]
+ mask = self.valid_mask[indexed_item_ids]
+
+ masked_probs = probs.clone()
+ masked_probs[~mask] = 0.0
+
+ row_sums = masked_probs.sum(dim=1, keepdim=True)
+ row_sums[row_sums == 0] = 1.0
+ masked_probs = masked_probs / row_sums
+
+ neighbor_indices = torch.multinomial(masked_probs, num_samples=1, replacement=True)
+ neighbor_indices = neighbor_indices.squeeze(1)
+
+ cooc_indexed_ids = self.neighbors_matrix[indexed_item_ids, neighbor_indices]
+ cooccurrence_embeddings = self.embedding_matrix[cooc_indexed_ids]
+
+ batch['cooccurrence_embedding'] = cooccurrence_embeddings
+
+ call_time = time.time() - call_start
+ if self.verbose and self.call_count % 1000 == 0:
+ print(f"Call #{self.call_count}: batch_size={batch_size}, {call_time*1000:.2f}ms")
+
+ return batch
\ No newline at end of file
diff --git a/scripts/plum-yambda/callbacks.py b/scripts/plum-yambda/callbacks.py
new file mode 100644
index 0000000..43ec460
--- /dev/null
+++ b/scripts/plum-yambda/callbacks.py
@@ -0,0 +1,64 @@
+import torch
+
+import irec.callbacks as cb
+from irec.runners import TrainingRunner, TrainingRunnerContext
+
+class InitCodebooks(cb.TrainingCallback):
+ def __init__(self, dataloader):
+ super().__init__()
+ self._dataloader = dataloader
+
+ @torch.no_grad()
+ def before_run(self, runner: TrainingRunner):
+ for i in range(len(runner.model.codebooks)):
+ X = next(iter(self._dataloader))['embedding']
+ idx = torch.randperm(X.shape[0], device=X.device)[:len(runner.model.codebooks[i])]
+ remainder = runner.model.encoder(X[idx])
+
+ for j in range(i):
+ codebook_indices = runner.model.get_codebook_indices(remainder, runner.model.codebooks[j])
+ codebook_vectors = runner.model.codebooks[j][codebook_indices]
+ remainder = remainder - codebook_vectors
+
+ runner.model.codebooks[i].data = remainder.detach()
+
+
+class FixDeadCentroids(cb.TrainingCallback):
+ def __init__(self, dataloader):
+ super().__init__()
+ self._dataloader = dataloader
+
+ def after_step(self, runner: TrainingRunner, context: TrainingRunnerContext):
+ for i, num_fixed in enumerate(self.fix_dead_codebooks(runner)):
+ context.metrics[f'num_dead/{i}'] = num_fixed
+
+ @torch.no_grad()
+ def fix_dead_codebooks(self, runner: TrainingRunner):
+ num_fixed = []
+ for codebook_idx, codebook in enumerate(runner.model.codebooks):
+ centroid_counts = torch.zeros(codebook.shape[0], dtype=torch.long, device=codebook.device)
+ random_batch = next(iter(self._dataloader))['embedding']
+
+ for batch in self._dataloader:
+ remainder = runner.model.encoder(batch['embedding'])
+ for l in range(codebook_idx):
+ ind = runner.model.get_codebook_indices(remainder, runner.model.codebooks[l])
+ remainder = remainder - runner.model.codebooks[l][ind]
+
+ indices = runner.model.get_codebook_indices(remainder, codebook)
+ centroid_counts.scatter_add_(0, indices, torch.ones_like(indices))
+
+ dead_mask = (centroid_counts == 0)
+ num_dead = int(dead_mask.sum().item())
+ num_fixed.append(num_dead)
+ if num_dead == 0:
+ continue
+
+ remainder = runner.model.encoder(random_batch)
+ for l in range(codebook_idx):
+ ind = runner.model.get_codebook_indices(remainder, runner.model.codebooks[l])
+ remainder = remainder - runner.model.codebooks[l][ind]
+ remainder = remainder[torch.randperm(remainder.shape[0], device=codebook.device)][:num_dead]
+ codebook[dead_mask] = remainder.detach()
+
+ return num_fixed
diff --git a/scripts/plum-yambda/cooc_data.py b/scripts/plum-yambda/cooc_data.py
new file mode 100644
index 0000000..50f2bdd
--- /dev/null
+++ b/scripts/plum-yambda/cooc_data.py
@@ -0,0 +1,108 @@
+import json
+import pickle
+from collections import defaultdict, Counter
+
+import numpy as np
+from loguru import logger
+
+
+import pickle
+from collections import defaultdict, Counter
+
+class CoocMappingDataset:
+ def __init__(
+ self,
+ train_sampler,
+ num_items,
+ cooccur_counter_mapping=None
+ ):
+ self._train_sampler = train_sampler
+ self._num_items = num_items
+ self._cooccur_counter_mapping = cooccur_counter_mapping
+
+ @classmethod
+ def create(cls, inter_json_path, window_size):
+ max_item_id = 0
+ train_dataset, validation_dataset, test_dataset = [], [], []
+
+ with open(inter_json_path, 'r') as f:
+ user_interactions = json.load(f)
+
+ for user_id_str, item_ids in user_interactions.items():
+ user_id = int(user_id_str)
+ if item_ids:
+ max_item_id = max(max_item_id, max(item_ids))
+ assert len(item_ids) >= 5, f'Core-5 dataset is used, user {user_id} has only {len(item_ids)} items'
+ train_dataset.append({
+ 'user.ids': [user_id],
+ 'item.ids': item_ids[:-2],
+ })
+
+ cooccur_counter_mapping = cls.build_cooccur_counter_mapping(train_dataset, window_size=window_size)
+ logger.debug(f'Computed window-based co-occurrence mapping for {len(cooccur_counter_mapping)} items but max_item_id is {max_item_id}')
+
+ train_sampler = train_dataset
+
+ return cls(
+ train_sampler=train_sampler,
+ num_items=max_item_id + 1,
+ cooccur_counter_mapping=cooccur_counter_mapping
+ )
+
+ @classmethod
+ def create_from_split_part(
+ cls,
+ train_inter_json_path,
+ window_size
+ ):
+
+ max_item_id = 0
+ train_dataset = []
+
+ with open(train_inter_json_path, 'r') as f:
+ train_interactions = json.load(f)
+
+ # Обрабатываем TRAIN
+ for user_id_str, item_ids in train_interactions.items():
+ user_id = int(user_id_str)
+ if item_ids:
+ max_item_id = max(max_item_id, max(item_ids))
+
+ train_dataset.append({
+ 'user.ids': [user_id],
+ 'item.ids': item_ids,
+ })
+
+ logger.debug(f'Train: {len(train_dataset)} users')
+ logger.debug(f'Max item ID: {max_item_id}')
+
+ cooccur_counter_mapping = cls.build_cooccur_counter_mapping(
+ train_dataset,
+ window_size=window_size
+ )
+
+ logger.debug(f'Computed window-based co-occurrence mapping for {len(cooccur_counter_mapping)} items')
+
+ return cls(
+ train_sampler=train_dataset,
+ num_items=max_item_id + 1,
+ cooccur_counter_mapping=cooccur_counter_mapping
+ )
+
+
+ @staticmethod
+ def build_cooccur_counter_mapping(train_dataset, window_size): #TODO передавать время и по нему строить окно
+ cooccur_counts = defaultdict(Counter)
+ for session in train_dataset:
+ items = session['item.ids']
+ for i in range(len(items)):
+ item_i = items[i]
+ for j in range(max(0, i - window_size), min(len(items), i + window_size + 1)):
+ if i != j:
+ cooccur_counts[item_i][items[j]] += 1
+ return cooccur_counts
+
+
+ @property
+ def cooccur_counter_mapping(self):
+ return self._cooccur_counter_mapping
diff --git a/scripts/plum-yambda/data.py b/scripts/plum-yambda/data.py
new file mode 100644
index 0000000..842adb5
--- /dev/null
+++ b/scripts/plum-yambda/data.py
@@ -0,0 +1,62 @@
+import numpy as np
+import pickle
+
+from irec.data.base import BaseDataset
+from irec.data.transforms import Transform
+
+
+import polars as pl
+import numpy as np
+import torch
+
+class EmbeddingDatasetParquet(BaseDataset):
+ def __init__(self, data_path):
+ self.df = pl.read_parquet(data_path)
+ self.item_ids = np.array(self.df['item_id'], dtype=np.int64)
+ self.embeddings = np.array(self.df['embedding'].to_list(), dtype=np.float32)
+ print(f"embedding dim: {self.embeddings[0].shape}")
+
+ def __getitem__(self, idx):
+ index = self.item_ids[idx]
+ tensor_emb = self.embeddings[idx]
+ return {
+ 'item_id': index,
+ 'embedding': tensor_emb,
+ 'embedding_dim': len(tensor_emb)
+ }
+
+ def __len__(self):
+ return len(self.embeddings)
+
+
+class EmbeddingDataset(BaseDataset):
+ def __init__(self, data_path):
+ self.data_path = data_path
+ with open(data_path, 'rb') as f:
+ self.data = pickle.load(f)
+
+ self.item_ids = np.array(self.data['item_id'], dtype=np.int64)
+ self.embeddings = np.array(self.data['embedding'], dtype=np.float32)
+
+ def __getitem__(self, idx):
+ index = self.item_ids[idx]
+ tensor_emb = self.embeddings[idx]
+ return {
+ 'item_id': index,
+ 'embedding': tensor_emb,
+ 'embedding_dim': len(tensor_emb)
+ }
+
+ def __len__(self):
+ return len(self.embeddings)
+
+
+class ProcessEmbeddings(Transform):
+ def __init__(self, embedding_dim, keys):
+ self.embedding_dim = embedding_dim
+ self.keys = keys
+
+ def __call__(self, batch):
+ for key in self.keys:
+ batch[key] = batch[key].reshape(-1, self.embedding_dim)
+ return batch
\ No newline at end of file
diff --git a/scripts/plum-yambda/models.py b/scripts/plum-yambda/models.py
new file mode 100644
index 0000000..a411519
--- /dev/null
+++ b/scripts/plum-yambda/models.py
@@ -0,0 +1,135 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class PlumRQVAE(nn.Module):
+ def __init__(
+ self,
+ input_dim,
+ num_codebooks,
+ codebook_size,
+ embedding_dim,
+ beta=0.25,
+ quant_loss_weight=1.0,
+ contrastive_loss_weight=1.0,
+ temperature=0.0,
+ ):
+ super().__init__()
+ self.register_buffer('beta', torch.tensor(beta))
+ self.temperature = temperature
+
+ self.input_dim = input_dim
+ self.num_codebooks = num_codebooks
+ self.codebook_size = codebook_size
+ self.embedding_dim = embedding_dim
+ self.quant_loss_weight = quant_loss_weight
+
+ self.contrastive_loss_weight = contrastive_loss_weight
+
+ self.encoder = self.make_encoding_tower(input_dim, embedding_dim)
+ self.decoder = self.make_encoding_tower(embedding_dim, input_dim)
+
+ self.codebooks = torch.nn.ParameterList()
+ for _ in range(num_codebooks):
+ cb = torch.FloatTensor(codebook_size, embedding_dim)
+ #nn.init.normal_(cb)
+ self.codebooks.append(cb)
+
+ @staticmethod
+ def make_encoding_tower(d1, d2, bias=False):
+ return torch.nn.Sequential(
+ nn.Linear(d1, d1),
+ nn.ReLU(),
+ nn.Linear(d1, d2),
+ nn.ReLU(),
+ nn.Linear(d2, d2, bias=bias)
+ )
+
+ @staticmethod
+ def get_codebook_indices(remainder, codebook):
+ dist = torch.cdist(remainder, codebook)
+ return dist.argmin(dim=-1)
+
+ def _quantize_representation(self, latent_vector):
+ latent_restored = 0
+ remainder = latent_vector
+
+ for codebook in self.codebooks:
+ codebook_indices = self.get_codebook_indices(remainder, codebook)
+ quantized = codebook[codebook_indices]
+ codebook_vectors = remainder + (quantized - remainder).detach()
+ latent_restored += codebook_vectors
+ remainder = remainder - codebook_vectors
+
+ return latent_restored
+
+ def contrastive_loss(self, p_i, p_i_star):
+ N_b = p_i.size(0)
+
+ p_i = F.normalize(p_i, p=2, dim=-1) #TODO посмотреть без нормалайза
+ p_i_star = F.normalize(p_i_star, p=2, dim=-1)
+
+ similarities = torch.matmul(p_i, p_i_star.T) / self.temperature
+
+ labels = torch.arange(N_b, dtype=torch.long, device=p_i.device)
+
+ loss = F.cross_entropy(similarities, labels)
+
+ return loss #только по последней размерности
+
+ def forward(self, inputs):
+ latent_vector = self.encoder(inputs['embedding'])
+ # print(f"latent vector shape: {latent_vector.shape}")
+ # print(f"inputs embedding shape: {inputs['embedding']}")
+ item_ids = inputs['item_id']
+
+ latent_restored = 0
+ rqvae_loss = 0
+ clusters = []
+ remainder = latent_vector
+
+ for codebook in self.codebooks:
+ codebook_indices = self.get_codebook_indices(remainder, codebook)
+ clusters.append(codebook_indices)
+
+ quantized = codebook[codebook_indices]
+ codebook_vectors = remainder + (quantized - remainder).detach()
+
+ rqvae_loss += self.beta * torch.nn.functional.mse_loss(remainder, quantized.detach())
+ rqvae_loss += torch.nn.functional.mse_loss(quantized, remainder.detach())
+
+ latent_restored += codebook_vectors
+ remainder = remainder - codebook_vectors
+
+ embeddings_restored = self.decoder(latent_restored)
+ recon_loss = torch.nn.functional.mse_loss(embeddings_restored, inputs['embedding'])
+
+ if 'cooccurrence_embedding' in inputs:
+ # print(f"cooccurrence_embedding shape: {inputs['cooccurrence_embedding'].shape} device {inputs['cooccurrence_embedding'].device}" )
+ # print(f"latent_restored shape {latent_restored.shape} device {latent_restored.device}")
+ cooccurrence_latent = self.encoder(inputs['cooccurrence_embedding'].to(latent_restored.device))
+ cooccurrence_restored = self._quantize_representation(cooccurrence_latent)
+ con_loss = self.contrastive_loss(latent_restored, cooccurrence_restored)
+ else:
+ con_loss = torch.as_tensor(0.0, device=latent_vector.device)
+
+ loss = (
+ recon_loss
+ + self.quant_loss_weight * rqvae_loss
+ + self.contrastive_loss_weight * con_loss
+ ).mean()
+
+ clusters_counts = []
+ for cluster in clusters:
+ clusters_counts.append(torch.bincount(cluster, minlength=self.codebook_size))
+
+ return loss, {
+ 'loss': loss.item(),
+ 'recon_loss': recon_loss.mean().item(),
+ 'rqvae_loss': rqvae_loss.mean().item(),
+ 'con_loss': con_loss.item(),
+
+ 'clusters_counts': clusters_counts,
+ 'clusters': torch.stack(clusters).T,
+ 'embedding_hat': embeddings_restored,
+ }
\ No newline at end of file
diff --git a/scripts/plum-yambda/transforms.py b/scripts/plum-yambda/transforms.py
new file mode 100644
index 0000000..bdbfffa
--- /dev/null
+++ b/scripts/plum-yambda/transforms.py
@@ -0,0 +1,247 @@
+import numpy as np
+import pickle
+import torch
+import torch.nn.functional as F
+from typing import Dict, List
+from irec.data.base import BaseDataset
+from irec.data.transforms import Transform
+
+from cooc_data import CoocMappingDataset
+
+
+class AddWeightedCooccurrenceEmbeddings:
+ def __init__(self, cooccur_counts, item_id_to_embedding, all_item_ids):
+ self.cooccur_counts = cooccur_counts
+ self.item_id_to_embedding = item_id_to_embedding
+ self.all_item_ids = all_item_ids
+ self.call_count = 0
+
+ def __call__(self, batch):
+ self.call_count += 1
+ item_ids = batch['item_id']
+ cooccurrence_embeddings = []
+
+ for idx, item_id in enumerate(item_ids):
+ item_id_val = int(item_id.item()) if torch.is_tensor(item_id) else int(item_id)
+
+ counter = self.cooccur_counts.get(item_id_val)
+ if counter and len(counter) > 0:
+ cooc_ids, freqs = zip(*counter.items())
+ freqs_array = np.array(freqs, dtype=np.float32)
+ probs = freqs_array / freqs_array.sum()
+ cooc_id = np.random.choice(cooc_ids, p=probs)
+
+ else:
+ cooc_id = np.random.choice(self.all_item_ids)
+ if self.call_count % 500 == 0 and idx < 5:
+ print(f" idx={idx}: item_id={item_id_val} fallback random")
+
+ cooc_emb = self.item_id_to_embedding.get(cooc_id, batch['embedding'][0])
+ cooccurrence_embeddings.append(cooc_emb)
+
+ batch['cooccurrence_embedding'] = torch.stack(cooccurrence_embeddings)
+ return batch
+
+
+
+class AddWeightedCooccurrenceEmbeddingsCached:
+ def __init__(self, cooccur_counts, item_id_to_embedding, all_item_ids):
+ self.cooccur_counts = cooccur_counts
+ self.item_id_to_embedding = item_id_to_embedding
+ self.all_item_ids = all_item_ids
+ self.call_count = 0
+
+ self.cooc_probs_cache = {}
+ self._precompute_probabilities()
+
+ def _precompute_probabilities(self):
+ for item_id, counter in self.cooccur_counts.items():
+ if counter and len(counter) > 0:
+ cooc_ids, freqs = zip(*counter.items())
+ freqs_array = np.array(freqs, dtype=np.float32)
+ probs = freqs_array / freqs_array.sum()
+ self.cooc_probs_cache[item_id] = (cooc_ids, probs)
+
+ def __call__(self, batch):
+ self.call_count += 1
+ item_ids = batch['item_id']
+ cooccurrence_embeddings = []
+
+ for idx, item_id in enumerate(item_ids):
+ item_id_val = int(item_id.item()) if torch.is_tensor(item_id) else int(item_id)
+
+ if item_id_val in self.cooc_probs_cache:
+ cooc_ids, probs = self.cooc_probs_cache[item_id_val]
+ cooc_id = np.random.choice(cooc_ids, p=probs)
+ else:
+ cooc_id = np.random.choice(self.all_item_ids)
+ if self.call_count % 10 == 0 and idx < 5:
+ print(f" idx={idx}: item_id={item_id_val} fallback random")
+
+ cooc_emb = self.item_id_to_embedding.get(cooc_id, batch['embedding'][0])
+ cooccurrence_embeddings.append(cooc_emb)
+
+ batch['cooccurrence_embedding'] = torch.stack(cooccurrence_embeddings)
+ return batch
+
+class AddWeightedCooccurrenceEmbeddingsVectorized:
+
+ def __init__(
+ self,
+ cooccur_counts: Dict[int, Dict[int, int]],
+ item_id_to_embedding: Dict[int, torch.Tensor],
+ all_item_ids: List[int],
+ device: torch.device,
+ limit_neighbors: bool = True,
+ max_neighbors: int = 256
+ ):
+ """
+ limit_neighbors: если True, ограничиваем до max_neighbors (для экономии памяти)
+ max_neighbors: максимум соседей (используется только если limit_neighbors=True)
+ """
+ self.device = device
+ self.call_count = 0
+ self.limit_neighbors = limit_neighbors
+ self.max_neighbors = max_neighbors
+
+ max_item_id = max(item_id_to_embedding.keys())
+ embedding_dim = next(iter(item_id_to_embedding.values())).shape[0]
+
+ self.embedding_matrix = torch.zeros(
+ (max_item_id + 1, embedding_dim),
+ device=device,
+ dtype=torch.float32,
+ requires_grad=False
+ )
+
+ print("Building embedding matrix")
+ for item_id, emb in item_id_to_embedding.items():
+ if isinstance(emb, torch.Tensor):
+ self.embedding_matrix[item_id] = emb.detach()
+ else:
+ self.embedding_matrix[item_id] = torch.tensor(emb, device=device, dtype=torch.float32)
+
+ self.all_item_ids_tensor = torch.tensor(
+ all_item_ids,
+ device=device,
+ dtype=torch.long,
+ requires_grad=False
+ )
+
+ print("Building cooccurrence tables")
+ self._build_cooccurrence_tables(cooccur_counts, max_item_id, len(all_item_ids))
+
+ def _build_cooccurrence_tables(self, cooccur_counts: Dict, max_item_id: int, num_all_items: int):
+ """
+ - neighbors_matrix: [max_item_id+1, num_neighbors]
+ - probs_matrix: [max_item_id+1, num_neighbors]
+ Если у item_id нет соседей, neighbors и probs заполняются равномерно из all_items
+ """
+ neighbor_counts = {}
+ for item_id in range(max_item_id + 1):
+ counter = cooccur_counts.get(item_id)
+ if counter and len(counter) > 0:
+ num_neighbors = len(counter)
+ if self.limit_neighbors:
+ num_neighbors = min(num_neighbors, self.max_neighbors)
+ else:
+ num_neighbors = num_all_items
+
+ neighbor_counts[item_id] = num_neighbors
+
+ max_num_neighbors = max(neighbor_counts.values())
+ actual_max_neighbors = min(max_num_neighbors, self.max_neighbors) if self.limit_neighbors else max_num_neighbors
+
+ print(f"Max neighbors per item: {actual_max_neighbors}")
+
+ neighbors_matrix = torch.zeros(
+ (max_item_id + 1, actual_max_neighbors),
+ dtype=torch.long,
+ device=self.device,
+ requires_grad=False
+ )
+
+ probs_matrix = torch.zeros(
+ (max_item_id + 1, actual_max_neighbors),
+ dtype=torch.float32,
+ device=self.device,
+ requires_grad=False
+ )
+
+ num_items_with_cooc = 0
+
+ # Заполняем матрицы
+ for item_id in range(max_item_id + 1):
+ counter = cooccur_counts.get(item_id)
+
+ if counter and len(counter) > 0:
+ # === Есть соседи: используем реальные вероятности ===
+ num_items_with_cooc += 1
+
+ # Извлекаем соседей и их counts, сортируем по частоте
+ cooc_ids, freqs = zip(*sorted(counter.items(), key=lambda x: x[1], reverse=True))
+ cooc_ids = list(cooc_ids)
+ freqs = np.array(freqs, dtype=np.float32)
+
+ # Берем только топ
+ num_neighbors = min(len(cooc_ids), actual_max_neighbors)
+ cooc_ids = cooc_ids[:num_neighbors]
+ freqs = freqs[:num_neighbors]
+
+ # Нормализуем
+ probs = freqs / freqs.sum()
+
+ neighbors_matrix[item_id, :num_neighbors] = torch.tensor(
+ cooc_ids, dtype=torch.long, device=self.device
+ )
+ probs_matrix[item_id, :num_neighbors] = torch.tensor(
+ probs, dtype=torch.float32, device=self.device
+ )
+
+ else:
+ # Нет соседей: равномерное распределение на all_items
+ if actual_max_neighbors >= num_all_items:
+ # Можем поместить всех айтемов
+ neighbors_matrix[item_id, :num_all_items] = self.all_item_ids_tensor
+ probs_matrix[item_id, :num_all_items] = 1.0 / num_all_items
+ else:
+ # Выбираем случайное подмножество
+ indices = torch.randperm(num_all_items, device=self.device)[:actual_max_neighbors]
+ neighbors_matrix[item_id] = self.all_item_ids_tensor[indices]
+ probs_matrix[item_id] = 1.0 / actual_max_neighbors
+
+ self.neighbors_matrix = neighbors_matrix
+ self.probs_matrix = probs_matrix
+
+ print(f"Cooccurrence tables built: {num_items_with_cooc}/{max_item_id + 1} items have real neighbors")
+
+ def __call__(self, batch):
+ self.call_count += 1
+
+ item_ids = batch['item_id'] # [batch_size]
+ batch_size = item_ids.shape[0]
+
+ # Берем вероятности для items в батче
+ probs = self.probs_matrix[item_ids] # [batch_size, max_neighbors]
+
+ # Выбираем индекс соседа для каждого item
+ # torch.multinomial: выбирает из max_neighbors категорий по вероятностям
+ # Результат: [batch_size, 1] - индексы в диапазоне [0, max_neighbors)
+ neighbor_indices = torch.multinomial(probs, num_samples=1, replacement=True)
+ neighbor_indices = neighbor_indices.squeeze(1) # [batch_size]
+
+ # neighbors_matrix[item_ids, neighbor_indices] -> [batch_size]
+ cooc_ids = self.neighbors_matrix[item_ids, neighbor_indices]
+
+ # Lookup эмбеддингов
+ cooccurrence_embeddings = self.embedding_matrix[cooc_ids] # [batch_size, embedding_dim]
+
+ batch['cooccurrence_embedding'] = cooccurrence_embeddings
+
+ # if self.call_count % 500 == 0:
+ # print(
+ # f"Call #{self.call_count}: {batch_size} samples, "
+ # f"cooc_embeddings shape: {cooccurrence_embeddings.shape}"
+ # )
+
+ return batch
\ No newline at end of file
diff --git a/scripts/plum-yambda/yambda_4_1_train_plum.py b/scripts/plum-yambda/yambda_4_1_train_plum.py
new file mode 100644
index 0000000..86e0c2b
--- /dev/null
+++ b/scripts/plum-yambda/yambda_4_1_train_plum.py
@@ -0,0 +1,186 @@
+from loguru import logger
+import os
+
+import torch
+
+import pickle
+
+import irec.callbacks as cb
+from irec.data.dataloader import DataLoader
+from irec.data.transforms import Collate, ToTorch, ToDevice
+from irec.runners import TrainingRunner
+
+from irec.utils import fix_random_seed
+
+from callbacks import InitCodebooks, FixDeadCentroids
+from data import EmbeddingDatasetParquet, ProcessEmbeddings
+from models import PlumRQVAE
+from transforms import AddWeightedCooccurrenceEmbeddingsVectorized
+from cooc_data import CoocMappingDataset
+
+SEED_VALUE = 42
+DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+NUM_EPOCHS = 35
+BATCH_SIZE = 1024
+
+MAX_NEIGHBOURS_COUNT = 1000
+
+INPUT_DIM = 128
+HIDDEN_DIM = 32
+CODEBOOK_SIZE = 256
+NUM_CODEBOOKS = 3
+BETA = 0.25
+LR = 1e-4
+WINDOW_SIZE = 2
+
+EXPERIMENT_NAME = f'4-1_filtered_yambda_gpu_week_ws_{WINDOW_SIZE}'
+INTER_TRAIN_PATH = "/home/jovyan/IRec/data/Yambda/week-splits/merged_for_exps_filtered/exp_4-1_0.9_inter_semantics_train.json" #отсекать старое (может и нет)
+EMBEDDINGS_PATH = "/home/jovyan/IRec/sigir/yambda_data/yambda_embeddings_reindexed.parquet"
+IREC_PATH = '../../'
+
+print(INTER_TRAIN_PATH)
+def main():
+ fix_random_seed(SEED_VALUE)
+
+ data = CoocMappingDataset.create_from_split_part(
+ train_inter_json_path=INTER_TRAIN_PATH,
+ window_size=WINDOW_SIZE
+ )
+
+ dataset = EmbeddingDatasetParquet(
+ data_path=EMBEDDINGS_PATH
+ )
+
+ item_id_to_embedding = {}
+ all_item_ids = []
+ for idx in range(len(dataset)):
+ sample = dataset[idx]
+ item_id = int(sample['item_id'])
+ item_id_to_embedding[item_id] = torch.tensor(sample['embedding'], device=DEVICE)
+ all_item_ids.append(item_id)
+
+ add_cooc_transform = AddWeightedCooccurrenceEmbeddingsVectorized(
+ cooccur_counts=data.cooccur_counter_mapping,
+ item_id_to_embedding=item_id_to_embedding,
+ all_item_ids=all_item_ids,
+ device=DEVICE,
+ limit_neighbors=True,
+ max_neighbors = MAX_NEIGHBOURS_COUNT
+ )
+
+ train_dataloader = DataLoader( #call в основном потоке делается нужно исправить
+ dataset,
+ batch_size=BATCH_SIZE,
+ shuffle=True,
+ drop_last=True,
+ ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(
+ ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])
+ ).map(add_cooc_transform
+ ).repeat(NUM_EPOCHS)
+
+ valid_dataloader = DataLoader(
+ dataset,
+ batch_size=BATCH_SIZE,
+ shuffle=False,
+ drop_last=False,
+ ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])
+ ).map(add_cooc_transform)
+
+ LOG_EVERY_NUM_STEPS = int(len(train_dataloader) // NUM_EPOCHS)
+
+ model = PlumRQVAE(
+ input_dim=INPUT_DIM,
+ num_codebooks=NUM_CODEBOOKS,
+ codebook_size=CODEBOOK_SIZE,
+ embedding_dim=HIDDEN_DIM,
+ beta=BETA,
+ quant_loss_weight=1.0,
+ contrastive_loss_weight=1.0,
+ temperature=1.0
+ ).to(DEVICE)
+
+ total_params = sum(p.numel() for p in model.parameters())
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+ logger.debug(f'Overall parameters: {total_params:,}')
+ logger.debug(f'Trainable parameters: {trainable_params:,}')
+
+ optimizer = torch.optim.Adam(model.parameters(), lr=LR, fused=True)
+
+ callbacks = [
+ InitCodebooks(valid_dataloader),
+
+ cb.BatchMetrics(metrics=lambda model_outputs, batch: {
+ 'loss': model_outputs['loss'],
+ 'recon_loss': model_outputs['recon_loss'],
+ 'rqvae_loss': model_outputs['rqvae_loss'],
+ 'con_loss': model_outputs['con_loss']
+ }, name='train'),
+
+ FixDeadCentroids(valid_dataloader),
+
+ cb.MetricAccumulator(
+ accumulators={
+ 'train/loss': cb.MeanAccumulator(),
+ 'train/recon_loss': cb.MeanAccumulator(),
+ 'train/rqvae_loss': cb.MeanAccumulator(),
+ 'train/con_loss': cb.MeanAccumulator(),
+ 'num_dead/0': cb.MeanAccumulator(),
+ 'num_dead/1': cb.MeanAccumulator(),
+ 'num_dead/2': cb.MeanAccumulator(),
+ },
+ reset_every_num_steps=LOG_EVERY_NUM_STEPS
+ ),
+
+ cb.Validation(
+ dataset=valid_dataloader,
+ callbacks=[
+ cb.BatchMetrics(metrics=lambda model_outputs, batch: {
+ 'loss': model_outputs['loss'],
+ 'recon_loss': model_outputs['recon_loss'],
+ 'rqvae_loss': model_outputs['rqvae_loss'],
+ 'con_loss': model_outputs['con_loss']
+ }, name='valid'),
+ cb.MetricAccumulator(
+ accumulators={
+ 'valid/loss': cb.MeanAccumulator(),
+ 'valid/recon_loss': cb.MeanAccumulator(),
+ 'valid/rqvae_loss': cb.MeanAccumulator(),
+ 'valid/con_loss': cb.MeanAccumulator()
+ }
+ ),
+ ],
+ ).every_num_steps(LOG_EVERY_NUM_STEPS),
+
+ cb.Logger().every_num_steps(LOG_EVERY_NUM_STEPS),
+ cb.TensorboardLogger(experiment_name=EXPERIMENT_NAME, logdir=os.path.join(IREC_PATH, 'tensorboard_logs')),
+
+ cb.Profiler(
+ wait=10,
+ warmup=10,
+ active=10,
+ logdir=os.path.join(IREC_PATH, 'tensorboard_logs')
+ ),
+
+ cb.EarlyStopping(
+ metric='valid/recon_loss',
+ patience=40,
+ minimize=True,
+ model_path=os.path.join(IREC_PATH, 'checkpoints', EXPERIMENT_NAME)
+ ).every_num_steps(LOG_EVERY_NUM_STEPS),
+ ]
+
+ logger.debug('Everything is ready for training process!')
+
+ runner = TrainingRunner(
+ model=model,
+ optimizer=optimizer,
+ dataset=train_dataloader,
+ callbacks=callbacks,
+ )
+ runner.run()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/plum-yambda/yambda_infer_4.1_default.py b/scripts/plum-yambda/yambda_infer_4.1_default.py
new file mode 100644
index 0000000..1485fde
--- /dev/null
+++ b/scripts/plum-yambda/yambda_infer_4.1_default.py
@@ -0,0 +1,145 @@
+from loguru import logger
+import os
+
+import torch
+
+import irec.callbacks as cb
+from irec.data.dataloader import DataLoader
+from irec.data.transforms import Collate, ToTorch, ToDevice
+from irec.runners import InferenceRunner
+
+from irec.utils import fix_random_seed
+
+from data import EmbeddingDatasetParquet, ProcessEmbeddings
+from models import PlumRQVAE
+
+# ПУТИ
+IREC_PATH = '/home/jovyan/IRec/'
+EMBEDDINGS_PATH = "/home/jovyan/IRec/sigir/yambda_data/yambda_embeddings_reindexed.parquet"
+MODEL_PATH = '/home/jovyan/IRec/checkpoints/4-1_filtered_yambda_gpu_quantile_ws_2_best_0.0026.pth'
+RESULTS_PATH = os.path.join(IREC_PATH, 'results_sigir_yambda')
+
+WINDOW_SIZE = 2
+EXPERIMENT_NAME = f'4-1_filtered_yambda_gpu_quantile_ws_{WINDOW_SIZE}'
+
+# ОСТАЛЬНОЕ
+
+SEED_VALUE = 42
+DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+BATCH_SIZE = 1024
+
+INPUT_DIM = 128
+HIDDEN_DIM = 32
+CODEBOOK_SIZE = 256
+NUM_CODEBOOKS = 3
+
+BETA = 0.25
+
+
+
+def main():
+ fix_random_seed(SEED_VALUE)
+
+ dataset = EmbeddingDatasetParquet(
+ data_path=EMBEDDINGS_PATH
+ )
+
+ item_id_to_embedding = {}
+ all_item_ids = []
+ for idx in range(len(dataset)):
+ sample = dataset[idx]
+ item_id = int(sample['item_id'])
+ item_id_to_embedding[item_id] = torch.tensor(sample['embedding'])
+ all_item_ids.append(item_id)
+
+ dataloader = DataLoader(
+ dataset,
+ batch_size=BATCH_SIZE,
+ shuffle=False,
+ drop_last=False,
+ ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding']))
+
+ model = PlumRQVAE(
+ input_dim=INPUT_DIM,
+ num_codebooks=NUM_CODEBOOKS,
+ codebook_size=CODEBOOK_SIZE,
+ embedding_dim=HIDDEN_DIM,
+ beta=BETA,
+ quant_loss_weight=1.0,
+ contrastive_loss_weight=1.0,
+ temperature=1.0
+ ).to(DEVICE)
+
+ total_params = sum(p.numel() for p in model.parameters())
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+ logger.debug(f'Overall parameters: {total_params:,}')
+ logger.debug(f'Trainable parameters: {trainable_params:,}')
+
+ callbacks = [
+ cb.LoadModel(MODEL_PATH),
+
+ cb.BatchMetrics(metrics=lambda model_outputs, _: {
+ 'loss': model_outputs['loss'],
+ 'recon_loss': model_outputs['recon_loss'],
+ 'rqvae_loss': model_outputs['rqvae_loss'],
+ 'con_loss': model_outputs['con_loss']
+ }, name='valid'),
+
+ cb.MetricAccumulator(
+ accumulators={
+ 'valid/loss': cb.MeanAccumulator(),
+ 'valid/recon_loss': cb.MeanAccumulator(),
+ 'valid/rqvae_loss': cb.MeanAccumulator(),
+ 'valid/con_loss': cb.MeanAccumulator(),
+ },
+ ),
+
+ cb.Logger().every_num_steps(len(dataloader)),
+
+ cb.InferenceSaver(
+ metrics=lambda batch, model_outputs, _: {'item_id': batch['item_id'], 'clusters': model_outputs['clusters']},
+ save_path=os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'),
+ format='json'
+ )
+ ]
+
+ logger.debug('Everything is ready for training process!')
+
+ runner = InferenceRunner(
+ model=model,
+ dataset=dataloader,
+ callbacks=callbacks,
+ )
+ runner.run()
+
+ import json
+ from collections import defaultdict
+ import numpy as np
+
+ with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'), 'r') as f:
+ mappings = json.load(f)
+
+ inter = {}
+ sem_2_ids = defaultdict(list)
+ for mapping in mappings:
+ item_id = mapping['item_id']
+ clusters = mapping['clusters']
+ inter[int(item_id)] = clusters
+ sem_2_ids[tuple(clusters)].append(int(item_id))
+
+ for semantics, items in sem_2_ids.items():
+ assert len(items) <= CODEBOOK_SIZE, str(len(items))
+ collision_solvers = np.random.permutation(CODEBOOK_SIZE)[:len(items)].tolist()
+ for item_id, collision_solver in zip(items, collision_solvers):
+ inter[item_id].append(collision_solver)
+ for i in range(len(inter[item_id])):
+ inter[item_id][i] += CODEBOOK_SIZE * i
+
+ with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters_colisionless.json'), 'w') as f:
+ json.dump(inter, f, indent=2)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/plum/beauty-exps/4_1_train_plum.py b/scripts/plum/beauty-exps/4_1_train_plum.py
new file mode 100644
index 0000000..357fc19
--- /dev/null
+++ b/scripts/plum/beauty-exps/4_1_train_plum.py
@@ -0,0 +1,169 @@
+from loguru import logger
+import os
+
+import torch
+
+import pickle
+
+import irec.callbacks as cb
+from irec.data.dataloader import DataLoader
+from irec.data.transforms import Collate, ToTorch, ToDevice
+from irec.runners import TrainingRunner
+
+from irec.utils import fix_random_seed
+
+from callbacks import InitCodebooks, FixDeadCentroids
+from data import EmbeddingDataset, ProcessEmbeddings
+from models import PlumRQVAE
+from transforms import AddWeightedCooccurrenceEmbeddings
+from cooc_data import CoocMappingDataset
+
+SEED_VALUE = 42
+DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+NUM_EPOCHS = 500
+BATCH_SIZE = 1024
+
+INPUT_DIM = 4096
+HIDDEN_DIM = 32
+CODEBOOK_SIZE = 256
+NUM_CODEBOOKS = 3
+BETA = 0.25
+LR = 1e-4
+WINDOW_SIZE = 2
+
+EXPERIMENT_NAME = f'4-1_yambda_quantile_ws_{WINDOW_SIZE}'
+INTER_TRAIN_PATH = "/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/exp_4-1_0.9_inter_semantics_train.json"
+EMBEDDINGS_PATH = "/home/jovyan/tiger/data/Beauty/default_content_embeddings.pkl"
+IREC_PATH = '../../'
+
+print(INTER_TRAIN_PATH)
+def main():
+ fix_random_seed(SEED_VALUE)
+
+ data = CoocMappingDataset.create_from_split_part(
+ train_inter_json_path=INTER_TRAIN_PATH,
+ window_size=WINDOW_SIZE
+ )
+
+ dataset = EmbeddingDataset(
+ data_path=EMBEDDINGS_PATH
+ )
+
+ item_id_to_embedding = {}
+ all_item_ids = []
+ for idx in range(len(dataset)):
+ sample = dataset[idx]
+ item_id = int(sample['item_id'])
+ item_id_to_embedding[item_id] = torch.tensor(sample['embedding'])
+ all_item_ids.append(item_id)
+
+ add_cooc_transform = AddWeightedCooccurrenceEmbeddings(
+ data.cooccur_counter_mapping, item_id_to_embedding, all_item_ids)
+
+ train_dataloader = DataLoader(
+ dataset,
+ batch_size=BATCH_SIZE,
+ shuffle=True,
+ drop_last=True,
+ ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(
+ ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])
+ ).map(add_cooc_transform).repeat(NUM_EPOCHS)
+
+ valid_dataloader = DataLoader(
+ dataset,
+ batch_size=BATCH_SIZE,
+ shuffle=False,
+ drop_last=False,
+ ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])).map(add_cooc_transform)
+
+ LOG_EVERY_NUM_STEPS = int(len(train_dataloader) // NUM_EPOCHS)
+
+ model = PlumRQVAE(
+ input_dim=INPUT_DIM,
+ num_codebooks=NUM_CODEBOOKS,
+ codebook_size=CODEBOOK_SIZE,
+ embedding_dim=HIDDEN_DIM,
+ beta=BETA,
+ quant_loss_weight=1.0,
+ contrastive_loss_weight=1.0,
+ temperature=1.0
+ ).to(DEVICE)
+
+ total_params = sum(p.numel() for p in model.parameters())
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+ logger.debug(f'Overall parameters: {total_params:,}')
+ logger.debug(f'Trainable parameters: {trainable_params:,}')
+
+ optimizer = torch.optim.Adam(model.parameters(), lr=LR, fused=True)
+
+ callbacks = [
+ InitCodebooks(valid_dataloader),
+
+ cb.BatchMetrics(metrics=lambda model_outputs, batch: {
+ 'loss': model_outputs['loss'],
+ 'recon_loss': model_outputs['recon_loss'],
+ 'rqvae_loss': model_outputs['rqvae_loss'],
+ 'con_loss': model_outputs['con_loss']
+ }, name='train'),
+
+ FixDeadCentroids(valid_dataloader),
+
+ cb.MetricAccumulator(
+ accumulators={
+ 'train/loss': cb.MeanAccumulator(),
+ 'train/recon_loss': cb.MeanAccumulator(),
+ 'train/rqvae_loss': cb.MeanAccumulator(),
+ 'train/con_loss': cb.MeanAccumulator(),
+ 'num_dead/0': cb.MeanAccumulator(),
+ 'num_dead/1': cb.MeanAccumulator(),
+ 'num_dead/2': cb.MeanAccumulator(),
+ },
+ reset_every_num_steps=LOG_EVERY_NUM_STEPS
+ ),
+
+ cb.Validation(
+ dataset=valid_dataloader,
+ callbacks=[
+ cb.BatchMetrics(metrics=lambda model_outputs, batch: {
+ 'loss': model_outputs['loss'],
+ 'recon_loss': model_outputs['recon_loss'],
+ 'rqvae_loss': model_outputs['rqvae_loss'],
+ 'con_loss': model_outputs['con_loss']
+ }, name='valid'),
+ cb.MetricAccumulator(
+ accumulators={
+ 'valid/loss': cb.MeanAccumulator(),
+ 'valid/recon_loss': cb.MeanAccumulator(),
+ 'valid/rqvae_loss': cb.MeanAccumulator(),
+ 'valid/con_loss': cb.MeanAccumulator()
+ }
+ ),
+ ],
+ ).every_num_steps(LOG_EVERY_NUM_STEPS),
+
+ cb.Logger().every_num_steps(LOG_EVERY_NUM_STEPS),
+ cb.TensorboardLogger(experiment_name=EXPERIMENT_NAME, logdir=os.path.join(IREC_PATH, 'tensorboard_logs')),
+
+ cb.EarlyStopping(
+ metric='valid/recon_loss',
+ patience=40,
+ minimize=True,
+ model_path=os.path.join(IREC_PATH, 'checkpoints', EXPERIMENT_NAME)
+ ).every_num_steps(LOG_EVERY_NUM_STEPS),
+ ]
+
+ logger.debug('Everything is ready for training process!')
+
+ runner = TrainingRunner(
+ model=model,
+ optimizer=optimizer,
+ dataset=train_dataloader,
+ callbacks=callbacks,
+ )
+ runner.run()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/plum/beauty-exps/4_2_train_plum.py b/scripts/plum/beauty-exps/4_2_train_plum.py
new file mode 100644
index 0000000..96cfda9
--- /dev/null
+++ b/scripts/plum/beauty-exps/4_2_train_plum.py
@@ -0,0 +1,169 @@
+from loguru import logger
+import os
+
+import torch
+
+import pickle
+
+import irec.callbacks as cb
+from irec.data.dataloader import DataLoader
+from irec.data.transforms import Collate, ToTorch, ToDevice
+from irec.runners import TrainingRunner
+
+from irec.utils import fix_random_seed
+
+from callbacks import InitCodebooks, FixDeadCentroids
+from data import EmbeddingDataset, ProcessEmbeddings
+from models import PlumRQVAE
+from transforms import AddWeightedCooccurrenceEmbeddings
+from cooc_data import CoocMappingDataset
+
+SEED_VALUE = 42
+DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+NUM_EPOCHS = 500
+BATCH_SIZE = 1024
+
+INPUT_DIM = 4096
+HIDDEN_DIM = 32
+CODEBOOK_SIZE = 256
+NUM_CODEBOOKS = 3
+BETA = 0.25
+LR = 1e-4
+WINDOW_SIZE = 2
+
+EXPERIMENT_NAME = f'4-2_updated_quantile_plum_rqvae_beauty_ws_{WINDOW_SIZE}'
+INTER_TRAIN_PATH = "/home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/merged_for_exps/exp_4-2_0.8_inter_semantics_train.json"
+EMBEDDINGS_PATH = "/home/jovyan/tiger/data/Beauty/default_content_embeddings.pkl"
+IREC_PATH = '../../'
+
+print(INTER_TRAIN_PATH)
+def main():
+ fix_random_seed(SEED_VALUE)
+
+ data = CoocMappingDataset.create_from_split_part(
+ train_inter_json_path=INTER_TRAIN_PATH,
+ window_size=WINDOW_SIZE
+ )
+
+ dataset = EmbeddingDataset(
+ data_path=EMBEDDINGS_PATH
+ )
+
+ item_id_to_embedding = {}
+ all_item_ids = []
+ for idx in range(len(dataset)):
+ sample = dataset[idx]
+ item_id = int(sample['item_id'])
+ item_id_to_embedding[item_id] = torch.tensor(sample['embedding'])
+ all_item_ids.append(item_id)
+
+ add_cooc_transform = AddWeightedCooccurrenceEmbeddings(
+ data.cooccur_counter_mapping, item_id_to_embedding, all_item_ids)
+
+ train_dataloader = DataLoader(
+ dataset,
+ batch_size=BATCH_SIZE,
+ shuffle=True,
+ drop_last=True,
+ ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(
+ ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])
+ ).map(add_cooc_transform).repeat(NUM_EPOCHS)
+
+ valid_dataloader = DataLoader(
+ dataset,
+ batch_size=BATCH_SIZE,
+ shuffle=False,
+ drop_last=False,
+ ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])).map(add_cooc_transform)
+
+ LOG_EVERY_NUM_STEPS = int(len(train_dataloader) // NUM_EPOCHS)
+
+ model = PlumRQVAE(
+ input_dim=INPUT_DIM,
+ num_codebooks=NUM_CODEBOOKS,
+ codebook_size=CODEBOOK_SIZE,
+ embedding_dim=HIDDEN_DIM,
+ beta=BETA,
+ quant_loss_weight=1.0,
+ contrastive_loss_weight=1.0,
+ temperature=1.0
+ ).to(DEVICE)
+
+ total_params = sum(p.numel() for p in model.parameters())
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+ logger.debug(f'Overall parameters: {total_params:,}')
+ logger.debug(f'Trainable parameters: {trainable_params:,}')
+
+ optimizer = torch.optim.Adam(model.parameters(), lr=LR, fused=True)
+
+ callbacks = [
+ InitCodebooks(valid_dataloader),
+
+ cb.BatchMetrics(metrics=lambda model_outputs, batch: {
+ 'loss': model_outputs['loss'],
+ 'recon_loss': model_outputs['recon_loss'],
+ 'rqvae_loss': model_outputs['rqvae_loss'],
+ 'con_loss': model_outputs['con_loss']
+ }, name='train'),
+
+ FixDeadCentroids(valid_dataloader),
+
+ cb.MetricAccumulator(
+ accumulators={
+ 'train/loss': cb.MeanAccumulator(),
+ 'train/recon_loss': cb.MeanAccumulator(),
+ 'train/rqvae_loss': cb.MeanAccumulator(),
+ 'train/con_loss': cb.MeanAccumulator(),
+ 'num_dead/0': cb.MeanAccumulator(),
+ 'num_dead/1': cb.MeanAccumulator(),
+ 'num_dead/2': cb.MeanAccumulator(),
+ },
+ reset_every_num_steps=LOG_EVERY_NUM_STEPS
+ ),
+
+ cb.Validation(
+ dataset=valid_dataloader,
+ callbacks=[
+ cb.BatchMetrics(metrics=lambda model_outputs, batch: {
+ 'loss': model_outputs['loss'],
+ 'recon_loss': model_outputs['recon_loss'],
+ 'rqvae_loss': model_outputs['rqvae_loss'],
+ 'con_loss': model_outputs['con_loss']
+ }, name='valid'),
+ cb.MetricAccumulator(
+ accumulators={
+ 'valid/loss': cb.MeanAccumulator(),
+ 'valid/recon_loss': cb.MeanAccumulator(),
+ 'valid/rqvae_loss': cb.MeanAccumulator(),
+ 'valid/con_loss': cb.MeanAccumulator()
+ }
+ ),
+ ],
+ ).every_num_steps(LOG_EVERY_NUM_STEPS),
+
+ cb.Logger().every_num_steps(LOG_EVERY_NUM_STEPS),
+ cb.TensorboardLogger(experiment_name=EXPERIMENT_NAME, logdir=os.path.join(IREC_PATH, 'tensorboard_logs')),
+
+ cb.EarlyStopping(
+ metric='valid/recon_loss',
+ patience=40,
+ minimize=True,
+ model_path=os.path.join(IREC_PATH, 'checkpoints', EXPERIMENT_NAME)
+ ).every_num_steps(LOG_EVERY_NUM_STEPS),
+ ]
+
+ logger.debug('Everything is ready for training process!')
+
+ runner = TrainingRunner(
+ model=model,
+ optimizer=optimizer,
+ dataset=train_dataloader,
+ callbacks=callbacks,
+ )
+ runner.run()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/plum/beauty-exps/4_3_train_plum.py b/scripts/plum/beauty-exps/4_3_train_plum.py
new file mode 100644
index 0000000..ac6cfb6
--- /dev/null
+++ b/scripts/plum/beauty-exps/4_3_train_plum.py
@@ -0,0 +1,169 @@
+from loguru import logger
+import os
+
+import torch
+
+import pickle
+
+import irec.callbacks as cb
+from irec.data.dataloader import DataLoader
+from irec.data.transforms import Collate, ToTorch, ToDevice
+from irec.runners import TrainingRunner
+
+from irec.utils import fix_random_seed
+
+from callbacks import InitCodebooks, FixDeadCentroids
+from data import EmbeddingDataset, ProcessEmbeddings
+from models import PlumRQVAE
+from transforms import AddWeightedCooccurrenceEmbeddings
+from cooc_data import CoocMappingDataset
+
+SEED_VALUE = 42
+DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+NUM_EPOCHS = 500
+BATCH_SIZE = 1024
+
+INPUT_DIM = 4096
+HIDDEN_DIM = 32
+CODEBOOK_SIZE = 256
+NUM_CODEBOOKS = 3
+BETA = 0.25
+LR = 1e-4
+WINDOW_SIZE = 2
+
+EXPERIMENT_NAME = f'4-3_updated_quantile_plum_rqvae_beauty_ws_{WINDOW_SIZE}'
+INTER_TRAIN_PATH = "/home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/merged_for_exps/exp_4-3_0.95_inter_semantics_train.json"
+EMBEDDINGS_PATH = "/home/jovyan/tiger/data/Beauty/default_content_embeddings.pkl"
+IREC_PATH = '../../'
+
+print(INTER_TRAIN_PATH)
+def main():
+ fix_random_seed(SEED_VALUE)
+
+ data = CoocMappingDataset.create_from_split_part(
+ train_inter_json_path=INTER_TRAIN_PATH,
+ window_size=WINDOW_SIZE
+ )
+
+ dataset = EmbeddingDataset(
+ data_path=EMBEDDINGS_PATH
+ )
+
+ item_id_to_embedding = {}
+ all_item_ids = []
+ for idx in range(len(dataset)):
+ sample = dataset[idx]
+ item_id = int(sample['item_id'])
+ item_id_to_embedding[item_id] = torch.tensor(sample['embedding'])
+ all_item_ids.append(item_id)
+
+ add_cooc_transform = AddWeightedCooccurrenceEmbeddings(
+ data.cooccur_counter_mapping, item_id_to_embedding, all_item_ids)
+
+ train_dataloader = DataLoader(
+ dataset,
+ batch_size=BATCH_SIZE,
+ shuffle=True,
+ drop_last=True,
+ ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(
+ ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])
+ ).map(add_cooc_transform).repeat(NUM_EPOCHS)
+
+ valid_dataloader = DataLoader(
+ dataset,
+ batch_size=BATCH_SIZE,
+ shuffle=False,
+ drop_last=False,
+ ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])).map(add_cooc_transform)
+
+ LOG_EVERY_NUM_STEPS = int(len(train_dataloader) // NUM_EPOCHS)
+
+ model = PlumRQVAE(
+ input_dim=INPUT_DIM,
+ num_codebooks=NUM_CODEBOOKS,
+ codebook_size=CODEBOOK_SIZE,
+ embedding_dim=HIDDEN_DIM,
+ beta=BETA,
+ quant_loss_weight=1.0,
+ contrastive_loss_weight=1.0,
+ temperature=1.0
+ ).to(DEVICE)
+
+ total_params = sum(p.numel() for p in model.parameters())
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+ logger.debug(f'Overall parameters: {total_params:,}')
+ logger.debug(f'Trainable parameters: {trainable_params:,}')
+
+ optimizer = torch.optim.Adam(model.parameters(), lr=LR, fused=True)
+
+ callbacks = [
+ InitCodebooks(valid_dataloader),
+
+ cb.BatchMetrics(metrics=lambda model_outputs, batch: {
+ 'loss': model_outputs['loss'],
+ 'recon_loss': model_outputs['recon_loss'],
+ 'rqvae_loss': model_outputs['rqvae_loss'],
+ 'con_loss': model_outputs['con_loss']
+ }, name='train'),
+
+ FixDeadCentroids(valid_dataloader),
+
+ cb.MetricAccumulator(
+ accumulators={
+ 'train/loss': cb.MeanAccumulator(),
+ 'train/recon_loss': cb.MeanAccumulator(),
+ 'train/rqvae_loss': cb.MeanAccumulator(),
+ 'train/con_loss': cb.MeanAccumulator(),
+ 'num_dead/0': cb.MeanAccumulator(),
+ 'num_dead/1': cb.MeanAccumulator(),
+ 'num_dead/2': cb.MeanAccumulator(),
+ },
+ reset_every_num_steps=LOG_EVERY_NUM_STEPS
+ ),
+
+ cb.Validation(
+ dataset=valid_dataloader,
+ callbacks=[
+ cb.BatchMetrics(metrics=lambda model_outputs, batch: {
+ 'loss': model_outputs['loss'],
+ 'recon_loss': model_outputs['recon_loss'],
+ 'rqvae_loss': model_outputs['rqvae_loss'],
+ 'con_loss': model_outputs['con_loss']
+ }, name='valid'),
+ cb.MetricAccumulator(
+ accumulators={
+ 'valid/loss': cb.MeanAccumulator(),
+ 'valid/recon_loss': cb.MeanAccumulator(),
+ 'valid/rqvae_loss': cb.MeanAccumulator(),
+ 'valid/con_loss': cb.MeanAccumulator()
+ }
+ ),
+ ],
+ ).every_num_steps(LOG_EVERY_NUM_STEPS),
+
+ cb.Logger().every_num_steps(LOG_EVERY_NUM_STEPS),
+ cb.TensorboardLogger(experiment_name=EXPERIMENT_NAME, logdir=os.path.join(IREC_PATH, 'tensorboard_logs')),
+
+ cb.EarlyStopping(
+ metric='valid/recon_loss',
+ patience=40,
+ minimize=True,
+ model_path=os.path.join(IREC_PATH, 'checkpoints', EXPERIMENT_NAME)
+ ).every_num_steps(LOG_EVERY_NUM_STEPS),
+ ]
+
+ logger.debug('Everything is ready for training process!')
+
+ runner = TrainingRunner(
+ model=model,
+ optimizer=optimizer,
+ dataset=train_dataloader,
+ callbacks=callbacks,
+ )
+ runner.run()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/plum/beauty-exps/infer_4.1_default.py b/scripts/plum/beauty-exps/infer_4.1_default.py
new file mode 100644
index 0000000..fff61d3
--- /dev/null
+++ b/scripts/plum/beauty-exps/infer_4.1_default.py
@@ -0,0 +1,145 @@
+from loguru import logger
+import os
+
+import torch
+
+import irec.callbacks as cb
+from irec.data.dataloader import DataLoader
+from irec.data.transforms import Collate, ToTorch, ToDevice
+from irec.runners import InferenceRunner
+
+from irec.utils import fix_random_seed
+
+from data import EmbeddingDataset, ProcessEmbeddings
+from models import PlumRQVAE
+
+# ПУТИ
+IREC_PATH = '/home/jovyan/IRec/'
+EMBEDDINGS_PATH = '/home/jovyan/tiger/data/Beauty/default_content_embeddings.pkl'
+MODEL_PATH = '/home/jovyan/IRec/checkpoints/4-1_updated_quantile_plum_rqvae_beauty_ws_2_best_0.0052.pth'
+RESULTS_PATH = os.path.join(IREC_PATH, 'results_sigir')
+
+WINDOW_SIZE = 2
+EXPERIMENT_NAME = f'4-1_updated_quantile_plum_rqvae_beauty_ws_{WINDOW_SIZE}'
+
+# ОСТАЛЬНОЕ
+
+SEED_VALUE = 42
+DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+BATCH_SIZE = 1024
+
+INPUT_DIM = 4096
+HIDDEN_DIM = 32
+CODEBOOK_SIZE = 256
+NUM_CODEBOOKS = 3
+
+BETA = 0.25
+
+
+
+def main():
+ fix_random_seed(SEED_VALUE)
+
+ dataset = EmbeddingDataset(
+ data_path=EMBEDDINGS_PATH
+ )
+
+ item_id_to_embedding = {}
+ all_item_ids = []
+ for idx in range(len(dataset)):
+ sample = dataset[idx]
+ item_id = int(sample['item_id'])
+ item_id_to_embedding[item_id] = torch.tensor(sample['embedding'])
+ all_item_ids.append(item_id)
+
+ dataloader = DataLoader(
+ dataset,
+ batch_size=BATCH_SIZE,
+ shuffle=False,
+ drop_last=False,
+ ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding']))
+
+ model = PlumRQVAE(
+ input_dim=INPUT_DIM,
+ num_codebooks=NUM_CODEBOOKS,
+ codebook_size=CODEBOOK_SIZE,
+ embedding_dim=HIDDEN_DIM,
+ beta=BETA,
+ quant_loss_weight=1.0,
+ contrastive_loss_weight=1.0,
+ temperature=1.0
+ ).to(DEVICE)
+
+ total_params = sum(p.numel() for p in model.parameters())
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+ logger.debug(f'Overall parameters: {total_params:,}')
+ logger.debug(f'Trainable parameters: {trainable_params:,}')
+
+ callbacks = [
+ cb.LoadModel(MODEL_PATH),
+
+ cb.BatchMetrics(metrics=lambda model_outputs, _: {
+ 'loss': model_outputs['loss'],
+ 'recon_loss': model_outputs['recon_loss'],
+ 'rqvae_loss': model_outputs['rqvae_loss'],
+ 'con_loss': model_outputs['con_loss']
+ }, name='valid'),
+
+ cb.MetricAccumulator(
+ accumulators={
+ 'valid/loss': cb.MeanAccumulator(),
+ 'valid/recon_loss': cb.MeanAccumulator(),
+ 'valid/rqvae_loss': cb.MeanAccumulator(),
+ 'valid/con_loss': cb.MeanAccumulator(),
+ },
+ ),
+
+ cb.Logger().every_num_steps(len(dataloader)),
+
+ cb.InferenceSaver(
+ metrics=lambda batch, model_outputs, _: {'item_id': batch['item_id'], 'clusters': model_outputs['clusters']},
+ save_path=os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'),
+ format='json'
+ )
+ ]
+
+ logger.debug('Everything is ready for training process!')
+
+ runner = InferenceRunner(
+ model=model,
+ dataset=dataloader,
+ callbacks=callbacks,
+ )
+ runner.run()
+
+ import json
+ from collections import defaultdict
+ import numpy as np
+
+ with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'), 'r') as f:
+ mappings = json.load(f)
+
+ inter = {}
+ sem_2_ids = defaultdict(list)
+ for mapping in mappings:
+ item_id = mapping['item_id']
+ clusters = mapping['clusters']
+ inter[int(item_id)] = clusters
+ sem_2_ids[tuple(clusters)].append(int(item_id))
+
+ for semantics, items in sem_2_ids.items():
+ assert len(items) <= CODEBOOK_SIZE, str(len(items))
+ collision_solvers = np.random.permutation(CODEBOOK_SIZE)[:len(items)].tolist()
+ for item_id, collision_solver in zip(items, collision_solvers):
+ inter[item_id].append(collision_solver)
+ for i in range(len(inter[item_id])):
+ inter[item_id][i] += CODEBOOK_SIZE * i
+
+ with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters_colisionless.json'), 'w') as f:
+ json.dump(inter, f, indent=2)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/plum/beauty-exps/infer_4.2_default.py b/scripts/plum/beauty-exps/infer_4.2_default.py
new file mode 100644
index 0000000..c5c7c02
--- /dev/null
+++ b/scripts/plum/beauty-exps/infer_4.2_default.py
@@ -0,0 +1,145 @@
+from loguru import logger
+import os
+
+import torch
+
+import irec.callbacks as cb
+from irec.data.dataloader import DataLoader
+from irec.data.transforms import Collate, ToTorch, ToDevice
+from irec.runners import InferenceRunner
+
+from irec.utils import fix_random_seed
+
+from data import EmbeddingDataset, ProcessEmbeddings
+from models import PlumRQVAE
+
+# ПУТИ
+IREC_PATH = '/home/jovyan/IRec/'
+EMBEDDINGS_PATH = '/home/jovyan/tiger/data/Beauty/default_content_embeddings.pkl'
+MODEL_PATH = '/home/jovyan/IRec/checkpoints/4-2_updated_quantile_plum_rqvae_beauty_ws_2_best_0.0051.pth'
+RESULTS_PATH = os.path.join(IREC_PATH, 'results_sigir')
+
+WINDOW_SIZE = 2
+EXPERIMENT_NAME = f'4-2_updated_quantile_plum_rqvae_beauty_ws_{WINDOW_SIZE}'
+
+# ОСТАЛЬНОЕ
+
+SEED_VALUE = 42
+DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+BATCH_SIZE = 1024
+
+INPUT_DIM = 4096
+HIDDEN_DIM = 32
+CODEBOOK_SIZE = 256
+NUM_CODEBOOKS = 3
+
+BETA = 0.25
+
+
+
+def main():
+ fix_random_seed(SEED_VALUE)
+
+ dataset = EmbeddingDataset(
+ data_path=EMBEDDINGS_PATH
+ )
+
+ item_id_to_embedding = {}
+ all_item_ids = []
+ for idx in range(len(dataset)):
+ sample = dataset[idx]
+ item_id = int(sample['item_id'])
+ item_id_to_embedding[item_id] = torch.tensor(sample['embedding'])
+ all_item_ids.append(item_id)
+
+ dataloader = DataLoader(
+ dataset,
+ batch_size=BATCH_SIZE,
+ shuffle=False,
+ drop_last=False,
+ ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding']))
+
+ model = PlumRQVAE(
+ input_dim=INPUT_DIM,
+ num_codebooks=NUM_CODEBOOKS,
+ codebook_size=CODEBOOK_SIZE,
+ embedding_dim=HIDDEN_DIM,
+ beta=BETA,
+ quant_loss_weight=1.0,
+ contrastive_loss_weight=1.0,
+ temperature=1.0
+ ).to(DEVICE)
+
+ total_params = sum(p.numel() for p in model.parameters())
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+ logger.debug(f'Overall parameters: {total_params:,}')
+ logger.debug(f'Trainable parameters: {trainable_params:,}')
+
+ callbacks = [
+ cb.LoadModel(MODEL_PATH),
+
+ cb.BatchMetrics(metrics=lambda model_outputs, _: {
+ 'loss': model_outputs['loss'],
+ 'recon_loss': model_outputs['recon_loss'],
+ 'rqvae_loss': model_outputs['rqvae_loss'],
+ 'con_loss': model_outputs['con_loss']
+ }, name='valid'),
+
+ cb.MetricAccumulator(
+ accumulators={
+ 'valid/loss': cb.MeanAccumulator(),
+ 'valid/recon_loss': cb.MeanAccumulator(),
+ 'valid/rqvae_loss': cb.MeanAccumulator(),
+ 'valid/con_loss': cb.MeanAccumulator(),
+ },
+ ),
+
+ cb.Logger().every_num_steps(len(dataloader)),
+
+ cb.InferenceSaver(
+ metrics=lambda batch, model_outputs, _: {'item_id': batch['item_id'], 'clusters': model_outputs['clusters']},
+ save_path=os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'),
+ format='json'
+ )
+ ]
+
+ logger.debug('Everything is ready for training process!')
+
+ runner = InferenceRunner(
+ model=model,
+ dataset=dataloader,
+ callbacks=callbacks,
+ )
+ runner.run()
+
+ import json
+ from collections import defaultdict
+ import numpy as np
+
+ with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'), 'r') as f:
+ mappings = json.load(f)
+
+ inter = {}
+ sem_2_ids = defaultdict(list)
+ for mapping in mappings:
+ item_id = mapping['item_id']
+ clusters = mapping['clusters']
+ inter[int(item_id)] = clusters
+ sem_2_ids[tuple(clusters)].append(int(item_id))
+
+ for semantics, items in sem_2_ids.items():
+ assert len(items) <= CODEBOOK_SIZE, str(len(items))
+ collision_solvers = np.random.permutation(CODEBOOK_SIZE)[:len(items)].tolist()
+ for item_id, collision_solver in zip(items, collision_solvers):
+ inter[item_id].append(collision_solver)
+ for i in range(len(inter[item_id])):
+ inter[item_id][i] += CODEBOOK_SIZE * i
+
+ with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters_colisionless.json'), 'w') as f:
+ json.dump(inter, f, indent=2)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/plum/beauty-exps/infer_4.3_default.py b/scripts/plum/beauty-exps/infer_4.3_default.py
new file mode 100644
index 0000000..c7fca80
--- /dev/null
+++ b/scripts/plum/beauty-exps/infer_4.3_default.py
@@ -0,0 +1,145 @@
+from loguru import logger
+import os
+
+import torch
+
+import irec.callbacks as cb
+from irec.data.dataloader import DataLoader
+from irec.data.transforms import Collate, ToTorch, ToDevice
+from irec.runners import InferenceRunner
+
+from irec.utils import fix_random_seed
+
+from data import EmbeddingDataset, ProcessEmbeddings
+from models import PlumRQVAE
+
+# ПУТИ
+IREC_PATH = '/home/jovyan/IRec/'
+EMBEDDINGS_PATH = '/home/jovyan/tiger/data/Beauty/default_content_embeddings.pkl'
+MODEL_PATH = '/home/jovyan/IRec/checkpoints/4-3_updated_quantile_plum_rqvae_beauty_ws_2_best_0.005.pth'
+RESULTS_PATH = os.path.join(IREC_PATH, 'results_sigir')
+
+WINDOW_SIZE = 2
+EXPERIMENT_NAME = f'4-3_updated_quantile_plum_rqvae_beauty_ws_{WINDOW_SIZE}'
+
+# ОСТАЛЬНОЕ
+
+SEED_VALUE = 42
+DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+BATCH_SIZE = 1024
+
+INPUT_DIM = 4096
+HIDDEN_DIM = 32
+CODEBOOK_SIZE = 256
+NUM_CODEBOOKS = 3
+
+BETA = 0.25
+
+
+
+def main():
+ fix_random_seed(SEED_VALUE)
+
+ dataset = EmbeddingDataset(
+ data_path=EMBEDDINGS_PATH
+ )
+
+ item_id_to_embedding = {}
+ all_item_ids = []
+ for idx in range(len(dataset)):
+ sample = dataset[idx]
+ item_id = int(sample['item_id'])
+ item_id_to_embedding[item_id] = torch.tensor(sample['embedding'])
+ all_item_ids.append(item_id)
+
+ dataloader = DataLoader(
+ dataset,
+ batch_size=BATCH_SIZE,
+ shuffle=False,
+ drop_last=False,
+ ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding']))
+
+ model = PlumRQVAE(
+ input_dim=INPUT_DIM,
+ num_codebooks=NUM_CODEBOOKS,
+ codebook_size=CODEBOOK_SIZE,
+ embedding_dim=HIDDEN_DIM,
+ beta=BETA,
+ quant_loss_weight=1.0,
+ contrastive_loss_weight=1.0,
+ temperature=1.0
+ ).to(DEVICE)
+
+ total_params = sum(p.numel() for p in model.parameters())
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+ logger.debug(f'Overall parameters: {total_params:,}')
+ logger.debug(f'Trainable parameters: {trainable_params:,}')
+
+ callbacks = [
+ cb.LoadModel(MODEL_PATH),
+
+ cb.BatchMetrics(metrics=lambda model_outputs, _: {
+ 'loss': model_outputs['loss'],
+ 'recon_loss': model_outputs['recon_loss'],
+ 'rqvae_loss': model_outputs['rqvae_loss'],
+ 'con_loss': model_outputs['con_loss']
+ }, name='valid'),
+
+ cb.MetricAccumulator(
+ accumulators={
+ 'valid/loss': cb.MeanAccumulator(),
+ 'valid/recon_loss': cb.MeanAccumulator(),
+ 'valid/rqvae_loss': cb.MeanAccumulator(),
+ 'valid/con_loss': cb.MeanAccumulator(),
+ },
+ ),
+
+ cb.Logger().every_num_steps(len(dataloader)),
+
+ cb.InferenceSaver(
+ metrics=lambda batch, model_outputs, _: {'item_id': batch['item_id'], 'clusters': model_outputs['clusters']},
+ save_path=os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'),
+ format='json'
+ )
+ ]
+
+ logger.debug('Everything is ready for training process!')
+
+ runner = InferenceRunner(
+ model=model,
+ dataset=dataloader,
+ callbacks=callbacks,
+ )
+ runner.run()
+
+ import json
+ from collections import defaultdict
+ import numpy as np
+
+ with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'), 'r') as f:
+ mappings = json.load(f)
+
+ inter = {}
+ sem_2_ids = defaultdict(list)
+ for mapping in mappings:
+ item_id = mapping['item_id']
+ clusters = mapping['clusters']
+ inter[int(item_id)] = clusters
+ sem_2_ids[tuple(clusters)].append(int(item_id))
+
+ for semantics, items in sem_2_ids.items():
+ assert len(items) <= CODEBOOK_SIZE, str(len(items))
+ collision_solvers = np.random.permutation(CODEBOOK_SIZE)[:len(items)].tolist()
+ for item_id, collision_solver in zip(items, collision_solvers):
+ inter[item_id].append(collision_solver)
+ for i in range(len(inter[item_id])):
+ inter[item_id][i] += CODEBOOK_SIZE * i
+
+ with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters_colisionless.json'), 'w') as f:
+ json.dump(inter, f, indent=2)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/plum/beauty-exps/infer_default.py b/scripts/plum/beauty-exps/infer_default.py
new file mode 100644
index 0000000..af8df34
--- /dev/null
+++ b/scripts/plum/beauty-exps/infer_default.py
@@ -0,0 +1,152 @@
+from loguru import logger
+import os
+
+import torch
+
+import irec.callbacks as cb
+from irec.data.dataloader import DataLoader
+from irec.data.transforms import Collate, ToTorch, ToDevice
+from irec.runners import InferenceRunner
+
+from irec.utils import fix_random_seed
+
+from data import EmbeddingDataset, ProcessEmbeddings
+from models import PlumRQVAE
+from transforms import AddWeightedCooccurrenceEmbeddings
+from cooc_data import CoocMappingDataset
+
+SEED_VALUE = 42
+DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+BATCH_SIZE = 1024
+
+INPUT_DIM = 4096
+HIDDEN_DIM = 32
+CODEBOOK_SIZE = 256
+NUM_CODEBOOKS = 3
+
+BETA = 0.25
+MODEL_PATH = '/home/jovyan/IRec/checkpoints/test_plum_rqvae_beauty_ws_2_best_0.0054.pth'
+
+WINDOW_SIZE = 2
+
+EXPERIMENT_NAME = f'test_plum_rqvae_beauty_ws_{WINDOW_SIZE}'
+
+IREC_PATH = '/home/jovyan/IRec/'
+
+
+def main():
+ fix_random_seed(SEED_VALUE)
+
+ data = CoocMappingDataset.create(
+ inter_json_path=os.path.join(IREC_PATH, 'data/Beauty/inter_new.json'),
+ max_sequence_length=20,
+ sampler_type='sasrec',
+ window_size=WINDOW_SIZE
+ )
+
+ dataset = EmbeddingDataset(
+ data_path='/home/jovyan/tiger/data/Beauty/default_content_embeddings.pkl'
+ )
+
+ item_id_to_embedding = {}
+ all_item_ids = []
+ for idx in range(len(dataset)):
+ sample = dataset[idx]
+ item_id = int(sample['item_id'])
+ item_id_to_embedding[item_id] = torch.tensor(sample['embedding'])
+ all_item_ids.append(item_id)
+
+ add_cooc_transform = AddWeightedCooccurrenceEmbeddings(
+ data.cooccur_counter_mapping, item_id_to_embedding, all_item_ids)
+
+ dataloader = DataLoader(
+ dataset,
+ batch_size=BATCH_SIZE,
+ shuffle=False,
+ drop_last=False,
+ ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])).map(add_cooc_transform)
+
+ model = PlumRQVAE(
+ input_dim=INPUT_DIM,
+ num_codebooks=NUM_CODEBOOKS,
+ codebook_size=CODEBOOK_SIZE,
+ embedding_dim=HIDDEN_DIM,
+ beta=BETA,
+ quant_loss_weight=1.0,
+ contrastive_loss_weight=1.0,
+ temperature=1.0
+ ).to(DEVICE)
+
+ total_params = sum(p.numel() for p in model.parameters())
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+ logger.debug(f'Overall parameters: {total_params:,}')
+ logger.debug(f'Trainable parameters: {trainable_params:,}')
+
+ callbacks = [
+ cb.LoadModel(MODEL_PATH),
+
+ cb.BatchMetrics(metrics=lambda model_outputs, _: {
+ 'loss': model_outputs['loss'],
+ 'recon_loss': model_outputs['recon_loss'],
+ 'rqvae_loss': model_outputs['rqvae_loss'],
+ 'con_loss': model_outputs['con_loss']
+ }, name='valid'),
+
+ cb.MetricAccumulator(
+ accumulators={
+ 'valid/loss': cb.MeanAccumulator(),
+ 'valid/recon_loss': cb.MeanAccumulator(),
+ 'valid/rqvae_loss': cb.MeanAccumulator(),
+ 'valid/con_loss': cb.MeanAccumulator(),
+ },
+ ),
+
+ cb.Logger().every_num_steps(len(dataloader)),
+
+ cb.InferenceSaver(
+ metrics=lambda batch, model_outputs, _: {'item_id': batch['item_id'], 'clusters': model_outputs['clusters']},
+ save_path=f'/home/jovyan/IRec/results/{EXPERIMENT_NAME}_clusters.json',
+ format='json'
+ )
+ ]
+
+ logger.debug('Everything is ready for training process!')
+
+ runner = InferenceRunner(
+ model=model,
+ dataset=dataloader,
+ callbacks=callbacks,
+ )
+ runner.run()
+
+ import json
+ from collections import defaultdict
+ import numpy as np
+
+ with open(f'/home/jovyan/IRec/results/{EXPERIMENT_NAME}_clusters.json', 'r') as f:
+ mappings = json.load(f)
+
+ inter = {}
+ sem_2_ids = defaultdict(list)
+ for mapping in mappings:
+ item_id = mapping['item_id']
+ clusters = mapping['clusters']
+ inter[int(item_id)] = clusters
+ sem_2_ids[tuple(clusters)].append(int(item_id))
+
+ for semantics, items in sem_2_ids.items():
+ assert len(items) <= CODEBOOK_SIZE, str(len(items))
+ collision_solvers = np.random.permutation(CODEBOOK_SIZE)[:len(items)].tolist()
+ for item_id, collision_solver in zip(items, collision_solvers):
+ inter[item_id].append(collision_solver)
+ for i in range(len(inter[item_id])):
+ inter[item_id][i] += CODEBOOK_SIZE * i
+
+ with open(os.path.join(IREC_PATH, 'results', f'{EXPERIMENT_NAME}_clusters_colisionless.json'), 'w') as f:
+ json.dump(inter, f, indent=2)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/plum/cooc_data.py b/scripts/plum/cooc_data.py
index b11e6f0..50f2bdd 100644
--- a/scripts/plum/cooc_data.py
+++ b/scripts/plum/cooc_data.py
@@ -13,21 +13,15 @@ class CoocMappingDataset:
def __init__(
self,
train_sampler,
- validation_sampler,
- test_sampler,
num_items,
- max_sequence_length,
cooccur_counter_mapping=None
):
self._train_sampler = train_sampler
- self._validation_sampler = validation_sampler
- self._test_sampler = test_sampler
self._num_items = num_items
- self._max_sequence_length = max_sequence_length
self._cooccur_counter_mapping = cooccur_counter_mapping
@classmethod
- def create(cls, inter_json_path, max_sequence_length, sampler_type, window_size):
+ def create(cls, inter_json_path, window_size):
max_item_id = 0
train_dataset, validation_dataset, test_dataset = [], [], []
@@ -43,31 +37,59 @@ def create(cls, inter_json_path, max_sequence_length, sampler_type, window_size)
'user.ids': [user_id],
'item.ids': item_ids[:-2],
})
- validation_dataset.append({
- 'user.ids': [user_id],
- 'item.ids': item_ids[:-1],
- })
- test_dataset.append({
- 'user.ids': [user_id],
- 'item.ids': item_ids,
- })
cooccur_counter_mapping = cls.build_cooccur_counter_mapping(train_dataset, window_size=window_size)
logger.debug(f'Computed window-based co-occurrence mapping for {len(cooccur_counter_mapping)} items but max_item_id is {max_item_id}')
train_sampler = train_dataset
- validation_sampler = validation_dataset
- test_sampler = test_dataset
return cls(
train_sampler=train_sampler,
- validation_sampler=validation_sampler,
- test_sampler=test_sampler,
num_items=max_item_id + 1,
- max_sequence_length=max_sequence_length,
cooccur_counter_mapping=cooccur_counter_mapping
)
+ @classmethod
+ def create_from_split_part(
+ cls,
+ train_inter_json_path,
+ window_size
+ ):
+
+ max_item_id = 0
+ train_dataset = []
+
+ with open(train_inter_json_path, 'r') as f:
+ train_interactions = json.load(f)
+
+ # Обрабатываем TRAIN
+ for user_id_str, item_ids in train_interactions.items():
+ user_id = int(user_id_str)
+ if item_ids:
+ max_item_id = max(max_item_id, max(item_ids))
+
+ train_dataset.append({
+ 'user.ids': [user_id],
+ 'item.ids': item_ids,
+ })
+
+ logger.debug(f'Train: {len(train_dataset)} users')
+ logger.debug(f'Max item ID: {max_item_id}')
+
+ cooccur_counter_mapping = cls.build_cooccur_counter_mapping(
+ train_dataset,
+ window_size=window_size
+ )
+
+ logger.debug(f'Computed window-based co-occurrence mapping for {len(cooccur_counter_mapping)} items')
+
+ return cls(
+ train_sampler=train_dataset,
+ num_items=max_item_id + 1,
+ cooccur_counter_mapping=cooccur_counter_mapping
+ )
+
+
@staticmethod
def build_cooccur_counter_mapping(train_dataset, window_size): #TODO передавать время и по нему строить окно
cooccur_counts = defaultdict(Counter)
@@ -80,16 +102,6 @@ def build_cooccur_counter_mapping(train_dataset, window_size): #TODO перед
cooccur_counts[item_i][items[j]] += 1
return cooccur_counts
- def get_datasets(self):
- return self._train_sampler, self._validation_sampler, self._test_sampler
-
- @property
- def num_items(self):
- return self._num_items
-
- @property
- def max_sequence_length(self):
- return self._max_sequence_length
@property
def cooccur_counter_mapping(self):
diff --git a/scripts/plum/data.py b/scripts/plum/data.py
index 0ffef82..9c15b70 100644
--- a/scripts/plum/data.py
+++ b/scripts/plum/data.py
@@ -5,6 +5,29 @@
from irec.data.transforms import Transform
+import polars as pl
+import torch
+
+class EmbeddingDatasetParquet(BaseDataset):
+ def __init__(self, data_path):
+ self.df = pl.read_parquet(data_path)
+ self.item_ids = np.array(self.df['item_id'], dtype=np.int64)
+ self.embeddings = np.array(self.df['embedding'].to_list(), dtype=np.float32)
+ print(f"embedding dim: {self.embeddings[0].shape}")
+
+ def __getitem__(self, idx):
+ index = self.item_ids[idx]
+ tensor_emb = self.embeddings[idx]
+ return {
+ 'item_id': index,
+ 'embedding': tensor_emb,
+ 'embedding_dim': len(tensor_emb)
+ }
+
+ def __len__(self):
+ return len(self.embeddings)
+
+
class EmbeddingDataset(BaseDataset):
def __init__(self, data_path):
self.data_path = data_path
diff --git a/scripts/plum/infer_default.py b/scripts/plum/infer_default.py
index af8df34..b15fb6d 100644
--- a/scripts/plum/infer_default.py
+++ b/scripts/plum/infer_default.py
@@ -12,8 +12,18 @@
from data import EmbeddingDataset, ProcessEmbeddings
from models import PlumRQVAE
-from transforms import AddWeightedCooccurrenceEmbeddings
-from cooc_data import CoocMappingDataset
+
+# ПУТИ
+IREC_PATH = '/home/jovyan/IRec/'
+EMBEDDINGS_PATH = '/home/jovyan/tiger/data/Beauty/default_content_embeddings.pkl'
+MODEL_PATH = '/home/jovyan/IRec/checkpoints/4-1_plum_rqvae_beauty_ws_2_best_0.0051.pth'
+RESULTS_PATH = os.path.join(IREC_PATH, 'results')
+
+WINDOW_SIZE = 2
+
+EXPERIMENT_NAME = f'test_plum_rqvae_beauty_ws_{WINDOW_SIZE}'
+
+# ОСТАЛЬНОЕ
SEED_VALUE = 42
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
@@ -26,29 +36,16 @@
NUM_CODEBOOKS = 3
BETA = 0.25
-MODEL_PATH = '/home/jovyan/IRec/checkpoints/test_plum_rqvae_beauty_ws_2_best_0.0054.pth'
-WINDOW_SIZE = 2
-
-EXPERIMENT_NAME = f'test_plum_rqvae_beauty_ws_{WINDOW_SIZE}'
-
-IREC_PATH = '/home/jovyan/IRec/'
def main():
fix_random_seed(SEED_VALUE)
- data = CoocMappingDataset.create(
- inter_json_path=os.path.join(IREC_PATH, 'data/Beauty/inter_new.json'),
- max_sequence_length=20,
- sampler_type='sasrec',
- window_size=WINDOW_SIZE
- )
-
dataset = EmbeddingDataset(
- data_path='/home/jovyan/tiger/data/Beauty/default_content_embeddings.pkl'
+ data_path=EMBEDDINGS_PATH
)
-
+
item_id_to_embedding = {}
all_item_ids = []
for idx in range(len(dataset)):
@@ -57,15 +54,12 @@ def main():
item_id_to_embedding[item_id] = torch.tensor(sample['embedding'])
all_item_ids.append(item_id)
- add_cooc_transform = AddWeightedCooccurrenceEmbeddings(
- data.cooccur_counter_mapping, item_id_to_embedding, all_item_ids)
-
dataloader = DataLoader(
dataset,
batch_size=BATCH_SIZE,
shuffle=False,
drop_last=False,
- ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])).map(add_cooc_transform)
+ ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding']))
model = PlumRQVAE(
input_dim=INPUT_DIM,
@@ -106,8 +100,8 @@ def main():
cb.Logger().every_num_steps(len(dataloader)),
cb.InferenceSaver(
- metrics=lambda batch, model_outputs, _: {'item_id': batch['item_id'], 'clusters': model_outputs['clusters']},
- save_path=f'/home/jovyan/IRec/results/{EXPERIMENT_NAME}_clusters.json',
+ metrics=lambda batch, model_outputs, _: {'item_id': batch['item_id'], 'clusters': model_outputs['clusters']},
+ save_path=os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'),
format='json'
)
]
@@ -125,9 +119,9 @@ def main():
from collections import defaultdict
import numpy as np
- with open(f'/home/jovyan/IRec/results/{EXPERIMENT_NAME}_clusters.json', 'r') as f:
+ with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'), 'r') as f:
mappings = json.load(f)
-
+
inter = {}
sem_2_ids = defaultdict(list)
for mapping in mappings:
@@ -143,8 +137,8 @@ def main():
inter[item_id].append(collision_solver)
for i in range(len(inter[item_id])):
inter[item_id][i] += CODEBOOK_SIZE * i
-
- with open(os.path.join(IREC_PATH, 'results', f'{EXPERIMENT_NAME}_clusters_colisionless.json'), 'w') as f:
+
+ with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters_colisionless.json'), 'w') as f:
json.dump(inter, f, indent=2)
diff --git a/scripts/plum/train_plum.py b/scripts/plum/train_plum.py
index 5a00bc3..ffa9e43 100644
--- a/scripts/plum/train_plum.py
+++ b/scripts/plum/train_plum.py
@@ -41,8 +41,6 @@ def main():
data = CoocMappingDataset.create(
inter_json_path=os.path.join(IREC_PATH, 'data/Beauty/inter_new.json'),
- max_sequence_length=20,
- sampler_type='sasrec',
window_size=WINDOW_SIZE
)
diff --git a/scripts/plum/train_plum_timestamp_based.py b/scripts/plum/train_plum_timestamp_based.py
new file mode 100644
index 0000000..e755d95
--- /dev/null
+++ b/scripts/plum/train_plum_timestamp_based.py
@@ -0,0 +1,168 @@
+from loguru import logger
+import os
+
+import torch
+
+import pickle
+
+import irec.callbacks as cb
+from irec.data.dataloader import DataLoader
+from irec.data.transforms import Collate, ToTorch, ToDevice
+from irec.runners import TrainingRunner
+
+from irec.utils import fix_random_seed
+
+from callbacks import InitCodebooks, FixDeadCentroids
+from data import EmbeddingDataset, ProcessEmbeddings
+from models import PlumRQVAE
+from transforms import AddWeightedCooccurrenceEmbeddings
+from cooc_data import CoocMappingDataset
+
+SEED_VALUE = 42
+DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+NUM_EPOCHS = 500
+BATCH_SIZE = 1024
+
+INPUT_DIM = 4096
+HIDDEN_DIM = 32
+CODEBOOK_SIZE = 256
+NUM_CODEBOOKS = 3
+BETA = 0.25
+LR = 1e-4
+WINDOW_SIZE = 2
+
+EXPERIMENT_NAME = f'4-1_plum_rqvae_beauty_ws_{WINDOW_SIZE}'
+INTER_TRAIN_PATH = "/home/jovyan/IRec/sigir/Beauty_new/splits/exp_data/exp_4.1_inter_semantics_train.json"
+EMBEDDINGS_PATH = "/home/jovyan/tiger/data/Beauty/default_content_embeddings.pkl"
+IREC_PATH = '../../'
+
+def main():
+ fix_random_seed(SEED_VALUE)
+
+ data = CoocMappingDataset.create_from_split_part(
+ train_inter_json_path=INTER_TRAIN_PATH,
+ window_size=WINDOW_SIZE
+ )
+
+ dataset = EmbeddingDataset(
+ data_path=EMBEDDINGS_PATH
+ )
+
+ item_id_to_embedding = {}
+ all_item_ids = []
+ for idx in range(len(dataset)):
+ sample = dataset[idx]
+ item_id = int(sample['item_id'])
+ item_id_to_embedding[item_id] = torch.tensor(sample['embedding'])
+ all_item_ids.append(item_id)
+
+ add_cooc_transform = AddWeightedCooccurrenceEmbeddings(
+ data.cooccur_counter_mapping, item_id_to_embedding, all_item_ids)
+
+ train_dataloader = DataLoader(
+ dataset,
+ batch_size=BATCH_SIZE,
+ shuffle=True,
+ drop_last=True,
+ ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(
+ ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])
+ ).map(add_cooc_transform).repeat(NUM_EPOCHS)
+
+ valid_dataloader = DataLoader(
+ dataset,
+ batch_size=BATCH_SIZE,
+ shuffle=False,
+ drop_last=False,
+ ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])).map(add_cooc_transform)
+
+ LOG_EVERY_NUM_STEPS = int(len(train_dataloader) // NUM_EPOCHS)
+
+ model = PlumRQVAE(
+ input_dim=INPUT_DIM,
+ num_codebooks=NUM_CODEBOOKS,
+ codebook_size=CODEBOOK_SIZE,
+ embedding_dim=HIDDEN_DIM,
+ beta=BETA,
+ quant_loss_weight=1.0,
+ contrastive_loss_weight=1.0,
+ temperature=1.0
+ ).to(DEVICE)
+
+ total_params = sum(p.numel() for p in model.parameters())
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+ logger.debug(f'Overall parameters: {total_params:,}')
+ logger.debug(f'Trainable parameters: {trainable_params:,}')
+
+ optimizer = torch.optim.Adam(model.parameters(), lr=LR, fused=True)
+
+ callbacks = [
+ InitCodebooks(valid_dataloader),
+
+ cb.BatchMetrics(metrics=lambda model_outputs, batch: {
+ 'loss': model_outputs['loss'],
+ 'recon_loss': model_outputs['recon_loss'],
+ 'rqvae_loss': model_outputs['rqvae_loss'],
+ 'con_loss': model_outputs['con_loss']
+ }, name='train'),
+
+ FixDeadCentroids(valid_dataloader),
+
+ cb.MetricAccumulator(
+ accumulators={
+ 'train/loss': cb.MeanAccumulator(),
+ 'train/recon_loss': cb.MeanAccumulator(),
+ 'train/rqvae_loss': cb.MeanAccumulator(),
+ 'train/con_loss': cb.MeanAccumulator(),
+ 'num_dead/0': cb.MeanAccumulator(),
+ 'num_dead/1': cb.MeanAccumulator(),
+ 'num_dead/2': cb.MeanAccumulator(),
+ },
+ reset_every_num_steps=LOG_EVERY_NUM_STEPS
+ ),
+
+ cb.Validation(
+ dataset=valid_dataloader,
+ callbacks=[
+ cb.BatchMetrics(metrics=lambda model_outputs, batch: {
+ 'loss': model_outputs['loss'],
+ 'recon_loss': model_outputs['recon_loss'],
+ 'rqvae_loss': model_outputs['rqvae_loss'],
+ 'con_loss': model_outputs['con_loss']
+ }, name='valid'),
+ cb.MetricAccumulator(
+ accumulators={
+ 'valid/loss': cb.MeanAccumulator(),
+ 'valid/recon_loss': cb.MeanAccumulator(),
+ 'valid/rqvae_loss': cb.MeanAccumulator(),
+ 'valid/con_loss': cb.MeanAccumulator()
+ }
+ ),
+ ],
+ ).every_num_steps(LOG_EVERY_NUM_STEPS),
+
+ cb.Logger().every_num_steps(LOG_EVERY_NUM_STEPS),
+ cb.TensorboardLogger(experiment_name=EXPERIMENT_NAME, logdir=os.path.join(IREC_PATH, 'tensorboard_logs')),
+
+ cb.EarlyStopping(
+ metric='valid/recon_loss',
+ patience=40,
+ minimize=True,
+ model_path=os.path.join(IREC_PATH, 'checkpoints', EXPERIMENT_NAME)
+ ).every_num_steps(LOG_EVERY_NUM_STEPS),
+ ]
+
+ logger.debug('Everything is ready for training process!')
+
+ runner = TrainingRunner(
+ model=model,
+ optimizer=optimizer,
+ dataset=train_dataloader,
+ callbacks=callbacks,
+ )
+ runner.run()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/plum/transforms.py b/scripts/plum/transforms.py
index 0af1dda..8887115 100644
--- a/scripts/plum/transforms.py
+++ b/scripts/plum/transforms.py
@@ -2,12 +2,6 @@
import pickle
import torch
-from irec.data.base import BaseDataset
-from irec.data.transforms import Transform
-
-from cooc_data import CoocMappingDataset
-
-
class AddWeightedCooccurrenceEmbeddings:
def __init__(self, cooccur_counts, item_id_to_embedding, all_item_ids):
self.cooccur_counts = cooccur_counts
@@ -32,7 +26,7 @@ def __call__(self, batch):
else:
cooc_id = np.random.choice(self.all_item_ids)
- if self.call_count % 10 == 0 and idx < 5:
+ if self.call_count % 500 == 0 and idx < 5:
print(f" idx={idx}: item_id={item_id_val} fallback random")
cooc_emb = self.item_id_to_embedding.get(cooc_id, batch['embedding'][0])
diff --git a/scripts/rqvae-yambda/callbacks.py b/scripts/rqvae-yambda/callbacks.py
new file mode 100644
index 0000000..43ec460
--- /dev/null
+++ b/scripts/rqvae-yambda/callbacks.py
@@ -0,0 +1,64 @@
+import torch
+
+import irec.callbacks as cb
+from irec.runners import TrainingRunner, TrainingRunnerContext
+
+class InitCodebooks(cb.TrainingCallback):
+ def __init__(self, dataloader):
+ super().__init__()
+ self._dataloader = dataloader
+
+ @torch.no_grad()
+ def before_run(self, runner: TrainingRunner):
+ for i in range(len(runner.model.codebooks)):
+ X = next(iter(self._dataloader))['embedding']
+ idx = torch.randperm(X.shape[0], device=X.device)[:len(runner.model.codebooks[i])]
+ remainder = runner.model.encoder(X[idx])
+
+ for j in range(i):
+ codebook_indices = runner.model.get_codebook_indices(remainder, runner.model.codebooks[j])
+ codebook_vectors = runner.model.codebooks[j][codebook_indices]
+ remainder = remainder - codebook_vectors
+
+ runner.model.codebooks[i].data = remainder.detach()
+
+
+class FixDeadCentroids(cb.TrainingCallback):
+ def __init__(self, dataloader):
+ super().__init__()
+ self._dataloader = dataloader
+
+ def after_step(self, runner: TrainingRunner, context: TrainingRunnerContext):
+ for i, num_fixed in enumerate(self.fix_dead_codebooks(runner)):
+ context.metrics[f'num_dead/{i}'] = num_fixed
+
+ @torch.no_grad()
+ def fix_dead_codebooks(self, runner: TrainingRunner):
+ num_fixed = []
+ for codebook_idx, codebook in enumerate(runner.model.codebooks):
+ centroid_counts = torch.zeros(codebook.shape[0], dtype=torch.long, device=codebook.device)
+ random_batch = next(iter(self._dataloader))['embedding']
+
+ for batch in self._dataloader:
+ remainder = runner.model.encoder(batch['embedding'])
+ for l in range(codebook_idx):
+ ind = runner.model.get_codebook_indices(remainder, runner.model.codebooks[l])
+ remainder = remainder - runner.model.codebooks[l][ind]
+
+ indices = runner.model.get_codebook_indices(remainder, codebook)
+ centroid_counts.scatter_add_(0, indices, torch.ones_like(indices))
+
+ dead_mask = (centroid_counts == 0)
+ num_dead = int(dead_mask.sum().item())
+ num_fixed.append(num_dead)
+ if num_dead == 0:
+ continue
+
+ remainder = runner.model.encoder(random_batch)
+ for l in range(codebook_idx):
+ ind = runner.model.get_codebook_indices(remainder, runner.model.codebooks[l])
+ remainder = remainder - runner.model.codebooks[l][ind]
+ remainder = remainder[torch.randperm(remainder.shape[0], device=codebook.device)][:num_dead]
+ codebook[dead_mask] = remainder.detach()
+
+ return num_fixed
diff --git a/scripts/rqvae-yambda/data.py b/scripts/rqvae-yambda/data.py
new file mode 100644
index 0000000..6c213ee
--- /dev/null
+++ b/scripts/rqvae-yambda/data.py
@@ -0,0 +1,35 @@
+import numpy as np
+import polars as pl
+
+from irec.data.base import BaseDataset
+from irec.data.transforms import Transform
+
+
+class EmbeddingDatasetParquet(BaseDataset):
+ def __init__(self, data_path):
+ self.df = pl.read_parquet(data_path)
+ self.item_ids = np.array(self.df['item_id'], dtype=np.int64)
+ self.embeddings = np.array(self.df['embedding'].to_list(), dtype=np.float32)
+ print(f"embedding dim: {self.embeddings[0].shape}")
+
+ def __getitem__(self, idx):
+ index = self.item_ids[idx]
+ tensor_emb = self.embeddings[idx]
+ return {
+ 'item_id': index,
+ 'embedding': tensor_emb,
+ 'embedding_dim': len(tensor_emb)
+ }
+
+ def __len__(self):
+ return len(self.embeddings)
+
+class ProcessEmbeddings(Transform):
+ def __init__(self, embedding_dim, keys):
+ self.embedding_dim = embedding_dim
+ self.keys = keys
+
+ def __call__(self, batch):
+ for key in self.keys:
+ batch[key] = batch[key].reshape(-1, self.embedding_dim)
+ return batch
\ No newline at end of file
diff --git a/scripts/rqvae-yambda/infer_yambda.py b/scripts/rqvae-yambda/infer_yambda.py
new file mode 100644
index 0000000..7daf42f
--- /dev/null
+++ b/scripts/rqvae-yambda/infer_yambda.py
@@ -0,0 +1,128 @@
+from loguru import logger
+import os
+
+import torch
+
+import irec.callbacks as cb
+from irec.data.dataloader import DataLoader
+from irec.data.transforms import Collate, ToTorch, ToDevice
+from irec.runners import InferenceRunner
+
+from irec.utils import fix_random_seed
+
+from data import EmbeddingDatasetParquet, ProcessEmbeddings
+from models import RQVAE
+
+
+IREC_PATH = '/home/jovyan/IRec/'
+EMBEDDINGS_PATH = "/home/jovyan/IRec/sigir/yambda_data/yambda_embeddings_reindexed.parquet"
+MODEL_PATH = '/home/jovyan/IRec/checkpoints/rqvae_yambda_hd_128_cz_512_best_0.0014.pth'
+RESULTS_PATH = '/home/jovyan/IRec/rqvae-yambda-sem-ids'
+EXPERIMENT_NAME = 'rqvae_yambda_hd_128_cz_512'
+
+SEED_VALUE = 42
+DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+BATCH_SIZE = 1024
+
+INPUT_DIM = 128
+HIDDEN_DIM = 128
+CODEBOOK_SIZE = 512
+NUM_CODEBOOKS = 3
+
+BETA = 0.25
+
+
+def main():
+ fix_random_seed(SEED_VALUE)
+
+ dataset = EmbeddingDatasetParquet(
+ data_path=EMBEDDINGS_PATH
+ )
+
+ dataloader = DataLoader(
+ dataset,
+ batch_size=BATCH_SIZE,
+ shuffle=True,
+ drop_last=False,
+ ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding']))
+
+ model = RQVAE(
+ input_dim=INPUT_DIM,
+ num_codebooks=NUM_CODEBOOKS,
+ codebook_size=CODEBOOK_SIZE,
+ embedding_dim=HIDDEN_DIM,
+ beta=BETA,
+ quant_loss_weight=1.0
+ ).to(DEVICE)
+
+ total_params = sum(p.numel() for p in model.parameters())
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+ logger.debug(f'Overall parameters: {total_params:,}')
+ logger.debug(f'Trainable parameters: {trainable_params:,}')
+
+ callbacks = [
+ cb.LoadModel(MODEL_PATH),
+
+ cb.BatchMetrics(metrics=lambda model_outputs, _: {
+ 'loss': model_outputs['loss'],
+ 'recon_loss': model_outputs['recon_loss'],
+ 'rqvae_loss': model_outputs['rqvae_loss'],
+ }, name='valid'),
+
+ cb.MetricAccumulator(
+ accumulators={
+ 'valid/loss': cb.MeanAccumulator(),
+ 'valid/recon_loss': cb.MeanAccumulator(),
+ 'valid/rqvae_loss': cb.MeanAccumulator(),
+ },
+ ),
+
+ cb.Logger().every_num_steps(len(dataloader)),
+
+ cb.InferenceSaver(
+ metrics=lambda batch, model_outputs, _: {'item_id': batch['item_id'], 'clusters': model_outputs['clusters']},
+ save_path=os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'),
+ format='json'
+ )
+ ]
+
+ logger.debug('Everything is ready for training process!')
+
+ runner = InferenceRunner(
+ model=model,
+ dataset=dataloader,
+ callbacks=callbacks,
+ )
+ runner.run()
+
+ import json
+ from collections import defaultdict
+ import numpy as np
+
+ with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters.json'), 'r') as f:
+ mappings = json.load(f)
+
+ inter = {}
+ sem_2_ids = defaultdict(list)
+ for mapping in mappings:
+ item_id = mapping['item_id']
+ clusters = mapping['clusters']
+ inter[int(item_id)] = clusters
+ sem_2_ids[tuple(clusters)].append(int(item_id))
+
+ for semantics, items in sem_2_ids.items():
+ assert len(items) <= CODEBOOK_SIZE, str(len(items))
+ collision_solvers = np.random.permutation(CODEBOOK_SIZE)[:len(items)].tolist()
+ for item_id, collision_solver in zip(items, collision_solvers):
+ inter[item_id].append(collision_solver)
+ for i in range(len(inter[item_id])):
+ inter[item_id][i] += CODEBOOK_SIZE * i
+
+ with open(os.path.join(RESULTS_PATH, f'{EXPERIMENT_NAME}_clusters_colisionless.json'), 'w') as f:
+ json.dump(inter, f, indent=2)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/rqvae-yambda/models.py b/scripts/rqvae-yambda/models.py
new file mode 100644
index 0000000..87c2241
--- /dev/null
+++ b/scripts/rqvae-yambda/models.py
@@ -0,0 +1,91 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class RQVAE(nn.Module):
+ def __init__(
+ self,
+ input_dim,
+ num_codebooks,
+ codebook_size,
+ embedding_dim,
+ beta=0.25,
+ quant_loss_weight=1.0,
+ ):
+ super().__init__()
+ self.register_buffer('beta', torch.tensor(beta))
+
+ self.input_dim = input_dim
+ self.num_codebooks = num_codebooks
+ self.codebook_size = codebook_size
+ self.embedding_dim = embedding_dim
+ self.quant_loss_weight = quant_loss_weight
+
+
+ self.encoder = self.make_encoding_tower(input_dim, embedding_dim)
+ self.decoder = self.make_encoding_tower(embedding_dim, input_dim)
+
+ self.codebooks = torch.nn.ParameterList()
+ for _ in range(num_codebooks):
+ cb = torch.FloatTensor(codebook_size, embedding_dim)
+ #nn.init.normal_(cb)
+ self.codebooks.append(cb)
+
+ @staticmethod
+ def make_encoding_tower(d1, d2, bias=False):
+ return torch.nn.Sequential(
+ nn.Linear(d1, d1),
+ nn.ReLU(),
+ nn.Linear(d1, d2),
+ nn.ReLU(),
+ nn.Linear(d2, d2, bias=bias)
+ )
+
+ @staticmethod
+ def get_codebook_indices(remainder, codebook):
+ dist = torch.cdist(remainder, codebook)
+ return dist.argmin(dim=-1)
+
+ def forward(self, inputs):
+ latent_vector = self.encoder(inputs['embedding'])
+ item_ids = inputs['item_id']
+
+ latent_restored = 0
+ rqvae_loss = 0
+ clusters = []
+ remainder = latent_vector
+
+ for codebook in self.codebooks:
+ codebook_indices = self.get_codebook_indices(remainder, codebook)
+ clusters.append(codebook_indices)
+
+ quantized = codebook[codebook_indices]
+ codebook_vectors = remainder + (quantized - remainder).detach()
+
+ rqvae_loss += self.beta * torch.nn.functional.mse_loss(remainder, quantized.detach())
+ rqvae_loss += torch.nn.functional.mse_loss(quantized, remainder.detach())
+
+ latent_restored += codebook_vectors
+ remainder = remainder - codebook_vectors
+
+ embeddings_restored = self.decoder(latent_restored)
+ recon_loss = torch.nn.functional.mse_loss(embeddings_restored, inputs['embedding'])
+
+ loss = (
+ recon_loss
+ + self.quant_loss_weight * rqvae_loss
+ ).mean()
+
+ clusters_counts = []
+ for cluster in clusters:
+ clusters_counts.append(torch.bincount(cluster, minlength=self.codebook_size))
+
+ return loss, {
+ 'loss': loss.item(),
+ 'recon_loss': recon_loss.mean().item(),
+ 'rqvae_loss': rqvae_loss.mean().item(),
+
+ 'clusters_counts': clusters_counts,
+ 'clusters': torch.stack(clusters).T,
+ 'embedding_hat': embeddings_restored,
+ }
\ No newline at end of file
diff --git a/scripts/rqvae-yambda/train_yambda.py b/scripts/rqvae-yambda/train_yambda.py
new file mode 100644
index 0000000..71582ae
--- /dev/null
+++ b/scripts/rqvae-yambda/train_yambda.py
@@ -0,0 +1,151 @@
+from loguru import logger
+import os
+
+import torch
+
+import irec.callbacks as cb
+from irec.data.dataloader import DataLoader
+from irec.data.transforms import Collate, ToTorch, ToDevice
+from irec.runners import TrainingRunner
+
+from irec.utils import fix_random_seed
+
+from callbacks import InitCodebooks, FixDeadCentroids
+from data import EmbeddingDatasetParquet, ProcessEmbeddings
+from models import RQVAE
+
+SEED_VALUE = 42
+DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+NUM_EPOCHS = 100
+BATCH_SIZE = 1024
+
+INPUT_DIM = 128
+HIDDEN_DIM = 128
+CODEBOOK_SIZE = 512
+NUM_CODEBOOKS = 3
+BETA = 0.25
+LR = 1e-4
+
+EXPERIMENT_NAME = 'rqvae_yambda_hd_128_cz_512'
+EMBEDDINGS_PATH = "/home/jovyan/IRec/sigir/yambda_data/yambda_embeddings_reindexed.parquet"
+IREC_PATH = '../../'
+
+print(EXPERIMENT_NAME, EMBEDDINGS_PATH)
+def main():
+ fix_random_seed(SEED_VALUE)
+
+ dataset = EmbeddingDatasetParquet(
+ data_path=EMBEDDINGS_PATH
+ )
+
+ train_dataloader = DataLoader(
+ dataset,
+ batch_size=BATCH_SIZE,
+ num_workers=8,
+ shuffle=True,
+ drop_last=True,
+ persistent_workers=True,
+ pin_memory=True
+ ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(
+ ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding'])
+ ).repeat(NUM_EPOCHS)
+
+ valid_dataloader = DataLoader(
+ dataset,
+ batch_size=BATCH_SIZE,
+ shuffle=False,
+ drop_last=False,
+ ).map(Collate()).map(ToTorch()).map(ToDevice(DEVICE)).map(ProcessEmbeddings(embedding_dim=INPUT_DIM, keys=['embedding']))
+
+ LOG_EVERY_NUM_STEPS = int(len(train_dataloader) // NUM_EPOCHS)
+
+ model = RQVAE(
+ input_dim=INPUT_DIM,
+ num_codebooks=NUM_CODEBOOKS,
+ codebook_size=CODEBOOK_SIZE,
+ embedding_dim=HIDDEN_DIM,
+ beta=BETA,
+ quant_loss_weight=1.0
+ ).to(DEVICE)
+
+ total_params = sum(p.numel() for p in model.parameters())
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+ logger.debug(f'Overall parameters: {total_params:,}')
+ logger.debug(f'Trainable parameters: {trainable_params:,}')
+
+ optimizer = torch.optim.Adam(model.parameters(), lr=LR, fused=True)
+
+ callbacks = [
+ InitCodebooks(valid_dataloader),
+
+ cb.BatchMetrics(metrics=lambda model_outputs, batch: {
+ 'loss': model_outputs['loss'],
+ 'recon_loss': model_outputs['recon_loss'],
+ 'rqvae_loss': model_outputs['rqvae_loss']
+ }, name='train'),
+
+ FixDeadCentroids(valid_dataloader),
+
+ cb.MetricAccumulator(
+ accumulators={
+ 'train/loss': cb.MeanAccumulator(),
+ 'train/recon_loss': cb.MeanAccumulator(),
+ 'train/rqvae_loss': cb.MeanAccumulator(),
+ 'num_dead/0': cb.MeanAccumulator(),
+ 'num_dead/1': cb.MeanAccumulator(),
+ 'num_dead/2': cb.MeanAccumulator(),
+ },
+ reset_every_num_steps=LOG_EVERY_NUM_STEPS
+ ),
+
+ cb.Validation(
+ dataset=valid_dataloader,
+ callbacks=[
+ cb.BatchMetrics(metrics=lambda model_outputs, batch: {
+ 'loss': model_outputs['loss'],
+ 'recon_loss': model_outputs['recon_loss'],
+ 'rqvae_loss': model_outputs['rqvae_loss'],
+ }, name='valid'),
+ cb.MetricAccumulator(
+ accumulators={
+ 'valid/loss': cb.MeanAccumulator(),
+ 'valid/recon_loss': cb.MeanAccumulator(),
+ 'valid/rqvae_loss': cb.MeanAccumulator(),
+ }
+ ),
+ ],
+ ).every_num_steps(LOG_EVERY_NUM_STEPS),
+
+ cb.Logger().every_num_steps(LOG_EVERY_NUM_STEPS),
+ cb.TensorboardLogger(experiment_name=EXPERIMENT_NAME, logdir=os.path.join(IREC_PATH, 'tensorboard_logs')),
+
+ cb.Profiler(
+ wait=10,
+ warmup=10,
+ active=10,
+ logdir=os.path.join(IREC_PATH, 'tensorboard_logs')
+ ),
+
+ cb.EarlyStopping(
+ metric='valid/recon_loss',
+ patience=40,
+ minimize=True,
+ model_path=os.path.join(IREC_PATH, 'checkpoints', EXPERIMENT_NAME)
+ ).every_num_steps(LOG_EVERY_NUM_STEPS),
+ ]
+
+ logger.debug('Everything is ready for training process!')
+
+ runner = TrainingRunner(
+ model=model,
+ optimizer=optimizer,
+ dataset=train_dataloader,
+ callbacks=callbacks,
+ )
+ runner.run()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/tiger-lsvd/data.py b/scripts/tiger-lsvd/data.py
new file mode 100644
index 0000000..26123db
--- /dev/null
+++ b/scripts/tiger-lsvd/data.py
@@ -0,0 +1,428 @@
+from collections import defaultdict
+import json
+from loguru import logger
+import numpy as np
+from pathlib import Path
+
+
+import pyarrow as pa
+import pyarrow.feather as feather
+
+import torch
+import polars as pl
+from irec.data.base import BaseDataset
+
+
+class InteractionsDatasetParquet(BaseDataset):
+ def __init__(self, data_path, max_items=None):
+ self.df = pl.read_parquet(data_path)
+ assert 'uid' in self.df.columns, "Missing 'uid' column"
+ assert 'item_ids' in self.df.columns, "Missing 'item_ids' column"
+ print(f"Dataset loaded: {len(self.df)} users")
+
+ if max_items is not None:
+ self.df = self.df.with_columns(
+ pl.col("item_ids").list.slice(-max_items).alias("item_ids")
+ )
+
+ def __getitem__(self, idx):
+ row = self.df.row(idx, named=True)
+ return {
+ 'user_id': row['uid'],
+ 'item_ids': np.array(row['item_ids'], dtype=np.uint32),
+ }
+
+ def __len__(self):
+ return len(self.df)
+
+ def __iter__(self):
+ for idx in range(len(self)):
+ yield self[idx]
+
+
+
+class Dataset:
+ def __init__(
+ self,
+ train_sampler,
+ validation_sampler,
+ test_sampler,
+ num_items,
+ max_sequence_length
+ ):
+ self._train_sampler = train_sampler
+ self._validation_sampler = validation_sampler
+ self._test_sampler = test_sampler
+ self._num_items = num_items
+ self._max_sequence_length = max_sequence_length
+
+ @classmethod
+ def create_timestamp_based_parquet(
+ cls,
+ train_parquet_path,
+ validation_parquet_path,
+ test_parquet_path,
+ max_sequence_length,
+ sampler_type,
+ min_sample_len=2,
+ is_extended=False,
+ max_train_events=50
+ ):
+ """
+ Загружает данные из parquet файлов с timestamp-based сплитом.
+
+ Ожидает структуру parquet:
+ - uid: int (user id)
+ - item_ids: list[int] (список item ids)
+
+ Аналогично create_timestamp_based, но для parquet формата.
+ """
+ max_item_id = 0
+ train_dataset, validation_dataset, test_dataset = [], [], []
+
+ print(f"started to load datasets from parquet with max train length {max_train_events}")
+
+ # Загружаем parquet файлы
+ train_df = pl.read_parquet(train_parquet_path)
+ validation_df = pl.read_parquet(validation_parquet_path)
+ test_df = pl.read_parquet(test_parquet_path)
+
+ # Проверяем наличие необходимых колонок
+ for df, name in [(train_df, "train"), (validation_df, "validation"), (test_df, "test")]:
+ assert 'uid' in df.columns, f"Missing 'uid' column in {name}"
+ assert 'item_ids' in df.columns, f"Missing 'item_ids' column in {name}"
+
+ # Создаем словари для быстрого доступа
+ train_data = {str(row['uid']): row['item_ids'] for row in train_df.iter_rows(named=True)}
+ validation_data = {str(row['uid']): row['item_ids'] for row in validation_df.iter_rows(named=True)}
+ test_data = {str(row['uid']): row['item_ids'] for row in test_df.iter_rows(named=True)}
+
+ all_users = set(train_data.keys()) | set(validation_data.keys()) | set(test_data.keys())
+ print(f"all users count: {len(all_users)}")
+
+ us_count = 0
+ for user_id_str in all_users:
+ if us_count % 100 == 0:
+ print(f"user id {us_count}/{len(all_users)}: {user_id_str}")
+
+ user_id = int(user_id_str)
+
+ # Получаем последовательности для каждого сплита
+ train_items = list(train_data.get(user_id_str, []))
+ validation_items = list(validation_data.get(user_id_str, []))
+ test_items = list(test_data.get(user_id_str, []))
+
+ # Обрезаем train на последние max_train_events событий
+ train_items = train_items[-max_train_events:] if len(train_items) > max_train_events else train_items
+
+ full_sequence = train_items + validation_items + test_items
+ if full_sequence:
+ max_item_id = max(max_item_id, max(full_sequence))
+
+ if us_count % 100 == 0:
+ print(f"full sequence len: {len(full_sequence)}")
+
+ us_count += 1
+ if len(full_sequence) < 4:
+ print(f'Core-4 dataset is used, user {user_id} has only {len(full_sequence)} items')
+ continue
+
+ if is_extended:
+ # sample = [1, 2]
+ # sample = [1, 2, 3]
+ # sample = [1, 2, 3, 4]
+ # sample = [1, 2, 3, 4, 5]
+ # sample = [1, 2, 3, 4, 5, 6]
+ # sample = [1, 2, 3, 4, 5, 6, 7]
+ # sample = [1, 2, 3, 4, 5, 6, 7, 8]
+ for prefix_length in range(min_sample_len, len(train_items) + 1):
+ train_dataset.append({
+ 'user.ids': [user_id],
+ 'item.ids': train_items[:prefix_length],
+ })
+ else:
+ # sample = [1, 2, 3, 4, 5, 6, 7, 8]
+ train_dataset.append({
+ 'user.ids': [user_id],
+ 'item.ids': train_items,
+ })
+
+ # валидация
+
+ # разворачиваем каждый айтем из валидации в отдельный сэмпл
+ # Пример: Train=[1,2], Valid=[3,4]
+ # sample = [1, 2, 3]
+ # sample = [1, 2, 3, 4]
+
+ current_history = train_items.copy()
+ valid_small_history = 0
+ for item in validation_items:
+ # эвал датасет сам отрезает таргет потом
+ sample_sequence = current_history + [item]
+
+ if len(sample_sequence) >= min_sample_len:
+ validation_dataset.append({
+ 'user.ids': [user_id],
+ 'item.ids': sample_sequence,
+ })
+ else:
+ valid_small_history += 1
+ current_history.append(item)
+
+ # разворачиваем каждый айтем из теста в отдельный сэмпл
+ # Пример: Train=[1,2], Valid=[3,4], Test=[5, 6]
+ # sample = [1, 2, 3, 4, 5]
+ # sample = [1, 2, 3, 4, 5, 6]
+ current_history = train_items + validation_items
+ test_small_history = 0
+ for item in test_items:
+ sample_sequence = current_history + [item]
+ if len(sample_sequence) >= min_sample_len:
+ test_dataset.append({
+ 'user.ids': [user_id],
+ 'item.ids': sample_sequence,
+ })
+ else:
+ test_small_history += 1
+ current_history.append(item)
+
+ print(f"Train dataset size: {len(train_dataset)}")
+ print(f"Validation dataset size: {len(validation_dataset)} with skipped {valid_small_history}")
+ print(f"Test dataset size: {len(test_dataset)} with skipped {test_small_history}")
+
+ logger.debug(f'Train dataset size: {len(train_dataset)}')
+ logger.debug(f'Validation dataset size: {len(validation_dataset)}')
+ logger.debug(f'Test dataset size: {len(test_dataset)}')
+
+ train_sampler = TrainDataset(train_dataset, sampler_type, max_sequence_length=max_sequence_length)
+ validation_sampler = EvalDataset(validation_dataset, max_sequence_length=max_sequence_length)
+ test_sampler = EvalDataset(test_dataset, max_sequence_length=max_sequence_length)
+
+ return cls(
+ train_sampler=train_sampler,
+ validation_sampler=validation_sampler,
+ test_sampler=test_sampler,
+ num_items=max_item_id + 1, # +1 added because our ids are 0-indexed
+ max_sequence_length=max_sequence_length
+ )
+
+ @classmethod
+ def create(cls, inter_json_path, max_sequence_length, sampler_type, is_extended=False):
+ max_item_id = 0
+ train_dataset, validation_dataset, test_dataset = [], [], []
+
+ with open(inter_json_path, 'r') as f:
+ user_interactions = json.load(f)
+
+ for user_id_str, item_ids in user_interactions.items():
+ user_id = int(user_id_str)
+
+ if item_ids:
+ max_item_id = max(max_item_id, max(item_ids))
+
+ assert len(item_ids) >= 5, f'Core-5 dataset is used, user {user_id} has only {len(item_ids)} items'
+
+ # sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] (leave one out scheme, 8 - train, 9 - valid, 10 - test)
+ if is_extended:
+ # sample = [1, 2]
+ # sample = [1, 2, 3]
+ # sample = [1, 2, 3, 4]
+ # sample = [1, 2, 3, 4, 5]
+ # sample = [1, 2, 3, 4, 5, 6]
+ # sample = [1, 2, 3, 4, 5, 6, 7]
+ # sample = [1, 2, 3, 4, 5, 6, 7, 8]
+ for prefix_length in range(2, len(item_ids) - 2 + 1):
+ train_dataset.append({
+ 'user.ids': [user_id],
+ 'item.ids': item_ids[:prefix_length],
+ })
+ else:
+ # sample = [1, 2, 3, 4, 5, 6, 7, 8]
+ train_dataset.append({
+ 'user.ids': [user_id],
+ 'item.ids': item_ids[:-2],
+ })
+
+ # sample = [1, 2, 3, 4, 5, 6, 7, 8, 9]
+ validation_dataset.append({
+ 'user.ids': [user_id],
+ 'item.ids': item_ids[:-1],
+ })
+
+ # sample = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+ test_dataset.append({
+ 'user.ids': [user_id],
+ 'item.ids': item_ids,
+ })
+
+ logger.debug(f'Train dataset size: {len(train_dataset)}')
+ logger.debug(f'Validation dataset size: {len(validation_dataset)}')
+ logger.debug(f'Test dataset size: {len(test_dataset)}')
+ logger.debug(f'Max item id: {max_item_id}')
+
+ train_sampler = TrainDataset(train_dataset, sampler_type, max_sequence_length=max_sequence_length)
+ validation_sampler = EvalDataset(validation_dataset, max_sequence_length=max_sequence_length)
+ test_sampler = EvalDataset(test_dataset, max_sequence_length=max_sequence_length)
+
+ return cls(
+ train_sampler=train_sampler,
+ validation_sampler=validation_sampler,
+ test_sampler=test_sampler,
+ num_items=max_item_id + 1, # +1 added because our ids are 0-indexed
+ max_sequence_length=max_sequence_length
+ )
+
+ def get_datasets(self):
+ return self._train_sampler, self._validation_sampler, self._test_sampler
+
+ @property
+ def num_items(self):
+ return self._num_items
+
+ @property
+ def max_sequence_length(self):
+ return self._max_sequence_length
+
+
+class TrainDataset(BaseDataset):
+ def __init__(self, dataset, prediction_type, max_sequence_length):
+ self._dataset = dataset
+ self._prediction_type = prediction_type
+ self._max_sequence_length = max_sequence_length
+
+ self._transforms = {
+ 'sasrec': self._all_items_transform,
+ 'tiger': self._last_item_transform
+ }
+
+ def _all_items_transform(self, sample):
+ item_sequence = sample['item.ids'][-self._max_sequence_length:][:-1]
+ next_item_sequence = sample['item.ids'][-self._max_sequence_length:][1:]
+ return {
+ 'user.ids': np.array(sample['user.ids'], dtype=np.int64),
+ 'user.length': np.array([len(sample['user.ids'])], dtype=np.int64),
+ 'item.ids': np.array(item_sequence, dtype=np.int64),
+ 'item.length': np.array([len(item_sequence)], dtype=np.int64),
+ 'labels.ids': np.array(next_item_sequence, dtype=np.int64),
+ 'labels.length': np.array([len(next_item_sequence)], dtype=np.int64)
+ }
+
+ def _last_item_transform(self, sample):
+ item_sequence = sample['item.ids'][-self._max_sequence_length:][:-1]
+ last_item = sample['item.ids'][-self._max_sequence_length:][-1]
+ return {
+ 'user.ids': np.array(sample['user.ids'], dtype=np.int64),
+ 'user.length': np.array([len(sample['user.ids'])], dtype=np.int64),
+ 'item.ids': np.array(item_sequence, dtype=np.int64),
+ 'item.length': np.array([len(item_sequence)], dtype=np.int64),
+ 'labels.ids': np.array([last_item], dtype=np.int64),
+ 'labels.length': np.array([1], dtype=np.int64),
+ }
+
+ def __getitem__(self, index):
+ return self._transforms[self._prediction_type](self._dataset[index])
+
+ def __len__(self):
+ return len(self._dataset)
+
+
+class EvalDataset(BaseDataset):
+ def __init__(self, dataset, max_sequence_length):
+ self._dataset = dataset
+ self._max_sequence_length = max_sequence_length
+
+ @property
+ def dataset(self):
+ return self._dataset
+
+ def __len__(self):
+ return len(self._dataset)
+
+ def __getitem__(self, index):
+ sample = self._dataset[index]
+
+ item_sequence = sample['item.ids'][-self._max_sequence_length:][:-1]
+ next_item = sample['item.ids'][-self._max_sequence_length:][-1]
+
+ return {
+ 'user.ids': np.array(sample['user.ids'], dtype=np.int64),
+ 'user.length': np.array([len(sample['user.ids'])], dtype=np.int64),
+ 'item.ids': np.array(item_sequence, dtype=np.int64),
+ 'item.length': np.array([len(item_sequence)], dtype=np.int64),
+ 'labels.ids': np.array([next_item], dtype=np.int64),
+ 'labels.length': np.array([1], dtype=np.int64),
+ 'visited.ids': np.array(sample['item.ids'][:-1], dtype=np.int64),
+ 'visited.length': np.array([len(sample['item.ids'][:-1])], dtype=np.int64),
+ }
+
+
+class ArrowBatchDataset(BaseDataset):
+ def __init__(self, batch_dir, device='cuda', preload=False):
+ self.batch_dir = Path(batch_dir)
+ self.device = device
+
+ all_files = list(self.batch_dir.glob('batch_*_len_*.arrow'))
+
+ batch_files_map = defaultdict(list)
+ for f in all_files:
+ batch_id = int(f.stem.split('_')[1])
+ batch_files_map[batch_id].append(f)
+
+ for batch_id in batch_files_map:
+ batch_files_map[batch_id].sort()
+
+ self.batch_indices = sorted(batch_files_map.keys())
+
+ if preload:
+ print(f"Preloading {len(self.batch_indices)} batches...")
+ self.cached_batches = []
+
+ for idx in range(len(self.batch_indices)):
+ batch = self._load_batch(batch_files_map[self.batch_indices[idx]])
+ self.cached_batches.append(batch)
+ else:
+ self.cached_batches = None
+ self.batch_files_map = batch_files_map
+
+ def _load_batch(self, arrow_files):
+ batch = {}
+
+ for arrow_file in arrow_files:
+ table = feather.read_table(arrow_file)
+ metadata = table.schema.metadata or {}
+
+ for col_name in table.column_names:
+ col = table.column(col_name)
+
+ shape_key = f'{col_name}_shape'
+ dtype_key = f'{col_name}_dtype'
+
+ if shape_key.encode() in metadata:
+ shape = eval(metadata[shape_key.encode()].decode())
+ dtype = np.dtype(metadata[dtype_key.encode()].decode())
+
+ # Проверяем тип колонки
+ if pa.types.is_list(col.type) or pa.types.is_large_list(col.type):
+ arr = np.array(col.to_pylist(), dtype=dtype)
+ else:
+ arr = col.to_numpy().reshape(shape).astype(dtype)
+ else:
+ if pa.types.is_list(col.type) or pa.types.is_large_list(col.type):
+ arr = np.array(col.to_pylist())
+ else:
+ arr = col.to_numpy()
+
+ batch[col_name] = torch.from_numpy(arr.copy()).to(self.device)
+
+ return batch
+
+ def __len__(self):
+ return len(self.batch_indices)
+
+ def __getitem__(self, idx):
+ if self.cached_batches is not None:
+ return self.cached_batches[idx]
+ else:
+ batch_id = self.batch_indices[idx]
+ arrow_files = self.batch_files_map[batch_id]
+ return self._load_batch(arrow_files)
diff --git a/scripts/tiger-lsvd/lsvd_train_4.1_plum.py b/scripts/tiger-lsvd/lsvd_train_4.1_plum.py
new file mode 100644
index 0000000..96a08d7
--- /dev/null
+++ b/scripts/tiger-lsvd/lsvd_train_4.1_plum.py
@@ -0,0 +1,230 @@
+from functools import partial
+import json
+from loguru import logger
+import os
+
+import torch
+
+import irec.callbacks as cb
+from irec.data.transforms import Collate, ToDevice
+from irec.data.dataloader import DataLoader
+from irec.runners import TrainingRunner
+from irec.utils import fix_random_seed
+
+from data import ArrowBatchDataset
+from models import TigerModel, CorrectItemsLogitsProcessor
+
+
+# ПУТИ
+IREC_PATH = '../../'
+SEMANTIC_MAPPING_PATH = os.path.join(IREC_PATH, 'results/4-1_vk_lsvd_ods_base_with_gap_cb_512_ws_2_k_2000_8w_e_35_clusters_colisionless.json')
+TRAIN_BATCHES_DIR = os.path.join(IREC_PATH, 'data/lsvd/8-weeks-base-one-week-split-4.1/train_batches/')
+VALID_BATCHES_DIR = os.path.join(IREC_PATH, 'data/lsvd/8-weeks-base-one-week-split-4.1/valid_batches/')
+EVAL_BATCHES_DIR = os.path.join(IREC_PATH, 'data/lsvd/8-weeks-base-one-week-split-4.1/eval_batches/')
+
+
+TENSORBOARD_LOGDIR = os.path.join(IREC_PATH, 'tensorboard_logs')
+CHECKPOINTS_DIR = os.path.join(IREC_PATH, 'checkpoints')
+
+EXPERIMENT_NAME = 'tiger_4-1_vk_lsvd_ods_base_with_gap_cb_512_ws_2_k_2000_8w_e_35'
+
+# ОСТАЛЬНОЕ
+SEED_VALUE = 42
+DEVICE = 'cuda'
+
+NUM_EPOCHS = 300
+MAX_SEQ_LEN = 20
+TRAIN_BATCH_SIZE = 256
+VALID_BATCH_SIZE = 1024
+EMBEDDING_DIM = 128
+CODEBOOK_SIZE = 512
+NUM_POSITIONS = 80
+NUM_USER_HASH = 2000
+NUM_HEADS = 6
+NUM_LAYERS = 4
+FEEDFORWARD_DIM = 1024
+KV_DIM = 64
+DROPOUT = 0.2
+NUM_BEAMS = 30
+TOP_K = 20
+NUM_CODEBOOKS = 4
+LR = 0.0001
+
+USE_MICROBATCHING = True
+MICROBATCH_SIZE = 256
+
+torch.set_float32_matmul_precision('high')
+torch._dynamo.config.capture_scalar_outputs = True
+
+import torch._inductor.config as config
+config.triton.cudagraph_skip_dynamic_graphs = True
+
+
+def main():
+ fix_random_seed(SEED_VALUE)
+
+ with open(SEMANTIC_MAPPING_PATH, 'r') as f:
+ mappings = json.load(f)
+
+ train_dataloader = DataLoader(
+ ArrowBatchDataset(
+ TRAIN_BATCHES_DIR,
+ device='cpu',
+ preload=True
+ ),
+ batch_size=1,
+ shuffle=True,
+ num_workers=0,
+ pin_memory=True,
+ collate_fn=Collate()
+ ).map(ToDevice(DEVICE)).repeat(NUM_EPOCHS)
+
+ valid_dataloder = ArrowBatchDataset(
+ VALID_BATCHES_DIR,
+ device=DEVICE,
+ preload=True
+ )
+
+ eval_dataloder = ArrowBatchDataset(
+ EVAL_BATCHES_DIR,
+ device=DEVICE,
+ preload=True
+ )
+
+ model = TigerModel(
+ embedding_dim=EMBEDDING_DIM,
+ codebook_size=CODEBOOK_SIZE,
+ sem_id_len=NUM_CODEBOOKS,
+ user_ids_count=NUM_USER_HASH,
+ num_positions=NUM_POSITIONS,
+ num_heads=NUM_HEADS,
+ num_encoder_layers=NUM_LAYERS,
+ num_decoder_layers=NUM_LAYERS,
+ dim_feedforward=FEEDFORWARD_DIM,
+ num_beams=NUM_BEAMS,
+ num_return_sequences=TOP_K,
+ activation='relu',
+ d_kv=KV_DIM,
+ dropout=DROPOUT,
+ layer_norm_eps=1e-6,
+ initializer_range=0.02,
+ logits_processor=partial(
+ CorrectItemsLogitsProcessor,
+ NUM_CODEBOOKS,
+ CODEBOOK_SIZE,
+ mappings,
+ NUM_BEAMS
+ ),
+ use_microbatching=USE_MICROBATCHING,
+ microbatch_size=MICROBATCH_SIZE
+ ).to(DEVICE)
+
+ total_params = sum(p.numel() for p in model.parameters())
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+ logger.debug(f'Overall parameters: {total_params:,}')
+ logger.debug(f'Trainable parameters: {trainable_params:,}')
+
+ optimizer = torch.optim.AdamW(
+ model.parameters(),
+ lr=LR,
+ )
+
+ EPOCH_NUM_STEPS = 1024 # int(len(train_dataloader) // NUM_EPOCHS)
+
+ callbacks = [
+ cb.BatchMetrics(metrics=lambda model_outputs, _: {
+ 'loss': model_outputs['loss'].item(),
+ }, name='train'),
+ cb.MetricAccumulator(
+ accumulators={
+ 'train/loss': cb.MeanAccumulator(),
+ },
+ reset_every_num_steps=EPOCH_NUM_STEPS
+ ),
+
+ cb.Validation(
+ dataset=valid_dataloder,
+ callbacks=[
+ cb.BatchMetrics(metrics=lambda model_outputs, _:{
+ 'loss': model_outputs['loss'].item(),
+ 'recall@5': model_outputs['recall@5'].tolist(),
+ 'recall@10': model_outputs['recall@10'].tolist(),
+ 'recall@20': model_outputs['recall@20'].tolist(),
+ 'ndcg@5': model_outputs['ndcg@5'].tolist(),
+ 'ndcg@10': model_outputs['ndcg@10'].tolist(),
+ 'ndcg@20': model_outputs['ndcg@20'].tolist(),
+ }, name='validation'),
+ cb.MetricAccumulator(
+ accumulators={
+ 'validation/loss': cb.MeanAccumulator(),
+ 'validation/recall@5': cb.MeanAccumulator(),
+ 'validation/recall@10': cb.MeanAccumulator(),
+ 'validation/recall@20': cb.MeanAccumulator(),
+ 'validation/ndcg@5': cb.MeanAccumulator(),
+ 'validation/ndcg@10': cb.MeanAccumulator(),
+ 'validation/ndcg@20': cb.MeanAccumulator(),
+ },
+ ),
+ ],
+ ).every_num_steps(EPOCH_NUM_STEPS),
+
+ cb.Validation(
+ dataset=eval_dataloder,
+ callbacks=[
+ cb.BatchMetrics(metrics=lambda model_outputs, _: {
+ 'loss': model_outputs['loss'].item(),
+ 'recall@5': model_outputs['recall@5'].tolist(),
+ 'recall@10': model_outputs['recall@10'].tolist(),
+ 'recall@20': model_outputs['recall@20'].tolist(),
+ 'ndcg@5': model_outputs['ndcg@5'].tolist(),
+ 'ndcg@10': model_outputs['ndcg@10'].tolist(),
+ 'ndcg@20': model_outputs['ndcg@20'].tolist(),
+ }, name='eval'),
+ cb.MetricAccumulator(
+ accumulators={
+ 'eval/loss': cb.MeanAccumulator(),
+ 'eval/recall@5': cb.MeanAccumulator(),
+ 'eval/recall@10': cb.MeanAccumulator(),
+ 'eval/recall@20': cb.MeanAccumulator(),
+ 'eval/ndcg@5': cb.MeanAccumulator(),
+ 'eval/ndcg@10': cb.MeanAccumulator(),
+ 'eval/ndcg@20': cb.MeanAccumulator(),
+ },
+ ),
+ ],
+ ).every_num_steps(EPOCH_NUM_STEPS * 4),
+
+ cb.Logger().every_num_steps(EPOCH_NUM_STEPS),
+ cb.TensorboardLogger(experiment_name=EXPERIMENT_NAME, logdir=TENSORBOARD_LOGDIR),
+
+ cb.EarlyStopping(
+ metric='validation/ndcg@20',
+ patience=40 * 4,
+ minimize=False,
+ model_path=os.path.join(CHECKPOINTS_DIR, EXPERIMENT_NAME)
+ ).every_num_steps(EPOCH_NUM_STEPS)
+
+ # cb.Profiler(
+ # wait=10,
+ # warmup=10,
+ # active=10,
+ # logdir=TENSORBOARD_LOGDIR
+ # ),
+ # cb.StopAfterNumSteps(40)
+
+ ]
+
+ logger.debug('Everything is ready for training process!')
+
+ runner = TrainingRunner(
+ model=model,
+ optimizer=optimizer,
+ dataset=train_dataloader,
+ callbacks=callbacks,
+ )
+ runner.run()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/tiger-lsvd/lsvd_train_4.2_plum.py b/scripts/tiger-lsvd/lsvd_train_4.2_plum.py
new file mode 100644
index 0000000..991f662
--- /dev/null
+++ b/scripts/tiger-lsvd/lsvd_train_4.2_plum.py
@@ -0,0 +1,230 @@
+from functools import partial
+import json
+from loguru import logger
+import os
+
+import torch
+
+import irec.callbacks as cb
+from irec.data.transforms import Collate, ToDevice
+from irec.data.dataloader import DataLoader
+from irec.runners import TrainingRunner
+from irec.utils import fix_random_seed
+
+from data import ArrowBatchDataset
+from models import TigerModel, CorrectItemsLogitsProcessor
+
+
+# ПУТИ
+IREC_PATH = '../../'
+SEMANTIC_MAPPING_PATH = os.path.join(IREC_PATH, 'results/4-2_vk_lsvd_ods_base_with_gap_cb_512_ws_2_k_2000_8w_e_35_clusters_colisionless.json')
+TRAIN_BATCHES_DIR = os.path.join(IREC_PATH, 'data/lsvd/8-weeks-base-one-week-split-4.2/train_batches/')
+VALID_BATCHES_DIR = os.path.join(IREC_PATH, 'data/lsvd/8-weeks-base-one-week-split-4.2/valid_batches/')
+EVAL_BATCHES_DIR = os.path.join(IREC_PATH, 'data/lsvd/8-weeks-base-one-week-split-4.2/eval_batches/')
+
+
+TENSORBOARD_LOGDIR = os.path.join(IREC_PATH, 'tensorboard_logs')
+CHECKPOINTS_DIR = os.path.join(IREC_PATH, 'checkpoints')
+
+EXPERIMENT_NAME = 'tiger_4-2_vk_lsvd_ods_base_cb_512_ws_2_k_2000_8w_e_35'
+
+# ОСТАЛЬНОЕ
+SEED_VALUE = 42
+DEVICE = 'cuda'
+
+NUM_EPOCHS = 300
+MAX_SEQ_LEN = 20
+TRAIN_BATCH_SIZE = 256
+VALID_BATCH_SIZE = 1024
+EMBEDDING_DIM = 128
+CODEBOOK_SIZE = 512
+NUM_POSITIONS = 80
+NUM_USER_HASH = 2000
+NUM_HEADS = 6
+NUM_LAYERS = 4
+FEEDFORWARD_DIM = 1024
+KV_DIM = 64
+DROPOUT = 0.2
+NUM_BEAMS = 30
+TOP_K = 20
+NUM_CODEBOOKS = 4
+LR = 0.0001
+
+USE_MICROBATCHING = True
+MICROBATCH_SIZE = 256
+
+torch.set_float32_matmul_precision('high')
+torch._dynamo.config.capture_scalar_outputs = True
+
+import torch._inductor.config as config
+config.triton.cudagraph_skip_dynamic_graphs = True
+
+
+def main():
+ fix_random_seed(SEED_VALUE)
+
+ with open(SEMANTIC_MAPPING_PATH, 'r') as f:
+ mappings = json.load(f)
+
+ train_dataloader = DataLoader(
+ ArrowBatchDataset(
+ TRAIN_BATCHES_DIR,
+ device='cpu',
+ preload=True
+ ),
+ batch_size=1,
+ shuffle=True,
+ num_workers=0,
+ pin_memory=True,
+ collate_fn=Collate()
+ ).map(ToDevice(DEVICE)).repeat(NUM_EPOCHS)
+
+ valid_dataloder = ArrowBatchDataset(
+ VALID_BATCHES_DIR,
+ device=DEVICE,
+ preload=True
+ )
+
+ eval_dataloder = ArrowBatchDataset(
+ EVAL_BATCHES_DIR,
+ device=DEVICE,
+ preload=True
+ )
+
+ model = TigerModel(
+ embedding_dim=EMBEDDING_DIM,
+ codebook_size=CODEBOOK_SIZE,
+ sem_id_len=NUM_CODEBOOKS,
+ user_ids_count=NUM_USER_HASH,
+ num_positions=NUM_POSITIONS,
+ num_heads=NUM_HEADS,
+ num_encoder_layers=NUM_LAYERS,
+ num_decoder_layers=NUM_LAYERS,
+ dim_feedforward=FEEDFORWARD_DIM,
+ num_beams=NUM_BEAMS,
+ num_return_sequences=TOP_K,
+ activation='relu',
+ d_kv=KV_DIM,
+ dropout=DROPOUT,
+ layer_norm_eps=1e-6,
+ initializer_range=0.02,
+ logits_processor=partial(
+ CorrectItemsLogitsProcessor,
+ NUM_CODEBOOKS,
+ CODEBOOK_SIZE,
+ mappings,
+ NUM_BEAMS
+ ),
+ use_microbatching=USE_MICROBATCHING,
+ microbatch_size=MICROBATCH_SIZE
+ ).to(DEVICE)
+
+ total_params = sum(p.numel() for p in model.parameters())
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+ logger.debug(f'Overall parameters: {total_params:,}')
+ logger.debug(f'Trainable parameters: {trainable_params:,}')
+
+ optimizer = torch.optim.AdamW(
+ model.parameters(),
+ lr=LR,
+ )
+
+ EPOCH_NUM_STEPS = 1024 # int(len(train_dataloader) // NUM_EPOCHS)
+
+ callbacks = [
+ cb.BatchMetrics(metrics=lambda model_outputs, _: {
+ 'loss': model_outputs['loss'].item(),
+ }, name='train'),
+ cb.MetricAccumulator(
+ accumulators={
+ 'train/loss': cb.MeanAccumulator(),
+ },
+ reset_every_num_steps=EPOCH_NUM_STEPS
+ ),
+
+ cb.Validation(
+ dataset=valid_dataloder,
+ callbacks=[
+ cb.BatchMetrics(metrics=lambda model_outputs, _:{
+ 'loss': model_outputs['loss'].item(),
+ 'recall@5': model_outputs['recall@5'].tolist(),
+ 'recall@10': model_outputs['recall@10'].tolist(),
+ 'recall@20': model_outputs['recall@20'].tolist(),
+ 'ndcg@5': model_outputs['ndcg@5'].tolist(),
+ 'ndcg@10': model_outputs['ndcg@10'].tolist(),
+ 'ndcg@20': model_outputs['ndcg@20'].tolist(),
+ }, name='validation'),
+ cb.MetricAccumulator(
+ accumulators={
+ 'validation/loss': cb.MeanAccumulator(),
+ 'validation/recall@5': cb.MeanAccumulator(),
+ 'validation/recall@10': cb.MeanAccumulator(),
+ 'validation/recall@20': cb.MeanAccumulator(),
+ 'validation/ndcg@5': cb.MeanAccumulator(),
+ 'validation/ndcg@10': cb.MeanAccumulator(),
+ 'validation/ndcg@20': cb.MeanAccumulator(),
+ },
+ ),
+ ],
+ ).every_num_steps(EPOCH_NUM_STEPS),
+
+ cb.Validation(
+ dataset=eval_dataloder,
+ callbacks=[
+ cb.BatchMetrics(metrics=lambda model_outputs, _: {
+ 'loss': model_outputs['loss'].item(),
+ 'recall@5': model_outputs['recall@5'].tolist(),
+ 'recall@10': model_outputs['recall@10'].tolist(),
+ 'recall@20': model_outputs['recall@20'].tolist(),
+ 'ndcg@5': model_outputs['ndcg@5'].tolist(),
+ 'ndcg@10': model_outputs['ndcg@10'].tolist(),
+ 'ndcg@20': model_outputs['ndcg@20'].tolist(),
+ }, name='eval'),
+ cb.MetricAccumulator(
+ accumulators={
+ 'eval/loss': cb.MeanAccumulator(),
+ 'eval/recall@5': cb.MeanAccumulator(),
+ 'eval/recall@10': cb.MeanAccumulator(),
+ 'eval/recall@20': cb.MeanAccumulator(),
+ 'eval/ndcg@5': cb.MeanAccumulator(),
+ 'eval/ndcg@10': cb.MeanAccumulator(),
+ 'eval/ndcg@20': cb.MeanAccumulator(),
+ },
+ ),
+ ],
+ ).every_num_steps(EPOCH_NUM_STEPS * 4),
+
+ cb.Logger().every_num_steps(EPOCH_NUM_STEPS),
+ cb.TensorboardLogger(experiment_name=EXPERIMENT_NAME, logdir=TENSORBOARD_LOGDIR),
+
+ cb.EarlyStopping(
+ metric='validation/ndcg@20',
+ patience=40 * 4,
+ minimize=False,
+ model_path=os.path.join(CHECKPOINTS_DIR, EXPERIMENT_NAME)
+ ).every_num_steps(EPOCH_NUM_STEPS)
+
+ # cb.Profiler(
+ # wait=10,
+ # warmup=10,
+ # active=10,
+ # logdir=TENSORBOARD_LOGDIR
+ # ),
+ # cb.StopAfterNumSteps(40)
+
+ ]
+
+ logger.debug('Everything is ready for training process!')
+
+ runner = TrainingRunner(
+ model=model,
+ optimizer=optimizer,
+ dataset=train_dataloader,
+ callbacks=callbacks,
+ )
+ runner.run()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/tiger-lsvd/lsvd_train_rqvae.py b/scripts/tiger-lsvd/lsvd_train_rqvae.py
new file mode 100644
index 0000000..aadf225
--- /dev/null
+++ b/scripts/tiger-lsvd/lsvd_train_rqvae.py
@@ -0,0 +1,230 @@
+from functools import partial
+import json
+from loguru import logger
+import os
+
+import torch
+
+import irec.callbacks as cb
+from irec.data.transforms import Collate, ToDevice
+from irec.data.dataloader import DataLoader
+from irec.runners import TrainingRunner
+from irec.utils import fix_random_seed
+
+from data import ArrowBatchDataset
+from models import TigerModel, CorrectItemsLogitsProcessor
+
+
+# ПУТИ
+IREC_PATH = '../../'
+SEMANTIC_MAPPING_PATH = os.path.join(IREC_PATH, 'results/rqvae_vk_lsvd_cz_512_8-weeks_clusters_colisionless.json')
+TRAIN_BATCHES_DIR = os.path.join(IREC_PATH, 'data/lsvd/8-weeks-base-one-week-split-rqvae/train_batches/')
+VALID_BATCHES_DIR = os.path.join(IREC_PATH, 'data/lsvd/8-weeks-base-one-week-split-rqvae/valid_batches/')
+EVAL_BATCHES_DIR = os.path.join(IREC_PATH, 'data/lsvd/8-weeks-base-one-week-split-rqvae/eval_batches/')
+
+
+TENSORBOARD_LOGDIR = os.path.join(IREC_PATH, 'tensorboard_logs')
+CHECKPOINTS_DIR = os.path.join(IREC_PATH, 'checkpoints')
+
+EXPERIMENT_NAME = 'tiger_rqvae_vk_lsvd_cz_512_8-weeks'
+
+# ОСТАЛЬНОЕ
+SEED_VALUE = 42
+DEVICE = 'cuda'
+
+NUM_EPOCHS = 300
+MAX_SEQ_LEN = 20
+TRAIN_BATCH_SIZE = 256
+VALID_BATCH_SIZE = 1024
+EMBEDDING_DIM = 128
+CODEBOOK_SIZE = 512
+NUM_POSITIONS = 80
+NUM_USER_HASH = 2000
+NUM_HEADS = 6
+NUM_LAYERS = 4
+FEEDFORWARD_DIM = 1024
+KV_DIM = 64
+DROPOUT = 0.2
+NUM_BEAMS = 30
+TOP_K = 20
+NUM_CODEBOOKS = 4
+LR = 0.0001
+
+USE_MICROBATCHING = True
+MICROBATCH_SIZE = 256
+
+torch.set_float32_matmul_precision('high')
+torch._dynamo.config.capture_scalar_outputs = True
+
+import torch._inductor.config as config
+config.triton.cudagraph_skip_dynamic_graphs = True
+
+
+def main():
+ fix_random_seed(SEED_VALUE)
+
+ with open(SEMANTIC_MAPPING_PATH, 'r') as f:
+ mappings = json.load(f)
+
+ train_dataloader = DataLoader(
+ ArrowBatchDataset(
+ TRAIN_BATCHES_DIR,
+ device='cpu',
+ preload=True
+ ),
+ batch_size=1,
+ shuffle=True,
+ num_workers=0,
+ pin_memory=True,
+ collate_fn=Collate()
+ ).map(ToDevice(DEVICE)).repeat(NUM_EPOCHS)
+
+ valid_dataloder = ArrowBatchDataset(
+ VALID_BATCHES_DIR,
+ device=DEVICE,
+ preload=True
+ )
+
+ eval_dataloder = ArrowBatchDataset(
+ EVAL_BATCHES_DIR,
+ device=DEVICE,
+ preload=True
+ )
+
+ model = TigerModel(
+ embedding_dim=EMBEDDING_DIM,
+ codebook_size=CODEBOOK_SIZE,
+ sem_id_len=NUM_CODEBOOKS,
+ user_ids_count=NUM_USER_HASH,
+ num_positions=NUM_POSITIONS,
+ num_heads=NUM_HEADS,
+ num_encoder_layers=NUM_LAYERS,
+ num_decoder_layers=NUM_LAYERS,
+ dim_feedforward=FEEDFORWARD_DIM,
+ num_beams=NUM_BEAMS,
+ num_return_sequences=TOP_K,
+ activation='relu',
+ d_kv=KV_DIM,
+ dropout=DROPOUT,
+ layer_norm_eps=1e-6,
+ initializer_range=0.02,
+ logits_processor=partial(
+ CorrectItemsLogitsProcessor,
+ NUM_CODEBOOKS,
+ CODEBOOK_SIZE,
+ mappings,
+ NUM_BEAMS
+ ),
+ use_microbatching=USE_MICROBATCHING,
+ microbatch_size=MICROBATCH_SIZE
+ ).to(DEVICE)
+
+ total_params = sum(p.numel() for p in model.parameters())
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+ logger.debug(f'Overall parameters: {total_params:,}')
+ logger.debug(f'Trainable parameters: {trainable_params:,}')
+
+ optimizer = torch.optim.AdamW(
+ model.parameters(),
+ lr=LR,
+ )
+
+ EPOCH_NUM_STEPS = 1024 # int(len(train_dataloader) // NUM_EPOCHS)
+
+ callbacks = [
+ cb.BatchMetrics(metrics=lambda model_outputs, _: {
+ 'loss': model_outputs['loss'].item(),
+ }, name='train'),
+ cb.MetricAccumulator(
+ accumulators={
+ 'train/loss': cb.MeanAccumulator(),
+ },
+ reset_every_num_steps=EPOCH_NUM_STEPS
+ ),
+
+ cb.Validation(
+ dataset=valid_dataloder,
+ callbacks=[
+ cb.BatchMetrics(metrics=lambda model_outputs, _:{
+ 'loss': model_outputs['loss'].item(),
+ 'recall@5': model_outputs['recall@5'].tolist(),
+ 'recall@10': model_outputs['recall@10'].tolist(),
+ 'recall@20': model_outputs['recall@20'].tolist(),
+ 'ndcg@5': model_outputs['ndcg@5'].tolist(),
+ 'ndcg@10': model_outputs['ndcg@10'].tolist(),
+ 'ndcg@20': model_outputs['ndcg@20'].tolist(),
+ }, name='validation'),
+ cb.MetricAccumulator(
+ accumulators={
+ 'validation/loss': cb.MeanAccumulator(),
+ 'validation/recall@5': cb.MeanAccumulator(),
+ 'validation/recall@10': cb.MeanAccumulator(),
+ 'validation/recall@20': cb.MeanAccumulator(),
+ 'validation/ndcg@5': cb.MeanAccumulator(),
+ 'validation/ndcg@10': cb.MeanAccumulator(),
+ 'validation/ndcg@20': cb.MeanAccumulator(),
+ },
+ ),
+ ],
+ ).every_num_steps(EPOCH_NUM_STEPS),
+
+ cb.Validation(
+ dataset=eval_dataloder,
+ callbacks=[
+ cb.BatchMetrics(metrics=lambda model_outputs, _: {
+ 'loss': model_outputs['loss'].item(),
+ 'recall@5': model_outputs['recall@5'].tolist(),
+ 'recall@10': model_outputs['recall@10'].tolist(),
+ 'recall@20': model_outputs['recall@20'].tolist(),
+ 'ndcg@5': model_outputs['ndcg@5'].tolist(),
+ 'ndcg@10': model_outputs['ndcg@10'].tolist(),
+ 'ndcg@20': model_outputs['ndcg@20'].tolist(),
+ }, name='eval'),
+ cb.MetricAccumulator(
+ accumulators={
+ 'eval/loss': cb.MeanAccumulator(),
+ 'eval/recall@5': cb.MeanAccumulator(),
+ 'eval/recall@10': cb.MeanAccumulator(),
+ 'eval/recall@20': cb.MeanAccumulator(),
+ 'eval/ndcg@5': cb.MeanAccumulator(),
+ 'eval/ndcg@10': cb.MeanAccumulator(),
+ 'eval/ndcg@20': cb.MeanAccumulator(),
+ },
+ ),
+ ],
+ ).every_num_steps(EPOCH_NUM_STEPS * 4),
+
+ cb.Logger().every_num_steps(EPOCH_NUM_STEPS),
+ cb.TensorboardLogger(experiment_name=EXPERIMENT_NAME, logdir=TENSORBOARD_LOGDIR),
+
+ cb.EarlyStopping(
+ metric='validation/ndcg@20',
+ patience=40 * 4,
+ minimize=False,
+ model_path=os.path.join(CHECKPOINTS_DIR, EXPERIMENT_NAME)
+ ).every_num_steps(EPOCH_NUM_STEPS)
+
+ # cb.Profiler(
+ # wait=10,
+ # warmup=10,
+ # active=10,
+ # logdir=TENSORBOARD_LOGDIR
+ # ),
+ # cb.StopAfterNumSteps(40)
+
+ ]
+
+ logger.debug('Everything is ready for training process!')
+
+ runner = TrainingRunner(
+ model=model,
+ optimizer=optimizer,
+ dataset=train_dataloader,
+ callbacks=callbacks,
+ )
+ runner.run()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/tiger-lsvd/lsvd_varka_4.1_plum.py b/scripts/tiger-lsvd/lsvd_varka_4.1_plum.py
new file mode 100644
index 0000000..cc35507
--- /dev/null
+++ b/scripts/tiger-lsvd/lsvd_varka_4.1_plum.py
@@ -0,0 +1,304 @@
+from collections import defaultdict
+import json
+import murmurhash
+import numpy as np
+import os
+from pathlib import Path
+
+import pyarrow as pa
+import pyarrow.feather as feather
+
+import torch
+
+from irec.data.transforms import Collate, Transform
+from irec.data.dataloader import DataLoader
+
+from data import Dataset
+
+print("tiger no arrow varka 4.1")
+
+# ПУТИ
+
+IREC_PATH = '../../'
+INTERACTIONS_TRAIN_PATH = "/home/jovyan/IRec/sigir/lsvd_data/8-days-base-ows/base_with_gap_interactions_grouped.parquet"
+INTERACTIONS_VALID_PATH = "/home/jovyan/IRec/sigir/lsvd_data/8-days-base-ows/val_interactions_grouped.parquet"
+INTERACTIONS_TEST_PATH = "/home/jovyan/IRec/sigir/lsvd_data/8-days-base-ows/test_interactions_grouped.parquet"
+
+SEMANTIC_MAPPING_PATH = os.path.join(IREC_PATH, 'results/4-1_vk_lsvd_ods_base_with_gap_cb_512_ws_2_k_2000_8w_e_35_clusters_colisionless.json')
+TRAIN_BATCHES_DIR = os.path.join(IREC_PATH, 'data/lsvd/8-weeks-base-one-week-split-4.1/train_batches/')
+VALID_BATCHES_DIR = os.path.join(IREC_PATH, 'data/lsvd/8-weeks-base-one-week-split-4.1/valid_batches/')
+EVAL_BATCHES_DIR = os.path.join(IREC_PATH, 'data/lsvd/8-weeks-base-one-week-split-4.1/eval_batches/')
+
+
+# ОСТАЛЬНОЕ
+
+SEED_VALUE = 42
+DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+
+MAX_TRAIN_EVENTS = 500
+MAX_SEQ_LEN = 20
+TRAIN_BATCH_SIZE = 256
+VALID_BATCH_SIZE = 1024
+NUM_USER_HASH = 2000
+CODEBOOK_SIZE = 512
+NUM_CODEBOOKS = 4
+
+UNIFIED_VOCAB_SIZE = CODEBOOK_SIZE * NUM_CODEBOOKS + NUM_USER_HASH + 10 # 10 for utilities
+PAD_TOKEN_ID = UNIFIED_VOCAB_SIZE - 1,
+EOS_TOKEN_ID = UNIFIED_VOCAB_SIZE - 2,
+DECODER_START_TOKEN_ID = UNIFIED_VOCAB_SIZE - 3,
+
+
+class TigerProcessing(Transform):
+ def __call__(self, batch):
+ input_semantic_ids, attention_mask = batch['item.semantic.padded'], batch['item.semantic.mask']
+ batch_size = attention_mask.shape[0]
+
+ input_semantic_ids[~attention_mask] = PAD_TOKEN_ID # TODO ???
+
+ input_semantic_ids = np.concatenate([
+ input_semantic_ids,
+ NUM_CODEBOOKS * CODEBOOK_SIZE + batch['user.hashed.ids'][:, None]
+ ], axis=-1)
+
+ attention_mask = np.concatenate([
+ attention_mask,
+ np.ones((batch_size, 1), dtype=attention_mask.dtype)
+ ], axis=-1)
+
+ batch['input.data'] = input_semantic_ids
+ batch['input.mask'] = attention_mask
+
+ target_semantic_ids = batch['labels.semantic.padded']
+ target_semantic_ids = np.concatenate([
+ np.ones(
+ (batch_size, 1),
+ dtype=np.int64,
+ ) * DECODER_START_TOKEN_ID,
+ target_semantic_ids
+ ], axis=-1)
+
+ batch['output.data'] = target_semantic_ids
+
+ return batch
+
+
+class ToMasked(Transform):
+ def __init__(self, prefix, is_right_aligned=False):
+ self._prefix = prefix
+ self._is_right_aligned = is_right_aligned
+
+ def __call__(self, batch):
+ data = batch[f'{self._prefix}.ids']
+ lengths = batch[f'{self._prefix}.length']
+
+ batch_size = lengths.shape[0]
+ max_sequence_length = int(lengths.max())
+
+ if len(data.shape) == 1: # only indices
+ padded_tensor = np.zeros(
+ (batch_size, max_sequence_length),
+ dtype=data.dtype
+ ) # (batch_size, max_seq_len)
+ else:
+ assert len(data.shape) == 2 # embeddings
+ padded_tensor = np.zeros(
+ (batch_size, max_sequence_length, data.shape[-1]),
+ dtype=data.dtype
+ ) # (batch_size, max_seq_len, emb_dim)
+
+ mask = np.arange(max_sequence_length)[None] < lengths[:, None]
+
+ if self._is_right_aligned:
+ mask = np.flip(mask, axis=-1)
+
+ padded_tensor[mask] = data
+
+ batch[f'{self._prefix}.padded'] = padded_tensor
+ batch[f'{self._prefix}.mask'] = mask
+
+ return batch
+
+
+class SemanticIdsMapper(Transform):
+ def __init__(self, mapping, names=[]):
+ super().__init__()
+ self._mapping = mapping
+ self._names = names
+
+ max_item_id = max(int(k) for k in mapping.keys())
+ print(len(list(mapping.keys())), min(int(k) for k in mapping.keys()) , max(int(k) for k in mapping.keys()))
+ # print(mapping["280052"]) #304781
+ # assert False
+ data = []
+ for i in range(max_item_id + 1):
+ if str(i) in mapping:
+ data.append(mapping[str(i)])
+ else:
+ data.append([-1] * NUM_CODEBOOKS)
+
+ self._mapping_tensor = torch.tensor(data, dtype=torch.long)
+ self._semantic_length = self._mapping_tensor.shape[-1]
+
+ missing_count = (max_item_id + 1) - len(mapping)
+ print(f"Mapping: {len(mapping)} items, {missing_count} missing (-1 filled)")
+
+ def __call__(self, batch):
+ for name in self._names:
+ if f'{name}.ids' in batch:
+ ids = batch[f'{name}.ids']
+ lengths = batch[f'{name}.length']
+ assert ids.min() >= 0
+ assert ids.max() < self._mapping_tensor.shape[0]
+ semantic_ids = self._mapping_tensor[ids].flatten()
+
+ assert (semantic_ids != -1).all(), \
+ f"Missing mappings detected in {name}! Invalid positions: {(semantic_ids == -1).sum()} out of {len(semantic_ids)}"
+
+ batch[f'{name}.semantic.ids'] = semantic_ids.numpy()
+ batch[f'{name}.semantic.length'] = lengths * self._semantic_length
+
+ return batch
+
+
+class UserHashing(Transform):
+ def __init__(self, hash_size):
+ super().__init__()
+ self._hash_size = hash_size
+
+ def __call__(self, batch):
+ batch['user.hashed.ids'] = np.array([murmurhash.hash(str(x)) % self._hash_size for x in batch['user.ids']], dtype=np.int64)
+ return batch
+
+
+def save_batches_to_arrow(batches, output_dir):
+ output_dir = Path(output_dir)
+ output_dir.mkdir(parents=True, exist_ok=False)
+
+ for batch_idx, batch in enumerate(batches):
+ length_groups = defaultdict(dict)
+ metadata_groups = defaultdict(dict)
+
+ for key, value in batch.items():
+ length = len(value)
+
+ metadata_groups[length][f'{key}_shape'] = str(value.shape)
+ metadata_groups[length][f'{key}_dtype'] = str(value.dtype)
+
+ if value.ndim == 1:
+ # 1D массив - сохраняем как есть
+ length_groups[length][key] = value
+ elif value.ndim == 2:
+ # 2D массив - используем list of lists
+ length_groups[length][key] = value.tolist()
+ else:
+ # >2D массив - flatten и сохраняем shape
+ length_groups[length][key] = value.flatten()
+
+ for length, fields in length_groups.items():
+ arrow_dict = {}
+ for k, v in fields.items():
+ if isinstance(v, list) and len(v) > 0 and isinstance(v[0], list):
+ # List of lists (2D)
+ arrow_dict[k] = pa.array(v)
+ else:
+ arrow_dict[k] = pa.array(v)
+
+ table = pa.table(arrow_dict)
+ if length in metadata_groups:
+ table = table.replace_schema_metadata(metadata_groups[length])
+
+ feather.write_feather(
+ table,
+ output_dir / f"batch_{batch_idx:06d}_len_{length}.arrow",
+ compression='lz4'
+ )
+
+ # arrow_dict = {k: pa.array(v) for k, v in fields.items()}
+ # table = pa.table(arrow_dict)
+
+ # feather.write_feather(
+ # table,
+ # output_dir / f"batch_{batch_idx:06d}_len_{length}.arrow",
+ # compression='lz4'
+ # )
+
+
+def main():
+ with open(SEMANTIC_MAPPING_PATH, 'r') as f:
+ mappings = json.load(f)
+ print("варка может начать умирать")
+ data = Dataset.create_timestamp_based_parquet(
+ train_parquet_path=INTERACTIONS_TRAIN_PATH,
+ validation_parquet_path=INTERACTIONS_VALID_PATH,
+ test_parquet_path=INTERACTIONS_TEST_PATH,
+ max_sequence_length=MAX_SEQ_LEN,
+ sampler_type='tiger',
+ min_sample_len=2,
+ is_extended=True,
+ max_train_events=MAX_TRAIN_EVENTS
+ )
+
+ train_dataset, valid_dataset, eval_dataset = data.get_datasets()
+ print("варка не умерла")
+ train_dataloader = DataLoader(
+ dataset=train_dataset,
+ batch_size=TRAIN_BATCH_SIZE,
+ shuffle=True,
+ drop_last=True
+ ) \
+ .map(Collate()) \
+ .map(UserHashing(NUM_USER_HASH)) \
+ .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \
+ .map(ToMasked('item.semantic', is_right_aligned=True)) \
+ .map(ToMasked('labels.semantic', is_right_aligned=True)) \
+ .map(TigerProcessing())
+
+ valid_dataloader = DataLoader(
+ dataset=valid_dataset,
+ batch_size=VALID_BATCH_SIZE,
+ shuffle=False,
+ drop_last=False
+ ) \
+ .map(Collate()) \
+ .map(UserHashing(NUM_USER_HASH)) \
+ .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \
+ .map(ToMasked('item.semantic', is_right_aligned=True)) \
+ .map(ToMasked('labels.semantic', is_right_aligned=True)) \
+ .map(ToMasked('visited', is_right_aligned=True)) \
+ .map(TigerProcessing())
+
+ eval_dataloader = DataLoader(
+ dataset=eval_dataset,
+ batch_size=VALID_BATCH_SIZE,
+ shuffle=False,
+ drop_last=False
+ ) \
+ .map(Collate()) \
+ .map(UserHashing(NUM_USER_HASH)) \
+ .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \
+ .map(ToMasked('item.semantic', is_right_aligned=True)) \
+ .map(ToMasked('labels.semantic', is_right_aligned=True)) \
+ .map(ToMasked('visited', is_right_aligned=True)) \
+ .map(TigerProcessing())
+
+ train_batches = []
+ for train_batch in train_dataloader:
+ train_batches.append(train_batch)
+ save_batches_to_arrow(train_batches, TRAIN_BATCHES_DIR)
+
+ valid_batches = []
+ for valid_batch in valid_dataloader:
+ valid_batches.append(valid_batch)
+ save_batches_to_arrow(valid_batches, VALID_BATCHES_DIR)
+
+ eval_batches = []
+ for eval_batch in eval_dataloader:
+ eval_batches.append(eval_batch)
+ save_batches_to_arrow(eval_batches, EVAL_BATCHES_DIR)
+
+
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/tiger-lsvd/lsvd_varka_4.2_plum.py b/scripts/tiger-lsvd/lsvd_varka_4.2_plum.py
new file mode 100644
index 0000000..7de54e4
--- /dev/null
+++ b/scripts/tiger-lsvd/lsvd_varka_4.2_plum.py
@@ -0,0 +1,304 @@
+from collections import defaultdict
+import json
+import murmurhash
+import numpy as np
+import os
+from pathlib import Path
+
+import pyarrow as pa
+import pyarrow.feather as feather
+
+import torch
+
+from irec.data.transforms import Collate, Transform
+from irec.data.dataloader import DataLoader
+
+from data import Dataset
+
+print("tiger no arrow varka 4.1")
+
+# ПУТИ
+
+IREC_PATH = '../../'
+INTERACTIONS_TRAIN_PATH = "/home/jovyan/IRec/sigir/lsvd_data/8-days-base-ows/base_with_gap_interactions_grouped.parquet"
+INTERACTIONS_VALID_PATH = "/home/jovyan/IRec/sigir/lsvd_data/8-days-base-ows/val_interactions_grouped.parquet"
+INTERACTIONS_TEST_PATH = "/home/jovyan/IRec/sigir/lsvd_data/8-days-base-ows/test_interactions_grouped.parquet"
+
+SEMANTIC_MAPPING_PATH = os.path.join(IREC_PATH, 'results/4-2_vk_lsvd_ods_base_with_gap_cb_512_ws_2_k_2000_8w_e_35_clusters_colisionless.json')
+TRAIN_BATCHES_DIR = os.path.join(IREC_PATH, 'data/lsvd/8-weeks-base-one-week-split-4.2/train_batches/')
+VALID_BATCHES_DIR = os.path.join(IREC_PATH, 'data/lsvd/8-weeks-base-one-week-split-4.2/valid_batches/')
+EVAL_BATCHES_DIR = os.path.join(IREC_PATH, 'data/lsvd/8-weeks-base-one-week-split-4.2/eval_batches/')
+
+
+# ОСТАЛЬНОЕ
+
+SEED_VALUE = 42
+DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+
+MAX_TRAIN_EVENTS = 500
+MAX_SEQ_LEN = 20
+TRAIN_BATCH_SIZE = 256
+VALID_BATCH_SIZE = 1024
+NUM_USER_HASH = 2000
+CODEBOOK_SIZE = 512
+NUM_CODEBOOKS = 4
+
+UNIFIED_VOCAB_SIZE = CODEBOOK_SIZE * NUM_CODEBOOKS + NUM_USER_HASH + 10 # 10 for utilities
+PAD_TOKEN_ID = UNIFIED_VOCAB_SIZE - 1,
+EOS_TOKEN_ID = UNIFIED_VOCAB_SIZE - 2,
+DECODER_START_TOKEN_ID = UNIFIED_VOCAB_SIZE - 3,
+
+
+class TigerProcessing(Transform):
+ def __call__(self, batch):
+ input_semantic_ids, attention_mask = batch['item.semantic.padded'], batch['item.semantic.mask']
+ batch_size = attention_mask.shape[0]
+
+ input_semantic_ids[~attention_mask] = PAD_TOKEN_ID # TODO ???
+
+ input_semantic_ids = np.concatenate([
+ input_semantic_ids,
+ NUM_CODEBOOKS * CODEBOOK_SIZE + batch['user.hashed.ids'][:, None]
+ ], axis=-1)
+
+ attention_mask = np.concatenate([
+ attention_mask,
+ np.ones((batch_size, 1), dtype=attention_mask.dtype)
+ ], axis=-1)
+
+ batch['input.data'] = input_semantic_ids
+ batch['input.mask'] = attention_mask
+
+ target_semantic_ids = batch['labels.semantic.padded']
+ target_semantic_ids = np.concatenate([
+ np.ones(
+ (batch_size, 1),
+ dtype=np.int64,
+ ) * DECODER_START_TOKEN_ID,
+ target_semantic_ids
+ ], axis=-1)
+
+ batch['output.data'] = target_semantic_ids
+
+ return batch
+
+
+class ToMasked(Transform):
+ def __init__(self, prefix, is_right_aligned=False):
+ self._prefix = prefix
+ self._is_right_aligned = is_right_aligned
+
+ def __call__(self, batch):
+ data = batch[f'{self._prefix}.ids']
+ lengths = batch[f'{self._prefix}.length']
+
+ batch_size = lengths.shape[0]
+ max_sequence_length = int(lengths.max())
+
+ if len(data.shape) == 1: # only indices
+ padded_tensor = np.zeros(
+ (batch_size, max_sequence_length),
+ dtype=data.dtype
+ ) # (batch_size, max_seq_len)
+ else:
+ assert len(data.shape) == 2 # embeddings
+ padded_tensor = np.zeros(
+ (batch_size, max_sequence_length, data.shape[-1]),
+ dtype=data.dtype
+ ) # (batch_size, max_seq_len, emb_dim)
+
+ mask = np.arange(max_sequence_length)[None] < lengths[:, None]
+
+ if self._is_right_aligned:
+ mask = np.flip(mask, axis=-1)
+
+ padded_tensor[mask] = data
+
+ batch[f'{self._prefix}.padded'] = padded_tensor
+ batch[f'{self._prefix}.mask'] = mask
+
+ return batch
+
+
+class SemanticIdsMapper(Transform):
+ def __init__(self, mapping, names=[]):
+ super().__init__()
+ self._mapping = mapping
+ self._names = names
+
+ max_item_id = max(int(k) for k in mapping.keys())
+ print(len(list(mapping.keys())), min(int(k) for k in mapping.keys()) , max(int(k) for k in mapping.keys()))
+ # print(mapping["280052"]) #304781
+ # assert False
+ data = []
+ for i in range(max_item_id + 1):
+ if str(i) in mapping:
+ data.append(mapping[str(i)])
+ else:
+ data.append([-1] * NUM_CODEBOOKS)
+
+ self._mapping_tensor = torch.tensor(data, dtype=torch.long)
+ self._semantic_length = self._mapping_tensor.shape[-1]
+
+ missing_count = (max_item_id + 1) - len(mapping)
+ print(f"Mapping: {len(mapping)} items, {missing_count} missing (-1 filled)")
+
+ def __call__(self, batch):
+ for name in self._names:
+ if f'{name}.ids' in batch:
+ ids = batch[f'{name}.ids']
+ lengths = batch[f'{name}.length']
+ assert ids.min() >= 0
+ assert ids.max() < self._mapping_tensor.shape[0]
+ semantic_ids = self._mapping_tensor[ids].flatten()
+
+ assert (semantic_ids != -1).all(), \
+ f"Missing mappings detected in {name}! Invalid positions: {(semantic_ids == -1).sum()} out of {len(semantic_ids)}"
+
+ batch[f'{name}.semantic.ids'] = semantic_ids.numpy()
+ batch[f'{name}.semantic.length'] = lengths * self._semantic_length
+
+ return batch
+
+
+class UserHashing(Transform):
+ def __init__(self, hash_size):
+ super().__init__()
+ self._hash_size = hash_size
+
+ def __call__(self, batch):
+ batch['user.hashed.ids'] = np.array([murmurhash.hash(str(x)) % self._hash_size for x in batch['user.ids']], dtype=np.int64)
+ return batch
+
+
+def save_batches_to_arrow(batches, output_dir):
+ output_dir = Path(output_dir)
+ output_dir.mkdir(parents=True, exist_ok=False)
+
+ for batch_idx, batch in enumerate(batches):
+ length_groups = defaultdict(dict)
+ metadata_groups = defaultdict(dict)
+
+ for key, value in batch.items():
+ length = len(value)
+
+ metadata_groups[length][f'{key}_shape'] = str(value.shape)
+ metadata_groups[length][f'{key}_dtype'] = str(value.dtype)
+
+ if value.ndim == 1:
+ # 1D массив - сохраняем как есть
+ length_groups[length][key] = value
+ elif value.ndim == 2:
+ # 2D массив - используем list of lists
+ length_groups[length][key] = value.tolist()
+ else:
+ # >2D массив - flatten и сохраняем shape
+ length_groups[length][key] = value.flatten()
+
+ for length, fields in length_groups.items():
+ arrow_dict = {}
+ for k, v in fields.items():
+ if isinstance(v, list) and len(v) > 0 and isinstance(v[0], list):
+ # List of lists (2D)
+ arrow_dict[k] = pa.array(v)
+ else:
+ arrow_dict[k] = pa.array(v)
+
+ table = pa.table(arrow_dict)
+ if length in metadata_groups:
+ table = table.replace_schema_metadata(metadata_groups[length])
+
+ feather.write_feather(
+ table,
+ output_dir / f"batch_{batch_idx:06d}_len_{length}.arrow",
+ compression='lz4'
+ )
+
+ # arrow_dict = {k: pa.array(v) for k, v in fields.items()}
+ # table = pa.table(arrow_dict)
+
+ # feather.write_feather(
+ # table,
+ # output_dir / f"batch_{batch_idx:06d}_len_{length}.arrow",
+ # compression='lz4'
+ # )
+
+
+def main():
+ with open(SEMANTIC_MAPPING_PATH, 'r') as f:
+ mappings = json.load(f)
+ print("варка может начать умирать")
+ data = Dataset.create_timestamp_based_parquet(
+ train_parquet_path=INTERACTIONS_TRAIN_PATH,
+ validation_parquet_path=INTERACTIONS_VALID_PATH,
+ test_parquet_path=INTERACTIONS_TEST_PATH,
+ max_sequence_length=MAX_SEQ_LEN,
+ sampler_type='tiger',
+ min_sample_len=2,
+ is_extended=True,
+ max_train_events=MAX_TRAIN_EVENTS
+ )
+
+ train_dataset, valid_dataset, eval_dataset = data.get_datasets()
+ print("варка не умерла")
+ train_dataloader = DataLoader(
+ dataset=train_dataset,
+ batch_size=TRAIN_BATCH_SIZE,
+ shuffle=True,
+ drop_last=True
+ ) \
+ .map(Collate()) \
+ .map(UserHashing(NUM_USER_HASH)) \
+ .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \
+ .map(ToMasked('item.semantic', is_right_aligned=True)) \
+ .map(ToMasked('labels.semantic', is_right_aligned=True)) \
+ .map(TigerProcessing())
+
+ valid_dataloader = DataLoader(
+ dataset=valid_dataset,
+ batch_size=VALID_BATCH_SIZE,
+ shuffle=False,
+ drop_last=False
+ ) \
+ .map(Collate()) \
+ .map(UserHashing(NUM_USER_HASH)) \
+ .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \
+ .map(ToMasked('item.semantic', is_right_aligned=True)) \
+ .map(ToMasked('labels.semantic', is_right_aligned=True)) \
+ .map(ToMasked('visited', is_right_aligned=True)) \
+ .map(TigerProcessing())
+
+ eval_dataloader = DataLoader(
+ dataset=eval_dataset,
+ batch_size=VALID_BATCH_SIZE,
+ shuffle=False,
+ drop_last=False
+ ) \
+ .map(Collate()) \
+ .map(UserHashing(NUM_USER_HASH)) \
+ .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \
+ .map(ToMasked('item.semantic', is_right_aligned=True)) \
+ .map(ToMasked('labels.semantic', is_right_aligned=True)) \
+ .map(ToMasked('visited', is_right_aligned=True)) \
+ .map(TigerProcessing())
+
+ train_batches = []
+ for train_batch in train_dataloader:
+ train_batches.append(train_batch)
+ save_batches_to_arrow(train_batches, TRAIN_BATCHES_DIR)
+
+ valid_batches = []
+ for valid_batch in valid_dataloader:
+ valid_batches.append(valid_batch)
+ save_batches_to_arrow(valid_batches, VALID_BATCHES_DIR)
+
+ eval_batches = []
+ for eval_batch in eval_dataloader:
+ eval_batches.append(eval_batch)
+ save_batches_to_arrow(eval_batches, EVAL_BATCHES_DIR)
+
+
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/tiger-lsvd/lsvd_varka_rqvae.py b/scripts/tiger-lsvd/lsvd_varka_rqvae.py
new file mode 100644
index 0000000..bb5ecc0
--- /dev/null
+++ b/scripts/tiger-lsvd/lsvd_varka_rqvae.py
@@ -0,0 +1,304 @@
+from collections import defaultdict
+import json
+import murmurhash
+import numpy as np
+import os
+from pathlib import Path
+
+import pyarrow as pa
+import pyarrow.feather as feather
+
+import torch
+
+from irec.data.transforms import Collate, Transform
+from irec.data.dataloader import DataLoader
+
+from data import Dataset
+
+print("tiger no arrow varka 4.1")
+
+# ПУТИ
+
+IREC_PATH = '../../'
+INTERACTIONS_TRAIN_PATH = "/home/jovyan/IRec/sigir/lsvd_data/8-weeks-base-ows/base_with_gap_interactions_grouped.parquet"
+INTERACTIONS_VALID_PATH = "/home/jovyan/IRec/sigir/lsvd_data/8-weeks-base-ows/val_interactions_grouped.parquet"
+INTERACTIONS_TEST_PATH = "/home/jovyan/IRec/sigir/lsvd_data/8-weeks-base-ows/test_interactions_grouped.parquet"
+
+SEMANTIC_MAPPING_PATH = os.path.join(IREC_PATH, 'results/rqvae_vk_lsvd_cz_512_8-weeks_clusters_colisionless.json')
+TRAIN_BATCHES_DIR = os.path.join(IREC_PATH, 'data/lsvd/8-weeks-base-one-week-split-rqvae/train_batches/')
+VALID_BATCHES_DIR = os.path.join(IREC_PATH, 'data/lsvd/8-weeks-base-one-week-split-rqvae/valid_batches/')
+EVAL_BATCHES_DIR = os.path.join(IREC_PATH, 'data/lsvd/8-weeks-base-one-week-split-rqvae/eval_batches/')
+
+
+# ОСТАЛЬНОЕ
+
+SEED_VALUE = 42
+DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+
+MAX_TRAIN_EVENTS = 500
+MAX_SEQ_LEN = 20
+TRAIN_BATCH_SIZE = 256
+VALID_BATCH_SIZE = 1024
+NUM_USER_HASH = 2000
+CODEBOOK_SIZE = 512
+NUM_CODEBOOKS = 4
+
+UNIFIED_VOCAB_SIZE = CODEBOOK_SIZE * NUM_CODEBOOKS + NUM_USER_HASH + 10 # 10 for utilities
+PAD_TOKEN_ID = UNIFIED_VOCAB_SIZE - 1,
+EOS_TOKEN_ID = UNIFIED_VOCAB_SIZE - 2,
+DECODER_START_TOKEN_ID = UNIFIED_VOCAB_SIZE - 3,
+
+
+class TigerProcessing(Transform):
+ def __call__(self, batch):
+ input_semantic_ids, attention_mask = batch['item.semantic.padded'], batch['item.semantic.mask']
+ batch_size = attention_mask.shape[0]
+
+ input_semantic_ids[~attention_mask] = PAD_TOKEN_ID # TODO ???
+
+ input_semantic_ids = np.concatenate([
+ input_semantic_ids,
+ NUM_CODEBOOKS * CODEBOOK_SIZE + batch['user.hashed.ids'][:, None]
+ ], axis=-1)
+
+ attention_mask = np.concatenate([
+ attention_mask,
+ np.ones((batch_size, 1), dtype=attention_mask.dtype)
+ ], axis=-1)
+
+ batch['input.data'] = input_semantic_ids
+ batch['input.mask'] = attention_mask
+
+ target_semantic_ids = batch['labels.semantic.padded']
+ target_semantic_ids = np.concatenate([
+ np.ones(
+ (batch_size, 1),
+ dtype=np.int64,
+ ) * DECODER_START_TOKEN_ID,
+ target_semantic_ids
+ ], axis=-1)
+
+ batch['output.data'] = target_semantic_ids
+
+ return batch
+
+
+class ToMasked(Transform):
+ def __init__(self, prefix, is_right_aligned=False):
+ self._prefix = prefix
+ self._is_right_aligned = is_right_aligned
+
+ def __call__(self, batch):
+ data = batch[f'{self._prefix}.ids']
+ lengths = batch[f'{self._prefix}.length']
+
+ batch_size = lengths.shape[0]
+ max_sequence_length = int(lengths.max())
+
+ if len(data.shape) == 1: # only indices
+ padded_tensor = np.zeros(
+ (batch_size, max_sequence_length),
+ dtype=data.dtype
+ ) # (batch_size, max_seq_len)
+ else:
+ assert len(data.shape) == 2 # embeddings
+ padded_tensor = np.zeros(
+ (batch_size, max_sequence_length, data.shape[-1]),
+ dtype=data.dtype
+ ) # (batch_size, max_seq_len, emb_dim)
+
+ mask = np.arange(max_sequence_length)[None] < lengths[:, None]
+
+ if self._is_right_aligned:
+ mask = np.flip(mask, axis=-1)
+
+ padded_tensor[mask] = data
+
+ batch[f'{self._prefix}.padded'] = padded_tensor
+ batch[f'{self._prefix}.mask'] = mask
+
+ return batch
+
+
+class SemanticIdsMapper(Transform):
+ def __init__(self, mapping, names=[]):
+ super().__init__()
+ self._mapping = mapping
+ self._names = names
+
+ max_item_id = max(int(k) for k in mapping.keys())
+ print(len(list(mapping.keys())), min(int(k) for k in mapping.keys()) , max(int(k) for k in mapping.keys()))
+ # print(mapping["280052"]) #304781
+ # assert False
+ data = []
+ for i in range(max_item_id + 1):
+ if str(i) in mapping:
+ data.append(mapping[str(i)])
+ else:
+ data.append([-1] * NUM_CODEBOOKS)
+
+ self._mapping_tensor = torch.tensor(data, dtype=torch.long)
+ self._semantic_length = self._mapping_tensor.shape[-1]
+
+ missing_count = (max_item_id + 1) - len(mapping)
+ print(f"Mapping: {len(mapping)} items, {missing_count} missing (-1 filled)")
+
+ def __call__(self, batch):
+ for name in self._names:
+ if f'{name}.ids' in batch:
+ ids = batch[f'{name}.ids']
+ lengths = batch[f'{name}.length']
+ assert ids.min() >= 0
+ assert ids.max() < self._mapping_tensor.shape[0]
+ semantic_ids = self._mapping_tensor[ids].flatten()
+
+ assert (semantic_ids != -1).all(), \
+ f"Missing mappings detected in {name}! Invalid positions: {(semantic_ids == -1).sum()} out of {len(semantic_ids)}"
+
+ batch[f'{name}.semantic.ids'] = semantic_ids.numpy()
+ batch[f'{name}.semantic.length'] = lengths * self._semantic_length
+
+ return batch
+
+
+class UserHashing(Transform):
+ def __init__(self, hash_size):
+ super().__init__()
+ self._hash_size = hash_size
+
+ def __call__(self, batch):
+ batch['user.hashed.ids'] = np.array([murmurhash.hash(str(x)) % self._hash_size for x in batch['user.ids']], dtype=np.int64)
+ return batch
+
+
+def save_batches_to_arrow(batches, output_dir):
+ output_dir = Path(output_dir)
+ output_dir.mkdir(parents=True, exist_ok=False)
+
+ for batch_idx, batch in enumerate(batches):
+ length_groups = defaultdict(dict)
+ metadata_groups = defaultdict(dict)
+
+ for key, value in batch.items():
+ length = len(value)
+
+ metadata_groups[length][f'{key}_shape'] = str(value.shape)
+ metadata_groups[length][f'{key}_dtype'] = str(value.dtype)
+
+ if value.ndim == 1:
+ # 1D массив - сохраняем как есть
+ length_groups[length][key] = value
+ elif value.ndim == 2:
+ # 2D массив - используем list of lists
+ length_groups[length][key] = value.tolist()
+ else:
+ # >2D массив - flatten и сохраняем shape
+ length_groups[length][key] = value.flatten()
+
+ for length, fields in length_groups.items():
+ arrow_dict = {}
+ for k, v in fields.items():
+ if isinstance(v, list) and len(v) > 0 and isinstance(v[0], list):
+ # List of lists (2D)
+ arrow_dict[k] = pa.array(v)
+ else:
+ arrow_dict[k] = pa.array(v)
+
+ table = pa.table(arrow_dict)
+ if length in metadata_groups:
+ table = table.replace_schema_metadata(metadata_groups[length])
+
+ feather.write_feather(
+ table,
+ output_dir / f"batch_{batch_idx:06d}_len_{length}.arrow",
+ compression='lz4'
+ )
+
+ # arrow_dict = {k: pa.array(v) for k, v in fields.items()}
+ # table = pa.table(arrow_dict)
+
+ # feather.write_feather(
+ # table,
+ # output_dir / f"batch_{batch_idx:06d}_len_{length}.arrow",
+ # compression='lz4'
+ # )
+
+
+def main():
+ with open(SEMANTIC_MAPPING_PATH, 'r') as f:
+ mappings = json.load(f)
+ print("варка может начать умирать")
+ data = Dataset.create_timestamp_based_parquet(
+ train_parquet_path=INTERACTIONS_TRAIN_PATH,
+ validation_parquet_path=INTERACTIONS_VALID_PATH,
+ test_parquet_path=INTERACTIONS_TEST_PATH,
+ max_sequence_length=MAX_SEQ_LEN,
+ sampler_type='tiger',
+ min_sample_len=2,
+ is_extended=True,
+ max_train_events=MAX_TRAIN_EVENTS
+ )
+
+ train_dataset, valid_dataset, eval_dataset = data.get_datasets()
+ print("варка не умерла")
+ train_dataloader = DataLoader(
+ dataset=train_dataset,
+ batch_size=TRAIN_BATCH_SIZE,
+ shuffle=True,
+ drop_last=True
+ ) \
+ .map(Collate()) \
+ .map(UserHashing(NUM_USER_HASH)) \
+ .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \
+ .map(ToMasked('item.semantic', is_right_aligned=True)) \
+ .map(ToMasked('labels.semantic', is_right_aligned=True)) \
+ .map(TigerProcessing())
+
+ valid_dataloader = DataLoader(
+ dataset=valid_dataset,
+ batch_size=VALID_BATCH_SIZE,
+ shuffle=False,
+ drop_last=False
+ ) \
+ .map(Collate()) \
+ .map(UserHashing(NUM_USER_HASH)) \
+ .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \
+ .map(ToMasked('item.semantic', is_right_aligned=True)) \
+ .map(ToMasked('labels.semantic', is_right_aligned=True)) \
+ .map(ToMasked('visited', is_right_aligned=True)) \
+ .map(TigerProcessing())
+
+ eval_dataloader = DataLoader(
+ dataset=eval_dataset,
+ batch_size=VALID_BATCH_SIZE,
+ shuffle=False,
+ drop_last=False
+ ) \
+ .map(Collate()) \
+ .map(UserHashing(NUM_USER_HASH)) \
+ .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \
+ .map(ToMasked('item.semantic', is_right_aligned=True)) \
+ .map(ToMasked('labels.semantic', is_right_aligned=True)) \
+ .map(ToMasked('visited', is_right_aligned=True)) \
+ .map(TigerProcessing())
+
+ train_batches = []
+ for train_batch in train_dataloader:
+ train_batches.append(train_batch)
+ save_batches_to_arrow(train_batches, TRAIN_BATCHES_DIR)
+
+ valid_batches = []
+ for valid_batch in valid_dataloader:
+ valid_batches.append(valid_batch)
+ save_batches_to_arrow(valid_batches, VALID_BATCHES_DIR)
+
+ eval_batches = []
+ for eval_batch in eval_dataloader:
+ eval_batches.append(eval_batch)
+ save_batches_to_arrow(eval_batches, EVAL_BATCHES_DIR)
+
+
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/tiger-lsvd/models.py b/scripts/tiger-lsvd/models.py
new file mode 100644
index 0000000..f89419e
--- /dev/null
+++ b/scripts/tiger-lsvd/models.py
@@ -0,0 +1,223 @@
+import torch
+from transformers import T5ForConditionalGeneration, T5Config, LogitsProcessor
+
+from irec.models import TorchModel
+
+
+class CorrectItemsLogitsProcessor(LogitsProcessor):
+ def __init__(self, num_codebooks, codebook_size, mapping, num_beams, visited_items):
+ self.num_codebooks = num_codebooks
+ self.codebook_size = codebook_size
+ self.num_beams = num_beams
+
+ semantic_ids = []
+ for i in range(len(mapping)):
+ assert len(mapping[str(i)]) == num_codebooks, 'All semantic ids must have the same length'
+ semantic_ids.append(mapping[str(i)])
+
+ self.index_semantic_ids = torch.tensor(semantic_ids, dtype=torch.long, device=visited_items.device) # (num_items, semantic_ids)
+
+ batch_size, _ = visited_items.shape
+
+ self.index_semantic_ids = torch.tile(self.index_semantic_ids[None], dims=[batch_size, 1, 1]) # (batch_size, num_items, semantic_ids)
+
+ index = visited_items[..., None].tile(dims=[1, 1, num_codebooks]) # (batch_size, num_rated, semantic_ids)
+ self.index_semantic_ids = torch.scatter(
+ input=self.index_semantic_ids,
+ dim=1,
+ index=index,
+ src=torch.zeros_like(index)
+ ) # (batch_size, num_items, semantic_ids)
+
+ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+ next_sid_codebook_num = (torch.minimum((input_ids[:, -1].max() // self.codebook_size), torch.as_tensor(self.num_codebooks - 1)).item() + 1) % self.num_codebooks
+ a = torch.tile(self.index_semantic_ids[:, None, :, next_sid_codebook_num], dims=[1, self.num_beams, 1]) # (batch_size, num_beams, num_items)
+ a = a.reshape(a.shape[0] * a.shape[1], a.shape[2]) # (batch_size * num_beams, num_items)
+
+ if next_sid_codebook_num != 0:
+ b = torch.tile(self.index_semantic_ids[:, None :, :next_sid_codebook_num], dims=[1, self.num_beams, 1, 1]) # (batch_size, num_beams, num_items, sid_len)
+ b = b.reshape(b.shape[0] * b.shape[1], b.shape[2], b.shape[3]) # (batch_size * num_beams, num_items, sid_len)
+
+ current_prefixes = input_ids[:, -next_sid_codebook_num:] # (batch_size * num_beams, sid_len)
+ possible_next_items_mask = (
+ torch.eq(current_prefixes[:, None, :], b).long().sum(dim=-1) == next_sid_codebook_num
+ ) # (batch_size * num_beams, num_items)
+ a[~possible_next_items_mask] = (next_sid_codebook_num + 1) * self.codebook_size
+
+ scores_mask = torch.zeros_like(scores).bool() # (batch_size * num_beams, num_items)
+ scores_mask = torch.scatter_add(
+ input=scores_mask,
+ dim=-1,
+ index=a,
+ src=torch.ones_like(a).bool()
+ )
+
+ scores[:, :next_sid_codebook_num * self.codebook_size] = -torch.inf
+ scores[:, (next_sid_codebook_num + 1) * self.codebook_size:] = -torch.inf
+ scores[~(scores_mask.bool())] = -torch.inf
+
+ return scores
+
+
+class TigerModel(TorchModel):
+ def __init__(
+ self,
+ embedding_dim,
+ codebook_size,
+ sem_id_len,
+ num_positions,
+ user_ids_count,
+ num_heads,
+ num_encoder_layers,
+ num_decoder_layers,
+ dim_feedforward,
+ num_beams=100,
+ num_return_sequences=20,
+ d_kv=64,
+ layer_norm_eps=1e-6,
+ activation='relu',
+ dropout=0.1,
+ initializer_range=0.02,
+ logits_processor=None,
+ use_microbatching=False,
+ microbatch_size=128
+ ):
+ super().__init__()
+ self._embedding_dim = embedding_dim
+ self._codebook_size = codebook_size
+ self._num_positions = num_positions
+ self._num_heads = num_heads
+ self._num_encoder_layers = num_encoder_layers
+ self._num_decoder_layers = num_decoder_layers
+ self._dim_feedforward = dim_feedforward
+ self._num_beams = num_beams
+ self._num_return_sequences = num_return_sequences
+ self._d_kv = d_kv
+ self._layer_norm_eps = layer_norm_eps
+ self._activation = activation
+ self._dropout = dropout
+ self._sem_id_len = sem_id_len
+ self.user_ids_count = user_ids_count
+ self.logits_processor = logits_processor
+ self._use_microbatching = use_microbatching
+ self._microbatch_size = microbatch_size
+
+ unified_vocab_size = codebook_size * self._sem_id_len + self.user_ids_count + 10 # 10 for utilities
+ self.config = T5Config(
+ vocab_size=unified_vocab_size,
+ d_model=self._embedding_dim,
+ d_kv=self._d_kv,
+ d_ff=self._dim_feedforward,
+ num_layers=self._num_encoder_layers,
+ num_decoder_layers=self._num_decoder_layers,
+ num_heads=self._num_heads,
+ dropout_rate=self._dropout,
+ is_encoder_decoder=True,
+ use_cache=False,
+ pad_token_id=unified_vocab_size - 1,
+ eos_token_id=unified_vocab_size - 2,
+ decoder_start_token_id=unified_vocab_size - 3,
+ layer_norm_epsilon=self._layer_norm_eps,
+ feed_forward_proj=self._activation,
+ tie_word_embeddings=False
+ )
+ self.model = T5ForConditionalGeneration(config=self.config)
+ self._init_weights(initializer_range)
+
+ self.model = torch.compile(
+ self.model,
+ mode='reduce-overhead',
+ fullgraph=False,
+ dynamic=True
+ )
+
+ def forward(self, inputs):
+ input_semantic_ids = inputs['input.data']
+ attention_mask = inputs['input.mask']
+ target_semantic_ids = inputs['output.data']
+
+ decoder_input_ids = target_semantic_ids[:, :-1].contiguous()
+ labels = target_semantic_ids[:, 1:].contiguous()
+
+ model_output = self.model(
+ input_ids=input_semantic_ids,
+ attention_mask=attention_mask,
+ decoder_input_ids=decoder_input_ids,
+ labels=labels
+ )
+ loss = model_output['loss']
+
+ metrics = {'loss': loss.detach()}
+
+ if not self.training and not self._use_microbatching:
+ visited_batch = inputs['visited.padded']
+
+ output = self.model.generate(
+ input_ids=input_semantic_ids,
+ attention_mask=attention_mask,
+ num_beams=self._num_beams,
+ num_return_sequences=self._num_return_sequences,
+ max_length=self._sem_id_len + 1,
+ decoder_start_token_id=self.config.decoder_start_token_id,
+ eos_token_id=self.config.eos_token_id,
+ pad_token_id=self.config.pad_token_id,
+ do_sample=False,
+ early_stopping=False,
+ logits_processor=[self.logits_processor(visited_items=visited_batch)] if self.logits_processor is not None else [],
+ )
+
+ predictions = output[:, 1:].reshape(-1, self._num_return_sequences, self._sem_id_len)
+
+ all_hits = (torch.eq(predictions, labels[:, None]).sum(dim=-1)) # (batch_size, top_k)
+ elif not self.training and self._use_microbatching:
+ visited_batch = inputs['visited.padded']
+ batch_size = input_semantic_ids.shape[0]
+
+ inference_batch_size = self._microbatch_size # вместо полного batch_size
+
+ all_predictions = []
+ all_labels = []
+ # print(f"start to infer batch of shape {input_semantic_ids.shape} with new batch {inference_batch_size}")
+ for batch_idx in range(0, batch_size, inference_batch_size):
+ batch_end = min(batch_idx + inference_batch_size, batch_size)
+ batch_slice = slice(batch_idx, batch_end)
+
+ input_ids_batch = input_semantic_ids[batch_slice]
+ attention_mask_batch = attention_mask[batch_slice]
+ visited_batch_subset = visited_batch[batch_slice]
+ labels_batch = labels[batch_slice]
+
+ with torch.inference_mode():
+ output = self.model.generate(
+ input_ids=input_ids_batch,
+ attention_mask=attention_mask_batch,
+ num_beams=self._num_beams,
+ num_return_sequences=self._num_return_sequences,
+ max_length=self._sem_id_len + 1,
+ decoder_start_token_id=self.config.decoder_start_token_id,
+ eos_token_id=self.config.eos_token_id,
+ pad_token_id=self.config.pad_token_id,
+ do_sample=False,
+ early_stopping=False,
+ logits_processor=[self.logits_processor(visited_items=visited_batch_subset)] if self.logits_processor is not None else [],
+ )
+
+ predictions_batch = output[:, 1:].reshape(-1, self._num_return_sequences, self._sem_id_len)
+ all_predictions.append(predictions_batch)
+ all_labels.append(labels_batch)
+ # print("end infer of batch")
+
+ predictions = torch.cat(all_predictions, dim=0) # (batch_size, num_return_sequences, sem_id_len)
+ labels_full = torch.cat(all_labels, dim=0) # (batch_size, sem_id_len)
+ all_hits = (torch.eq(predictions, labels_full[:, None]).sum(dim=-1)) # (batch_size, top_k)
+
+ if not self.training:
+ for k in [5, 10, 20]:
+ hits = (all_hits[:, :k] == self._sem_id_len).float() # (batch_size, k)
+ recall = hits.sum(dim=-1) # (batch_size)
+ discount_factor = 1 / torch.log2(torch.arange(1, k + 1, 1).float() + 1.).to(hits.device) # (k)
+
+ metrics[f'recall@{k}'] = recall.cpu().float()
+ metrics[f'ndcg@{k}'] = torch.einsum('bk,k->b', hits, discount_factor).cpu().float()
+
+ return loss, metrics
\ No newline at end of file
diff --git a/scripts/tiger-yambda/data.py b/scripts/tiger-yambda/data.py
new file mode 100644
index 0000000..87ff07d
--- /dev/null
+++ b/scripts/tiger-yambda/data.py
@@ -0,0 +1,498 @@
+from collections import defaultdict
+import json
+from loguru import logger
+import numpy as np
+from pathlib import Path
+
+
+import pyarrow as pa
+import pyarrow.feather as feather
+
+import torch
+
+from irec.data.base import BaseDataset
+
+
+class Dataset:
+ def __init__(
+ self,
+ train_sampler,
+ validation_sampler,
+ test_sampler,
+ num_items,
+ max_sequence_length
+ ):
+ self._train_sampler = train_sampler
+ self._validation_sampler = validation_sampler
+ self._test_sampler = test_sampler
+ self._num_items = num_items
+ self._max_sequence_length = max_sequence_length
+
+ @classmethod
+ def create_timestamp_based(
+ cls,
+ train_json_path,
+ validation_json_path,
+ test_json_path,
+ max_sequence_length,
+ sampler_type,
+ min_sample_len=2,
+ is_extended=False,
+ max_train_events=50
+ ):
+ max_item_id = 0
+ train_dataset, validation_dataset, test_dataset = [], [], []
+ print("started to load datasets")
+ with open(train_json_path, 'r') as f:
+ train_data = json.load(f)
+ with open(validation_json_path, 'r') as f:
+ validation_data = json.load(f)
+ with open(test_json_path, 'r') as f:
+ test_data = json.load(f)
+
+ all_users = set(train_data.keys()) | set(validation_data.keys()) | set(test_data.keys())
+ print(f"all users count: {len(all_users)}")
+ us_count = 0
+ for user_id_str in all_users:
+ if us_count % 100 == 0:
+ print(f"user id {us_count}/{len(all_users)}: {user_id_str}")
+ user_id = int(user_id_str)
+
+ train_items = train_data.get(user_id_str, [])
+ validation_items = validation_data.get(user_id_str, [])
+ test_items = test_data.get(user_id_str, [])
+
+ full_sequence = train_items + validation_items + test_items
+ if full_sequence:
+ max_item_id = max(max_item_id, max(full_sequence))
+
+ if us_count % 100 == 0:
+ print(f"full sequence len: {len(full_sequence)}")
+ us_count += 1
+ assert len(full_sequence) >= 2, f'Core-5 dataset is used, user {user_id} has only {len(full_sequence)} items'
+
+ # Обрезаем train на последние max_train_events событий
+ train_items = train_items[-max_train_events:] if len(train_items) > max_train_events else train_items
+
+ if is_extended:
+ # sample = [1, 2]
+ # sample = [1, 2, 3]
+ # sample = [1, 2, 3, 4]
+ # sample = [1, 2, 3, 4, 5]
+ # sample = [1, 2, 3, 4, 5, 6]
+ # sample = [1, 2, 3, 4, 5, 6, 7]
+ # sample = [1, 2, 3, 4, 5, 6, 7, 8]
+ for prefix_length in range(min_sample_len, len(train_items) + 1):
+ train_dataset.append({
+ 'user.ids': [user_id],
+ 'item.ids': train_items[:prefix_length],
+ })
+ else:
+ # sample = [1, 2, 3, 4, 5, 6, 7, 8]
+ train_dataset.append({
+ 'user.ids': [user_id],
+ 'item.ids': train_items,
+ })
+
+ # валидация
+
+ # разворачиваем каждый айтем из валидации в отдельный сэмпл
+ # Пример: Train=[1,2], Valid=[3,4]
+ # sample = [1, 2, 3]
+ # sample = [1, 2, 3, 4]
+
+ current_history = train_items.copy()
+ valid_small_history = 0
+ for item in validation_items:
+ # эвал датасет сам отрезает таргет потом
+ sample_sequence = current_history + [item]
+
+ if len(sample_sequence) >= min_sample_len:
+ validation_dataset.append({
+ 'user.ids': [user_id],
+ 'item.ids': sample_sequence,
+ })
+ else:
+ valid_small_history += 1
+ current_history.append(item)
+
+ # разворачиваем каждый айтем из теста в отдельный сэмпл
+ # Пример: Train=[1,2], Valid=[3,4], Test=[5, 6]
+ # sample = [1, 2, 3, 4, 5]
+ # sample = [1, 2, 3, 4, 5, 6]
+ current_history = train_items + validation_items
+ test_small_history = 0
+ for item in test_items:
+ sample_sequence = current_history + [item]
+ if len(sample_sequence) >= min_sample_len:
+ test_dataset.append({
+ 'user.ids': [user_id],
+ 'item.ids': sample_sequence,
+ })
+ else:
+ test_small_history += 1
+ current_history.append(item)
+
+ print(f"Train dataset size: {len(train_dataset)}")
+ print(f"Validation dataset size: {len(validation_dataset)} with skipped {valid_small_history}")
+ print(f"Test dataset size: {len(test_dataset)} with skipped {test_small_history}")
+
+ logger.debug(f'Train dataset size: {len(train_dataset)}')
+ logger.debug(f'Validation dataset size: {len(validation_dataset)}')
+ logger.debug(f'Test dataset size: {len(test_dataset)}')
+
+ train_sampler = TrainDataset(train_dataset, sampler_type, max_sequence_length=max_sequence_length)
+ validation_sampler = EvalDataset(validation_dataset, max_sequence_length=max_sequence_length)
+ test_sampler = EvalDataset(test_dataset, max_sequence_length=max_sequence_length)
+
+ return cls(
+ train_sampler=train_sampler,
+ validation_sampler=validation_sampler,
+ test_sampler=test_sampler,
+ num_items=max_item_id + 1, # +1 added because our ids are 0-indexed
+ max_sequence_length=max_sequence_length
+ )
+
+ @classmethod
+ def create_timestamp_based_with_one_valid(
+ cls,
+ train_json_path,
+ validation_json_path,
+ test_json_path,
+ max_sequence_length,
+ sampler_type,
+ min_sample_len=2,
+ is_extended=False,
+ max_train_events=50,
+ max_valid_events=50
+ ):
+ max_item_id = 0
+ train_dataset, validation_dataset, test_dataset = [], [], []
+ print("started to load datasets")
+ with open(train_json_path, 'r') as f:
+ train_data = json.load(f)
+ with open(validation_json_path, 'r') as f:
+ validation_data = json.load(f)
+ with open(test_json_path, 'r') as f:
+ test_data = json.load(f)
+
+ all_users = set(train_data.keys()) | set(validation_data.keys()) | set(test_data.keys())
+ print(f"all users count: {len(all_users)}")
+ us_count = 0
+ for user_id_str in all_users:
+ if us_count % 100 == 0:
+ print(f"user id {us_count}/{len(all_users)}: {user_id_str}")
+ user_id = int(user_id_str)
+
+ train_items = train_data.get(user_id_str, [])
+ validation_items = validation_data.get(user_id_str, [])
+ test_items = test_data.get(user_id_str, [])
+
+ full_sequence = train_items + validation_items + test_items
+ if full_sequence:
+ max_item_id = max(max_item_id, max(full_sequence))
+
+ if us_count % 100 == 0:
+ print(f"full sequence len: {len(full_sequence)}")
+
+ assert len(full_sequence) >= 2, f'Core-5 dataset is used, user {user_id} has only {len(full_sequence)} items'
+
+ # Обрезаем train на последние max_train_events событий
+ train_items = train_items[-max_train_events:] if len(train_items) > max_train_events else train_items
+
+ if is_extended:
+ # sample = [1, 2]
+ # sample = [1, 2, 3]
+ # sample = [1, 2, 3, 4]
+ # sample = [1, 2, 3, 4, 5]
+ # sample = [1, 2, 3, 4, 5, 6]
+ # sample = [1, 2, 3, 4, 5, 6, 7]
+ # sample = [1, 2, 3, 4, 5, 6, 7, 8]
+ for prefix_length in range(min_sample_len, len(train_items) + 1):
+ train_dataset.append({
+ 'user.ids': [user_id],
+ 'item.ids': train_items[:prefix_length],
+ })
+ else:
+ # sample = [1, 2, 3, 4, 5, 6, 7, 8]
+ train_dataset.append({
+ 'user.ids': [user_id],
+ 'item.ids': train_items,
+ })
+
+ # валидация
+
+ # разворачиваем каждый айтем из валидации в отдельный сэмпл
+ # Пример: Train=[1,2], Valid=[3,4]
+ # sample = [1, 2, 3]
+ # sample = [1, 2, 3, 4]
+
+ current_history = train_items.copy()
+ if us_count % 100 == 0:
+ print(f"validation data length {len(validation_items[:max_valid_events])}")
+ us_count += 1
+ for item in validation_items[:max_valid_events]:
+ # эвал датасет сам отрезает таргет потом
+ sample_sequence = current_history + [item]
+
+ if len(sample_sequence) >= min_sample_len:
+ validation_dataset.append({
+ 'user.ids': [user_id],
+ 'item.ids': sample_sequence,
+ })
+ current_history.append(item)
+
+ # разворачиваем каждый айтем из теста в отдельный сэмпл
+ # Пример: Train=[1,2], Valid=[3,4], Test=[5, 6]
+ # sample = [1, 2, 3, 4, 5]
+ # sample = [1, 2, 3, 4, 5, 6]
+ current_history = train_items + validation_items
+ for item in test_items:
+ sample_sequence = current_history + [item]
+ if len(sample_sequence) >= min_sample_len:
+ test_dataset.append({
+ 'user.ids': [user_id],
+ 'item.ids': sample_sequence,
+ })
+ current_history.append(item)
+
+ print(f"Train dataset size: {len(train_dataset)}")
+ print(f"Validation dataset size: {len(validation_dataset)}")
+ print(f"Test dataset size: {len(test_dataset)}")
+
+ logger.debug(f'Train dataset size: {len(train_dataset)}')
+ logger.debug(f'Validation dataset size: {len(validation_dataset)}')
+ logger.debug(f'Test dataset size: {len(test_dataset)}')
+
+ train_sampler = TrainDataset(train_dataset, sampler_type, max_sequence_length=max_sequence_length)
+ validation_sampler = EvalDataset(validation_dataset, max_sequence_length=max_sequence_length)
+ test_sampler = EvalDataset(test_dataset, max_sequence_length=max_sequence_length)
+
+ return cls(
+ train_sampler=train_sampler,
+ validation_sampler=validation_sampler,
+ test_sampler=test_sampler,
+ num_items=max_item_id + 1, # +1 added because our ids are 0-indexed
+ max_sequence_length=max_sequence_length
+ )
+
+ @classmethod
+ def create(cls, inter_json_path, max_sequence_length, sampler_type, is_extended=False):
+ max_item_id = 0
+ train_dataset, validation_dataset, test_dataset = [], [], []
+
+ with open(inter_json_path, 'r') as f:
+ user_interactions = json.load(f)
+
+ for user_id_str, item_ids in user_interactions.items():
+ user_id = int(user_id_str)
+
+ if item_ids:
+ max_item_id = max(max_item_id, max(item_ids))
+
+ assert len(item_ids) >= 5, f'Core-5 dataset is used, user {user_id} has only {len(item_ids)} items'
+
+ # sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] (leave one out scheme, 8 - train, 9 - valid, 10 - test)
+ if is_extended:
+ # sample = [1, 2]
+ # sample = [1, 2, 3]
+ # sample = [1, 2, 3, 4]
+ # sample = [1, 2, 3, 4, 5]
+ # sample = [1, 2, 3, 4, 5, 6]
+ # sample = [1, 2, 3, 4, 5, 6, 7]
+ # sample = [1, 2, 3, 4, 5, 6, 7, 8]
+ for prefix_length in range(2, len(item_ids) - 2 + 1):
+ train_dataset.append({
+ 'user.ids': [user_id],
+ 'item.ids': item_ids[:prefix_length],
+ })
+ else:
+ # sample = [1, 2, 3, 4, 5, 6, 7, 8]
+ train_dataset.append({
+ 'user.ids': [user_id],
+ 'item.ids': item_ids[:-2],
+ })
+
+ # sample = [1, 2, 3, 4, 5, 6, 7, 8, 9]
+ validation_dataset.append({
+ 'user.ids': [user_id],
+ 'item.ids': item_ids[:-1],
+ })
+
+ # sample = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+ test_dataset.append({
+ 'user.ids': [user_id],
+ 'item.ids': item_ids,
+ })
+
+ logger.debug(f'Train dataset size: {len(train_dataset)}')
+ logger.debug(f'Validation dataset size: {len(validation_dataset)}')
+ logger.debug(f'Test dataset size: {len(test_dataset)}')
+ logger.debug(f'Max item id: {max_item_id}')
+
+ train_sampler = TrainDataset(train_dataset, sampler_type, max_sequence_length=max_sequence_length)
+ validation_sampler = EvalDataset(validation_dataset, max_sequence_length=max_sequence_length)
+ test_sampler = EvalDataset(test_dataset, max_sequence_length=max_sequence_length)
+
+ return cls(
+ train_sampler=train_sampler,
+ validation_sampler=validation_sampler,
+ test_sampler=test_sampler,
+ num_items=max_item_id + 1, # +1 added because our ids are 0-indexed
+ max_sequence_length=max_sequence_length
+ )
+
+ def get_datasets(self):
+ return self._train_sampler, self._validation_sampler, self._test_sampler
+
+ @property
+ def num_items(self):
+ return self._num_items
+
+ @property
+ def max_sequence_length(self):
+ return self._max_sequence_length
+
+
+class TrainDataset(BaseDataset):
+ def __init__(self, dataset, prediction_type, max_sequence_length):
+ self._dataset = dataset
+ self._prediction_type = prediction_type
+ self._max_sequence_length = max_sequence_length
+
+ self._transforms = {
+ 'sasrec': self._all_items_transform,
+ 'tiger': self._last_item_transform
+ }
+
+ def _all_items_transform(self, sample):
+ item_sequence = sample['item.ids'][-self._max_sequence_length:][:-1]
+ next_item_sequence = sample['item.ids'][-self._max_sequence_length:][1:]
+ return {
+ 'user.ids': np.array(sample['user.ids'], dtype=np.int64),
+ 'user.length': np.array([len(sample['user.ids'])], dtype=np.int64),
+ 'item.ids': np.array(item_sequence, dtype=np.int64),
+ 'item.length': np.array([len(item_sequence)], dtype=np.int64),
+ 'labels.ids': np.array(next_item_sequence, dtype=np.int64),
+ 'labels.length': np.array([len(next_item_sequence)], dtype=np.int64)
+ }
+
+ def _last_item_transform(self, sample):
+ item_sequence = sample['item.ids'][-self._max_sequence_length:][:-1]
+ last_item = sample['item.ids'][-self._max_sequence_length:][-1]
+ return {
+ 'user.ids': np.array(sample['user.ids'], dtype=np.int64),
+ 'user.length': np.array([len(sample['user.ids'])], dtype=np.int64),
+ 'item.ids': np.array(item_sequence, dtype=np.int64),
+ 'item.length': np.array([len(item_sequence)], dtype=np.int64),
+ 'labels.ids': np.array([last_item], dtype=np.int64),
+ 'labels.length': np.array([1], dtype=np.int64),
+ }
+
+ def __getitem__(self, index):
+ return self._transforms[self._prediction_type](self._dataset[index])
+
+ def __len__(self):
+ return len(self._dataset)
+
+
+class EvalDataset(BaseDataset):
+ def __init__(self, dataset, max_sequence_length):
+ self._dataset = dataset
+ self._max_sequence_length = max_sequence_length
+
+ @property
+ def dataset(self):
+ return self._dataset
+
+ def __len__(self):
+ return len(self._dataset)
+
+ def __getitem__(self, index):
+ sample = self._dataset[index]
+
+ item_sequence = sample['item.ids'][-self._max_sequence_length:][:-1]
+ next_item = sample['item.ids'][-self._max_sequence_length:][-1]
+
+ return {
+ 'user.ids': np.array(sample['user.ids'], dtype=np.int64),
+ 'user.length': np.array([len(sample['user.ids'])], dtype=np.int64),
+ 'item.ids': np.array(item_sequence, dtype=np.int64),
+ 'item.length': np.array([len(item_sequence)], dtype=np.int64),
+ 'labels.ids': np.array([next_item], dtype=np.int64),
+ 'labels.length': np.array([1], dtype=np.int64),
+ 'visited.ids': np.array(sample['item.ids'][:-1], dtype=np.int64),
+ 'visited.length': np.array([len(sample['item.ids'][:-1])], dtype=np.int64),
+ }
+
+
+class ArrowBatchDataset(BaseDataset):
+ def __init__(self, batch_dir, device='cuda', preload=False):
+ self.batch_dir = Path(batch_dir)
+ self.device = device
+
+ all_files = list(self.batch_dir.glob('batch_*_len_*.arrow'))
+
+ batch_files_map = defaultdict(list)
+ for f in all_files:
+ batch_id = int(f.stem.split('_')[1])
+ batch_files_map[batch_id].append(f)
+
+ for batch_id in batch_files_map:
+ batch_files_map[batch_id].sort()
+
+ self.batch_indices = sorted(batch_files_map.keys())
+
+ if preload:
+ print(f"Preloading {len(self.batch_indices)} batches...")
+ self.cached_batches = []
+
+ for idx in range(len(self.batch_indices)):
+ batch = self._load_batch(batch_files_map[self.batch_indices[idx]])
+ self.cached_batches.append(batch)
+ else:
+ self.cached_batches = None
+ self.batch_files_map = batch_files_map
+
+ def _load_batch(self, arrow_files):
+ batch = {}
+
+ for arrow_file in arrow_files:
+ table = feather.read_table(arrow_file)
+ metadata = table.schema.metadata or {}
+
+ for col_name in table.column_names:
+ col = table.column(col_name)
+
+ shape_key = f'{col_name}_shape'
+ dtype_key = f'{col_name}_dtype'
+
+ if shape_key.encode() in metadata:
+ shape = eval(metadata[shape_key.encode()].decode())
+ dtype = np.dtype(metadata[dtype_key.encode()].decode())
+
+ # Проверяем тип колонки
+ if pa.types.is_list(col.type) or pa.types.is_large_list(col.type):
+ arr = np.array(col.to_pylist(), dtype=dtype)
+ else:
+ arr = col.to_numpy().reshape(shape).astype(dtype)
+ else:
+ if pa.types.is_list(col.type) or pa.types.is_large_list(col.type):
+ arr = np.array(col.to_pylist())
+ else:
+ arr = col.to_numpy()
+
+ batch[col_name] = torch.from_numpy(arr.copy()).to(self.device)
+
+ return batch
+
+ def __len__(self):
+ return len(self.batch_indices)
+
+ def __getitem__(self, idx):
+ if self.cached_batches is not None:
+ return self.cached_batches[idx]
+ else:
+ batch_id = self.batch_indices[idx]
+ arrow_files = self.batch_files_map[batch_id]
+ return self._load_batch(arrow_files)
diff --git a/scripts/tiger-yambda/models.py b/scripts/tiger-yambda/models.py
new file mode 100644
index 0000000..8fd0f76
--- /dev/null
+++ b/scripts/tiger-yambda/models.py
@@ -0,0 +1,223 @@
+import torch
+from transformers import T5ForConditionalGeneration, T5Config, LogitsProcessor
+
+from irec.models import TorchModel
+
+
+class CorrectItemsLogitsProcessor(LogitsProcessor):
+ def __init__(self, num_codebooks, codebook_size, mapping, num_beams, visited_items):
+ self.num_codebooks = num_codebooks
+ self.codebook_size = codebook_size
+ self.num_beams = num_beams
+
+ semantic_ids = []
+ for i in range(len(mapping)):
+ assert len(mapping[str(i)]) == num_codebooks, 'All semantic ids must have the same length'
+ semantic_ids.append(mapping[str(i)])
+
+ self.index_semantic_ids = torch.tensor(semantic_ids, dtype=torch.long, device=visited_items.device) # (num_items, semantic_ids)
+
+ batch_size, _ = visited_items.shape
+
+ self.index_semantic_ids = torch.tile(self.index_semantic_ids[None], dims=[batch_size, 1, 1]) # (batch_size, num_items, semantic_ids)
+
+ index = visited_items[..., None].tile(dims=[1, 1, num_codebooks]) # (batch_size, num_rated, semantic_ids)
+ self.index_semantic_ids = torch.scatter(
+ input=self.index_semantic_ids,
+ dim=1,
+ index=index,
+ src=torch.zeros_like(index)
+ ) # (batch_size, num_items, semantic_ids)
+
+ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+ next_sid_codebook_num = (torch.minimum((input_ids[:, -1].max() // self.codebook_size), torch.as_tensor(self.num_codebooks - 1)).item() + 1) % self.num_codebooks
+ a = torch.tile(self.index_semantic_ids[:, None, :, next_sid_codebook_num], dims=[1, self.num_beams, 1]) # (batch_size, num_beams, num_items)
+ a = a.reshape(a.shape[0] * a.shape[1], a.shape[2]) # (batch_size * num_beams, num_items)
+
+ if next_sid_codebook_num != 0:
+ b = torch.tile(self.index_semantic_ids[:, None :, :next_sid_codebook_num], dims=[1, self.num_beams, 1, 1]) # (batch_size, num_beams, num_items, sid_len)
+ b = b.reshape(b.shape[0] * b.shape[1], b.shape[2], b.shape[3]) # (batch_size * num_beams, num_items, sid_len)
+
+ current_prefixes = input_ids[:, -next_sid_codebook_num:] # (batch_size * num_beams, sid_len)
+ possible_next_items_mask = (
+ torch.eq(current_prefixes[:, None, :], b).long().sum(dim=-1) == next_sid_codebook_num
+ ) # (batch_size * num_beams, num_items)
+ a[~possible_next_items_mask] = (next_sid_codebook_num + 1) * self.codebook_size
+
+ scores_mask = torch.zeros_like(scores).bool() # (batch_size * num_beams, num_items)
+ scores_mask = torch.scatter_add(
+ input=scores_mask,
+ dim=-1,
+ index=a,
+ src=torch.ones_like(a).bool()
+ )
+
+ scores[:, :next_sid_codebook_num * self.codebook_size] = -torch.inf
+ scores[:, (next_sid_codebook_num + 1) * self.codebook_size:] = -torch.inf
+ scores[~(scores_mask.bool())] = -torch.inf
+
+ return scores
+
+
+class TigerModel(TorchModel):
+ def __init__(
+ self,
+ embedding_dim,
+ codebook_size,
+ sem_id_len,
+ num_positions,
+ user_ids_count,
+ num_heads,
+ num_encoder_layers,
+ num_decoder_layers,
+ dim_feedforward,
+ num_beams=100,
+ num_return_sequences=20,
+ d_kv=64,
+ layer_norm_eps=1e-6,
+ activation='relu',
+ dropout=0.1,
+ initializer_range=0.02,
+ logits_processor=None,
+ use_microbatching=False,
+ microbatch_size=128
+ ):
+ super().__init__()
+ self._embedding_dim = embedding_dim
+ self._codebook_size = codebook_size
+ self._num_positions = num_positions
+ self._num_heads = num_heads
+ self._num_encoder_layers = num_encoder_layers
+ self._num_decoder_layers = num_decoder_layers
+ self._dim_feedforward = dim_feedforward
+ self._num_beams = num_beams
+ self._num_return_sequences = num_return_sequences
+ self._d_kv = d_kv
+ self._layer_norm_eps = layer_norm_eps
+ self._activation = activation
+ self._dropout = dropout
+ self._sem_id_len = sem_id_len
+ self.user_ids_count = user_ids_count
+ self.logits_processor = logits_processor
+ self._use_microbatching = use_microbatching
+ self._microbatch_size = microbatch_size
+
+ unified_vocab_size = codebook_size * self._sem_id_len + self.user_ids_count + 10 # 10 for utilities
+ self.config = T5Config(
+ vocab_size=unified_vocab_size,
+ d_model=self._embedding_dim,
+ d_kv=self._d_kv,
+ d_ff=self._dim_feedforward,
+ num_layers=self._num_encoder_layers,
+ num_decoder_layers=self._num_decoder_layers,
+ num_heads=self._num_heads,
+ dropout_rate=self._dropout,
+ is_encoder_decoder=True,
+ use_cache=False,
+ pad_token_id=unified_vocab_size - 1,
+ eos_token_id=unified_vocab_size - 2,
+ decoder_start_token_id=unified_vocab_size - 3,
+ layer_norm_epsilon=self._layer_norm_eps,
+ feed_forward_proj=self._activation,
+ tie_word_embeddings=False
+ )
+ self.model = T5ForConditionalGeneration(config=self.config)
+ self._init_weights(initializer_range)
+
+ self.model = torch.compile(
+ self.model,
+ mode='reduce-overhead',
+ fullgraph=False,
+ dynamic=True
+ )
+
+ def forward(self, inputs):
+ input_semantic_ids = inputs['input.data']
+ attention_mask = inputs['input.mask']
+ target_semantic_ids = inputs['output.data']
+
+ decoder_input_ids = target_semantic_ids[:, :-1].contiguous()
+ labels = target_semantic_ids[:, 1:].contiguous()
+
+ model_output = self.model(
+ input_ids=input_semantic_ids,
+ attention_mask=attention_mask,
+ decoder_input_ids=decoder_input_ids,
+ labels=labels
+ )
+ loss = model_output['loss']
+
+ metrics = {'loss': loss.detach()}
+
+ if not self.training and not self._use_microbatching:
+ visited_batch = inputs['visited.padded']
+
+ output = self.model.generate(
+ input_ids=input_semantic_ids,
+ attention_mask=attention_mask,
+ num_beams=self._num_beams,
+ num_return_sequences=self._num_return_sequences,
+ max_length=self._sem_id_len + 1,
+ decoder_start_token_id=self.config.decoder_start_token_id,
+ eos_token_id=self.config.eos_token_id,
+ pad_token_id=self.config.pad_token_id,
+ do_sample=False,
+ early_stopping=False,
+ logits_processor=[self.logits_processor(visited_items=visited_batch)] if self.logits_processor is not None else [],
+ )
+
+ predictions = output[:, 1:].reshape(-1, self._num_return_sequences, self._sem_id_len)
+
+ all_hits = (torch.eq(predictions, labels[:, None]).sum(dim=-1)) # (batch_size, top_k)
+ elif not self.training and self._use_microbatching:
+ visited_batch = inputs['visited.padded']
+ batch_size = input_semantic_ids.shape[0]
+
+ inference_batch_size = self._microbatch_size # вместо полного batch_size
+
+ all_predictions = []
+ all_labels = []
+ # print(f"start to infer batch of shape {input_semantic_ids.shape} with new batch {inference_batch_size}")
+ for batch_idx in range(0, batch_size, inference_batch_size):
+ batch_end = min(batch_idx + inference_batch_size, batch_size)
+ batch_slice = slice(batch_idx, batch_end)
+
+ input_ids_batch = input_semantic_ids[batch_slice]
+ attention_mask_batch = attention_mask[batch_slice]
+ visited_batch_subset = visited_batch[batch_slice]
+ labels_batch = labels[batch_slice]
+
+ with torch.inference_mode():
+ output = self.model.generate(
+ input_ids=input_ids_batch,
+ attention_mask=attention_mask_batch,
+ num_beams=self._num_beams,
+ num_return_sequences=self._num_return_sequences,
+ max_length=self._sem_id_len + 1,
+ decoder_start_token_id=self.config.decoder_start_token_id,
+ eos_token_id=self.config.eos_token_id,
+ pad_token_id=self.config.pad_token_id,
+ do_sample=False,
+ early_stopping=False,
+ logits_processor=[self.logits_processor(visited_items=visited_batch_subset)] if self.logits_processor is not None else [],
+ )
+
+ predictions_batch = output[:, 1:].reshape(-1, self._num_return_sequences, self._sem_id_len)
+ all_predictions.append(predictions_batch)
+ all_labels.append(labels_batch)
+ # print("end infer of batch")
+
+ predictions = torch.cat(all_predictions, dim=0) # (batch_size, num_return_sequences, sem_id_len)
+ labels_full = torch.cat(all_labels, dim=0) # (batch_size, sem_id_len)
+ all_hits = (torch.eq(predictions, labels_full[:, None]).sum(dim=-1)) # (batch_size, top_k)
+
+ if not self.training:
+ for k in [5, 10, 20]:
+ hits = (all_hits[:, :k] == self._sem_id_len).float() # (batch_size, k)
+ recall = hits.sum(dim=-1) # (batch_size)
+ discount_factor = 1 / torch.log2(torch.arange(1, k + 1, 1).float() + 1.).to(hits.device) # (k)
+
+ metrics[f'recall@{k}'] = recall.cpu().float()
+ metrics[f'ndcg@{k}'] = torch.einsum('bk,k->b', hits, discount_factor).cpu().float()
+
+ return loss, metrics
\ No newline at end of file
diff --git a/scripts/tiger-yambda/yambda_train_4.1_plum.py b/scripts/tiger-yambda/yambda_train_4.1_plum.py
new file mode 100644
index 0000000..607c0e3
--- /dev/null
+++ b/scripts/tiger-yambda/yambda_train_4.1_plum.py
@@ -0,0 +1,230 @@
+from functools import partial
+import json
+from loguru import logger
+import os
+
+import torch
+
+import irec.callbacks as cb
+from irec.data.transforms import Collate, ToDevice
+from irec.data.dataloader import DataLoader
+from irec.runners import TrainingRunner
+from irec.utils import fix_random_seed
+
+from data import ArrowBatchDataset
+from models import TigerModel, CorrectItemsLogitsProcessor
+
+
+# ПУТИ
+IREC_PATH = '../../'
+SEMANTIC_MAPPING_PATH = os.path.join(IREC_PATH, 'results_sigir_yambda/4-1_filtered_yambda_gpu_quantile_ws_2_clusters_colisionless.json')
+TRAIN_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Yambda/day-splits/test/yambda_quantile_tiger_T_train_batches/')
+VALID_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Yambda/day-splits/test/yambda_quantile_tiger_T_valid_batches/')
+EVAL_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Yambda/day-splits/test/yambda_quantile_tiger_T_eval_batches/')
+
+
+TENSORBOARD_LOGDIR = os.path.join(IREC_PATH, 'tensorboard_logs')
+CHECKPOINTS_DIR = os.path.join(IREC_PATH, 'checkpoints')
+
+EXPERIMENT_NAME = 'TEST_tiger_yambda_filtered_day-split_plum_ws_2_dp_0.2_max_300_256_1024'
+
+# ОСТАЛЬНОЕ
+SEED_VALUE = 42
+DEVICE = 'cuda'
+
+NUM_EPOCHS = 300
+MAX_SEQ_LEN = 20
+TRAIN_BATCH_SIZE = 256
+VALID_BATCH_SIZE = 1024
+EMBEDDING_DIM = 128
+CODEBOOK_SIZE = 256
+NUM_POSITIONS = 20
+NUM_USER_HASH = 2000
+NUM_HEADS = 6
+NUM_LAYERS = 4
+FEEDFORWARD_DIM = 1024
+KV_DIM = 64
+DROPOUT = 0.2
+NUM_BEAMS = 30
+TOP_K = 20
+NUM_CODEBOOKS = 4
+LR = 0.0001
+
+USE_MICROBATCHING = True
+MICROBATCH_SIZE = 128
+
+torch.set_float32_matmul_precision('high')
+torch._dynamo.config.capture_scalar_outputs = True
+
+import torch._inductor.config as config
+config.triton.cudagraph_skip_dynamic_graphs = True
+
+
+def main():
+ fix_random_seed(SEED_VALUE)
+
+ with open(SEMANTIC_MAPPING_PATH, 'r') as f:
+ mappings = json.load(f)
+
+ train_dataloader = DataLoader(
+ ArrowBatchDataset(
+ TRAIN_BATCHES_DIR,
+ device='cpu',
+ preload=True
+ ),
+ batch_size=1,
+ shuffle=True,
+ num_workers=0,
+ pin_memory=True,
+ collate_fn=Collate()
+ ).map(ToDevice(DEVICE)).repeat(NUM_EPOCHS)
+
+ valid_dataloder = ArrowBatchDataset(
+ VALID_BATCHES_DIR,
+ device=DEVICE,
+ preload=True
+ )
+
+ eval_dataloder = ArrowBatchDataset(
+ EVAL_BATCHES_DIR,
+ device=DEVICE,
+ preload=True
+ )
+
+ model = TigerModel(
+ embedding_dim=EMBEDDING_DIM,
+ codebook_size=CODEBOOK_SIZE,
+ sem_id_len=NUM_CODEBOOKS,
+ user_ids_count=NUM_USER_HASH,
+ num_positions=NUM_POSITIONS,
+ num_heads=NUM_HEADS,
+ num_encoder_layers=NUM_LAYERS,
+ num_decoder_layers=NUM_LAYERS,
+ dim_feedforward=FEEDFORWARD_DIM,
+ num_beams=NUM_BEAMS,
+ num_return_sequences=TOP_K,
+ activation='relu',
+ d_kv=KV_DIM,
+ dropout=DROPOUT,
+ layer_norm_eps=1e-6,
+ initializer_range=0.02,
+ logits_processor=partial(
+ CorrectItemsLogitsProcessor,
+ NUM_CODEBOOKS,
+ CODEBOOK_SIZE,
+ mappings,
+ NUM_BEAMS
+ ),
+ use_microbatching=USE_MICROBATCHING,
+ microbatch_size=MICROBATCH_SIZE
+ ).to(DEVICE)
+
+ total_params = sum(p.numel() for p in model.parameters())
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+ logger.debug(f'Overall parameters: {total_params:,}')
+ logger.debug(f'Trainable parameters: {trainable_params:,}')
+
+ optimizer = torch.optim.AdamW(
+ model.parameters(),
+ lr=LR,
+ )
+
+ EPOCH_NUM_STEPS = 1024 # int(len(train_dataloader) // NUM_EPOCHS)
+
+ callbacks = [
+ cb.BatchMetrics(metrics=lambda model_outputs, _: {
+ 'loss': model_outputs['loss'].item(),
+ }, name='train'),
+ cb.MetricAccumulator(
+ accumulators={
+ 'train/loss': cb.MeanAccumulator(),
+ },
+ reset_every_num_steps=EPOCH_NUM_STEPS
+ ),
+
+ cb.Validation(
+ dataset=valid_dataloder,
+ callbacks=[
+ cb.BatchMetrics(metrics=lambda model_outputs, _:{
+ 'loss': model_outputs['loss'].item(),
+ 'recall@5': model_outputs['recall@5'].tolist(),
+ 'recall@10': model_outputs['recall@10'].tolist(),
+ 'recall@20': model_outputs['recall@20'].tolist(),
+ 'ndcg@5': model_outputs['ndcg@5'].tolist(),
+ 'ndcg@10': model_outputs['ndcg@10'].tolist(),
+ 'ndcg@20': model_outputs['ndcg@20'].tolist(),
+ }, name='validation'),
+ cb.MetricAccumulator(
+ accumulators={
+ 'validation/loss': cb.MeanAccumulator(),
+ 'validation/recall@5': cb.MeanAccumulator(),
+ 'validation/recall@10': cb.MeanAccumulator(),
+ 'validation/recall@20': cb.MeanAccumulator(),
+ 'validation/ndcg@5': cb.MeanAccumulator(),
+ 'validation/ndcg@10': cb.MeanAccumulator(),
+ 'validation/ndcg@20': cb.MeanAccumulator(),
+ },
+ ),
+ ],
+ ).every_num_steps(EPOCH_NUM_STEPS),
+
+ cb.Validation(
+ dataset=eval_dataloder,
+ callbacks=[
+ cb.BatchMetrics(metrics=lambda model_outputs, _: {
+ 'loss': model_outputs['loss'].item(),
+ 'recall@5': model_outputs['recall@5'].tolist(),
+ 'recall@10': model_outputs['recall@10'].tolist(),
+ 'recall@20': model_outputs['recall@20'].tolist(),
+ 'ndcg@5': model_outputs['ndcg@5'].tolist(),
+ 'ndcg@10': model_outputs['ndcg@10'].tolist(),
+ 'ndcg@20': model_outputs['ndcg@20'].tolist(),
+ }, name='eval'),
+ cb.MetricAccumulator(
+ accumulators={
+ 'eval/loss': cb.MeanAccumulator(),
+ 'eval/recall@5': cb.MeanAccumulator(),
+ 'eval/recall@10': cb.MeanAccumulator(),
+ 'eval/recall@20': cb.MeanAccumulator(),
+ 'eval/ndcg@5': cb.MeanAccumulator(),
+ 'eval/ndcg@10': cb.MeanAccumulator(),
+ 'eval/ndcg@20': cb.MeanAccumulator(),
+ },
+ ),
+ ],
+ ).every_num_steps(EPOCH_NUM_STEPS * 4),
+
+ cb.Logger().every_num_steps(EPOCH_NUM_STEPS),
+ cb.TensorboardLogger(experiment_name=EXPERIMENT_NAME, logdir=TENSORBOARD_LOGDIR),
+
+ cb.EarlyStopping(
+ metric='validation/ndcg@20',
+ patience=40 * 4,
+ minimize=False,
+ model_path=os.path.join(CHECKPOINTS_DIR, EXPERIMENT_NAME)
+ ).every_num_steps(EPOCH_NUM_STEPS)
+
+ # cb.Profiler(
+ # wait=10,
+ # warmup=10,
+ # active=10,
+ # logdir=TENSORBOARD_LOGDIR
+ # ),
+ # cb.StopAfterNumSteps(40)
+
+ ]
+
+ logger.debug('Everything is ready for training process!')
+
+ runner = TrainingRunner(
+ model=model,
+ optimizer=optimizer,
+ dataset=train_dataloader,
+ callbacks=callbacks,
+ )
+ runner.run()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/tiger-yambda/yambda_varka_4.1_plum.py b/scripts/tiger-yambda/yambda_varka_4.1_plum.py
new file mode 100644
index 0000000..9c00704
--- /dev/null
+++ b/scripts/tiger-yambda/yambda_varka_4.1_plum.py
@@ -0,0 +1,304 @@
+from collections import defaultdict
+import json
+import murmurhash
+import numpy as np
+import os
+from pathlib import Path
+
+import pyarrow as pa
+import pyarrow.feather as feather
+
+import torch
+
+from irec.data.transforms import Collate, Transform
+from irec.data.dataloader import DataLoader
+
+from data import Dataset
+
+print("tiger no arrow varka 4.1")
+
+# ПУТИ
+
+IREC_PATH = '../../'
+INTERACTIONS_TRAIN_PATH = os.path.join(IREC_PATH, 'data/Yambda/day-splits/merged_for_exps_filtered/exp_4_0.9_inter_tiger_train.json')
+INTERACTIONS_VALID_PATH = os.path.join(IREC_PATH, 'data/Yambda/day-splits/merged_for_exps_filtered/valid_set.json')
+INTERACTIONS_TEST_PATH = os.path.join(IREC_PATH, 'data/Yambda/day-splits/merged_for_exps_filtered/test_set.json')
+
+SEMANTIC_MAPPING_PATH = os.path.join(IREC_PATH, 'results_sigir_yambda/4-1_filtered_yambda_gpu_quantile_ws_2_clusters_colisionless.json')
+TRAIN_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Yambda/day-splits/test/yambda_quantile_tiger_T_train_batches/')
+VALID_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Yambda/day-splits/test/yambda_quantile_tiger_T_valid_batches/')
+EVAL_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Yambda/day-splits/test/yambda_quantile_tiger_T_eval_batches/')
+
+
+# ОСТАЛЬНОЕ
+
+SEED_VALUE = 42
+DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+
+MAX_TRAIN_EVENTS = 300
+MAX_SEQ_LEN = 20
+TRAIN_BATCH_SIZE = 256
+VALID_BATCH_SIZE = 1024
+NUM_USER_HASH = 2000
+CODEBOOK_SIZE = 256
+NUM_CODEBOOKS = 4
+
+UNIFIED_VOCAB_SIZE = CODEBOOK_SIZE * NUM_CODEBOOKS + NUM_USER_HASH + 10 # 10 for utilities
+PAD_TOKEN_ID = UNIFIED_VOCAB_SIZE - 1,
+EOS_TOKEN_ID = UNIFIED_VOCAB_SIZE - 2,
+DECODER_START_TOKEN_ID = UNIFIED_VOCAB_SIZE - 3,
+
+
+class TigerProcessing(Transform):
+ def __call__(self, batch):
+ input_semantic_ids, attention_mask = batch['item.semantic.padded'], batch['item.semantic.mask']
+ batch_size = attention_mask.shape[0]
+
+ input_semantic_ids[~attention_mask] = PAD_TOKEN_ID # TODO ???
+
+ input_semantic_ids = np.concatenate([
+ input_semantic_ids,
+ NUM_CODEBOOKS * CODEBOOK_SIZE + batch['user.hashed.ids'][:, None]
+ ], axis=-1)
+
+ attention_mask = np.concatenate([
+ attention_mask,
+ np.ones((batch_size, 1), dtype=attention_mask.dtype)
+ ], axis=-1)
+
+ batch['input.data'] = input_semantic_ids
+ batch['input.mask'] = attention_mask
+
+ target_semantic_ids = batch['labels.semantic.padded']
+ target_semantic_ids = np.concatenate([
+ np.ones(
+ (batch_size, 1),
+ dtype=np.int64,
+ ) * DECODER_START_TOKEN_ID,
+ target_semantic_ids
+ ], axis=-1)
+
+ batch['output.data'] = target_semantic_ids
+
+ return batch
+
+
+class ToMasked(Transform):
+ def __init__(self, prefix, is_right_aligned=False):
+ self._prefix = prefix
+ self._is_right_aligned = is_right_aligned
+
+ def __call__(self, batch):
+ data = batch[f'{self._prefix}.ids']
+ lengths = batch[f'{self._prefix}.length']
+
+ batch_size = lengths.shape[0]
+ max_sequence_length = int(lengths.max())
+
+ if len(data.shape) == 1: # only indices
+ padded_tensor = np.zeros(
+ (batch_size, max_sequence_length),
+ dtype=data.dtype
+ ) # (batch_size, max_seq_len)
+ else:
+ assert len(data.shape) == 2 # embeddings
+ padded_tensor = np.zeros(
+ (batch_size, max_sequence_length, data.shape[-1]),
+ dtype=data.dtype
+ ) # (batch_size, max_seq_len, emb_dim)
+
+ mask = np.arange(max_sequence_length)[None] < lengths[:, None]
+
+ if self._is_right_aligned:
+ mask = np.flip(mask, axis=-1)
+
+ padded_tensor[mask] = data
+
+ batch[f'{self._prefix}.padded'] = padded_tensor
+ batch[f'{self._prefix}.mask'] = mask
+
+ return batch
+
+
+class SemanticIdsMapper(Transform):
+ def __init__(self, mapping, names=[]):
+ super().__init__()
+ self._mapping = mapping
+ self._names = names
+
+ max_item_id = max(int(k) for k in mapping.keys())
+ print(len(list(mapping.keys())), min(int(k) for k in mapping.keys()) , max(int(k) for k in mapping.keys()))
+ print(mapping["280052"]) #304781
+ # assert False
+ data = []
+ for i in range(max_item_id + 1):
+ if str(i) in mapping:
+ data.append(mapping[str(i)])
+ else:
+ data.append([-1] * NUM_CODEBOOKS)
+
+ self._mapping_tensor = torch.tensor(data, dtype=torch.long)
+ self._semantic_length = self._mapping_tensor.shape[-1]
+
+ missing_count = (max_item_id + 1) - len(mapping)
+ print(f"Mapping: {len(mapping)} items, {missing_count} missing (-1 filled)")
+
+ def __call__(self, batch):
+ for name in self._names:
+ if f'{name}.ids' in batch:
+ ids = batch[f'{name}.ids']
+ lengths = batch[f'{name}.length']
+ assert ids.min() >= 0
+ assert ids.max() < self._mapping_tensor.shape[0]
+ semantic_ids = self._mapping_tensor[ids].flatten()
+
+ assert (semantic_ids != -1).all(), \
+ f"Missing mappings detected in {name}! Invalid positions: {(semantic_ids == -1).sum()} out of {len(semantic_ids)}"
+
+ batch[f'{name}.semantic.ids'] = semantic_ids.numpy()
+ batch[f'{name}.semantic.length'] = lengths * self._semantic_length
+
+ return batch
+
+
+class UserHashing(Transform):
+ def __init__(self, hash_size):
+ super().__init__()
+ self._hash_size = hash_size
+
+ def __call__(self, batch):
+ batch['user.hashed.ids'] = np.array([murmurhash.hash(str(x)) % self._hash_size for x in batch['user.ids']], dtype=np.int64)
+ return batch
+
+
+def save_batches_to_arrow(batches, output_dir):
+ output_dir = Path(output_dir)
+ output_dir.mkdir(parents=True, exist_ok=False)
+
+ for batch_idx, batch in enumerate(batches):
+ length_groups = defaultdict(dict)
+ metadata_groups = defaultdict(dict)
+
+ for key, value in batch.items():
+ length = len(value)
+
+ metadata_groups[length][f'{key}_shape'] = str(value.shape)
+ metadata_groups[length][f'{key}_dtype'] = str(value.dtype)
+
+ if value.ndim == 1:
+ # 1D массив - сохраняем как есть
+ length_groups[length][key] = value
+ elif value.ndim == 2:
+ # 2D массив - используем list of lists
+ length_groups[length][key] = value.tolist()
+ else:
+ # >2D массив - flatten и сохраняем shape
+ length_groups[length][key] = value.flatten()
+
+ for length, fields in length_groups.items():
+ arrow_dict = {}
+ for k, v in fields.items():
+ if isinstance(v, list) and len(v) > 0 and isinstance(v[0], list):
+ # List of lists (2D)
+ arrow_dict[k] = pa.array(v)
+ else:
+ arrow_dict[k] = pa.array(v)
+
+ table = pa.table(arrow_dict)
+ if length in metadata_groups:
+ table = table.replace_schema_metadata(metadata_groups[length])
+
+ feather.write_feather(
+ table,
+ output_dir / f"batch_{batch_idx:06d}_len_{length}.arrow",
+ compression='lz4'
+ )
+
+ # arrow_dict = {k: pa.array(v) for k, v in fields.items()}
+ # table = pa.table(arrow_dict)
+
+ # feather.write_feather(
+ # table,
+ # output_dir / f"batch_{batch_idx:06d}_len_{length}.arrow",
+ # compression='lz4'
+ # )
+
+
+def main():
+ with open(SEMANTIC_MAPPING_PATH, 'r') as f:
+ mappings = json.load(f)
+
+ data = Dataset.create_timestamp_based(
+ train_json_path=INTERACTIONS_TRAIN_PATH,
+ validation_json_path=INTERACTIONS_VALID_PATH,
+ test_json_path=INTERACTIONS_TEST_PATH,
+ max_sequence_length=MAX_SEQ_LEN,
+ sampler_type='tiger',
+ min_sample_len=2,
+ is_extended=True,
+ max_train_events=MAX_TRAIN_EVENTS
+ )
+
+ train_dataset, valid_dataset, eval_dataset = data.get_datasets()
+
+ train_dataloader = DataLoader(
+ dataset=train_dataset,
+ batch_size=TRAIN_BATCH_SIZE,
+ shuffle=True,
+ drop_last=True
+ ) \
+ .map(Collate()) \
+ .map(UserHashing(NUM_USER_HASH)) \
+ .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \
+ .map(ToMasked('item.semantic', is_right_aligned=True)) \
+ .map(ToMasked('labels.semantic', is_right_aligned=True)) \
+ .map(TigerProcessing())
+
+ valid_dataloader = DataLoader(
+ dataset=valid_dataset,
+ batch_size=VALID_BATCH_SIZE,
+ shuffle=False,
+ drop_last=False
+ ) \
+ .map(Collate()) \
+ .map(UserHashing(NUM_USER_HASH)) \
+ .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \
+ .map(ToMasked('item.semantic', is_right_aligned=True)) \
+ .map(ToMasked('labels.semantic', is_right_aligned=True)) \
+ .map(ToMasked('visited', is_right_aligned=True)) \
+ .map(TigerProcessing())
+
+ eval_dataloader = DataLoader(
+ dataset=eval_dataset,
+ batch_size=VALID_BATCH_SIZE,
+ shuffle=False,
+ drop_last=False
+ ) \
+ .map(Collate()) \
+ .map(UserHashing(NUM_USER_HASH)) \
+ .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \
+ .map(ToMasked('item.semantic', is_right_aligned=True)) \
+ .map(ToMasked('labels.semantic', is_right_aligned=True)) \
+ .map(ToMasked('visited', is_right_aligned=True)) \
+ .map(TigerProcessing())
+
+ train_batches = []
+ for train_batch in train_dataloader:
+ train_batches.append(train_batch)
+ save_batches_to_arrow(train_batches, TRAIN_BATCHES_DIR)
+
+ valid_batches = []
+ for valid_batch in valid_dataloader:
+ valid_batches.append(valid_batch)
+ save_batches_to_arrow(valid_batches, VALID_BATCHES_DIR)
+
+ eval_batches = []
+ for eval_batch in eval_dataloader:
+ eval_batches.append(eval_batch)
+ save_batches_to_arrow(eval_batches, EVAL_BATCHES_DIR)
+
+
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/tiger/beauty_exps/train_4.1_plum.py b/scripts/tiger/beauty_exps/train_4.1_plum.py
new file mode 100644
index 0000000..8daf273
--- /dev/null
+++ b/scripts/tiger/beauty_exps/train_4.1_plum.py
@@ -0,0 +1,225 @@
+from functools import partial
+import json
+from loguru import logger
+import os
+
+import torch
+
+import irec.callbacks as cb
+from irec.data.transforms import Collate, ToDevice
+from irec.data.dataloader import DataLoader
+from irec.runners import TrainingRunner
+from irec.utils import fix_random_seed
+
+from data import ArrowBatchDataset
+from models import TigerModel, CorrectItemsLogitsProcessor
+
+
+# ПУТИ
+IREC_PATH = '../../'
+SEMANTIC_MAPPING_PATH = os.path.join(IREC_PATH, 'results_sigir/4-1_updated_quantile_plum_rqvae_beauty_ws_2_clusters_colisionless.json')
+TRAIN_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/updated_quantile_tiger_4-1_train_batches/')
+VALID_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/updated_quantile_tiger_4-1_valid_batches/')
+EVAL_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/updated_quantile_tiger_4-1_eval_batches/')
+
+TENSORBOARD_LOGDIR = os.path.join(IREC_PATH, 'tensorboard_logs')
+CHECKPOINTS_DIR = os.path.join(IREC_PATH, 'checkpoints')
+
+EXPERIMENT_NAME = 'tiger_beauty_updated_quantile_4-1_plum_ws_2_dp_0.2'
+
+# ОСТАЛЬНОЕ
+SEED_VALUE = 42
+DEVICE = 'cuda'
+
+NUM_EPOCHS = 300
+MAX_SEQ_LEN = 20
+TRAIN_BATCH_SIZE = 256
+VALID_BATCH_SIZE = 1024
+EMBEDDING_DIM = 128
+CODEBOOK_SIZE = 256
+NUM_POSITIONS = 20
+NUM_USER_HASH = 2000
+NUM_HEADS = 6
+NUM_LAYERS = 4
+FEEDFORWARD_DIM = 1024
+KV_DIM = 64
+DROPOUT = 0.2
+NUM_BEAMS = 30
+TOP_K = 20
+NUM_CODEBOOKS = 4
+LR = 0.0001
+
+
+torch.set_float32_matmul_precision('high')
+torch._dynamo.config.capture_scalar_outputs = True
+
+import torch._inductor.config as config
+config.triton.cudagraph_skip_dynamic_graphs = True
+
+
+def main():
+ fix_random_seed(SEED_VALUE)
+
+ with open(SEMANTIC_MAPPING_PATH, 'r') as f:
+ mappings = json.load(f)
+
+ train_dataloader = DataLoader(
+ ArrowBatchDataset(
+ TRAIN_BATCHES_DIR,
+ device='cpu',
+ preload=True
+ ),
+ batch_size=1,
+ shuffle=True,
+ num_workers=0,
+ pin_memory=True,
+ collate_fn=Collate()
+ ).map(ToDevice(DEVICE)).repeat(NUM_EPOCHS)
+
+ valid_dataloder = ArrowBatchDataset(
+ VALID_BATCHES_DIR,
+ device=DEVICE,
+ preload=True
+ )
+
+ eval_dataloder = ArrowBatchDataset(
+ EVAL_BATCHES_DIR,
+ device=DEVICE,
+ preload=True
+ )
+
+ model = TigerModel(
+ embedding_dim=EMBEDDING_DIM,
+ codebook_size=CODEBOOK_SIZE,
+ sem_id_len=NUM_CODEBOOKS,
+ user_ids_count=NUM_USER_HASH,
+ num_positions=NUM_POSITIONS,
+ num_heads=NUM_HEADS,
+ num_encoder_layers=NUM_LAYERS,
+ num_decoder_layers=NUM_LAYERS,
+ dim_feedforward=FEEDFORWARD_DIM,
+ num_beams=NUM_BEAMS,
+ num_return_sequences=TOP_K,
+ activation='relu',
+ d_kv=KV_DIM,
+ dropout=DROPOUT,
+ layer_norm_eps=1e-6,
+ initializer_range=0.02,
+ logits_processor=partial(
+ CorrectItemsLogitsProcessor,
+ NUM_CODEBOOKS,
+ CODEBOOK_SIZE,
+ mappings,
+ NUM_BEAMS
+ )
+ ).to(DEVICE)
+
+ total_params = sum(p.numel() for p in model.parameters())
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+ logger.debug(f'Overall parameters: {total_params:,}')
+ logger.debug(f'Trainable parameters: {trainable_params:,}')
+
+ optimizer = torch.optim.AdamW(
+ model.parameters(),
+ lr=LR,
+ )
+
+ EPOCH_NUM_STEPS = 1024 # int(len(train_dataloader) // NUM_EPOCHS)
+
+ callbacks = [
+ cb.BatchMetrics(metrics=lambda model_outputs, _: {
+ 'loss': model_outputs['loss'].item(),
+ }, name='train'),
+ cb.MetricAccumulator(
+ accumulators={
+ 'train/loss': cb.MeanAccumulator(),
+ },
+ reset_every_num_steps=EPOCH_NUM_STEPS
+ ),
+
+ cb.Validation(
+ dataset=valid_dataloder,
+ callbacks=[
+ cb.BatchMetrics(metrics=lambda model_outputs, _:{
+ 'loss': model_outputs['loss'].item(),
+ 'recall@5': model_outputs['recall@5'].tolist(),
+ 'recall@10': model_outputs['recall@10'].tolist(),
+ 'recall@20': model_outputs['recall@20'].tolist(),
+ 'ndcg@5': model_outputs['ndcg@5'].tolist(),
+ 'ndcg@10': model_outputs['ndcg@10'].tolist(),
+ 'ndcg@20': model_outputs['ndcg@20'].tolist(),
+ }, name='validation'),
+ cb.MetricAccumulator(
+ accumulators={
+ 'validation/loss': cb.MeanAccumulator(),
+ 'validation/recall@5': cb.MeanAccumulator(),
+ 'validation/recall@10': cb.MeanAccumulator(),
+ 'validation/recall@20': cb.MeanAccumulator(),
+ 'validation/ndcg@5': cb.MeanAccumulator(),
+ 'validation/ndcg@10': cb.MeanAccumulator(),
+ 'validation/ndcg@20': cb.MeanAccumulator(),
+ },
+ ),
+ ],
+ ).every_num_steps(EPOCH_NUM_STEPS),
+
+ cb.Validation(
+ dataset=eval_dataloder,
+ callbacks=[
+ cb.BatchMetrics(metrics=lambda model_outputs, _: {
+ 'loss': model_outputs['loss'].item(),
+ 'recall@5': model_outputs['recall@5'].tolist(),
+ 'recall@10': model_outputs['recall@10'].tolist(),
+ 'recall@20': model_outputs['recall@20'].tolist(),
+ 'ndcg@5': model_outputs['ndcg@5'].tolist(),
+ 'ndcg@10': model_outputs['ndcg@10'].tolist(),
+ 'ndcg@20': model_outputs['ndcg@20'].tolist(),
+ }, name='eval'),
+ cb.MetricAccumulator(
+ accumulators={
+ 'eval/loss': cb.MeanAccumulator(),
+ 'eval/recall@5': cb.MeanAccumulator(),
+ 'eval/recall@10': cb.MeanAccumulator(),
+ 'eval/recall@20': cb.MeanAccumulator(),
+ 'eval/ndcg@5': cb.MeanAccumulator(),
+ 'eval/ndcg@10': cb.MeanAccumulator(),
+ 'eval/ndcg@20': cb.MeanAccumulator(),
+ },
+ ),
+ ],
+ ).every_num_steps(EPOCH_NUM_STEPS),
+
+ cb.Logger().every_num_steps(EPOCH_NUM_STEPS),
+ cb.TensorboardLogger(experiment_name=EXPERIMENT_NAME, logdir=TENSORBOARD_LOGDIR),
+
+ cb.EarlyStopping(
+ metric='eval/ndcg@20',
+ patience=40,
+ minimize=False,
+ model_path=os.path.join(CHECKPOINTS_DIR, EXPERIMENT_NAME)
+ ).every_num_steps(EPOCH_NUM_STEPS)
+
+ # cb.Profiler(
+ # wait=10,
+ # warmup=10,
+ # active=10,
+ # logdir=TENSORBOARD_LOGDIR
+ # ),
+ # cb.StopAfterNumSteps(40)
+
+ ]
+
+ logger.debug('Everything is ready for training process!')
+
+ runner = TrainingRunner(
+ model=model,
+ optimizer=optimizer,
+ dataset=train_dataloader,
+ callbacks=callbacks,
+ )
+ runner.run()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/tiger/beauty_exps/train_4.2_plum.py b/scripts/tiger/beauty_exps/train_4.2_plum.py
new file mode 100644
index 0000000..580bcb5
--- /dev/null
+++ b/scripts/tiger/beauty_exps/train_4.2_plum.py
@@ -0,0 +1,225 @@
+from functools import partial
+import json
+from loguru import logger
+import os
+
+import torch
+
+import irec.callbacks as cb
+from irec.data.transforms import Collate, ToDevice
+from irec.data.dataloader import DataLoader
+from irec.runners import TrainingRunner
+from irec.utils import fix_random_seed
+
+from data import ArrowBatchDataset
+from models import TigerModel, CorrectItemsLogitsProcessor
+
+
+# ПУТИ
+IREC_PATH = '../../'
+SEMANTIC_MAPPING_PATH = os.path.join(IREC_PATH, 'results_sigir/4-2_updated_quantile_plum_rqvae_beauty_ws_2_clusters_colisionless.json')
+TRAIN_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/updated_quantile_tiger_4-2_train_batches/')
+VALID_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/updated_quantile_tiger_4-2_valid_batches/')
+EVAL_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/updated_quantile_tiger_4-2_eval_batches/')
+
+TENSORBOARD_LOGDIR = os.path.join(IREC_PATH, 'tensorboard_logs')
+CHECKPOINTS_DIR = os.path.join(IREC_PATH, 'checkpoints')
+
+EXPERIMENT_NAME = 'tiger_beauty_updated_quantile_4-2_plum_ws_2_dp_0.2'
+
+# ОСТАЛЬНОЕ
+SEED_VALUE = 42
+DEVICE = 'cuda'
+
+NUM_EPOCHS = 300
+MAX_SEQ_LEN = 20
+TRAIN_BATCH_SIZE = 256
+VALID_BATCH_SIZE = 1024
+EMBEDDING_DIM = 128
+CODEBOOK_SIZE = 256
+NUM_POSITIONS = 20
+NUM_USER_HASH = 2000
+NUM_HEADS = 6
+NUM_LAYERS = 4
+FEEDFORWARD_DIM = 1024
+KV_DIM = 64
+DROPOUT = 0.2
+NUM_BEAMS = 30
+TOP_K = 20
+NUM_CODEBOOKS = 4
+LR = 0.0001
+
+
+torch.set_float32_matmul_precision('high')
+torch._dynamo.config.capture_scalar_outputs = True
+
+import torch._inductor.config as config
+config.triton.cudagraph_skip_dynamic_graphs = True
+
+
+def main():
+ fix_random_seed(SEED_VALUE)
+
+ with open(SEMANTIC_MAPPING_PATH, 'r') as f:
+ mappings = json.load(f)
+
+ train_dataloader = DataLoader(
+ ArrowBatchDataset(
+ TRAIN_BATCHES_DIR,
+ device='cpu',
+ preload=True
+ ),
+ batch_size=1,
+ shuffle=True,
+ num_workers=0,
+ pin_memory=True,
+ collate_fn=Collate()
+ ).map(ToDevice(DEVICE)).repeat(NUM_EPOCHS)
+
+ valid_dataloder = ArrowBatchDataset(
+ VALID_BATCHES_DIR,
+ device=DEVICE,
+ preload=True
+ )
+
+ eval_dataloder = ArrowBatchDataset(
+ EVAL_BATCHES_DIR,
+ device=DEVICE,
+ preload=True
+ )
+
+ model = TigerModel(
+ embedding_dim=EMBEDDING_DIM,
+ codebook_size=CODEBOOK_SIZE,
+ sem_id_len=NUM_CODEBOOKS,
+ user_ids_count=NUM_USER_HASH,
+ num_positions=NUM_POSITIONS,
+ num_heads=NUM_HEADS,
+ num_encoder_layers=NUM_LAYERS,
+ num_decoder_layers=NUM_LAYERS,
+ dim_feedforward=FEEDFORWARD_DIM,
+ num_beams=NUM_BEAMS,
+ num_return_sequences=TOP_K,
+ activation='relu',
+ d_kv=KV_DIM,
+ dropout=DROPOUT,
+ layer_norm_eps=1e-6,
+ initializer_range=0.02,
+ logits_processor=partial(
+ CorrectItemsLogitsProcessor,
+ NUM_CODEBOOKS,
+ CODEBOOK_SIZE,
+ mappings,
+ NUM_BEAMS
+ )
+ ).to(DEVICE)
+
+ total_params = sum(p.numel() for p in model.parameters())
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+ logger.debug(f'Overall parameters: {total_params:,}')
+ logger.debug(f'Trainable parameters: {trainable_params:,}')
+
+ optimizer = torch.optim.AdamW(
+ model.parameters(),
+ lr=LR,
+ )
+
+ EPOCH_NUM_STEPS = 1024 # int(len(train_dataloader) // NUM_EPOCHS)
+
+ callbacks = [
+ cb.BatchMetrics(metrics=lambda model_outputs, _: {
+ 'loss': model_outputs['loss'].item(),
+ }, name='train'),
+ cb.MetricAccumulator(
+ accumulators={
+ 'train/loss': cb.MeanAccumulator(),
+ },
+ reset_every_num_steps=EPOCH_NUM_STEPS
+ ),
+
+ cb.Validation(
+ dataset=valid_dataloder,
+ callbacks=[
+ cb.BatchMetrics(metrics=lambda model_outputs, _:{
+ 'loss': model_outputs['loss'].item(),
+ 'recall@5': model_outputs['recall@5'].tolist(),
+ 'recall@10': model_outputs['recall@10'].tolist(),
+ 'recall@20': model_outputs['recall@20'].tolist(),
+ 'ndcg@5': model_outputs['ndcg@5'].tolist(),
+ 'ndcg@10': model_outputs['ndcg@10'].tolist(),
+ 'ndcg@20': model_outputs['ndcg@20'].tolist(),
+ }, name='validation'),
+ cb.MetricAccumulator(
+ accumulators={
+ 'validation/loss': cb.MeanAccumulator(),
+ 'validation/recall@5': cb.MeanAccumulator(),
+ 'validation/recall@10': cb.MeanAccumulator(),
+ 'validation/recall@20': cb.MeanAccumulator(),
+ 'validation/ndcg@5': cb.MeanAccumulator(),
+ 'validation/ndcg@10': cb.MeanAccumulator(),
+ 'validation/ndcg@20': cb.MeanAccumulator(),
+ },
+ ),
+ ],
+ ).every_num_steps(EPOCH_NUM_STEPS),
+
+ cb.Validation(
+ dataset=eval_dataloder,
+ callbacks=[
+ cb.BatchMetrics(metrics=lambda model_outputs, _: {
+ 'loss': model_outputs['loss'].item(),
+ 'recall@5': model_outputs['recall@5'].tolist(),
+ 'recall@10': model_outputs['recall@10'].tolist(),
+ 'recall@20': model_outputs['recall@20'].tolist(),
+ 'ndcg@5': model_outputs['ndcg@5'].tolist(),
+ 'ndcg@10': model_outputs['ndcg@10'].tolist(),
+ 'ndcg@20': model_outputs['ndcg@20'].tolist(),
+ }, name='eval'),
+ cb.MetricAccumulator(
+ accumulators={
+ 'eval/loss': cb.MeanAccumulator(),
+ 'eval/recall@5': cb.MeanAccumulator(),
+ 'eval/recall@10': cb.MeanAccumulator(),
+ 'eval/recall@20': cb.MeanAccumulator(),
+ 'eval/ndcg@5': cb.MeanAccumulator(),
+ 'eval/ndcg@10': cb.MeanAccumulator(),
+ 'eval/ndcg@20': cb.MeanAccumulator(),
+ },
+ ),
+ ],
+ ).every_num_steps(EPOCH_NUM_STEPS),
+
+ cb.Logger().every_num_steps(EPOCH_NUM_STEPS),
+ cb.TensorboardLogger(experiment_name=EXPERIMENT_NAME, logdir=TENSORBOARD_LOGDIR),
+
+ cb.EarlyStopping(
+ metric='eval/ndcg@20',
+ patience=40,
+ minimize=False,
+ model_path=os.path.join(CHECKPOINTS_DIR, EXPERIMENT_NAME)
+ ).every_num_steps(EPOCH_NUM_STEPS)
+
+ # cb.Profiler(
+ # wait=10,
+ # warmup=10,
+ # active=10,
+ # logdir=TENSORBOARD_LOGDIR
+ # ),
+ # cb.StopAfterNumSteps(40)
+
+ ]
+
+ logger.debug('Everything is ready for training process!')
+
+ runner = TrainingRunner(
+ model=model,
+ optimizer=optimizer,
+ dataset=train_dataloader,
+ callbacks=callbacks,
+ )
+ runner.run()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/tiger/beauty_exps/train_4.3_plum.py b/scripts/tiger/beauty_exps/train_4.3_plum.py
new file mode 100644
index 0000000..f98e9fd
--- /dev/null
+++ b/scripts/tiger/beauty_exps/train_4.3_plum.py
@@ -0,0 +1,225 @@
+from functools import partial
+import json
+from loguru import logger
+import os
+
+import torch
+
+import irec.callbacks as cb
+from irec.data.transforms import Collate, ToDevice
+from irec.data.dataloader import DataLoader
+from irec.runners import TrainingRunner
+from irec.utils import fix_random_seed
+
+from data import ArrowBatchDataset
+from models import TigerModel, CorrectItemsLogitsProcessor
+
+
+# ПУТИ
+IREC_PATH = '../../'
+SEMANTIC_MAPPING_PATH = os.path.join(IREC_PATH, 'results_sigir/4-3_updated_quantile_plum_rqvae_beauty_ws_2_clusters_colisionless.json')
+TRAIN_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/updated_quantile_tiger_4-3_train_batches/')
+VALID_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/updated_quantile_tiger_4-3_valid_batches/')
+EVAL_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/updated_quantile_tiger_4-3_eval_batches/')
+
+TENSORBOARD_LOGDIR = os.path.join(IREC_PATH, 'tensorboard_logs')
+CHECKPOINTS_DIR = os.path.join(IREC_PATH, 'checkpoints')
+
+EXPERIMENT_NAME = 'tiger_beauty_updated_quantile_4-3_plum_ws_2_dp_0.2'
+
+# ОСТАЛЬНОЕ
+SEED_VALUE = 42
+DEVICE = 'cuda'
+
+NUM_EPOCHS = 300
+MAX_SEQ_LEN = 20
+TRAIN_BATCH_SIZE = 256
+VALID_BATCH_SIZE = 1024
+EMBEDDING_DIM = 128
+CODEBOOK_SIZE = 256
+NUM_POSITIONS = 20
+NUM_USER_HASH = 2000
+NUM_HEADS = 6
+NUM_LAYERS = 4
+FEEDFORWARD_DIM = 1024
+KV_DIM = 64
+DROPOUT = 0.2
+NUM_BEAMS = 30
+TOP_K = 20
+NUM_CODEBOOKS = 4
+LR = 0.0001
+
+
+torch.set_float32_matmul_precision('high')
+torch._dynamo.config.capture_scalar_outputs = True
+
+import torch._inductor.config as config
+config.triton.cudagraph_skip_dynamic_graphs = True
+
+
+def main():
+ fix_random_seed(SEED_VALUE)
+
+ with open(SEMANTIC_MAPPING_PATH, 'r') as f:
+ mappings = json.load(f)
+
+ train_dataloader = DataLoader(
+ ArrowBatchDataset(
+ TRAIN_BATCHES_DIR,
+ device='cpu',
+ preload=True
+ ),
+ batch_size=1,
+ shuffle=True,
+ num_workers=0,
+ pin_memory=True,
+ collate_fn=Collate()
+ ).map(ToDevice(DEVICE)).repeat(NUM_EPOCHS)
+
+ valid_dataloder = ArrowBatchDataset(
+ VALID_BATCHES_DIR,
+ device=DEVICE,
+ preload=True
+ )
+
+ eval_dataloder = ArrowBatchDataset(
+ EVAL_BATCHES_DIR,
+ device=DEVICE,
+ preload=True
+ )
+
+ model = TigerModel(
+ embedding_dim=EMBEDDING_DIM,
+ codebook_size=CODEBOOK_SIZE,
+ sem_id_len=NUM_CODEBOOKS,
+ user_ids_count=NUM_USER_HASH,
+ num_positions=NUM_POSITIONS,
+ num_heads=NUM_HEADS,
+ num_encoder_layers=NUM_LAYERS,
+ num_decoder_layers=NUM_LAYERS,
+ dim_feedforward=FEEDFORWARD_DIM,
+ num_beams=NUM_BEAMS,
+ num_return_sequences=TOP_K,
+ activation='relu',
+ d_kv=KV_DIM,
+ dropout=DROPOUT,
+ layer_norm_eps=1e-6,
+ initializer_range=0.02,
+ logits_processor=partial(
+ CorrectItemsLogitsProcessor,
+ NUM_CODEBOOKS,
+ CODEBOOK_SIZE,
+ mappings,
+ NUM_BEAMS
+ )
+ ).to(DEVICE)
+
+ total_params = sum(p.numel() for p in model.parameters())
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+ logger.debug(f'Overall parameters: {total_params:,}')
+ logger.debug(f'Trainable parameters: {trainable_params:,}')
+
+ optimizer = torch.optim.AdamW(
+ model.parameters(),
+ lr=LR,
+ )
+
+ EPOCH_NUM_STEPS = 1024 # int(len(train_dataloader) // NUM_EPOCHS)
+
+ callbacks = [
+ cb.BatchMetrics(metrics=lambda model_outputs, _: {
+ 'loss': model_outputs['loss'].item(),
+ }, name='train'),
+ cb.MetricAccumulator(
+ accumulators={
+ 'train/loss': cb.MeanAccumulator(),
+ },
+ reset_every_num_steps=EPOCH_NUM_STEPS
+ ),
+
+ cb.Validation(
+ dataset=valid_dataloder,
+ callbacks=[
+ cb.BatchMetrics(metrics=lambda model_outputs, _:{
+ 'loss': model_outputs['loss'].item(),
+ 'recall@5': model_outputs['recall@5'].tolist(),
+ 'recall@10': model_outputs['recall@10'].tolist(),
+ 'recall@20': model_outputs['recall@20'].tolist(),
+ 'ndcg@5': model_outputs['ndcg@5'].tolist(),
+ 'ndcg@10': model_outputs['ndcg@10'].tolist(),
+ 'ndcg@20': model_outputs['ndcg@20'].tolist(),
+ }, name='validation'),
+ cb.MetricAccumulator(
+ accumulators={
+ 'validation/loss': cb.MeanAccumulator(),
+ 'validation/recall@5': cb.MeanAccumulator(),
+ 'validation/recall@10': cb.MeanAccumulator(),
+ 'validation/recall@20': cb.MeanAccumulator(),
+ 'validation/ndcg@5': cb.MeanAccumulator(),
+ 'validation/ndcg@10': cb.MeanAccumulator(),
+ 'validation/ndcg@20': cb.MeanAccumulator(),
+ },
+ ),
+ ],
+ ).every_num_steps(EPOCH_NUM_STEPS),
+
+ cb.Validation(
+ dataset=eval_dataloder,
+ callbacks=[
+ cb.BatchMetrics(metrics=lambda model_outputs, _: {
+ 'loss': model_outputs['loss'].item(),
+ 'recall@5': model_outputs['recall@5'].tolist(),
+ 'recall@10': model_outputs['recall@10'].tolist(),
+ 'recall@20': model_outputs['recall@20'].tolist(),
+ 'ndcg@5': model_outputs['ndcg@5'].tolist(),
+ 'ndcg@10': model_outputs['ndcg@10'].tolist(),
+ 'ndcg@20': model_outputs['ndcg@20'].tolist(),
+ }, name='eval'),
+ cb.MetricAccumulator(
+ accumulators={
+ 'eval/loss': cb.MeanAccumulator(),
+ 'eval/recall@5': cb.MeanAccumulator(),
+ 'eval/recall@10': cb.MeanAccumulator(),
+ 'eval/recall@20': cb.MeanAccumulator(),
+ 'eval/ndcg@5': cb.MeanAccumulator(),
+ 'eval/ndcg@10': cb.MeanAccumulator(),
+ 'eval/ndcg@20': cb.MeanAccumulator(),
+ },
+ ),
+ ],
+ ).every_num_steps(EPOCH_NUM_STEPS),
+
+ cb.Logger().every_num_steps(EPOCH_NUM_STEPS),
+ cb.TensorboardLogger(experiment_name=EXPERIMENT_NAME, logdir=TENSORBOARD_LOGDIR),
+
+ cb.EarlyStopping(
+ metric='eval/ndcg@20',
+ patience=40,
+ minimize=False,
+ model_path=os.path.join(CHECKPOINTS_DIR, EXPERIMENT_NAME)
+ ).every_num_steps(EPOCH_NUM_STEPS)
+
+ # cb.Profiler(
+ # wait=10,
+ # warmup=10,
+ # active=10,
+ # logdir=TENSORBOARD_LOGDIR
+ # ),
+ # cb.StopAfterNumSteps(40)
+
+ ]
+
+ logger.debug('Everything is ready for training process!')
+
+ runner = TrainingRunner(
+ model=model,
+ optimizer=optimizer,
+ dataset=train_dataloader,
+ callbacks=callbacks,
+ )
+ runner.run()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/tiger/beauty_exps/varka_4.1_plum.py b/scripts/tiger/beauty_exps/varka_4.1_plum.py
new file mode 100644
index 0000000..302e04e
--- /dev/null
+++ b/scripts/tiger/beauty_exps/varka_4.1_plum.py
@@ -0,0 +1,287 @@
+from collections import defaultdict
+import json
+import murmurhash
+import numpy as np
+import os
+from pathlib import Path
+
+import pyarrow as pa
+import pyarrow.feather as feather
+
+import torch
+
+from irec.data.transforms import Collate, Transform
+from irec.data.dataloader import DataLoader
+
+from data import Dataset
+
+
+
+# ПУТИ
+
+IREC_PATH = '../../'
+INTERACTIONS_TRAIN_PATH = os.path.join(IREC_PATH, 'sigir/Beauty_new/updated_quantile_splits/merged_for_exps/exp_4_0.9_inter_tiger_train.json')
+INTERACTIONS_VALID_PATH = os.path.join(IREC_PATH, 'sigir/Beauty_new/updated_quantile_splits/merged_for_exps/valid_set.json')
+INTERACTIONS_TEST_PATH = os.path.join(IREC_PATH, 'sigir/Beauty_new/updated_quantile_splits/merged_for_exps/test_set.json')
+
+SEMANTIC_MAPPING_PATH = os.path.join(IREC_PATH, 'results_sigir/4-1_updated_quantile_plum_rqvae_beauty_ws_2_clusters_colisionless.json')
+TRAIN_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/updated_quantile_tiger_4-1_train_batches/')
+VALID_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/updated_quantile_tiger_4-1_valid_batches/')
+EVAL_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/updated_quantile_tiger_4-1_eval_batches/')
+
+
+# ОСТАЛЬНОЕ
+
+SEED_VALUE = 42
+DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+
+MAX_SEQ_LEN = 20
+TRAIN_BATCH_SIZE = 256
+VALID_BATCH_SIZE = 1024
+NUM_USER_HASH = 2000
+CODEBOOK_SIZE = 256
+NUM_CODEBOOKS = 4
+
+UNIFIED_VOCAB_SIZE = CODEBOOK_SIZE * NUM_CODEBOOKS + NUM_USER_HASH + 10 # 10 for utilities
+PAD_TOKEN_ID = UNIFIED_VOCAB_SIZE - 1,
+EOS_TOKEN_ID = UNIFIED_VOCAB_SIZE - 2,
+DECODER_START_TOKEN_ID = UNIFIED_VOCAB_SIZE - 3,
+
+
+
+class TigerProcessing(Transform):
+ def __call__(self, batch):
+ input_semantic_ids, attention_mask = batch['item.semantic.padded'], batch['item.semantic.mask']
+ batch_size = attention_mask.shape[0]
+
+ input_semantic_ids[~attention_mask] = PAD_TOKEN_ID # TODO ???
+
+ input_semantic_ids = np.concatenate([
+ input_semantic_ids,
+ NUM_CODEBOOKS * CODEBOOK_SIZE + batch['user.hashed.ids'][:, None]
+ ], axis=-1)
+
+ attention_mask = np.concatenate([
+ attention_mask,
+ np.ones((batch_size, 1), dtype=attention_mask.dtype)
+ ], axis=-1)
+
+ batch['input.data'] = input_semantic_ids
+ batch['input.mask'] = attention_mask
+
+ target_semantic_ids = batch['labels.semantic.padded']
+ target_semantic_ids = np.concatenate([
+ np.ones(
+ (batch_size, 1),
+ dtype=np.int64,
+ ) * DECODER_START_TOKEN_ID,
+ target_semantic_ids
+ ], axis=-1)
+
+ batch['output.data'] = target_semantic_ids
+
+ return batch
+
+
+class ToMasked(Transform):
+ def __init__(self, prefix, is_right_aligned=False):
+ self._prefix = prefix
+ self._is_right_aligned = is_right_aligned
+
+ def __call__(self, batch):
+ data = batch[f'{self._prefix}.ids']
+ lengths = batch[f'{self._prefix}.length']
+
+ batch_size = lengths.shape[0]
+ max_sequence_length = int(lengths.max())
+
+ if len(data.shape) == 1: # only indices
+ padded_tensor = np.zeros(
+ (batch_size, max_sequence_length),
+ dtype=data.dtype
+ ) # (batch_size, max_seq_len)
+ else:
+ assert len(data.shape) == 2 # embeddings
+ padded_tensor = np.zeros(
+ (batch_size, max_sequence_length, data.shape[-1]),
+ dtype=data.dtype
+ ) # (batch_size, max_seq_len, emb_dim)
+
+ mask = np.arange(max_sequence_length)[None] < lengths[:, None]
+
+ if self._is_right_aligned:
+ mask = np.flip(mask, axis=-1)
+
+ padded_tensor[mask] = data
+
+ batch[f'{self._prefix}.padded'] = padded_tensor
+ batch[f'{self._prefix}.mask'] = mask
+
+ return batch
+
+
+class SemanticIdsMapper(Transform):
+ def __init__(self, mapping, names=[]):
+ super().__init__()
+ self._mapping = mapping
+ self._names = names
+
+ data = []
+ for i in range(len(mapping)):
+ data.append(mapping[str(i)])
+ self._mapping_tensor = torch.tensor(data, dtype=torch.long)
+ self._semantic_length = self._mapping_tensor.shape[-1]
+
+ def __call__(self, batch):
+ for name in self._names:
+ if f'{name}.ids' in batch:
+ ids = batch[f'{name}.ids']
+ lengths = batch[f'{name}.length']
+ assert ids.min() >= 0
+ assert ids.max() < self._mapping_tensor.shape[0]
+ batch[f'{name}.semantic.ids'] = self._mapping_tensor[ids].flatten().numpy()
+ batch[f'{name}.semantic.length'] = lengths * self._semantic_length
+
+ return batch
+
+
+class UserHashing(Transform):
+ def __init__(self, hash_size):
+ super().__init__()
+ self._hash_size = hash_size
+
+ def __call__(self, batch):
+ batch['user.hashed.ids'] = np.array([murmurhash.hash(str(x)) % self._hash_size for x in batch['user.ids']], dtype=np.int64)
+ return batch
+
+
+def save_batches_to_arrow(batches, output_dir):
+ output_dir = Path(output_dir)
+ output_dir.mkdir(parents=True, exist_ok=False)
+
+ for batch_idx, batch in enumerate(batches):
+ length_groups = defaultdict(dict)
+ metadata_groups = defaultdict(dict)
+
+ for key, value in batch.items():
+ length = len(value)
+
+ metadata_groups[length][f'{key}_shape'] = str(value.shape)
+ metadata_groups[length][f'{key}_dtype'] = str(value.dtype)
+
+ if value.ndim == 1:
+ # 1D массив - сохраняем как есть
+ length_groups[length][key] = value
+ elif value.ndim == 2:
+ # 2D массив - используем list of lists
+ length_groups[length][key] = value.tolist()
+ else:
+ # >2D массив - flatten и сохраняем shape
+ length_groups[length][key] = value.flatten()
+
+ for length, fields in length_groups.items():
+ arrow_dict = {}
+ for k, v in fields.items():
+ if isinstance(v, list) and len(v) > 0 and isinstance(v[0], list):
+ # List of lists (2D)
+ arrow_dict[k] = pa.array(v)
+ else:
+ arrow_dict[k] = pa.array(v)
+
+ table = pa.table(arrow_dict)
+ if length in metadata_groups:
+ table = table.replace_schema_metadata(metadata_groups[length])
+
+ feather.write_feather(
+ table,
+ output_dir / f"batch_{batch_idx:06d}_len_{length}.arrow",
+ compression='lz4'
+ )
+
+ # arrow_dict = {k: pa.array(v) for k, v in fields.items()}
+ # table = pa.table(arrow_dict)
+
+ # feather.write_feather(
+ # table,
+ # output_dir / f"batch_{batch_idx:06d}_len_{length}.arrow",
+ # compression='lz4'
+ # )
+
+
+def main():
+ data = Dataset.create_timestamp_based(
+ train_json_path=INTERACTIONS_TRAIN_PATH,
+ validation_json_path=INTERACTIONS_VALID_PATH,
+ test_json_path=INTERACTIONS_TEST_PATH,
+ max_sequence_length=MAX_SEQ_LEN,
+ sampler_type='tiger',
+ min_sample_len=2,
+ is_extended=True
+ )
+
+ with open(SEMANTIC_MAPPING_PATH, 'r') as f:
+ mappings = json.load(f)
+
+ train_dataset, valid_dataset, eval_dataset = data.get_datasets()
+
+ train_dataloader = DataLoader(
+ dataset=train_dataset,
+ batch_size=TRAIN_BATCH_SIZE,
+ shuffle=True,
+ drop_last=True
+ ) \
+ .map(Collate()) \
+ .map(UserHashing(NUM_USER_HASH)) \
+ .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \
+ .map(ToMasked('item.semantic', is_right_aligned=True)) \
+ .map(ToMasked('labels.semantic', is_right_aligned=True)) \
+ .map(TigerProcessing())
+
+ valid_dataloader = DataLoader(
+ dataset=valid_dataset,
+ batch_size=VALID_BATCH_SIZE,
+ shuffle=False,
+ drop_last=False
+ ) \
+ .map(Collate()) \
+ .map(UserHashing(NUM_USER_HASH)) \
+ .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \
+ .map(ToMasked('item.semantic', is_right_aligned=True)) \
+ .map(ToMasked('labels.semantic', is_right_aligned=True)) \
+ .map(ToMasked('visited', is_right_aligned=True)) \
+ .map(TigerProcessing())
+
+ eval_dataloader = DataLoader(
+ dataset=eval_dataset,
+ batch_size=VALID_BATCH_SIZE,
+ shuffle=False,
+ drop_last=False
+ ) \
+ .map(Collate()) \
+ .map(UserHashing(NUM_USER_HASH)) \
+ .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \
+ .map(ToMasked('item.semantic', is_right_aligned=True)) \
+ .map(ToMasked('labels.semantic', is_right_aligned=True)) \
+ .map(ToMasked('visited', is_right_aligned=True)) \
+ .map(TigerProcessing())
+
+ train_batches = []
+ for train_batch in train_dataloader:
+ train_batches.append(train_batch)
+ save_batches_to_arrow(train_batches, TRAIN_BATCHES_DIR)
+
+ valid_batches = []
+ for valid_batch in valid_dataloader:
+ valid_batches.append(valid_batch)
+ save_batches_to_arrow(valid_batches, VALID_BATCHES_DIR)
+
+ eval_batches = []
+ for eval_batch in eval_dataloader:
+ eval_batches.append(eval_batch)
+ save_batches_to_arrow(eval_batches, EVAL_BATCHES_DIR)
+
+
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/tiger/beauty_exps/varka_4.2_plum.py b/scripts/tiger/beauty_exps/varka_4.2_plum.py
new file mode 100644
index 0000000..b00fef2
--- /dev/null
+++ b/scripts/tiger/beauty_exps/varka_4.2_plum.py
@@ -0,0 +1,288 @@
+from collections import defaultdict
+import json
+import murmurhash
+import numpy as np
+import os
+from pathlib import Path
+
+import pyarrow as pa
+import pyarrow.feather as feather
+
+import torch
+
+from irec.data.transforms import Collate, Transform
+from irec.data.dataloader import DataLoader
+
+from data import Dataset
+
+
+
+# ПУТИ
+
+IREC_PATH = '../../'
+INTERACTIONS_TRAIN_PATH = os.path.join(IREC_PATH, 'sigir/Beauty_new/updated_quantile_splits/merged_for_exps/exp_4_0.9_inter_tiger_train.json')
+INTERACTIONS_VALID_PATH = os.path.join(IREC_PATH, 'sigir/Beauty_new/updated_quantile_splits/merged_for_exps/valid_set.json')
+INTERACTIONS_TEST_PATH = os.path.join(IREC_PATH, 'sigir/Beauty_new/updated_quantile_splits/merged_for_exps/test_set.json')
+
+SEMANTIC_MAPPING_PATH = os.path.join(IREC_PATH, 'results_sigir/4-2_updated_quantile_plum_rqvae_beauty_ws_2_clusters_colisionless.json')
+TRAIN_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/updated_quantile_tiger_4-2_train_batches/')
+VALID_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/updated_quantile_tiger_4-2_valid_batches/')
+EVAL_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/updated_quantile_tiger_4-2_eval_batches/')
+
+
+
+# ОСТАЛЬНОЕ
+
+SEED_VALUE = 42
+DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+
+MAX_SEQ_LEN = 20
+TRAIN_BATCH_SIZE = 256
+VALID_BATCH_SIZE = 1024
+NUM_USER_HASH = 2000
+CODEBOOK_SIZE = 256
+NUM_CODEBOOKS = 4
+
+UNIFIED_VOCAB_SIZE = CODEBOOK_SIZE * NUM_CODEBOOKS + NUM_USER_HASH + 10 # 10 for utilities
+PAD_TOKEN_ID = UNIFIED_VOCAB_SIZE - 1,
+EOS_TOKEN_ID = UNIFIED_VOCAB_SIZE - 2,
+DECODER_START_TOKEN_ID = UNIFIED_VOCAB_SIZE - 3,
+
+
+
+class TigerProcessing(Transform):
+ def __call__(self, batch):
+ input_semantic_ids, attention_mask = batch['item.semantic.padded'], batch['item.semantic.mask']
+ batch_size = attention_mask.shape[0]
+
+ input_semantic_ids[~attention_mask] = PAD_TOKEN_ID # TODO ???
+
+ input_semantic_ids = np.concatenate([
+ input_semantic_ids,
+ NUM_CODEBOOKS * CODEBOOK_SIZE + batch['user.hashed.ids'][:, None]
+ ], axis=-1)
+
+ attention_mask = np.concatenate([
+ attention_mask,
+ np.ones((batch_size, 1), dtype=attention_mask.dtype)
+ ], axis=-1)
+
+ batch['input.data'] = input_semantic_ids
+ batch['input.mask'] = attention_mask
+
+ target_semantic_ids = batch['labels.semantic.padded']
+ target_semantic_ids = np.concatenate([
+ np.ones(
+ (batch_size, 1),
+ dtype=np.int64,
+ ) * DECODER_START_TOKEN_ID,
+ target_semantic_ids
+ ], axis=-1)
+
+ batch['output.data'] = target_semantic_ids
+
+ return batch
+
+
+class ToMasked(Transform):
+ def __init__(self, prefix, is_right_aligned=False):
+ self._prefix = prefix
+ self._is_right_aligned = is_right_aligned
+
+ def __call__(self, batch):
+ data = batch[f'{self._prefix}.ids']
+ lengths = batch[f'{self._prefix}.length']
+
+ batch_size = lengths.shape[0]
+ max_sequence_length = int(lengths.max())
+
+ if len(data.shape) == 1: # only indices
+ padded_tensor = np.zeros(
+ (batch_size, max_sequence_length),
+ dtype=data.dtype
+ ) # (batch_size, max_seq_len)
+ else:
+ assert len(data.shape) == 2 # embeddings
+ padded_tensor = np.zeros(
+ (batch_size, max_sequence_length, data.shape[-1]),
+ dtype=data.dtype
+ ) # (batch_size, max_seq_len, emb_dim)
+
+ mask = np.arange(max_sequence_length)[None] < lengths[:, None]
+
+ if self._is_right_aligned:
+ mask = np.flip(mask, axis=-1)
+
+ padded_tensor[mask] = data
+
+ batch[f'{self._prefix}.padded'] = padded_tensor
+ batch[f'{self._prefix}.mask'] = mask
+
+ return batch
+
+
+class SemanticIdsMapper(Transform):
+ def __init__(self, mapping, names=[]):
+ super().__init__()
+ self._mapping = mapping
+ self._names = names
+
+ data = []
+ for i in range(len(mapping)):
+ data.append(mapping[str(i)])
+ self._mapping_tensor = torch.tensor(data, dtype=torch.long)
+ self._semantic_length = self._mapping_tensor.shape[-1]
+
+ def __call__(self, batch):
+ for name in self._names:
+ if f'{name}.ids' in batch:
+ ids = batch[f'{name}.ids']
+ lengths = batch[f'{name}.length']
+ assert ids.min() >= 0
+ assert ids.max() < self._mapping_tensor.shape[0]
+ batch[f'{name}.semantic.ids'] = self._mapping_tensor[ids].flatten().numpy()
+ batch[f'{name}.semantic.length'] = lengths * self._semantic_length
+
+ return batch
+
+
+class UserHashing(Transform):
+ def __init__(self, hash_size):
+ super().__init__()
+ self._hash_size = hash_size
+
+ def __call__(self, batch):
+ batch['user.hashed.ids'] = np.array([murmurhash.hash(str(x)) % self._hash_size for x in batch['user.ids']], dtype=np.int64)
+ return batch
+
+
+def save_batches_to_arrow(batches, output_dir):
+ output_dir = Path(output_dir)
+ output_dir.mkdir(parents=True, exist_ok=False)
+
+ for batch_idx, batch in enumerate(batches):
+ length_groups = defaultdict(dict)
+ metadata_groups = defaultdict(dict)
+
+ for key, value in batch.items():
+ length = len(value)
+
+ metadata_groups[length][f'{key}_shape'] = str(value.shape)
+ metadata_groups[length][f'{key}_dtype'] = str(value.dtype)
+
+ if value.ndim == 1:
+ # 1D массив - сохраняем как есть
+ length_groups[length][key] = value
+ elif value.ndim == 2:
+ # 2D массив - используем list of lists
+ length_groups[length][key] = value.tolist()
+ else:
+ # >2D массив - flatten и сохраняем shape
+ length_groups[length][key] = value.flatten()
+
+ for length, fields in length_groups.items():
+ arrow_dict = {}
+ for k, v in fields.items():
+ if isinstance(v, list) and len(v) > 0 and isinstance(v[0], list):
+ # List of lists (2D)
+ arrow_dict[k] = pa.array(v)
+ else:
+ arrow_dict[k] = pa.array(v)
+
+ table = pa.table(arrow_dict)
+ if length in metadata_groups:
+ table = table.replace_schema_metadata(metadata_groups[length])
+
+ feather.write_feather(
+ table,
+ output_dir / f"batch_{batch_idx:06d}_len_{length}.arrow",
+ compression='lz4'
+ )
+
+ # arrow_dict = {k: pa.array(v) for k, v in fields.items()}
+ # table = pa.table(arrow_dict)
+
+ # feather.write_feather(
+ # table,
+ # output_dir / f"batch_{batch_idx:06d}_len_{length}.arrow",
+ # compression='lz4'
+ # )
+
+
+def main():
+ data = Dataset.create_timestamp_based(
+ train_json_path=INTERACTIONS_TRAIN_PATH,
+ validation_json_path=INTERACTIONS_VALID_PATH,
+ test_json_path=INTERACTIONS_TEST_PATH,
+ max_sequence_length=MAX_SEQ_LEN,
+ sampler_type='tiger',
+ min_sample_len=2,
+ is_extended=True
+ )
+
+ with open(SEMANTIC_MAPPING_PATH, 'r') as f:
+ mappings = json.load(f)
+
+ train_dataset, valid_dataset, eval_dataset = data.get_datasets()
+
+ train_dataloader = DataLoader(
+ dataset=train_dataset,
+ batch_size=TRAIN_BATCH_SIZE,
+ shuffle=True,
+ drop_last=True
+ ) \
+ .map(Collate()) \
+ .map(UserHashing(NUM_USER_HASH)) \
+ .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \
+ .map(ToMasked('item.semantic', is_right_aligned=True)) \
+ .map(ToMasked('labels.semantic', is_right_aligned=True)) \
+ .map(TigerProcessing())
+
+ valid_dataloader = DataLoader(
+ dataset=valid_dataset,
+ batch_size=VALID_BATCH_SIZE,
+ shuffle=False,
+ drop_last=False
+ ) \
+ .map(Collate()) \
+ .map(UserHashing(NUM_USER_HASH)) \
+ .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \
+ .map(ToMasked('item.semantic', is_right_aligned=True)) \
+ .map(ToMasked('labels.semantic', is_right_aligned=True)) \
+ .map(ToMasked('visited', is_right_aligned=True)) \
+ .map(TigerProcessing())
+
+ eval_dataloader = DataLoader(
+ dataset=eval_dataset,
+ batch_size=VALID_BATCH_SIZE,
+ shuffle=False,
+ drop_last=False
+ ) \
+ .map(Collate()) \
+ .map(UserHashing(NUM_USER_HASH)) \
+ .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \
+ .map(ToMasked('item.semantic', is_right_aligned=True)) \
+ .map(ToMasked('labels.semantic', is_right_aligned=True)) \
+ .map(ToMasked('visited', is_right_aligned=True)) \
+ .map(TigerProcessing())
+
+ train_batches = []
+ for train_batch in train_dataloader:
+ train_batches.append(train_batch)
+ save_batches_to_arrow(train_batches, TRAIN_BATCHES_DIR)
+
+ valid_batches = []
+ for valid_batch in valid_dataloader:
+ valid_batches.append(valid_batch)
+ save_batches_to_arrow(valid_batches, VALID_BATCHES_DIR)
+
+ eval_batches = []
+ for eval_batch in eval_dataloader:
+ eval_batches.append(eval_batch)
+ save_batches_to_arrow(eval_batches, EVAL_BATCHES_DIR)
+
+
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/tiger/beauty_exps/varka_4.3_plum.py b/scripts/tiger/beauty_exps/varka_4.3_plum.py
new file mode 100644
index 0000000..2a96339
--- /dev/null
+++ b/scripts/tiger/beauty_exps/varka_4.3_plum.py
@@ -0,0 +1,289 @@
+from collections import defaultdict
+import json
+import murmurhash
+import numpy as np
+import os
+from pathlib import Path
+
+import pyarrow as pa
+import pyarrow.feather as feather
+
+import torch
+
+from irec.data.transforms import Collate, Transform
+from irec.data.dataloader import DataLoader
+
+from data import Dataset
+
+
+
+# ПУТИ
+
+IREC_PATH = '../../'
+INTERACTIONS_TRAIN_PATH = os.path.join(IREC_PATH, 'sigir/Beauty_new/updated_quantile_splits/merged_for_exps/exp_4_0.9_inter_tiger_train.json')
+INTERACTIONS_VALID_PATH = os.path.join(IREC_PATH, 'sigir/Beauty_new/updated_quantile_splits/merged_for_exps/valid_set.json')
+INTERACTIONS_TEST_PATH = os.path.join(IREC_PATH, 'sigir/Beauty_new/updated_quantile_splits/merged_for_exps/test_set.json')
+
+SEMANTIC_MAPPING_PATH = os.path.join(IREC_PATH, 'results_sigir/4-3_updated_quantile_plum_rqvae_beauty_ws_2_clusters_colisionless.json')
+TRAIN_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/updated_quantile_tiger_4-3_train_batches/')
+VALID_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/updated_quantile_tiger_4-3_valid_batches/')
+EVAL_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/updated_quantile_tiger_4-3_eval_batches/')
+
+
+
+
+# ОСТАЛЬНОЕ
+
+SEED_VALUE = 42
+DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+
+MAX_SEQ_LEN = 20
+TRAIN_BATCH_SIZE = 256
+VALID_BATCH_SIZE = 1024
+NUM_USER_HASH = 2000
+CODEBOOK_SIZE = 256
+NUM_CODEBOOKS = 4
+
+UNIFIED_VOCAB_SIZE = CODEBOOK_SIZE * NUM_CODEBOOKS + NUM_USER_HASH + 10 # 10 for utilities
+PAD_TOKEN_ID = UNIFIED_VOCAB_SIZE - 1,
+EOS_TOKEN_ID = UNIFIED_VOCAB_SIZE - 2,
+DECODER_START_TOKEN_ID = UNIFIED_VOCAB_SIZE - 3,
+
+
+
+class TigerProcessing(Transform):
+ def __call__(self, batch):
+ input_semantic_ids, attention_mask = batch['item.semantic.padded'], batch['item.semantic.mask']
+ batch_size = attention_mask.shape[0]
+
+ input_semantic_ids[~attention_mask] = PAD_TOKEN_ID # TODO ???
+
+ input_semantic_ids = np.concatenate([
+ input_semantic_ids,
+ NUM_CODEBOOKS * CODEBOOK_SIZE + batch['user.hashed.ids'][:, None]
+ ], axis=-1)
+
+ attention_mask = np.concatenate([
+ attention_mask,
+ np.ones((batch_size, 1), dtype=attention_mask.dtype)
+ ], axis=-1)
+
+ batch['input.data'] = input_semantic_ids
+ batch['input.mask'] = attention_mask
+
+ target_semantic_ids = batch['labels.semantic.padded']
+ target_semantic_ids = np.concatenate([
+ np.ones(
+ (batch_size, 1),
+ dtype=np.int64,
+ ) * DECODER_START_TOKEN_ID,
+ target_semantic_ids
+ ], axis=-1)
+
+ batch['output.data'] = target_semantic_ids
+
+ return batch
+
+
+class ToMasked(Transform):
+ def __init__(self, prefix, is_right_aligned=False):
+ self._prefix = prefix
+ self._is_right_aligned = is_right_aligned
+
+ def __call__(self, batch):
+ data = batch[f'{self._prefix}.ids']
+ lengths = batch[f'{self._prefix}.length']
+
+ batch_size = lengths.shape[0]
+ max_sequence_length = int(lengths.max())
+
+ if len(data.shape) == 1: # only indices
+ padded_tensor = np.zeros(
+ (batch_size, max_sequence_length),
+ dtype=data.dtype
+ ) # (batch_size, max_seq_len)
+ else:
+ assert len(data.shape) == 2 # embeddings
+ padded_tensor = np.zeros(
+ (batch_size, max_sequence_length, data.shape[-1]),
+ dtype=data.dtype
+ ) # (batch_size, max_seq_len, emb_dim)
+
+ mask = np.arange(max_sequence_length)[None] < lengths[:, None]
+
+ if self._is_right_aligned:
+ mask = np.flip(mask, axis=-1)
+
+ padded_tensor[mask] = data
+
+ batch[f'{self._prefix}.padded'] = padded_tensor
+ batch[f'{self._prefix}.mask'] = mask
+
+ return batch
+
+
+class SemanticIdsMapper(Transform):
+ def __init__(self, mapping, names=[]):
+ super().__init__()
+ self._mapping = mapping
+ self._names = names
+
+ data = []
+ for i in range(len(mapping)):
+ data.append(mapping[str(i)])
+ self._mapping_tensor = torch.tensor(data, dtype=torch.long)
+ self._semantic_length = self._mapping_tensor.shape[-1]
+
+ def __call__(self, batch):
+ for name in self._names:
+ if f'{name}.ids' in batch:
+ ids = batch[f'{name}.ids']
+ lengths = batch[f'{name}.length']
+ assert ids.min() >= 0
+ assert ids.max() < self._mapping_tensor.shape[0]
+ batch[f'{name}.semantic.ids'] = self._mapping_tensor[ids].flatten().numpy()
+ batch[f'{name}.semantic.length'] = lengths * self._semantic_length
+
+ return batch
+
+
+class UserHashing(Transform):
+ def __init__(self, hash_size):
+ super().__init__()
+ self._hash_size = hash_size
+
+ def __call__(self, batch):
+ batch['user.hashed.ids'] = np.array([murmurhash.hash(str(x)) % self._hash_size for x in batch['user.ids']], dtype=np.int64)
+ return batch
+
+
+def save_batches_to_arrow(batches, output_dir):
+ output_dir = Path(output_dir)
+ output_dir.mkdir(parents=True, exist_ok=False)
+
+ for batch_idx, batch in enumerate(batches):
+ length_groups = defaultdict(dict)
+ metadata_groups = defaultdict(dict)
+
+ for key, value in batch.items():
+ length = len(value)
+
+ metadata_groups[length][f'{key}_shape'] = str(value.shape)
+ metadata_groups[length][f'{key}_dtype'] = str(value.dtype)
+
+ if value.ndim == 1:
+ # 1D массив - сохраняем как есть
+ length_groups[length][key] = value
+ elif value.ndim == 2:
+ # 2D массив - используем list of lists
+ length_groups[length][key] = value.tolist()
+ else:
+ # >2D массив - flatten и сохраняем shape
+ length_groups[length][key] = value.flatten()
+
+ for length, fields in length_groups.items():
+ arrow_dict = {}
+ for k, v in fields.items():
+ if isinstance(v, list) and len(v) > 0 and isinstance(v[0], list):
+ # List of lists (2D)
+ arrow_dict[k] = pa.array(v)
+ else:
+ arrow_dict[k] = pa.array(v)
+
+ table = pa.table(arrow_dict)
+ if length in metadata_groups:
+ table = table.replace_schema_metadata(metadata_groups[length])
+
+ feather.write_feather(
+ table,
+ output_dir / f"batch_{batch_idx:06d}_len_{length}.arrow",
+ compression='lz4'
+ )
+
+ # arrow_dict = {k: pa.array(v) for k, v in fields.items()}
+ # table = pa.table(arrow_dict)
+
+ # feather.write_feather(
+ # table,
+ # output_dir / f"batch_{batch_idx:06d}_len_{length}.arrow",
+ # compression='lz4'
+ # )
+
+
+def main():
+ data = Dataset.create_timestamp_based(
+ train_json_path=INTERACTIONS_TRAIN_PATH,
+ validation_json_path=INTERACTIONS_VALID_PATH,
+ test_json_path=INTERACTIONS_TEST_PATH,
+ max_sequence_length=MAX_SEQ_LEN,
+ sampler_type='tiger',
+ min_sample_len=2,
+ is_extended=True
+ )
+
+ with open(SEMANTIC_MAPPING_PATH, 'r') as f:
+ mappings = json.load(f)
+
+ train_dataset, valid_dataset, eval_dataset = data.get_datasets()
+
+ train_dataloader = DataLoader(
+ dataset=train_dataset,
+ batch_size=TRAIN_BATCH_SIZE,
+ shuffle=True,
+ drop_last=True
+ ) \
+ .map(Collate()) \
+ .map(UserHashing(NUM_USER_HASH)) \
+ .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \
+ .map(ToMasked('item.semantic', is_right_aligned=True)) \
+ .map(ToMasked('labels.semantic', is_right_aligned=True)) \
+ .map(TigerProcessing())
+
+ valid_dataloader = DataLoader(
+ dataset=valid_dataset,
+ batch_size=VALID_BATCH_SIZE,
+ shuffle=False,
+ drop_last=False
+ ) \
+ .map(Collate()) \
+ .map(UserHashing(NUM_USER_HASH)) \
+ .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \
+ .map(ToMasked('item.semantic', is_right_aligned=True)) \
+ .map(ToMasked('labels.semantic', is_right_aligned=True)) \
+ .map(ToMasked('visited', is_right_aligned=True)) \
+ .map(TigerProcessing())
+
+ eval_dataloader = DataLoader(
+ dataset=eval_dataset,
+ batch_size=VALID_BATCH_SIZE,
+ shuffle=False,
+ drop_last=False
+ ) \
+ .map(Collate()) \
+ .map(UserHashing(NUM_USER_HASH)) \
+ .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \
+ .map(ToMasked('item.semantic', is_right_aligned=True)) \
+ .map(ToMasked('labels.semantic', is_right_aligned=True)) \
+ .map(ToMasked('visited', is_right_aligned=True)) \
+ .map(TigerProcessing())
+
+ train_batches = []
+ for train_batch in train_dataloader:
+ train_batches.append(train_batch)
+ save_batches_to_arrow(train_batches, TRAIN_BATCHES_DIR)
+
+ valid_batches = []
+ for valid_batch in valid_dataloader:
+ valid_batches.append(valid_batch)
+ save_batches_to_arrow(valid_batches, VALID_BATCHES_DIR)
+
+ eval_batches = []
+ for eval_batch in eval_dataloader:
+ eval_batches.append(eval_batch)
+ save_batches_to_arrow(eval_batches, EVAL_BATCHES_DIR)
+
+
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/tiger/data.py b/scripts/tiger/data.py
index 188993a..a34accd 100644
--- a/scripts/tiger/data.py
+++ b/scripts/tiger/data.py
@@ -28,6 +28,116 @@ def __init__(
self._num_items = num_items
self._max_sequence_length = max_sequence_length
+ @classmethod
+ def create_timestamp_based(
+ cls,
+ train_json_path,
+ validation_json_path,
+ test_json_path,
+ max_sequence_length,
+ sampler_type,
+ min_sample_len=2,
+ is_extended=False
+ ):
+ max_item_id = 0
+ train_dataset, validation_dataset, test_dataset = [], [], []
+
+ with open(train_json_path, 'r') as f:
+ train_data = json.load(f)
+ with open(validation_json_path, 'r') as f:
+ validation_data = json.load(f)
+ with open(test_json_path, 'r') as f:
+ test_data = json.load(f)
+
+ all_users = set(train_data.keys()) | set(validation_data.keys()) | set(test_data.keys())
+ print(f"all users count: {len(all_users)}")
+ for user_id_str in all_users:
+ user_id = int(user_id_str)
+
+ train_items = train_data.get(user_id_str, [])
+ validation_items = validation_data.get(user_id_str, [])
+ test_items = test_data.get(user_id_str, [])
+
+ full_sequence = train_items + validation_items + test_items
+ if full_sequence:
+ max_item_id = max(max_item_id, max(full_sequence))
+
+ assert len(full_sequence) >= 5, f'Core-5 dataset is used, user {user_id} has only {len(full_sequence)} items'
+
+ if is_extended:
+ # sample = [1, 2]
+ # sample = [1, 2, 3]
+ # sample = [1, 2, 3, 4]
+ # sample = [1, 2, 3, 4, 5]
+ # sample = [1, 2, 3, 4, 5, 6]
+ # sample = [1, 2, 3, 4, 5, 6, 7]
+ # sample = [1, 2, 3, 4, 5, 6, 7, 8]
+ for prefix_length in range(min_sample_len, len(train_items) + 1):
+ train_dataset.append({
+ 'user.ids': [user_id],
+ 'item.ids': train_items[:prefix_length],
+ })
+ else:
+ # sample = [1, 2, 3, 4, 5, 6, 7, 8]
+ train_dataset.append({
+ 'user.ids': [user_id],
+ 'item.ids': train_items,
+ })
+
+ # разворачиваем каждый айтем из валидации в отдельный сэмпл
+ # Пример: Train=[1,2], Valid=[3,4]
+ # sample = [1, 2, 3]
+ # sample = [1, 2, 3, 4]
+
+ current_history = train_items.copy()
+ for item in validation_items:
+ # эвал датасет сам отрезает таргет потом
+ sample_sequence = current_history + [item]
+
+ if len(sample_sequence) >= min_sample_len:
+ validation_dataset.append({
+ 'user.ids': [user_id],
+ 'item.ids': sample_sequence,
+ })
+ current_history.append(item)
+
+ # разворачиваем каждый айтем из теста в отдельный сэмпл
+ # Пример: Train=[1,2], Valid=[3,4], Test=[5, 6]
+ # sample = [1, 2, 3, 4, 5]
+ # sample = [1, 2, 3, 4, 5, 6]
+ current_history = train_items + validation_items
+
+ for item in test_items:
+ # эвал датасет сам отрезает таргет потом
+ sample_sequence = current_history + [item]
+
+ if len(sample_sequence) >= min_sample_len:
+ test_dataset.append({
+ 'user.ids': [user_id],
+ 'item.ids': sample_sequence,
+ })
+
+ current_history.append(item)
+
+ logger.debug(f'Train dataset size: {len(train_dataset)}')
+ logger.debug(f'Validation dataset size: {len(validation_dataset)}')
+ logger.debug(f'Test dataset size: {len(test_dataset)}')
+ print(f'Train dataset size: {len(train_dataset)}')
+ print(f'Validation dataset size: {len(validation_dataset)}')
+ print(f'Test dataset size: {len(test_dataset)}')
+
+ train_sampler = TrainDataset(train_dataset, sampler_type, max_sequence_length=max_sequence_length)
+ validation_sampler = EvalDataset(validation_dataset, max_sequence_length=max_sequence_length)
+ test_sampler = EvalDataset(test_dataset, max_sequence_length=max_sequence_length)
+
+ return cls(
+ train_sampler=train_sampler,
+ validation_sampler=validation_sampler,
+ test_sampler=test_sampler,
+ num_items=max_item_id + 1, # +1 added because our ids are 0-indexed
+ max_sequence_length=max_sequence_length
+ )
+
@classmethod
def create(cls, inter_json_path, max_sequence_length, sampler_type, is_extended=False):
max_item_id = 0
diff --git a/scripts/tiger/train.py b/scripts/tiger/train.py
index f436dd4..1a2d347 100644
--- a/scripts/tiger/train.py
+++ b/scripts/tiger/train.py
@@ -14,10 +14,23 @@
from data import ArrowBatchDataset
from models import TigerModel, CorrectItemsLogitsProcessor
+
+# ПУТИ
+IREC_PATH = '../../'
+SEMANTIC_MAPPING_PATH = os.path.join(IREC_PATH, 'results_sigir/4-1_plum_rqvae_beauty_ws_2_clusters_colisionless.json')
+TRAIN_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/tiger_4-1_train_batches/')
+VALID_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/tiger_4-1_valid_batches/')
+EVAL_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/tiger_4-1_eval_batches/')
+
+TENSORBOARD_LOGDIR = os.path.join(IREC_PATH, 'tensorboard_logs')
+CHECKPOINTS_DIR = os.path.join(IREC_PATH, 'checkpoints')
+
+EXPERIMENT_NAME = 'tiger_beauty_4-1_plum_ws_2_dp_0.2'
+
+# ОСТАЛЬНОЕ
SEED_VALUE = 42
DEVICE = 'cuda'
-EXPERIMENT_NAME = 'tiger_beauty'
NUM_EPOCHS = 300
MAX_SEQ_LEN = 20
TRAIN_BATCH_SIZE = 256
@@ -30,13 +43,12 @@
NUM_LAYERS = 4
FEEDFORWARD_DIM = 1024
KV_DIM = 64
-DROPOUT = 0.1
+DROPOUT = 0.2
NUM_BEAMS = 30
TOP_K = 20
NUM_CODEBOOKS = 4
-LR = 3e-4
+LR = 0.0001
-IREC_PATH = '../../'
torch.set_float32_matmul_precision('high')
torch._dynamo.config.capture_scalar_outputs = True
@@ -48,30 +60,30 @@
def main():
fix_random_seed(SEED_VALUE)
- with open(os.path.join(IREC_PATH, 'results/rqvae_beauty_best_clusters_colisionless.json'), 'r') as f:
+ with open(SEMANTIC_MAPPING_PATH, 'r') as f:
mappings = json.load(f)
-
+
train_dataloader = DataLoader(
ArrowBatchDataset(
- os.path.join(IREC_PATH, 'data/Beauty/tiger_train_batches/'),
- device='cpu',
+ TRAIN_BATCHES_DIR,
+ device='cpu',
preload=True
),
- batch_size=1,
- shuffle=True,
+ batch_size=1,
+ shuffle=True,
num_workers=0,
- pin_memory=True,
+ pin_memory=True,
collate_fn=Collate()
).map(ToDevice(DEVICE)).repeat(NUM_EPOCHS)
valid_dataloder = ArrowBatchDataset(
- os.path.join(IREC_PATH, 'data/Beauty/tiger_valid_batches/'),
+ VALID_BATCHES_DIR,
device=DEVICE,
preload=True
)
eval_dataloder = ArrowBatchDataset(
- os.path.join(IREC_PATH, 'data/Beauty/tiger_eval_batches/'),
+ EVAL_BATCHES_DIR,
device=DEVICE,
preload=True
)
@@ -177,22 +189,22 @@ def main():
),
],
).every_num_steps(EPOCH_NUM_STEPS),
-
+
cb.Logger().every_num_steps(EPOCH_NUM_STEPS),
- cb.TensorboardLogger(experiment_name=EXPERIMENT_NAME, logdir=os.path.join(IREC_PATH, 'tensorboard_logs')),
+ cb.TensorboardLogger(experiment_name=EXPERIMENT_NAME, logdir=TENSORBOARD_LOGDIR),
cb.EarlyStopping(
- metric='eval/ndcg@20',
+ metric='eval/ndcg@20',
patience=40,
minimize=False,
- model_path=os.path.join(IREC_PATH, 'checkpoints', EXPERIMENT_NAME)
+ model_path=os.path.join(CHECKPOINTS_DIR, EXPERIMENT_NAME)
).every_num_steps(EPOCH_NUM_STEPS)
# cb.Profiler(
# wait=10,
# warmup=10,
# active=10,
- # logdir=os.path.join(IREC_PATH, 'tensorboard_logs')
+ # logdir=TENSORBOARD_LOGDIR
# ),
# cb.StopAfterNumSteps(40)
diff --git a/scripts/tiger/varka.py b/scripts/tiger/varka.py
index ed47595..4dc3e02 100644
--- a/scripts/tiger/varka.py
+++ b/scripts/tiger/varka.py
@@ -15,6 +15,20 @@
from data import Dataset
+
+
+# ПУТИ
+
+IREC_PATH = '../../'
+INTERACTIONS_PATH = os.path.join(IREC_PATH, 'data/Beauty/inter.json')
+SEMANTIC_MAPPING_PATH = os.path.join(IREC_PATH, 'results/rqvae_beauty_best_clusters_colisionless.json')
+TRAIN_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/tiger_train_batches/')
+VALID_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/tiger_valid_batches/')
+EVAL_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/tiger_eval_batches/')
+
+
+# ОСТАЛЬНОЕ
+
SEED_VALUE = 42
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
@@ -32,8 +46,6 @@
DECODER_START_TOKEN_ID = UNIFIED_VOCAB_SIZE - 3,
-IREC_PATH = '../../'
-
class TigerProcessing(Transform):
def __call__(self, batch):
@@ -42,12 +54,12 @@ def __call__(self, batch):
input_semantic_ids[~attention_mask] = PAD_TOKEN_ID # TODO ???
- input_semantic_ids = np.concat([
+ input_semantic_ids = np.concatenate([
input_semantic_ids,
NUM_CODEBOOKS * CODEBOOK_SIZE + batch['user.hashed.ids'][:, None]
], axis=-1)
- attention_mask = np.concat([
+ attention_mask = np.concatenate([
attention_mask,
np.ones((batch_size, 1), dtype=attention_mask.dtype)
], axis=-1)
@@ -56,7 +68,7 @@ def __call__(self, batch):
batch['input.mask'] = attention_mask
target_semantic_ids = batch['labels.semantic.padded']
- target_semantic_ids = np.concat([
+ target_semantic_ids = np.concatenate([
np.ones(
(batch_size, 1),
dtype=np.int64,
@@ -73,7 +85,7 @@ class ToMasked(Transform):
def __init__(self, prefix, is_right_aligned=False):
self._prefix = prefix
self._is_right_aligned = is_right_aligned
-
+
def __call__(self, batch):
data = batch[f'{self._prefix}.ids']
lengths = batch[f'{self._prefix}.length']
@@ -92,7 +104,7 @@ def __call__(self, batch):
(batch_size, max_sequence_length, data.shape[-1]),
dtype=data.dtype
) # (batch_size, max_seq_len, emb_dim)
-
+
mask = np.arange(max_sequence_length)[None] < lengths[:, None]
if self._is_right_aligned:
@@ -117,10 +129,10 @@ def __init__(self, mapping, names=[]):
data.append(mapping[str(i)])
self._mapping_tensor = torch.tensor(data, dtype=torch.long)
self._semantic_length = self._mapping_tensor.shape[-1]
-
+
def __call__(self, batch):
for name in self._names:
- if f'{name}.ids' in batch:
+ if f'{name}.ids' in batch:
ids = batch[f'{name}.ids']
lengths = batch[f'{name}.length']
assert ids.min() >= 0
@@ -135,7 +147,7 @@ class UserHashing(Transform):
def __init__(self, hash_size):
super().__init__()
self._hash_size = hash_size
-
+
def __call__(self, batch):
batch['user.hashed.ids'] = np.array([murmurhash.hash(str(x)) % self._hash_size for x in batch['user.ids']], dtype=np.int64)
return batch
@@ -144,7 +156,7 @@ def __call__(self, batch):
def save_batches_to_arrow(batches, output_dir):
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=False)
-
+
for batch_idx, batch in enumerate(batches):
length_groups = defaultdict(dict)
metadata_groups = defaultdict(dict)
@@ -164,7 +176,7 @@ def save_batches_to_arrow(batches, output_dir):
else:
# >2D массив - flatten и сохраняем shape
length_groups[length][key] = value.flatten()
-
+
for length, fields in length_groups.items():
arrow_dict = {}
for k, v in fields.items():
@@ -173,11 +185,11 @@ def save_batches_to_arrow(batches, output_dir):
arrow_dict[k] = pa.array(v)
else:
arrow_dict[k] = pa.array(v)
-
+
table = pa.table(arrow_dict)
if length in metadata_groups:
table = table.replace_schema_metadata(metadata_groups[length])
-
+
feather.write_feather(
table,
output_dir / f"batch_{batch_idx:06d}_len_{length}.arrow",
@@ -186,7 +198,7 @@ def save_batches_to_arrow(batches, output_dir):
# arrow_dict = {k: pa.array(v) for k, v in fields.items()}
# table = pa.table(arrow_dict)
-
+
# feather.write_feather(
# table,
# output_dir / f"batch_{batch_idx:06d}_len_{length}.arrow",
@@ -196,15 +208,15 @@ def save_batches_to_arrow(batches, output_dir):
def main():
data = Dataset.create(
- inter_json_path=os.path.join(IREC_PATH, 'data/Beauty/inter.json'),
+ inter_json_path=INTERACTIONS_PATH,
max_sequence_length=MAX_SEQ_LEN,
sampler_type='tiger',
is_extended=True
)
- with open(os.path.join(IREC_PATH, 'results/rqvae_beauty_best_clusters_colisionless.json'), 'r') as f:
+ with open(SEMANTIC_MAPPING_PATH, 'r') as f:
mappings = json.load(f)
-
+
train_dataset, valid_dataset, eval_dataset = data.get_datasets()
train_dataloader = DataLoader(
@@ -219,7 +231,7 @@ def main():
.map(ToMasked('item.semantic', is_right_aligned=True)) \
.map(ToMasked('labels.semantic', is_right_aligned=True)) \
.map(TigerProcessing())
-
+
valid_dataloader = DataLoader(
dataset=valid_dataset,
batch_size=VALID_BATCH_SIZE,
@@ -251,17 +263,18 @@ def main():
train_batches = []
for train_batch in train_dataloader:
train_batches.append(train_batch)
- save_batches_to_arrow(train_batches, os.path.join(IREC_PATH, 'data/Beauty/tiger_train_batches/'))
-
+ save_batches_to_arrow(train_batches, TRAIN_BATCHES_DIR)
+
valid_batches = []
for valid_batch in valid_dataloader:
valid_batches.append(valid_batch)
- save_batches_to_arrow(valid_batches, os.path.join(IREC_PATH, 'data/Beauty/tiger_valid_batches/'))
-
+ save_batches_to_arrow(valid_batches, VALID_BATCHES_DIR)
+
eval_batches = []
for eval_batch in eval_dataloader:
eval_batches.append(eval_batch)
- save_batches_to_arrow(eval_batches, os.path.join(IREC_PATH, 'data/Beauty/tiger_eval_batches/'))
+ save_batches_to_arrow(eval_batches, EVAL_BATCHES_DIR)
+
if __name__ == '__main__':
diff --git a/scripts/tiger/varka_timestamp_based.py b/scripts/tiger/varka_timestamp_based.py
new file mode 100644
index 0000000..11343ea
--- /dev/null
+++ b/scripts/tiger/varka_timestamp_based.py
@@ -0,0 +1,287 @@
+from collections import defaultdict
+import json
+import murmurhash
+import numpy as np
+import os
+from pathlib import Path
+
+import pyarrow as pa
+import pyarrow.feather as feather
+
+import torch
+
+from irec.data.transforms import Collate, Transform
+from irec.data.dataloader import DataLoader
+
+from data import Dataset
+
+
+
+# ПУТИ
+
+IREC_PATH = '../../'
+INTERACTIONS_TRAIN_PATH = os.path.join(IREC_PATH, 'sigir/Beauty_new/splits/exp_data/exp_4_inter_tiger_train.json')
+INTERACTIONS_VALID_PATH = os.path.join(IREC_PATH, 'sigir/Beauty_new/splits/exp_data/valid_skip_set.json')
+INTERACTIONS_TEST_PATH = os.path.join(IREC_PATH, 'sigir/Beauty_new/splits/exp_data/test_set.json')
+
+SEMANTIC_MAPPING_PATH = os.path.join(IREC_PATH, 'results_sigir/4-1_plum_rqvae_beauty_ws_2_clusters_colisionless.json')
+TRAIN_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/tiger_4-1_train_batches/')
+VALID_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/tiger_4-1_valid_batches/')
+EVAL_BATCHES_DIR = os.path.join(IREC_PATH, 'data/Beauty/tiger_4-1_eval_batches/')
+
+
+# ОСТАЛЬНОЕ
+
+SEED_VALUE = 42
+DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+
+MAX_SEQ_LEN = 20
+TRAIN_BATCH_SIZE = 256
+VALID_BATCH_SIZE = 1024
+NUM_USER_HASH = 2000
+CODEBOOK_SIZE = 256
+NUM_CODEBOOKS = 4
+
+UNIFIED_VOCAB_SIZE = CODEBOOK_SIZE * NUM_CODEBOOKS + NUM_USER_HASH + 10 # 10 for utilities
+PAD_TOKEN_ID = UNIFIED_VOCAB_SIZE - 1,
+EOS_TOKEN_ID = UNIFIED_VOCAB_SIZE - 2,
+DECODER_START_TOKEN_ID = UNIFIED_VOCAB_SIZE - 3,
+
+
+
+class TigerProcessing(Transform):
+ def __call__(self, batch):
+ input_semantic_ids, attention_mask = batch['item.semantic.padded'], batch['item.semantic.mask']
+ batch_size = attention_mask.shape[0]
+
+ input_semantic_ids[~attention_mask] = PAD_TOKEN_ID # TODO ???
+
+ input_semantic_ids = np.concatenate([
+ input_semantic_ids,
+ NUM_CODEBOOKS * CODEBOOK_SIZE + batch['user.hashed.ids'][:, None]
+ ], axis=-1)
+
+ attention_mask = np.concatenate([
+ attention_mask,
+ np.ones((batch_size, 1), dtype=attention_mask.dtype)
+ ], axis=-1)
+
+ batch['input.data'] = input_semantic_ids
+ batch['input.mask'] = attention_mask
+
+ target_semantic_ids = batch['labels.semantic.padded']
+ target_semantic_ids = np.concatenate([
+ np.ones(
+ (batch_size, 1),
+ dtype=np.int64,
+ ) * DECODER_START_TOKEN_ID,
+ target_semantic_ids
+ ], axis=-1)
+
+ batch['output.data'] = target_semantic_ids
+
+ return batch
+
+
+class ToMasked(Transform):
+ def __init__(self, prefix, is_right_aligned=False):
+ self._prefix = prefix
+ self._is_right_aligned = is_right_aligned
+
+ def __call__(self, batch):
+ data = batch[f'{self._prefix}.ids']
+ lengths = batch[f'{self._prefix}.length']
+
+ batch_size = lengths.shape[0]
+ max_sequence_length = int(lengths.max())
+
+ if len(data.shape) == 1: # only indices
+ padded_tensor = np.zeros(
+ (batch_size, max_sequence_length),
+ dtype=data.dtype
+ ) # (batch_size, max_seq_len)
+ else:
+ assert len(data.shape) == 2 # embeddings
+ padded_tensor = np.zeros(
+ (batch_size, max_sequence_length, data.shape[-1]),
+ dtype=data.dtype
+ ) # (batch_size, max_seq_len, emb_dim)
+
+ mask = np.arange(max_sequence_length)[None] < lengths[:, None]
+
+ if self._is_right_aligned:
+ mask = np.flip(mask, axis=-1)
+
+ padded_tensor[mask] = data
+
+ batch[f'{self._prefix}.padded'] = padded_tensor
+ batch[f'{self._prefix}.mask'] = mask
+
+ return batch
+
+
+class SemanticIdsMapper(Transform):
+ def __init__(self, mapping, names=[]):
+ super().__init__()
+ self._mapping = mapping
+ self._names = names
+
+ data = []
+ for i in range(len(mapping)):
+ data.append(mapping[str(i)])
+ self._mapping_tensor = torch.tensor(data, dtype=torch.long)
+ self._semantic_length = self._mapping_tensor.shape[-1]
+
+ def __call__(self, batch):
+ for name in self._names:
+ if f'{name}.ids' in batch:
+ ids = batch[f'{name}.ids']
+ lengths = batch[f'{name}.length']
+ assert ids.min() >= 0
+ assert ids.max() < self._mapping_tensor.shape[0]
+ batch[f'{name}.semantic.ids'] = self._mapping_tensor[ids].flatten().numpy()
+ batch[f'{name}.semantic.length'] = lengths * self._semantic_length
+
+ return batch
+
+
+class UserHashing(Transform):
+ def __init__(self, hash_size):
+ super().__init__()
+ self._hash_size = hash_size
+
+ def __call__(self, batch):
+ batch['user.hashed.ids'] = np.array([murmurhash.hash(str(x)) % self._hash_size for x in batch['user.ids']], dtype=np.int64)
+ return batch
+
+
+def save_batches_to_arrow(batches, output_dir):
+ output_dir = Path(output_dir)
+ output_dir.mkdir(parents=True, exist_ok=False)
+
+ for batch_idx, batch in enumerate(batches):
+ length_groups = defaultdict(dict)
+ metadata_groups = defaultdict(dict)
+
+ for key, value in batch.items():
+ length = len(value)
+
+ metadata_groups[length][f'{key}_shape'] = str(value.shape)
+ metadata_groups[length][f'{key}_dtype'] = str(value.dtype)
+
+ if value.ndim == 1:
+ # 1D массив - сохраняем как есть
+ length_groups[length][key] = value
+ elif value.ndim == 2:
+ # 2D массив - используем list of lists
+ length_groups[length][key] = value.tolist()
+ else:
+ # >2D массив - flatten и сохраняем shape
+ length_groups[length][key] = value.flatten()
+
+ for length, fields in length_groups.items():
+ arrow_dict = {}
+ for k, v in fields.items():
+ if isinstance(v, list) and len(v) > 0 and isinstance(v[0], list):
+ # List of lists (2D)
+ arrow_dict[k] = pa.array(v)
+ else:
+ arrow_dict[k] = pa.array(v)
+
+ table = pa.table(arrow_dict)
+ if length in metadata_groups:
+ table = table.replace_schema_metadata(metadata_groups[length])
+
+ feather.write_feather(
+ table,
+ output_dir / f"batch_{batch_idx:06d}_len_{length}.arrow",
+ compression='lz4'
+ )
+
+ # arrow_dict = {k: pa.array(v) for k, v in fields.items()}
+ # table = pa.table(arrow_dict)
+
+ # feather.write_feather(
+ # table,
+ # output_dir / f"batch_{batch_idx:06d}_len_{length}.arrow",
+ # compression='lz4'
+ # )
+
+
+def main():
+ data = Dataset.create_timestamp_based(
+ train_json_path=INTERACTIONS_TRAIN_PATH,
+ validation_json_path=INTERACTIONS_VALID_PATH,
+ test_json_path=INTERACTIONS_TEST_PATH,
+ max_sequence_length=MAX_SEQ_LEN,
+ sampler_type='tiger',
+ min_sample_len=2,
+ is_extended=True
+ )
+
+ with open(SEMANTIC_MAPPING_PATH, 'r') as f:
+ mappings = json.load(f)
+
+ train_dataset, valid_dataset, eval_dataset = data.get_datasets()
+
+ train_dataloader = DataLoader(
+ dataset=train_dataset,
+ batch_size=TRAIN_BATCH_SIZE,
+ shuffle=True,
+ drop_last=True
+ ) \
+ .map(Collate()) \
+ .map(UserHashing(NUM_USER_HASH)) \
+ .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \
+ .map(ToMasked('item.semantic', is_right_aligned=True)) \
+ .map(ToMasked('labels.semantic', is_right_aligned=True)) \
+ .map(TigerProcessing())
+
+ valid_dataloader = DataLoader(
+ dataset=valid_dataset,
+ batch_size=VALID_BATCH_SIZE,
+ shuffle=False,
+ drop_last=False
+ ) \
+ .map(Collate()) \
+ .map(UserHashing(NUM_USER_HASH)) \
+ .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \
+ .map(ToMasked('item.semantic', is_right_aligned=True)) \
+ .map(ToMasked('labels.semantic', is_right_aligned=True)) \
+ .map(ToMasked('visited', is_right_aligned=True)) \
+ .map(TigerProcessing())
+
+ eval_dataloader = DataLoader(
+ dataset=eval_dataset,
+ batch_size=VALID_BATCH_SIZE,
+ shuffle=False,
+ drop_last=False
+ ) \
+ .map(Collate()) \
+ .map(UserHashing(NUM_USER_HASH)) \
+ .map(SemanticIdsMapper(mappings, names=['item', 'labels'])) \
+ .map(ToMasked('item.semantic', is_right_aligned=True)) \
+ .map(ToMasked('labels.semantic', is_right_aligned=True)) \
+ .map(ToMasked('visited', is_right_aligned=True)) \
+ .map(TigerProcessing())
+
+ train_batches = []
+ for train_batch in train_dataloader:
+ train_batches.append(train_batch)
+ save_batches_to_arrow(train_batches, TRAIN_BATCHES_DIR)
+
+ valid_batches = []
+ for valid_batch in valid_dataloader:
+ valid_batches.append(valid_batch)
+ save_batches_to_arrow(valid_batches, VALID_BATCHES_DIR)
+
+ eval_batches = []
+ for eval_batch in eval_dataloader:
+ eval_batches.append(eval_batch)
+ save_batches_to_arrow(eval_batches, EVAL_BATCHES_DIR)
+
+
+
+if __name__ == '__main__':
+ main()
diff --git a/sigir/Beauty/DatasetProcessing.ipynb b/sigir/Beauty/DatasetProcessing.ipynb
new file mode 100644
index 0000000..b49f4ab
--- /dev/null
+++ b/sigir/Beauty/DatasetProcessing.ipynb
@@ -0,0 +1,856 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "3bdb292f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from collections import defaultdict\n",
+ "\n",
+ "import json\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import pickle\n",
+ "import polars as pl\n",
+ "\n",
+ "from transformers import LlamaModel, LlamaTokenizer\n",
+ "\n",
+ "import torch\n",
+ "from torch.utils.data import DataLoader\n",
+ "\n",
+ "from tqdm import tqdm as tqdm"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "66d9b312",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "interactions_dataset_path = '../data/Beauty/Beauty_5.json'\n",
+ "metadata_path = '../data/Beauty/metadata.json'\n",
+ "\n",
+ "interactions_output_json_path = '../data/Beauty_new/inter_new.json'\n",
+ "interactions_output_parquet_path = '../data/Beauty_new/inter_new.parquet'\n",
+ "embeddings_output_path = '../data/Beauty_new/content_embeddings.pkl'\n",
+ "item_ids_mapping_output_path = '../data/Beauty_new/item_ids_mapping.json'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "6ed4dffb",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of events: 198502\n"
+ ]
+ }
+ ],
+ "source": [
+ "df = defaultdict(list)\n",
+ "\n",
+ "with open(interactions_dataset_path, 'r') as f:\n",
+ " for line in f.readlines():\n",
+ " review = json.loads(line)\n",
+ " df['user_id'].append(review['reviewerID'])\n",
+ " df['item_id'].append(review['asin'])\n",
+ " df['timestamp'].append(review['unixReviewTime'])\n",
+ "\n",
+ "print(f'Number of events: {len(df[\"user_id\"])}')\n",
+ "\n",
+ "df = pl.from_dict(df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "c26746c4",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (5, 3)| user_id | item_id | timestamp |
|---|
| str | str | i64 |
| "A1YJEY40YUW4SE" | "7806397051" | 1391040000 |
| "A60XNB876KYML" | "7806397051" | 1397779200 |
| "A3G6XNM240RMWA" | "7806397051" | 1378425600 |
| "A1PQFP6SAJ6D80" | "7806397051" | 1386460800 |
| "A38FVHZTNQ271F" | "7806397051" | 1382140800 |
"
+ ],
+ "text/plain": [
+ "shape: (5, 3)\n",
+ "┌────────────────┬────────────┬────────────┐\n",
+ "│ user_id ┆ item_id ┆ timestamp │\n",
+ "│ --- ┆ --- ┆ --- │\n",
+ "│ str ┆ str ┆ i64 │\n",
+ "╞════════════════╪════════════╪════════════╡\n",
+ "│ A1YJEY40YUW4SE ┆ 7806397051 ┆ 1391040000 │\n",
+ "│ A60XNB876KYML ┆ 7806397051 ┆ 1397779200 │\n",
+ "│ A3G6XNM240RMWA ┆ 7806397051 ┆ 1378425600 │\n",
+ "│ A1PQFP6SAJ6D80 ┆ 7806397051 ┆ 1386460800 │\n",
+ "│ A38FVHZTNQ271F ┆ 7806397051 ┆ 1382140800 │\n",
+ "└────────────────┴────────────┴────────────┘"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "adcf5713",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "filtered_df = df.clone()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c0bbf9ba",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Processing dataset to get core-5 state in case full dataset is provided\n",
+ "is_changed = True\n",
+ "threshold = 5\n",
+ "good_users = set()\n",
+ "good_items = set()\n",
+ "\n",
+ "while is_changed:\n",
+ " user_counts = filtered_df.group_by('user_id').agg(\n",
+ " pl.len().alias('user_count'),\n",
+ " )\n",
+ " item_counts = filtered_df.group_by('item_id').agg(\n",
+ " pl.len().alias('item_count'),\n",
+ " )\n",
+ "\n",
+ " good_users = user_counts.filter(pl.col('user_count') >= threshold).select(\n",
+ " 'user_id',\n",
+ " )\n",
+ " good_items = item_counts.filter(pl.col('item_count') >= threshold).select(\n",
+ " 'item_id',\n",
+ " )\n",
+ "\n",
+ " old_size = len(filtered_df)\n",
+ "\n",
+ " new_df = filtered_df.join(good_users, on='user_id', how='inner')\n",
+ " new_df = new_df.join(good_items, on='item_id', how='inner')\n",
+ "\n",
+ " new_size = len(new_df)\n",
+ "\n",
+ " filtered_df = new_df\n",
+ " is_changed = old_size != new_size\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "218a9348",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (5, 3)| user_id | item_id | timestamp |
|---|
| i64 | i64 | i64 |
| 0 | 0 | 1391040000 |
| 1 | 0 | 1397779200 |
| 2 | 0 | 1378425600 |
| 3 | 0 | 1386460800 |
| 4 | 0 | 1382140800 |
"
+ ],
+ "text/plain": [
+ "shape: (5, 3)\n",
+ "┌─────────┬─────────┬────────────┐\n",
+ "│ user_id ┆ item_id ┆ timestamp │\n",
+ "│ --- ┆ --- ┆ --- │\n",
+ "│ i64 ┆ i64 ┆ i64 │\n",
+ "╞═════════╪═════════╪════════════╡\n",
+ "│ 0 ┆ 0 ┆ 1391040000 │\n",
+ "│ 1 ┆ 0 ┆ 1397779200 │\n",
+ "│ 2 ┆ 0 ┆ 1378425600 │\n",
+ "│ 3 ┆ 0 ┆ 1386460800 │\n",
+ "│ 4 ┆ 0 ┆ 1382140800 │\n",
+ "└─────────┴─────────┴────────────┘"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "unique_values = filtered_df[\"user_id\"].unique(maintain_order=True).to_list()\n",
+ "user_ids_mapping = {value: i for i, value in enumerate(unique_values)}\n",
+ "\n",
+ "filtered_df = filtered_df.with_columns(\n",
+ " pl.col(\"user_id\").replace_strict(user_ids_mapping)\n",
+ ")\n",
+ "\n",
+ "unique_values = filtered_df[\"item_id\"].unique(maintain_order=True).to_list()\n",
+ "item_ids_mapping = {value: i for i, value in enumerate(unique_values)}\n",
+ "\n",
+ "filtered_df = filtered_df.with_columns(\n",
+ " pl.col(\"item_id\").replace_strict(item_ids_mapping)\n",
+ ")\n",
+ "\n",
+ "filtered_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "34604fe6",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (5, 2)| old_item_id | new_item_id |
|---|
| str | i64 |
| "7806397051" | 0 |
| "9759091062" | 1 |
| "9788072216" | 2 |
| "9790790961" | 3 |
| "9790794231" | 4 |
"
+ ],
+ "text/plain": [
+ "shape: (5, 2)\n",
+ "┌─────────────┬─────────────┐\n",
+ "│ old_item_id ┆ new_item_id │\n",
+ "│ --- ┆ --- │\n",
+ "│ str ┆ i64 │\n",
+ "╞═════════════╪═════════════╡\n",
+ "│ 7806397051 ┆ 0 │\n",
+ "│ 9759091062 ┆ 1 │\n",
+ "│ 9788072216 ┆ 2 │\n",
+ "│ 9790790961 ┆ 3 │\n",
+ "│ 9790794231 ┆ 4 │\n",
+ "└─────────────┴─────────────┘"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "item_ids_mapping_df = pl.from_dict({\n",
+ " 'old_item_id': list(item_ids_mapping.keys()),\n",
+ " 'new_item_id': list(item_ids_mapping.values())\n",
+ "})\n",
+ "item_ids_mapping_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "99b54db807b9495c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "with open(item_ids_mapping_output_path, 'w') as f:\n",
+ " json.dump({str(k): v for k, v in item_ids_mapping.items()}, f)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "6017e65c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (5, 3)| user_id | item_id | timestamp |
|---|
| i64 | i64 | i64 |
| 0 | 0 | 1391040000 |
| 1 | 0 | 1397779200 |
| 2 | 0 | 1378425600 |
| 3 | 0 | 1386460800 |
| 4 | 0 | 1382140800 |
"
+ ],
+ "text/plain": [
+ "shape: (5, 3)\n",
+ "┌─────────┬─────────┬────────────┐\n",
+ "│ user_id ┆ item_id ┆ timestamp │\n",
+ "│ --- ┆ --- ┆ --- │\n",
+ "│ i64 ┆ i64 ┆ i64 │\n",
+ "╞═════════╪═════════╪════════════╡\n",
+ "│ 0 ┆ 0 ┆ 1391040000 │\n",
+ "│ 1 ┆ 0 ┆ 1397779200 │\n",
+ "│ 2 ┆ 0 ┆ 1378425600 │\n",
+ "│ 3 ┆ 0 ┆ 1386460800 │\n",
+ "│ 4 ┆ 0 ┆ 1382140800 │\n",
+ "└─────────┴─────────┴────────────┘"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "filtered_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "9efd1983",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "filtered_df = filtered_df.sort([\"user_id\", \"timestamp\"])\n",
+ "\n",
+ "grouped_filtered_df = filtered_df.group_by(\"user_id\", maintain_order=True).agg(pl.all())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "fd51c525",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (5, 2)| old_item_id | new_item_id |
|---|
| str | i64 |
| "7806397051" | 0 |
| "9759091062" | 1 |
| "9788072216" | 2 |
| "9790790961" | 3 |
| "9790794231" | 4 |
"
+ ],
+ "text/plain": [
+ "shape: (5, 2)\n",
+ "┌─────────────┬─────────────┐\n",
+ "│ old_item_id ┆ new_item_id │\n",
+ "│ --- ┆ --- │\n",
+ "│ str ┆ i64 │\n",
+ "╞═════════════╪═════════════╡\n",
+ "│ 7806397051 ┆ 0 │\n",
+ "│ 9759091062 ┆ 1 │\n",
+ "│ 9788072216 ┆ 2 │\n",
+ "│ 9790790961 ┆ 3 │\n",
+ "│ 9790794231 ┆ 4 │\n",
+ "└─────────────┴─────────────┘"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "item_ids_mapping_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "8b0821da",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (5, 3)| user_id | item_id | timestamp |
|---|
| i64 | list[i64] | list[i64] |
| 0 | [6845, 7872, … 0] | [1318896000, 1318896000, … 1391040000] |
| 1 | [815, 10405, … 232] | [1392422400, 1396224000, … 1397779200] |
| 2 | [6049, 0, … 6608] | [1378425600, 1378425600, … 1400284800] |
| 3 | [5521, 5160, … 0] | [1379116800, 1380931200, … 1386460800] |
| 4 | [0, 10469, … 11389] | [1382140800, 1383523200, … 1388966400] |
"
+ ],
+ "text/plain": [
+ "shape: (5, 3)\n",
+ "┌─────────┬─────────────────────┬─────────────────────────────────┐\n",
+ "│ user_id ┆ item_id ┆ timestamp │\n",
+ "│ --- ┆ --- ┆ --- │\n",
+ "│ i64 ┆ list[i64] ┆ list[i64] │\n",
+ "╞═════════╪═════════════════════╪═════════════════════════════════╡\n",
+ "│ 0 ┆ [6845, 7872, … 0] ┆ [1318896000, 1318896000, … 139… │\n",
+ "│ 1 ┆ [815, 10405, … 232] ┆ [1392422400, 1396224000, … 139… │\n",
+ "│ 2 ┆ [6049, 0, … 6608] ┆ [1378425600, 1378425600, … 140… │\n",
+ "│ 3 ┆ [5521, 5160, … 0] ┆ [1379116800, 1380931200, … 138… │\n",
+ "│ 4 ┆ [0, 10469, … 11389] ┆ [1382140800, 1383523200, … 138… │\n",
+ "└─────────┴─────────────────────┴─────────────────────────────────┘"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "grouped_filtered_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "dc222d59",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Users count: 22363\n",
+ "Items count: 12101\n",
+ "Actions count: 198502\n",
+ "Avg user history len: 8.876358270357287\n"
+ ]
+ }
+ ],
+ "source": [
+ "print('Users count:', filtered_df.select('user_id').unique().shape[0])\n",
+ "print('Items count:', filtered_df.select('item_id').unique().shape[0])\n",
+ "print('Actions count:', filtered_df.shape[0])\n",
+ "print('Avg user history len:', np.mean(list(map(lambda x: x[0], grouped_filtered_df.select(pl.col('item_id').list.len()).rows()))))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "a272855d-84b2-4414-ba9f-62647e1151cf",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "shape: (5, 3)\n",
+ "┌─────┬─────────────────────┬─────────────────────────────────┐\n",
+ "│ uid ┆ item_ids ┆ timestamps │\n",
+ "│ --- ┆ --- ┆ --- │\n",
+ "│ i64 ┆ list[i64] ┆ list[i64] │\n",
+ "╞═════╪═════════════════════╪═════════════════════════════════╡\n",
+ "│ 0 ┆ [6845, 7872, … 0] ┆ [1318896000, 1318896000, … 139… │\n",
+ "│ 1 ┆ [815, 10405, … 232] ┆ [1392422400, 1396224000, … 139… │\n",
+ "│ 2 ┆ [6049, 0, … 6608] ┆ [1378425600, 1378425600, … 140… │\n",
+ "│ 3 ┆ [5521, 5160, … 0] ┆ [1379116800, 1380931200, … 138… │\n",
+ "│ 4 ┆ [0, 10469, … 11389] ┆ [1382140800, 1383523200, … 138… │\n",
+ "└─────┴─────────────────────┴─────────────────────────────────┘\n"
+ ]
+ }
+ ],
+ "source": [
+ "inter_new = grouped_filtered_df.select([\n",
+ " pl.col(\"user_id\").alias(\"uid\"),\n",
+ " pl.col(\"item_id\").alias(\"item_ids\"),\n",
+ " pl.col(\"timestamp\").alias(\"timestamps\")\n",
+ "])\n",
+ "\n",
+ "print(inter_new.head())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "de5a853a-8ee2-42dd-a71a-6cc6f90d526c",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Файл успешно сохранен: ../data/Beauty_new/inter_new.parquet\n"
+ ]
+ }
+ ],
+ "source": [
+ "output_path_parquet = interactions_output_parquet_path\n",
+ "inter_new.write_parquet(output_path_parquet)\n",
+ "\n",
+ "print(f\"Файл успешно сохранен: {output_path_parquet}\")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "d07a2e91",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "json_data = {}\n",
+ "for user_id, item_ids, _ in grouped_filtered_df.iter_rows():\n",
+ " json_data[user_id] = item_ids\n",
+ "\n",
+ "with open(interactions_output_json_path, 'w') as f:\n",
+ " json.dump(json_data, f, indent=2)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "237523fa",
+ "metadata": {
+ "jp-MarkdownHeadingCollapsed": true
+ },
+ "source": [
+ "## Content embedding creation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "6361c7a5",
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "KeyboardInterrupt",
+ "evalue": "",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[0;32mIn[19], line 5\u001b[0m, in \u001b[0;36mgetDF\u001b[0;34m(path)\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(path, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[0;32m----> 5\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m line \u001b[38;5;129;01min\u001b[39;00m \u001b[43mf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreadlines\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 6\u001b[0m df[i] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28meval\u001b[39m(line)\n",
+ "File \u001b[0;32m/usr/lib/python3.10/codecs.py:319\u001b[0m, in \u001b[0;36mBufferedIncrementalDecoder.decode\u001b[0;34m(self, input, final)\u001b[0m\n\u001b[1;32m 317\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m\n\u001b[0;32m--> 319\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mdecode\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m, final\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m):\n\u001b[1;32m 320\u001b[0m \u001b[38;5;66;03m# decode input (taking the buffer into account)\u001b[39;00m\n\u001b[1;32m 321\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbuffer \u001b[38;5;241m+\u001b[39m \u001b[38;5;28minput\u001b[39m\n",
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m: ",
+ "\nDuring handling of the above exception, another exception occurred:\n",
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[0;32mIn[19], line 11\u001b[0m\n\u001b[1;32m 7\u001b[0m i \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 9\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m pd\u001b[38;5;241m.\u001b[39mDataFrame\u001b[38;5;241m.\u001b[39mfrom_dict(df, orient\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mindex\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 11\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43mgetDF\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmetadata_path\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 12\u001b[0m df\u001b[38;5;241m.\u001b[39mhead()\n",
+ "Cell \u001b[0;32mIn[19], line 5\u001b[0m, in \u001b[0;36mgetDF\u001b[0;34m(path)\u001b[0m\n\u001b[1;32m 3\u001b[0m df \u001b[38;5;241m=\u001b[39m {}\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(path, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[0;32m----> 5\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m line \u001b[38;5;129;01min\u001b[39;00m \u001b[43mf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreadlines\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 6\u001b[0m df[i] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28meval\u001b[39m(line)\n\u001b[1;32m 7\u001b[0m i \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n",
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+ ]
+ }
+ ],
+ "source": [
+ "def getDF(path):\n",
+ " i = 0\n",
+ " df = {}\n",
+ " with open(path, 'r') as f:\n",
+ " for line in f.readlines():\n",
+ " df[i] = eval(line)\n",
+ " i += 1\n",
+ "\n",
+ " return pd.DataFrame.from_dict(df, orient=\"index\")\n",
+ "\n",
+ "df = getDF(metadata_path)\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "971fa89c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def preprocess(row: pd.Series):\n",
+ " row = row.fillna(\"None\")\n",
+ " return f\"Title: {row['title']}. Categories: {', '.join(row['categories'][0])}. Description: {row['description']}.\"\n",
+ "\n",
+ "\n",
+ "def get_data(metadata_df, item_ids_mapping_df):\n",
+ " filtered_df = metadata_df.join(\n",
+ " item_ids_mapping_df, \n",
+ " left_on=\"asin\", \n",
+ " right_on='old_item_id', \n",
+ " how=\"inner\"\n",
+ " ).select(pl.col('new_item_id'), pl.col('title'), pl.col('description'), pl.col('categories'))\n",
+ "\n",
+ " filtered_df = filtered_df.to_pandas()\n",
+ " filtered_df[\"combined_text\"] = filtered_df.apply(preprocess, axis=1)\n",
+ "\n",
+ " return filtered_df\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3b0dd5d5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data = get_data(pl.from_pandas(df), item_ids_mapping)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "12e622ff",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "device = torch.device('cuda:6')\n",
+ "\n",
+ "model_name = \"huggyllama/llama-7b\"\n",
+ "tokenizer = LlamaTokenizer.from_pretrained(model_name)\n",
+ "\n",
+ "if tokenizer.pad_token is None:\n",
+ " tokenizer.pad_token = tokenizer.eos_token\n",
+ "\n",
+ "model = LlamaModel.from_pretrained(model_name)\n",
+ "model = model.to(device)\n",
+ "model = model.eval()\n",
+ "\n",
+ "\n",
+ "class MyDataset:\n",
+ " def __init__(self, data):\n",
+ " self._data = list(zip(data.to_dict()['new_item_id'].values(), data.to_dict()['combined_text'].values()))\n",
+ "\n",
+ " def __len__(self):\n",
+ " return len(self._data)\n",
+ "\n",
+ " def __getitem__(self, idx):\n",
+ " text = self._data[idx][1]\n",
+ " inputs = tokenizer(text, return_tensors=\"pt\", max_length=1024, truncation=True, padding=\"max_length\")\n",
+ " return {\n",
+ " 'item_id': self._data[idx][0],\n",
+ " 'input_ids': inputs['input_ids'][0],\n",
+ " 'attention_mask': inputs['attention_mask'][0]\n",
+ " }\n",
+ " \n",
+ "\n",
+ "dataset = MyDataset(data)\n",
+ "loader = DataLoader(dataset, batch_size=8, drop_last=False, shuffle=False, num_workers=10)\n",
+ "\n",
+ "\n",
+ "new_df = {\n",
+ " 'item_id': [],\n",
+ " 'embedding': []\n",
+ "}\n",
+ "\n",
+ "for batch in tqdm(loader):\n",
+ " with torch.inference_mode():\n",
+ " outputs = model(\n",
+ " input_ids=batch[\"input_ids\"].to(device), \n",
+ " attention_mask=batch[\"attention_mask\"].to(device)\n",
+ " )\n",
+ " embeddings = outputs.last_hidden_state\n",
+ " \n",
+ " embeddings = outputs.last_hidden_state # (bs, sl, ed)\n",
+ " embeddings[(~batch[\"attention_mask\"].bool())] = 0. # (bs, sl, ed)\n",
+ "\n",
+ " new_df['item_id'] += batch['item_id'].tolist()\n",
+ " new_df['embedding'] += embeddings.mean(dim=1).tolist() # (bs, ed)\n",
+ "\n",
+ "\n",
+ "with open(embeddings_output_path, 'wb') as f:\n",
+ " pickle.dump(new_df, f)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a6fffc4a-85f1-424e-b460-29e526df3317",
+ "metadata": {},
+ "source": [
+ "# Test"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "1f922431-e3c1-4587-86d1-04a58eb8ffee",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "✓ Сохранено: ../data/Beauty_new/splits/inter_new_[0_1291403520.0).json\n",
+ "✓ Сохранено: ../data/Beauty_new/splits/inter_new_[1291403520.0_1329626880.0).json\n",
+ "✓ Сохранено: ../data/Beauty_new/splits/inter_new_[1329626880.0_1367850240.0).json\n",
+ "✓ Сохранено: ../data/Beauty_new/splits/inter_new_[1367850240.0_inf).json\n",
+ "Интервал 0: 3485 пользователей, 10350 взаимодействий\n",
+ "Интервал 1: 5751 пользователей, 15837 взаимодействий\n",
+ "Интервал 2: 13543 пользователей, 61954 взаимодействий\n",
+ "Интервал 3: 18811 пользователей, 110361 взаимодействий\n"
+ ]
+ }
+ ],
+ "source": [
+ "import polars as pl\n",
+ "import json\n",
+ "from typing import List, Dict\n",
+ "\n",
+ "def split_session_by_timestamps(\n",
+ " df: pl.DataFrame,\n",
+ " time_cutoffs: List[int],\n",
+ " output_dir: str = None,\n",
+ " return_dicts: bool = True\n",
+ ") -> List[Dict[int, List[int]]]:\n",
+ " \"\"\"\n",
+ " Разбивает датасет по временным интервалам и возвращает JSON-подобные словари.\n",
+ " \n",
+ " Args:\n",
+ " df: Polars DataFrame с колонками uid, item_ids (list), timestamps (list)\n",
+ " time_cutoffs: Лист временных точек для разбиения\n",
+ " output_dir: Директория для сохранения JSON файлов (опционально)\n",
+ " return_dicts: Возвращать ли словари (как json_data format)\n",
+ " \n",
+ " Returns:\n",
+ " Лист словарей в формате {user_id: [item_ids для интервала]}\n",
+ " \n",
+ " Example:\n",
+ " >>> df = pl.read_parquet(\"inter_new.parquet\")\n",
+ " >>> cutoffs = [1000000, 2000000, 3000000]\n",
+ " >>> dicts = split_session_by_timestamps(df, cutoffs, output_dir=\"./data\")\n",
+ " >>> # Получим 4 JSON файла за каждый интервал + последний\n",
+ " \"\"\"\n",
+ " \n",
+ " result_dicts = []\n",
+ " \n",
+ " def extract_interval(df_source, start, end=None):\n",
+ " \"\"\"Извлекает данные для одного временного интервала\"\"\"\n",
+ " q = df_source.lazy()\n",
+ " q = q.explode([\"item_ids\", \"timestamps\"])\n",
+ " \n",
+ " if end is not None:\n",
+ " q = q.filter(\n",
+ " (pl.col(\"timestamps\") >= start) & \n",
+ " (pl.col(\"timestamps\") < end)\n",
+ " )\n",
+ " else:\n",
+ " q = q.filter(\n",
+ " pl.col(\"timestamps\") >= start\n",
+ " )\n",
+ " \n",
+ " q = q.group_by(\"uid\").agg([\n",
+ " pl.col(\"item_ids\").alias(\"item_ids\")\n",
+ " ]).sort(\"uid\")\n",
+ " \n",
+ " return q.collect()\n",
+ " \n",
+ " # Генерируем интервалы\n",
+ " intervals = []\n",
+ " current_start = 0\n",
+ " for cutoff in time_cutoffs:\n",
+ " intervals.append((current_start, cutoff))\n",
+ " current_start = cutoff\n",
+ " # Последний интервал от последнего cutoff до бесконечности\n",
+ " intervals.append((current_start, None))\n",
+ " \n",
+ " # Обрабатываем каждый интервал\n",
+ " for start, end in intervals:\n",
+ " subset = extract_interval(df, start, end)\n",
+ " \n",
+ " # Конвертируем в JSON-подобный словарь\n",
+ " json_dict = {}\n",
+ " for user_id, item_ids in subset.iter_rows():\n",
+ " json_dict[user_id] = item_ids\n",
+ " \n",
+ " result_dicts.append(json_dict)\n",
+ " \n",
+ " # Опционально сохраняем файлы\n",
+ " if output_dir:\n",
+ " if end is not None:\n",
+ " filename = f\"inter_new_[{start}_{end}).json\"\n",
+ " else:\n",
+ " filename = f\"inter_new_[{start}_inf).json\"\n",
+ " \n",
+ " filepath = f\"{output_dir}/{filename}\"\n",
+ " with open(filepath, 'w') as f:\n",
+ " json.dump(json_dict, f, indent=2)\n",
+ " \n",
+ " print(f\"✓ Сохранено: {filepath}\")\n",
+ " \n",
+ " return result_dicts\n",
+ "\n",
+ "\n",
+ "# ==========================================\n",
+ "# Использование в ноутбуке\n",
+ "# ==========================================\n",
+ "\n",
+ "# Загружаем сохраненный Parquet файл\n",
+ "df = pl.read_parquet(interactions_output_parquet_path)\n",
+ "\n",
+ "# Определяем временные точки разбиения (можно использовать процентили или конкретные даты)\n",
+ "# Например: разбить на 70%, 80%, 90% от времени\n",
+ "df_timestamps = df.select(\n",
+ " pl.col(\"timestamps\").explode()\n",
+ ")\n",
+ "min_time = df_timestamps.select(pl.col(\"timestamps\").min()).item()\n",
+ "max_time = df_timestamps.select(pl.col(\"timestamps\").max()).item()\n",
+ "\n",
+ "# Разделяем на 4 части (train/val/test/test_final)\n",
+ "cutoffs = [\n",
+ " min_time + (max_time - min_time) * 0.7, # 70%\n",
+ " min_time + (max_time - min_time) * 0.8, # 80%\n",
+ " min_time + (max_time - min_time) * 0.9, # 90%\n",
+ "]\n",
+ "\n",
+ "# Применяем функцию\n",
+ "result_dicts = split_session_by_timestamps(\n",
+ " df, \n",
+ " cutoffs,\n",
+ " output_dir=\"../data/Beauty_new/splits\" # Опционально\n",
+ ")\n",
+ "\n",
+ "# Выводим статистику\n",
+ "for i, json_dict in enumerate(result_dicts):\n",
+ " total_interactions = sum(len(items) for items in json_dict.values())\n",
+ " print(f\"Интервал {i}: {len(json_dict)} пользователей, {total_interactions} взаимодействий\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "73b5ec51-4d94-4021-9a21-3f4345ecdd26",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "✓ Сохранено: ../data/Beauty_new/splits/inter_new_[0_inf).json\n"
+ ]
+ }
+ ],
+ "source": [
+ "split_session_by_timestamps(\n",
+ " df, \n",
+ " [],\n",
+ " output_dir=\"../data/Beauty_new/splits\"\n",
+ ")\n",
+ "None"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/sigir/Beauty/exps_data.ipynb b/sigir/Beauty/exps_data.ipynb
new file mode 100644
index 0000000..2625231
--- /dev/null
+++ b/sigir/Beauty/exps_data.ipynb
@@ -0,0 +1,921 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "e2462a97-6705-44e1-a232-4dd78a5dfc85",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import polars as pl\n",
+ "import json\n",
+ "from typing import List, Dict"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "fd38624d-5796-4aa5-929f-7e82c5544f6c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "interactions_output_parquet_path = '/home/jovyan/IRec/sigir/Beauty_new/inter_new.parquet'\n",
+ "# 1. Загружаем\n",
+ "df = pl.read_parquet(interactions_output_parquet_path)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "ee127317-66b8-4f22-9109-94bcb8b1f1ae",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def split_session_by_timestamps(\n",
+ " df: pl.DataFrame,\n",
+ " time_cutoffs: List[int],\n",
+ " output_dir: str = None,\n",
+ " return_dicts: bool = True\n",
+ ") -> List[Dict[int, List[int]]]:\n",
+ " \"\"\"\n",
+ " Разбивает датасет по временным интервалам и возвращает JSON-подобные словари.\n",
+ " \n",
+ " Args:\n",
+ " df: Polars DataFrame с колонками uid, item_ids (list), timestamps (list)\n",
+ " time_cutoffs: Лист временных точек для разбиения\n",
+ " output_dir: Директория для сохранения JSON файлов (опционально)\n",
+ " return_dicts: Возвращать ли словари (как json_data format)\n",
+ " \n",
+ " Returns:\n",
+ " Лист словарей в формате {user_id: [item_ids для интервала]}\n",
+ " \n",
+ " Example:\n",
+ " >>> df = pl.read_parquet(\"inter_new.parquet\")\n",
+ " >>> cutoffs = [1000000, 2000000, 3000000]\n",
+ " >>> dicts = split_session_by_timestamps(df, cutoffs, output_dir=\"./data\")\n",
+ " >>> # Получим 4 JSON файла за каждый интервал + последний\n",
+ " \"\"\"\n",
+ " \n",
+ " result_dicts = []\n",
+ " \n",
+ " def extract_interval(df_source, start, end=None):\n",
+ " \"\"\"Извлекает данные для одного временного интервала\"\"\"\n",
+ " q = df_source.lazy()\n",
+ " q = q.explode([\"item_ids\", \"timestamps\"])\n",
+ " \n",
+ " if end is not None:\n",
+ " q = q.filter(\n",
+ " (pl.col(\"timestamps\") >= start) & \n",
+ " (pl.col(\"timestamps\") < end)\n",
+ " )\n",
+ " else:\n",
+ " q = q.filter(\n",
+ " pl.col(\"timestamps\") >= start\n",
+ " )\n",
+ " \n",
+ " q = q.group_by(\"uid\").agg([\n",
+ " pl.col(\"item_ids\").alias(\"item_ids\")\n",
+ " ]).sort(\"uid\")\n",
+ " \n",
+ " return q.collect()\n",
+ " \n",
+ " # Генерируем интервалы\n",
+ " intervals = []\n",
+ " current_start = 0\n",
+ " for cutoff in time_cutoffs:\n",
+ " intervals.append((current_start, cutoff))\n",
+ " current_start = cutoff\n",
+ " # Последний интервал от последнего cutoff до бесконечности\n",
+ " intervals.append((current_start, None))\n",
+ " \n",
+ " # Обрабатываем каждый интервал\n",
+ " for start, end in intervals:\n",
+ " subset = extract_interval(df, start, end)\n",
+ " \n",
+ " # Конвертируем в JSON-подобный словарь\n",
+ " json_dict = {}\n",
+ " for user_id, item_ids in subset.iter_rows():\n",
+ " json_dict[user_id] = item_ids\n",
+ " \n",
+ " result_dicts.append(json_dict)\n",
+ " \n",
+ " # Опционально сохраняем файлы\n",
+ " if output_dir:\n",
+ " if end is not None:\n",
+ " filename = f\"inter_new_[{start}_{end}).json\"\n",
+ " else:\n",
+ " filename = f\"inter_new_[{start}_inf).json\"\n",
+ " \n",
+ " filepath = f\"{output_dir}/{filename}\"\n",
+ " with open(filepath, 'w') as f:\n",
+ " json.dump(json_dict, f, indent=2)\n",
+ " \n",
+ " print(f\"✓ Сохранено: {filepath}\")\n",
+ " \n",
+ " return result_dicts"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "efc8b582-dd8a-4299-9c49-de906251de8a",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Cutoffs: [1402444800, 1403654400, 1404864000]\n",
+ "✓ Сохранено: /home/jovyan/IRec/sigir/Beauty_new/splits/test_splits/inter_new_[0_1402444800).json\n",
+ "✓ Сохранено: /home/jovyan/IRec/sigir/Beauty_new/splits/test_splits/inter_new_[1402444800_1403654400).json\n",
+ "✓ Сохранено: /home/jovyan/IRec/sigir/Beauty_new/splits/test_splits/inter_new_[1403654400_1404864000).json\n",
+ "✓ Сохранено: /home/jovyan/IRec/sigir/Beauty_new/splits/test_splits/inter_new_[1404864000_inf).json\n",
+ "Part 0 [Base]: 22029 users\n",
+ "Part 1 [Week -6]: 1854 users\n",
+ "Part 2 [Week -4]: 1945 users\n",
+ "Part 3 [Week -2]: 1381 users\n"
+ ]
+ }
+ ],
+ "source": [
+ "global_max_time = df.select(\n",
+ " pl.col(\"timestamps\").explode().max()\n",
+ ").item()\n",
+ "\n",
+ "# 3. Размер окна (неделя)\n",
+ "days_val = 14\n",
+ "window_sec = days_val * 24 * 3600 \n",
+ "\n",
+ "# 4. Три отсечки с конца\n",
+ "cutoff_test_start = global_max_time - window_sec # T - 2w\n",
+ "cutoff_val_start = global_max_time - 2 * window_sec # T - 4w\n",
+ "cutoff_gap_start = global_max_time - 3 * window_sec # T - 6w\n",
+ "\n",
+ "cutoffs = [\n",
+ " int(cutoff_gap_start), # Граница Part 0 | Part 1\n",
+ " int(cutoff_val_start), # Граница Part 1 | Part 2\n",
+ " int(cutoff_test_start) # Граница Part 2 | Part 3\n",
+ "]\n",
+ "\n",
+ "print(f\"Cutoffs: {cutoffs}\")\n",
+ "\n",
+ "# 5. Разбиваем на 4 файла\n",
+ "# Part 0: Deep History\n",
+ "# Part 1: Pre-Validation (нужна для s1, но выкидывается для 'совсем короткого' s0?)\n",
+ "# *В вашем случае 4.2 просто 'на неделю короче', так что Part 1 все равно войдет в трейн Semantics, \n",
+ "# а выкинется только Part 2. Но если захотите еще короче - можно выкинуть и Part 1.*\n",
+ "# Part 2: Validation (Есть в 4.1, НЕТ в 4.2 для Semantics)\n",
+ "# Part 3: Test\n",
+ "\n",
+ "split_files = split_session_by_timestamps(\n",
+ " df, \n",
+ " cutoffs, \n",
+ " output_dir=\"/home/jovyan/IRec/sigir/Beauty_new/splits/test_splits\"\n",
+ ")\n",
+ "\n",
+ "names = [\"Base\", \"Week -6\", \"Week -4\", \"Week -2\"]\n",
+ "for i, d in enumerate(split_files):\n",
+ " print(f\"Part {i} [{names[i]}]: {len(d)} users\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "d5ba172e-b430-40a3-a4fa-64366d02a015",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def merge_and_save(parts_to_merge, dirr, output_name):\n",
+ " merged = {}\n",
+ " print(f\"Merging {len(parts_to_merge)} files into {output_name}...\")\n",
+ " \n",
+ " for part in parts_to_merge:\n",
+ " # with open(fp, 'r') as f:\n",
+ " # part = json.load(f)\n",
+ " for uid, items in part.items():\n",
+ " if uid not in merged:\n",
+ " merged[uid] = []\n",
+ " merged[uid].extend(items)\n",
+ " \n",
+ " out_path = f\"{dirr}/{output_name}\"\n",
+ " with open(out_path, 'w') as f:\n",
+ " json.dump(merged, f)\n",
+ " print(f\"✓ Done: {out_path} (Users: {len(merged)})\")\n",
+ "\n",
+ "\n",
+ "# p0, p1, p2, p3 = split_files[0], split_files[1], split_files[2], split_files[3]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "d116b7e0-9bf9-4104-86a0-69788a70cc14",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Merging 2 files into exp_4_inter_tiger_train.json...\n",
+ "✓ Done: ../sigir/Beauty_new/splits/exp_data/exp_4_inter_tiger_train.json (Users: 22129)\n",
+ "Merging 2 files into exp_4.1_inter_semantics_train.json...\n",
+ "✓ Done: ../sigir/Beauty_new/splits/exp_data/exp_4.1_inter_semantics_train.json (Users: 22129)\n",
+ "Merging 1 files into exp_4.2_inter_semantics_train_short.json...\n",
+ "✓ Done: ../sigir/Beauty_new/splits/exp_data/exp_4.2_inter_semantics_train_short.json (Users: 22029)\n",
+ "Merging 3 files into exp_4.3_inter_semantics_train_leak.json...\n",
+ "✓ Done: ../sigir/Beauty_new/splits/exp_data/exp_4.3_inter_semantics_train_leak.json (Users: 22265)\n",
+ "Merging 1 files into test_set.json...\n",
+ "✓ Done: ../sigir/Beauty_new/splits/exp_data/test_set.json (Users: 1381)\n",
+ "Merging 1 files into valid_skip_set.json...\n",
+ "✓ Done: ../sigir/Beauty_new/splits/exp_data/valid_skip_set.json (Users: 1945)\n",
+ "\n",
+ "All done!\n"
+ ]
+ }
+ ],
+ "source": [
+ "EXP_DIR = \"../sigir/Beauty_new/splits/exp_data\"\n",
+ "\n",
+ "# Tiger: P0+P1\n",
+ "merge_and_save([p0, p1], EXP_DIR, \"exp_4_inter_tiger_train.json\")\n",
+ "\n",
+ "# 1. Exp 4.1 (Standard)\n",
+ "# Semantics: P0+P1 (Всё кроме пропуска и теста)\n",
+ "merge_and_save([p0, p1], EXP_DIR, \"exp_4.1_inter_semantics_train.json\")\n",
+ "\n",
+ "# 2. Exp 4.2 (Short Semantics)\n",
+ "# Semantics: P0 (Короче на неделю, без P2)\n",
+ "merge_and_save([p0], EXP_DIR, \"exp_4.2_inter_semantics_train_short.json\")\n",
+ "\n",
+ "# 3. Exp 4.3 (Leak)\n",
+ "# Semantics: P0+P1+P2 (Видит валидацию)\n",
+ "merge_and_save([p0, p1, p2], EXP_DIR, \"exp_4.3_inter_semantics_train_leak.json\")\n",
+ "\n",
+ "# 4. Test Set (тест всех моделей)\n",
+ "merge_and_save([p3], EXP_DIR, \"test_set.json\")\n",
+ "\n",
+ "# 4. Valid Set (пропуск, имитируется разница трейна и теста чтобы потом дообучать)\n",
+ "merge_and_save([p2], EXP_DIR, \"valid_skip_set.json\")\n",
+ "\n",
+ "print(\"\\nAll done!\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "9ae1d1e5-567d-471a-8f83-4039ecacc8d2",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Merging 4 files into all_set.json...\n",
+ "✓ Done: ../sigir/Beauty_new/splits/exp_data/all_set.json (Users: 22363)\n"
+ ]
+ }
+ ],
+ "source": [
+ "merge_and_save([p0, p1, p2, p3], EXP_DIR, \"all_set.json\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "328de16c-f61d-45be-8a72-5f0eaef612e8",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Проверка Train сетов (должны быть префиксами):\n",
+ "✅ [ПРЕФИКСЫ] Все 22129 массивов ОК. Полных совпадений: 19410\n",
+ "✅ [ПРЕФИКСЫ] Все 22029 массивов ОК. Полных совпадений: 18191\n",
+ "✅ [ПРЕФИКСЫ] Все 22265 массивов ОК. Полных совпадений: 20982\n",
+ "✅ [ПРЕФИКСЫ] Все 22129 массивов ОК. Полных совпадений: 19410\n",
+ "\n",
+ "Проверка Test сета (должен быть суффиксом):\n",
+ "✅ [СУФФИКСЫ] Все 1381 массивов ОК. Полных совпадений: 98\n",
+ "\n",
+ "(Контроль) Проверка Test сета как префикса (должна упасть):\n",
+ "❌ [ПРЕФИКСЫ] Найдено 1283 ошибок.\n"
+ ]
+ }
+ ],
+ "source": [
+ "with open(\"../data/Beauty/inter_new.json\", 'r') as f:\n",
+ " old_inter_new = json.load(f)\n",
+ "\n",
+ "with open(\"../sigir/Beauty_new/splits/exp_data/exp_4.1_inter_semantics_train.json\", 'r') as ff:\n",
+ " first_sem = json.load(ff)\n",
+ " \n",
+ "with open(\"../sigir/Beauty_new/splits/exp_data/exp_4.2_inter_semantics_train_short.json\", 'r') as ff:\n",
+ " second_sem = json.load(ff)\n",
+ " \n",
+ "with open(\"../sigir/Beauty_new/splits/exp_data/exp_4.3_inter_semantics_train_leak.json\", 'r') as ff:\n",
+ " third_sem = json.load(ff)\n",
+ " \n",
+ "with open(\"../sigir/Beauty_new/splits/exp_data/exp_4_inter_tiger_train.json\", 'r') as ff:\n",
+ " tiger_sem = json.load(ff)\n",
+ "\n",
+ "with open(\"../sigir/Beauty_new/splits/exp_data/test_set.json\", 'r') as ff:\n",
+ " test_sem = json.load(ff)\n",
+ "\n",
+ "def check_prefix_match(full_data, subset_data, check_suffix=False):\n",
+ " \"\"\"\n",
+ " check_suffix=True включит режим проверки суффиксов (для теста).\n",
+ " \"\"\"\n",
+ " mismatch_count = 0\n",
+ " full_match_count = 0\n",
+ " \n",
+ " # Итерируемся по ключам сабсета, так как в full_data может быть больше юзеров\n",
+ " for user, sub_items in subset_data.items():\n",
+ " \n",
+ " # Проверяем есть ли такой юзер в исходнике\n",
+ " if user not in full_data:\n",
+ " print(f\"⚠ Юзер {user} не найден в исходном файле!\")\n",
+ " mismatch_count += 1\n",
+ " continue\n",
+ " \n",
+ " full_items = full_data[user]\n",
+ " \n",
+ " # Логика для проверки ПРЕФИКСА (начало совпадает)\n",
+ " if not check_suffix:\n",
+ " if len(sub_items) > len(full_items):\n",
+ " mismatch_count += 1\n",
+ " continue\n",
+ " \n",
+ " # Сравниваем начало full с sub\n",
+ " if full_items[:len(sub_items)] == sub_items:\n",
+ " if len(full_items) == len(sub_items):\n",
+ " full_match_count += 1\n",
+ " else:\n",
+ " mismatch_count += 1\n",
+ " \n",
+ " # Логика для проверки СУФФИКСА (конец совпадает - для теста)\n",
+ " else:\n",
+ " if len(sub_items) > len(full_items):\n",
+ " mismatch_count += 1\n",
+ " continue\n",
+ " \n",
+ " # Сравниваем конец full с sub\n",
+ " # Срез [-len:] берет последние N элементов\n",
+ " if full_items[-len(sub_items):] == sub_items:\n",
+ " if len(full_items) == len(sub_items):\n",
+ " full_match_count += 1\n",
+ " else:\n",
+ " mismatch_count += 1\n",
+ "\n",
+ " mode = \"СУФФИКСЫ\" if check_suffix else \"ПРЕФИКСЫ\"\n",
+ " \n",
+ " if mismatch_count == 0:\n",
+ " print(f\"✅ [{mode}] Все {len(subset_data)} массивов ОК. Полных совпадений: {full_match_count}\")\n",
+ " else:\n",
+ " print(f\"❌ [{mode}] Найдено {mismatch_count} ошибок.\")\n",
+ "\n",
+ "# --- Запуск проверок ---\n",
+ "print(\"Проверка Train сетов (должны быть префиксами):\")\n",
+ "check_prefix_match(old_inter_new, first_sem)\n",
+ "check_prefix_match(old_inter_new, second_sem)\n",
+ "check_prefix_match(old_inter_new, third_sem)\n",
+ "check_prefix_match(old_inter_new, tiger_sem)\n",
+ "\n",
+ "print(\"\\nПроверка Test сета (должен быть суффиксом):\")\n",
+ "check_prefix_match(old_inter_new, test_sem, check_suffix=True)\n",
+ "\n",
+ "print(\"\\n(Контроль) Проверка Test сета как префикса (должна упасть):\")\n",
+ "check_prefix_match(old_inter_new, test_sem, check_suffix=False)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "0715adfd",
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "NameError",
+ "evalue": "name 'суа' is not defined",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[0;32mIn[1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mсуа\u001b[49m\n",
+ "\u001b[0;31mNameError\u001b[0m: name 'суа' is not defined"
+ ]
+ }
+ ],
+ "source": [
+ "суа"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "f2df507d",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "--- Статистика по временным интервалам (Fixed Time Window) ---\n",
+ "Part 0 [Base]: 186516 events (Start -> 2014-06-11)\n",
+ "Part 1 [Gap (Week -6)]: 4073 events (2014-06-11 -> 2014-06-25)\n",
+ "Part 2 [Pre-Valid (Week -4)]: 4730 events (2014-06-25 -> 2014-07-09)\n",
+ "Part 3 [Test (Week -2)]: 3183 events (2014-07-09 -> Inf)\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAA/8AAAIjCAYAAABViau2AAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjAsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvlHJYcgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAo2ZJREFUeJzs3XlcFPX/B/DXLLDcp9yIiKDigamYF14oCmreeZInqZlHpl8tK+/M1DTP1A5FE9PUNLNCUcH7NjzRkMATBEVuOXd+f/hjcl0EVheWpdfz8dhgP/OZz7xnPrvke+YznxFEURRBRERERERERFWWTNsBEBEREREREVH5YvJPREREREREVMUx+SciIiIiIiKq4pj8ExEREREREVVxTP6JiIiIiIiIqjgm/0RERERERERVHJN/IiIiIiIioiqOyT8RERERERFRFcfkn4iIiIiIiKiKY/JPRERE/3lXr17Fnj17pPdRUVH4/ffftRfQK7p37x5CQkKk9/Hx8QgNDdVeQEREVGkw+Sci0rKQkBAIgiC9jIyMUKdOHUyYMAEPHz7UdnhE/wkZGRkYO3YsTp8+jZiYGHzwwQe4cuWKtsNSmyAIGD9+PPbv34/4+HhMnz4dx44d03ZYRERUCehrOwAiInpm3rx5cHd3R05ODo4fP461a9fijz/+wNWrV2FiYqLt8IiqtFatWkkvAKhTpw5Gjx6t5ajU5+LigtGjRyMwMBAA4OTkhMjISO0GRURElYIgiqKo7SCIiP7LQkJCMHLkSJw7dw7NmjWTyqdOnYply5Zh69atGDx4sBYjJPrvuH79Op4+fQpvb2/I5XJth/PKYmNj8ejRIzRs2BCmpqbaDoeIiCoBDvsnIqqkOnbsCACIi4sDAKSkpOB///sfvL29YWZmBgsLC3Tt2hWXLl1SWTcnJwdz5sxBnTp1YGRkBCcnJ/Tt2xexsbEAnt0H/PytBi++OnToILUVGRkJQRCwfft2fPLJJ3B0dISpqSl69uyJu3fvqmz7zJkzCAwMhKWlJUxMTNC+fXucOHGi2H3s0KFDsdufM2eOSt0tW7bAx8cHxsbGsLGxwaBBg4rdfkn79jyFQoHly5ejQYMGMDIygoODA8aOHYsnT54o1atZsybeeustle1MmDBBpc3iYl+yZInKMQWA3NxczJ49G56enjA0NISrqyumT5+O3NzcYo/V8148bra2tujevTuuXr0q1Xn8+DG6du2K6tWrw9DQEE5OTggKCsLt27eV2vrqq6/QunVrVKtWDcbGxvDx8cHOnTtVtlnWfSv6vAiCgKioKKX69+/fh56eHgRBUNnG4cOH0bZtW5iamsLKygq9evVCdHR0mfb/dT83HTp0kPahfv368PHxwaVLl4r93BS12bx5c5iYmMDa2hrt2rXDgQMHADz7vJT03apZsyYA1c+pgYEBatasiWnTpiEvL0/aVtFtQfHx8VKZQqFAo0aNIAiC0v39I0aMkNr38PBAixYtkJKSAmNjY5U2ijNixAilmKytrdGhQ4dibxv45ptv0KBBAxgaGsLZ2Rnjx49Hampqie0DwJw5c5S2YW5ujubNmyvNtwA865OGDRviwoULaN26NYyNjeHu7o5169aptFnW71LRNpcvX67ShpeXFwRBwIQJE5TKU1NTMXnyZLi6usLQ0BCenp5YtGgRFAqFVKeoL7/66iuVdhs2bFjs9+PF0Rjdu3d/6WeYiEhTOOyfiKiSKkrUq1WrBgD4559/sGfPHvTv3x/u7u54+PAh1q9fj/bt2+P69etwdnYGABQWFuKtt97CoUOHMGjQIHzwwQfIyMhAeHg4rl69Cg8PD2kbgwcPRrdu3ZS2O2PGjGLjWbBgAQRBwEcffYSkpCQsX74c/v7+iIqKgrGxMYBnCVzXrl3h4+OD2bNnQyaTYePGjejYsSOOHTuG5s2bq7RbvXp1LFy4EACQmZmJcePGFbvtmTNnYsCAAXj33XeRnJyMVatWoV27dvjrr79gZWWlss6YMWPQtm1bAMAvv/yC3bt3Ky0fO3asNOpi0qRJiIuLw+rVq/HXX3/hxIkTMDAwKPY4qCM1NVXat+cpFAr07NkTx48fx5gxY1CvXj1cuXIFX3/9Nf7++2+VRKg4Xl5e+PTTTyGKImJjY7Fs2TJ069YNd+7cAQDk5eXB3NwcH3zwAapVq4bY2FisWrUKly9fVrqXfcWKFejZsyeCgoKQl5eHbdu2oX///ti3bx+6d++u9r4VMTIywsaNG7FixQqpbNOmTZDL5cjJyVGqe/DgQXTt2hW1atXCnDlz8PTpU6xatQq+vr64ePGilNA+r7w+N0U++uijYsvnzp2LOXPmoHXr1pg3bx7kcjnOnDmDw4cPo0uXLli+fDkyMzMBANHR0fjiiy/wySefoF69egAAMzMzpfaKPqe5ubnYv38/vvrqKxgZGWH+/Pkvje3HH38s83wEs2bNUjneJbG1tcXXX38N4NnkgStWrEC3bt1w9+5d6XjNmTMHc+fOhb+/P8aNG4ebN29i7dq1OHfuXJm/Oz/++CMA4NGjR/jmm2/Qv39/XL16FXXr1pXqPHnyBN26dcOAAQMwePBg/Pzzzxg3bhzkcjlGjRoFQP3vUtHncvLkyVLZyZMnVU6KAUB2djbat2+P+/fvY+zYsahRowZOnjyJGTNmICEhodiTCK/i6NGj+OOPPzTSFhFRiUQiItKqjRs3igDEgwcPisnJyeLdu3fFbdu2idWqVRONjY3Fe/fuiaIoijk5OWJhYaHSunFxcaKhoaE4b948qWzDhg0iAHHZsmUq21IoFNJ6AMQlS5ao1GnQoIHYvn176X1ERIQIQHRxcRHT09Ol8p9//lkEIK5YsUJqu3bt2mJAQIC0HVEUxezsbNHd3V3s3LmzyrZat24tNmzYUHqfnJwsAhBnz54tlcXHx4t6enriggULlNa9cuWKqK+vr1IeExMjAhA3bdoklc2ePVt8/n95x44dEwGIoaGhSuuGhYWplLu5uYndu3dXiX38+PHii/8bfTH26dOni/b29qKPj4/SMf3xxx9FmUwmHjt2TGn9devWiQDEEydOqGzvee3bt1dqTxRF8ZNPPhEBiElJSS9db/HixSIA8dGjR1JZdna2Up28vDyxYcOGYseOHV9p34o+L4MHDxarVasm5ubmSstq164tDhkyRAQg7tixQypv3LixaG9vLz5+/Fgqu3TpkiiTycRhw4ap7IemPzcvHs8//vhDBCAGBgYq9XFMTIwok8nEPn36qHwXn//Mv3gsIiIiVJYVfQc3btyoVO7s7Cx269ZNel/09yEuLk4UxWd/B2rUqCF27dpVZf3hw4eLbm5u0vurV6+KMplMqlvUxsu8uL4oiuK3334rAhDPnj0riqIoJiUliXK5XOzSpYvSMVi9erUIQNywYUOJ23jxuyiKonjgwAERgPjzzz9LZe3btxcBiEuXLpXKcnNzpc9KXl6eKIrqfZcAiG+//baor68vnj9/XioPDg6WPpfjx4+XyufPny+ampqKf//9t1LbH3/8sainpyfeuXNHFMVX+3v6/GeiRYsWUh89/xkmItI0DvsnIqok/P39YWdnB1dXVwwaNAhmZmbYvXs3XFxcAACGhoaQyZ792S4sLMTjx49hZmaGunXr4uLFi1I7u3btgq2tLSZOnKiyjeKGMJfVsGHDYG5uLr1/++234eTkJF2xioqKQkxMDIYMGYLHjx/j0aNHePToEbKystCpUyccPXpUaags8Oz2BCMjoxK3+8svv0ChUGDAgAFSm48ePYKjoyNq166NiIgIpfpFQ6YNDQ1f2uaOHTtgaWmJzp07K7Xp4+MDMzMzlTbz8/OV6j169KjUq6n379/HqlWrMHPmTJWrvTt27EC9evXg5eWl1GbRrR4vbr84RTElJyfj1KlT2L17Nxo1agRbW1ulehkZGUhKSsKpU6fw008/oUGDBrCxsZGWF43aAJ5daU1LS0Pbtm2VPlPq7FuRHj16QBAE7N27FwBw7Ngx3Lt3DwMHDlSql5CQgKioKIwYMUIprkaNGqFz587FXhEtj89NEVEUMWPGDPTr1w8tWrRQWrZnzx4oFArMmjVL+i4WedXvVmZmJh49eoT79+/j22+/RWJiIjp16vTS+mvWrMHjx48xe/bsUtueMWMGmjZtiv79+5c5HoVCIR2rqKgobN68GU5OTtLIhYMHDyIvLw+TJ09WOgajR4+GhYVFmR+PWLSN6OhorFu3DqampmjZsqVSHX19fYwdO1Z6L5fLMXbsWCQlJeHChQsA1P8uOTg4oHv37ti4cSOAZ1f3f/75Z4wcOVIlxh07dqBt27awtrZWatvf3x+FhYU4evSoUv3s7GyVvxOFhYUlHodffvkF586dw5dfflmm40ZE9Do47J+IqJJYs2YN6tSpA319fTg4OKBu3bpK/7hWKBRYsWIFvvnmG8TFxSn9o7Lo1gDg2e0CdevWhb6+Zv/E165dW+m9IAjw9PSU7iOOiYkBAAwfPvylbaSlpcHa2lp6/+jRI5V2XxQTEwNRFF9a78UhxkX3Hb8sKS1qMy0tDfb29sUuT0pKUnp/4MAB2NnZlRjni2bPng1nZ2eMHTtW5f72mJgYREdHv7TNF7dfnJMnTyqtX7t2bezZs0clCR09ejS2b98OAHjzzTfxxx9/KNXZt28fPv/8c0RFRSndI11SMlvSvhUxMDDAO++8gw0bNuDtt9/Ghg0b0K9fP1hYWCjVKxpu/fxw7yL16tXD/v37kZWVpTRpXXl8boqEhobi2rVr+Pnnn7F161alZbGxsZDJZKhfv36J21bHxIkTlU7UjRw5Eh9++GGxddPS0vDFF19gypQpcHBwKLHd48eP47fffsOhQ4ekW0HK4u7du0qfKycnJ+zatUv6Pr2sv+RyOWrVqlXs8PniPL8NCwsLhIaGwtXVVamOs7OzymSFderUAfDsPvuWLVu+0ndp5MiRGDlyJJYuXYodO3bA2tpaOlnwvJiYGFy+fLnMbc+ePbvYkzIv66vCwkJ88sknCAoKQqNGjYqtQ0SkSUz+iYgqiebNmyvN9v+iL774AjNnzsSoUaMwf/582NjYQCaTYfLkySpX1LWhKIYlS5agcePGxdZ5PiHPy8tDQkICOnfuXGq7giDgzz//hJ6eXoltAkBiYiIAwNHRscQ27e3tERoaWuzyF/+x36JFC3z++edKZatXr8avv/5a7PrR0dEICQnBli1bik0yFQoFvL29sWzZsmLXfzEJKk6jRo2wdOlSAEBycjJWrlyJDh064OLFi0r7/tlnn2HkyJGIjY3F4sWLMWjQIBw8eBD6+vo4duwYevbsiXbt2uGbb76Bk5MTDAwMsHHjRpXEt6z79rxRo0ahSZMmuHnzJnbs2CGNAngd5fW5KWp75syZCA4OlpLM8jZt2jR06dIFhYWFuHbtGubNmwdRFKUr089btGgRZDIZpk2bhsePH5fY7kcffYSAgAB07NhRaVLA0jg4OGDLli0Anp1s2LBhAwIDA3H8+HF4e3urtW8lCQ8PBwBkZWVh165dGDBgAPbt21dqv77oVb5L3bt3h1wux549e7Bx40YMHz5cZSRHUdudO3fG9OnTi237xc/ImDFjVEZZlPS4yB9++AHx8fHYv3//S+sQEWkSk38iIh2xc+dO+Pn54YcfflAqT01NVRrq7eHhgTNnziA/P18jk9YVKbqyX0QURdy6dUu6YlU0kaCFhQX8/f1Lbe/SpUvIz88v8YRHUbuiKMLd3b1MCdn169chCEKxV5Kfb/PgwYPw9fVVGvb+Mra2tir7VNKkfDNmzEDjxo1Vhrg/v/1Lly6hU6dOrzxc3NraWimmDh06wNnZGRs3blSatLFhw4Zo2LAhAMDb2xvt2rVDeHg4unbtil27dsHIyAj79+9Xuk2iuMSzrPv2PG9vbzRp0gQDBgyAnZ0d/Pz8cOTIEaU6bm5uAICbN2+qrH/jxg3Y2toqXf0tr88N8GwG+6SkpJfOuO7h4QGFQoHr16+/9ASXuurXry/1Y0BAAHJzc/HJJ59gwYIF0iSeAPDgwQOsWLECCxcuhLm5eYnJ/549e3Dq1KkSb914GSMjI6XPVc+ePWFjY4PVq1dj/fr1Sv1Vq1YtqV5eXh7i4uLK9N0HoFSvV69eOHPmDL766iul5P/Bgwcqoz7+/vtvAFB6qoG63yV9fX0MHToUCxYswLVr17Bhw4Zi63l4eCAzM7PM+1S7dm2Vui97zGJ2djbmzp2L999/XzqmRETljff8ExHpCD09PYiiqFS2Y8cO3L9/X6msX79+ePToEVavXq3Sxovrq2Pz5s3IyMiQ3u/cuRMJCQno2rUrAMDHxwceHh746quvpNnOn5ecnKwSu56eXrGP0Xte3759oaenh7lz56rEL4qiUhJUUFCAXbt2oXnz5iUO+x8wYAAKCwuLnVG9oKCgTI8se5lTp07h119/xZdffvnSZGTAgAG4f/8+vvvuO5VlT58+RVZWltrbffr0KQCU+KjAR48eKdUpeuze87eQxMfHv/TERln27UWjRo3C5cuXpcfIvcjJyQmNGzfGpk2blI771atXceDAAZWnUZTH5wZ4NjfCggUL8OGHH7501Ejv3r0hk8kwb948ldE2r/Pdel5RPz7/uD/g2VMGHBwc8N5775W4ftFQ8iFDhmjkBEVeXh4KCgqkz4y/vz/kcjlWrlyptM8//PAD0tLSSnxCREkx5+XlqXx2CwoKsH79eqVY1q9fDzs7O/j4+AB49e/SqFGjcOXKFbRr107pJMbzBgwYgFOnThV7ZT41NRUFBQVl3scXrVixAllZWfj0009fuQ0iInXxyj8RkY546623MG/ePIwcORKtW7fGlStXEBoaqvIP12HDhmHz5s2YMmUKzp49i7Zt2yIrKwsHDx7E+++/j169er3S9m1sbNCmTRuMHDkSDx8+xPLly+Hp6SkNa5XJZPj+++/RtWtXNGjQACNHjoSLiwvu37+PiIgIWFhY4LfffkNWVhbWrFmDlStXok6dOkrPuy46aXD58mWcOnUKrVq1goeHBz7//HPMmDED8fHx6N27N8zNzREXF4fdu3djzJgx+N///oeDBw9i5syZuHz5Mn777bcS96V9+/YYO3YsFi5ciKioKHTp0gUGBgaIiYnBjh07sGLFCrz99tuvdJwOHDiAzp07l3i1cOjQofj555/x3nvvISIiAr6+vigsLMSNGzfw888/Y//+/aVe2X748KE0PPvRo0dYv3499PX1paT4u+++w9GjR9G0aVNYWFjg+vXr+O677+Dk5CRNKNe9e3csW7YMgYGBGDJkCJKSkrBmzRp4enri8uXLr7RvLxo9ejT69+8PS0vLl9ZZsmQJunbtilatWiE4OFh61J+lpaV0Fb68PjdFLl68CFtb25cO8QYAT09PfPrpp5g/fz7atm2Lvn37wtDQEOfOnYOzs3OJjz58mVOnTkFfX18a9r9q1So0adJE5fGGBw4cQGhoKORyeYnt3bt3D3K5/JUfHZeVlaU07P/HH39ETk4O+vTpA+DZLTEzZszA3LlzERgYiJ49e+LmzZv45ptv8Oabb+Kdd94p03aKtpGVlYU9e/YgPj5e6fF7wLN7/hctWoT4+HjUqVMH27dvR1RUFL799ltpVNOrfpfq1auHR48elTjyZ9q0adi7dy/eeustjBgxAj4+PsjKysKVK1ewc+dOxMfHq0ywWVYHDhzAggULlOZrISIqd1p4wgARET2n6FFe586dK7FeTk6OOHXqVNHJyUk0NjYWfX19xVOnThX72Lfs7Gzx008/Fd3d3UUDAwPR0dFRfPvtt8XY2FhRFF/t0VQ//fSTOGPGDNHe3l40NjYWu3fvLt6+fVtl/b/++kvs27evWK1aNdHQ0FB0c3MTBwwYIB46dEhp26W9hg8frtTurl27xDZt2oimpqaiqamp6OXlJY4fP168efOmKIqiOHHiRLFdu3ZiWFiYSkzFPV5MFJ89xszHx0c0NjYWzc3NRW9vb3H69OnigwcPpDrqPupPEATxwoULSuXF9VFeXp64aNEisUGDBqKhoaFobW0t+vj4iHPnzhXT0tJUtvdie88fKysrK9HX11f8448/pDpHjhwR27ZtK1pZWYmGhoZizZo1xdGjR6s87u2HH34Qa9euLRoaGopeXl7ixo0biz1eZd23os/L84/ye97Llh88eFD09fUVjY2NRQsLC7FHjx7i9evXpeXl9bl5/nh+/fXXSuu+7HOzYcMGsUmTJlK/tW/fXgwPD3/pvpb0qL+il0wmE6tXry4OHz5cerynKP7796Fx48ZKjxMs7lGBw4cPFwGIH3zwgdK2Xnxc4MsUrV/0MjMzE5s2bSr++OOPKnVXr14tenl5iQYGBqKDg4M4btw48cmTJyW2L4r/HtOil7GxsVi/fn3x66+/Vtq/9u3biw0aNBDPnz8vtmrVSjQyMhLd3NzE1atXq7RZ1u8SXniU34uKW56RkSHOmDFD9PT0FOVyuWhrayu2bt1a/Oqrr6THDb7K31MnJycxKytLZft81B8RlSdBFDU0To2IiKqkyMhI+Pn5YceOHa98Nfx58fHxcHd3R1xcnMrVzSJz5sxBfHy8WhOVUdXGz81/S4cOHfDo0SNcvXpV26EQEVUZvOefiIiIiIiIqIrjPf9ERFShzMzMEBQUVOKEfI0aNVKa6ZyInxsiIqLXw+SfiIgqlK2trTTZ18v07du3gqIhXcHPDRER0evhPf9EREREREREVZxW7/k/evQoevToAWdnZwiC8NLnCgPAe++9B0EQsHz5cqXylJQUBAUFwcLCAlZWVggODlZ5vvTly5fRtm1bGBkZwdXVFYsXL1Zpf8eOHfDy8oKRkRG8vb1f+RE5RERERERERJWNVpP/rKwsvPHGG1izZk2J9Xbv3o3Tp08Xex9fUFAQrl27hvDwcOzbtw9Hjx7FmDFjpOXp6eno0qUL3NzccOHCBSxZsgRz5szBt99+K9U5efIkBg8ejODgYPz111/o3bs3evfuzRlmiYiIiIiIqEqoNMP+BUHA7t270bt3b6Xy+/fvo0WLFti/fz+6d++OyZMnY/LkyQCA6Oho1K9fH+fOnUOzZs0AAGFhYejWrRvu3bsHZ2dnrF27Fp9++ikSExMhl8sBAB9//DH27NmDGzduAAAGDhyIrKws7Nu3T9puy5Yt0bhxY6xbt65M8SsUCjx48ADm5uYQBOE1jwYRERERERFRyURRREZGBpydnSGTlXxtv1JP+KdQKDB06FBMmzYNDRo0UFl+6tQpWFlZSYk/APj7+0Mmk+HMmTPo06cPTp06hXbt2kmJPwAEBARg0aJFePLkCaytrXHq1ClMmTJFqe2AgIASb0PIzc1Fbm6u9P7+/fuoX7/+a+wtERERERERkfru3r2L6tWrl1inUif/ixYtgr6+PiZNmlTs8sTERNjb2yuV6evrw8bGBomJiVIdd3d3pToODg7SMmtrayQmJkplz9cpaqM4CxcuxNy5c1XKL168WOJjiKhiKBQKpKenw8LCotQzYFS5sO90E/tNd7HvdBf7Tjex33QX+043vazfbNu2hd7Dhyh0cACWAHq5D+F1W0BCgQgHUwccG3hMi1GXXWZmJpo2bQpzc/NS61ba5P/ChQtYsWIFLl68WCmH0c+YMUNptEB6ejpcXV3h7u4OCwsLLUZGwLMveXJyMuzs7PjHWcew73QT+013se90F/tON7HfdBf7Tje9rN8EfX0IAER9fcAMEPQAmbEA5IvQN9GHh4eH9oJWQ3p6OgCUKWeutMn/sWPHkJSUhBo1akhlhYWFmDp1KpYvX474+Hg4OjoiKSlJab2CggKkpKTA0dERAODo6IiHDx8q1Sl6X1qdouXFMTQ0hKGhoUq5TCbjH4NKQhAE9oeOYt/pJvab7mLf6S72nW5iv+ku9p1uKqnfXpYy60ofqxNnpd2joUOH4vLly4iKipJezs7OmDZtGvbv3w8AaNWqFVJTU3HhwgVpvcOHD0OhUKBFixZSnaNHjyI/P1+qEx4ejrp168La2lqqc+jQIaXth4eHo1WrVuW9m0RERERERETlTqtX/jMzM3Hr1i3pfVxcHKKiomBjY4MaNWqgWrVqSvUNDAzg6OiIunXrAgDq1auHwMBAjB49GuvWrUN+fj4mTJiAQYMGSY8FHDJkCObOnYvg4GB89NFHuHr1KlasWIGvv/5aaveDDz5A+/btsXTpUnTv3h3btm3D+fPnlR4HSERERERERKSrtJr8nz9/Hn5+ftL7onvohw8fjpCQkDK1ERoaigkTJqBTp06QyWTo168fVq5cKS23tLTEgQMHMH78ePj4+MDW1hazZs3CmDFjpDqtW7fG1q1b8dlnn+GTTz5B7dq1sWfPHjRs2FAzO/r/RFFEQUEBCgsLNdouqVIoFMjPz0dOTo7ODNmhZyq67wwMDKCnp1fu2yEiIiKiSuTcOaCwENDTA6wAiIU4l/UIhYa20BOq5r8NBVEURW0HURWkp6fD0tISaWlpxU74l5eXh4SEBGRnZ2shuv8eURShUCggk8kq5YSR9HIV3XeCIKB69ep8SsdrUigUSEpKgr29PU+46Rj2ne5i3+km9pvuYt/ppqreb6Xloc+rtBP+VSUKhQJxcXHQ09ODs7Mz5HI5E9JyVjTKQl9fn8dax1Rk34miiOTkZNy7dw+1a9fmCAAiIiIiqrKY/FeAvLw8KBQKuLq6wsTERNvh/Ccw+dddFd13dnZ2iI+PR35+PpN/IiIiIqqymPxXoKo4zIRI1/HkEBEREdF/0LffApmZgJkZ0BFAfia+jb+ITKumMJObYYzPmFKb0DVM/omIiIiIiOi/Zd484P59wMUFsAPw9D7mxctwPz8ULuYuVTL556VoIiIiIiIioiqOV/61bEdsWoVtq7+HZYVti4iIiIiIiCoPXvmnEo0YMQKCIKi8AgMDKyyGOXPmoHHjxhW2PSIiIiIioqqGV/6pVIGBgdi4caNSmaGhoZaiISIiIiIiInXxyj+VytDQEI6Ojkova2trDBkyBAMHDlSqm5+fD1tbW2zevBkAoFAosHDhQri7u8PY2BhvvPEGdu7cKdWPjIyEIAg4dOgQmjVrBhMTE7Ru3Ro3b94EAISEhGDu3Lm4dOmSNOogJCQEoihizpw5qFGjBgwNDeHs7IxJkyZV3EEhIiIiIiLSIbzyT68sKCgI/fv3R2ZmJszMzAAA+/fvR3Z2Nvr06QMAWLhwIbZs2YJ169ahdu3aOHr0KN555x3Y2dmhffv2Uluffvopli5dCjs7O7z33nsYNWoUTpw4gYEDB+Lq1asICwvDwYMHAQCWlpbYtWsXvv76a2zbtg0NGjRAYmIiLl26VPEHgYiIiIiISAcw+adS7du3T0rui3zyySeYPn06TE1NsXv3bgwdOhQAsHXrVvTs2RPm5ubIzc3FF198gYMHD6JVq1YAgFq1auH48eNYv369UvK/YMEC6f3HH3+M7t27IycnB8bGxjAzM4O+vj4cHR2l+nfu3IGjoyP8/f1hYGCAGjVqoHnz5uV9KIiIiIiIiHQSk38qlZ+fH9auXatUZmNjA319fQwYMAChoaEYOnQosrKy8Ouvv2Lbtm0AgFu3biE7OxudO3dWWjcvLw9NmjRRKmvUqJH0u5OTEwAgKSkJNWrUKDam/v37Y/ny5ahVqxYCAwPRrVs39OjRA/r6/EgTERERERG9iJkSlcrU1BSenp7FLgsKCkL79u2RlJSE8PBwGBsbS08CyMzMBAD8/vvvcHFxUVrvxQkDDQwMpN8FQQDwbL6Al3F1dcXNmzdx8OBBhIeH4/3338eSJUtw5MgRpbaIiIiIiIhU1KkDWFoCDg6ABQC5JeqYPICl3BkOpg7ajq5cMPmn19K6dWu4urpi+/bt+PPPP9G/f38p+a5fvz4MDQ1x584dpSH+6pLL5SgsLFQpNzY2Ro8ePdCjRw+MHz8eXl5euHLlCpo2bfrK2yIiIiIiov+Aw4dVi7QQRkVi8k+lys3NRWJiolKZvr4+bG1tAQBDhgzBunXr8PfffyMiIkKqY25ujv/973/48MMPoVAo0KZNG6SlpeHEiROwsLDA8OHDy7T9mjVrIi4uDlFRUahevTrMzc3x008/obCwEC1atICJiQm2bNkCY2NjuLm5aW7HiYiIiIioUtoRm1bi8v4elhUUie5g8q9luvChDAsLk+7DL1K3bl3cuHEDwLOh/wsWLICbmxt8fX2V6s2fPx92dnZYuHAh/vnnH1hZWaFp06b45JNPyrz9fv364ZdffoGfnx9SU1OxceNGWFlZ4csvv8SUKVNQWFgIb29v/Pbbb6hWrdrr7zAREREREVEVI4iiKGo7iKogPT0dlpaWSEtLg4WFhdKynJwcxMXFwd3dHUZGRlqK8L9FFEUUFBRAX19fmkOAdENF9x2/n5qhUCiQlJQEe3t7yGQybYdDamDf6S72nW5iv+ku9l3lUtYr/1W930rKQ1/EK/9ERERERET03xIUBDx6BNjaAu8DyH2EoJibeGRaF7YmtgjtG6rtCDWOyT8RERERERH9txw5Aty/D7i4AG8DeHofRx7LcD/xNlzMXUpdXRdVvXEPRERERERERKSEyT8RERERERFRFcfkn4iIiIiIiKiKY/JPREREREREVMUx+SciIiIiIiKq4pj8ExEREREREVVxTP6JiIiIiIiIqjgm/1TpnThxAt7e3jAwMEDv3r1fWqYrfvjhB3Tp0kXbYWDEiBEVcuzCwsLQuHFjKBSKct8WEREREREVj8k/lSoxMRETJ05ErVq1YGhoCFdXV/To0QOHDh0qcxuRkZEQBAGpqalqb3/KlClo3Lgx4uLiEBIS8tKyskhMTMQHH3wAT09PGBkZwcHBAb6+vli7di2ys7PVjk1dOTk5mDlzJmbPng3gWWIsCAISExOV6jk5OaFmzZpKZfHx8RAEQa3jXlFCQkLQqFEjGBkZwd7eHuPHj5eWBQYGwsDAAKGhoVqMkIiIiIh0xY7YtFJfr230aODDD5/99BwN1P0Qoz3b4cOWH2J009Gv334lpK/tAKhyi4+Ph6+vL6ysrLBkyRJ4e3sjPz8f+/fvx/jx43Hjxo1yjyE2NhbvvfceqlevXmJZaf755x9pX7744gt4e3vD0NAQV65cwbfffgsXFxf07NmzPHZBsnPnTlhYWMDX1xcA0KZNG+jr6yMyMhKDBg0CAERHR+Pp06fIzs5GfHy8dBIgIiIChoaG0rqVxbJly7B06VIsWbIELVq0QFZWFuLj45XqjBgxAitXrsTQoUO1EyQRERER0fP+/2KcUpGPFuKoQLzyTyV6//33IQgCzp49i379+qFOnTpo0KABpkyZgtOnTwP494p0VFSUtF5qaioEQUBkZCTi4+Ph5+cHALC2toYgCBgxYgQAIDc3F5MmTYK9vT2MjIzQpk0bnDt3Tqndx48fY9SoURAEASEhIcWWlXVf9PX1cf78eQwYMAD16tVDrVq10KtXL/z+++/o0aOHVHfZsmXw9vaGqakpXF1d8f777yMzM1NaHhISAisrK+zZswe1a9eGkZERAgICcPfu3RJj2LZtm9J2zMzM8OabbyIyMlIqi4yMRJs2beDr66tS3rJlSxgZGQEAvv/+e9SrVw9GRkbw8vLCN998o7Stu3fvYsCAAbCysoKNjQ169eqlkpQ/79y5c7Czs8OiRYtK3IfnPXnyBJ999hk2b96MIUOGwMPDA40aNVI5idKjRw+cP38esbGxZW6biIiIiIg0h8m/ti1bBlSvXvqruCvSPXuWbd1ly14ptJSUFISFhWH8+PEwNTVVWW5lZVWmdlxdXbFr1y4AwM2bN5GQkIAVK1YAAKZPn45du3Zh06ZNuHjxIjw9PREQEICUlBS4uroiISEBFhYWWL58ORISEtC/f3+VsoEDB0onBV7m8ePHOHDgwEv3BYDS+jKZDCtXrsS1a9ewadMmHD58GNOnT1eqn52djQULFmDz5s04ceIEUlNTpav3L3P8+HE0a9ZMqczPzw8RERHS+4iICHTo0AHt27dXKo+MjJROooSGhmLWrFlYsGABoqOj8cUXX2DmzJnYtGkTACA/Px8BAQEwNzfHsWPHcOLECZiZmSEwMBB5eXkqcR0+fBidO3fGggUL8NFHH5W4D88LDw+HQqHA/fv3Ua9ePVSvXh0DBgxQOQlSo0YNODg44NixY2Vum4iIiIiINIfD/rUtPR24f7/0eq6uqmXJyWVbNz1d/bgA3Lp1C6IowsvL65XWL6KnpwcbGxsAgL29vXTSICsrC2vXrkVISAi6du0KAPjuu+8QHh6OH374AdOmTYOjoyMEQYClpSUcHR0BAKampipllpaWqFu3bqn78mIdW1tb5OTkAADGjx8vXfWePHmyVKdmzZr4/PPP8d577yldXc/Pz8fq1avRokULAMCmTZtQr149nD17Fs2bN1eJITU1FWlpaXB2dlYq9/PzwxdffIGEhAQ4OTnhyJEjmDZtGgoKCrB27VoAz25ZuHPnjpT8z549G0uXLkXfvn0BAO7u7rh+/TrWr1+P4cOHY/v27VAoFPj++++lkxobN26ElZUVIiMjlSYc3L17N4YNG4bvv/8eAwcOfOkxLM4///wDhUKBL774AitWrIClpSU+++wzdO7cGZcvX4ZcLpfqOjs74/bt22q1T0REREREmsHkX9ssLAAXl9Lr2dkVX1aWdS0s1I8LgCiKr7ReWcXGxiI/P1/pHnYDAwM0b94c0dHRarXVp08f9OnTR+0Yzp49C4VCgaCgIOTm5krlBw8exMKFC3Hjxg2kp6ejoKAAOTk5yM7OhomJCQBAX18fb775prSOl5cXrKysEB0dXWzy//TpUwCQhu0Xad26NeRyOSIjI/HGG2/g6dOnaNq0KRQKBZKTkxEXF4fIyEgYGxujZcuWyMrKQmxsLIKDgzF69L+TkRQUFMDS0hIAcOnSJdy6dQvm5uZK28rJyVEaen/mzBns27cPO3fuLHXm/65du0pX7t3c3HDt2jUoFArk5+dj5cqV0gmFn376CY6OjoiIiEBAQIC0vrGxcYVMqkhEREREVKrq1Z9dSHVxAVYBeHof1eNluJ+vgIu5C+5NuaftCDWOyb+2TZny7PUq9u7VbCwvqF27NgRBKHVSP5ns2d0jz58syM/PL9fY1OXp6QlBEHDz5k2l8lq1agF4lpgWiY+Px1tvvYVx48ZhwYIFsLGxwfHjxxEcHIy8vDwp+VdXtWrVIAgCnjx5olRuYmKC5s2bIyIiAikpKWjTpg309PSgp6eH1q1bIyIiAhEREfD19YVcLpfW/+6776RRB0X09PQAAJmZmfDx8Sl2hn27504keXh4oFq1atiwYQO6d+8OAwODl8b//fffSycwiuo5OTkBAOrXr6/Uvq2tLe7cuaO0fkpKitK2iYiIiIio4vCef3opGxsbBAQEYM2aNcjKylJZXvTYvqKELiEhQVr2/OR/AKTh34WFhVKZh4cH5HI5Tpw4IZXl5+fj3LlzSsmkJlSrVg2dO3fG6tWri92X5124cAEKhQJLly5Fy5YtUadOHTx48EClXkFBAc6fPy+9v3nzJlJTU1GvXr1i25XL5ahfvz6uX7+usszPzw+RkZGIjIxEhw4dpPJ27dohMjISR44ckYb8Ozg4wNnZGf/88w88PT2VXu7u7gCApk2bIiYmBvb29ip1ikYHAM9uezh8+DBu3bqFAQMGlHjSxsXFRWrDzc0NAKRRG8+fVElJScGjR4+kOsC/Iw6aNGny0vaJiIiIiKj8MPmnEq1ZswaFhYVo3rw5du3ahZiYGERHR2PlypVo1aoVAEjD0b/88ktER0fjyJEj+Oyzz5TacXNzgyAI2LdvH5KTk5GZmQlTU1OMGzcO06ZNQ1hYGK5fv47Ro0cjOzsbwcHBasW5e/fuUucm+Oabb1BQUIBmzZph+/btiI6Oxs2bN7FlyxbcuHFDumru6emJ/Px8rFq1Cv/88w9+/PFHrFu3TqU9AwMDTJw4EWfOnMGFCxcwYsQItGzZstgh/0UCAgJw/PhxlXI/Pz/ExMRg//79aN++vVTevn177NmzB3fv3pWSfwCYO3cuFi5ciJUrV+Lvv//GlStXsHHjRiz7/8kdg4KCYGtri169euHYsWPSrQOTJk3CvXvKQ5js7e1x+PBh3LhxA4MHD0ZBQUGJx/F5derUQa9evfDBBx/g5MmTuHr1KoYPHw4vLy+leE+fPg1DQ0PpM0NERERERBWLyT+VqFatWrh48SL8/PwwdepUNGzYEJ07d8ahQ4ekyegAYMOGDSgoKICPjw8mT56Mzz//XKkdFxcXzJ07Fx9//DEcHBwwYcIEAMCXX36Jfv36YejQoWjatClu3bqF/fv3w9raWq0409LSVIb0v8jDwwN//fUX/P39MWPGDLzxxhto1qwZVq1ahf/973+YP38+AOCNN97AsmXLsGjRIjRs2BChoaFYuHChSnsmJib46KOPMGTIEPj6+sLMzAzbt28vMYbg4GD88ccfSEtLUypv1aoVDA0NIYoifHz+fcBoixYtkJ+fLz0SsMi7776L77//Hhs3boS3tzfat2+PkJAQ6cq/iYkJjh49iho1aqBv376oV68egoODkZOTA4ti5oBwdHTE4cOHceXKFQQFBSmN0CjN5s2b0aJFC3Tv3h3t27eHgYEBwsLClG4h+OmnnxAUFPTKt0wQEREREdHrEcTyntXtPyI9PR2WlpZIS0tTSa5ycnIQFxcHd3d3lcneqHyIooiCggLo6+uX+AjAVxUSEoLJkydLtz6oo3///mjatClmzJih8bgqo0ePHqFu3bo4f/68dHKiJOXddy/i91MzFAoFkpKSYG9vL80DQrqBfae72He6if2mu9h3FWdHbFrplUrR3+PZra4v7bcqMuFfSXnoi/ipJapgS5YsgZmZmbbDqDDx8fH45ptvypT4ExERERFR+WDyT1TBatasiYkTJ2o7jArTrFkzDBw4UNthEBERERFJsgtE6WfR70Vj4p8WihoZfVDZMPknegUjRox4pSH/RERERERE2sDkn4iIiIiIiKiK09d2AEREREREREQV6ezS9ZDl5UEhlwOegEyRh2l2N5FsVBcGMrm2wysXTP6JiIiIiIjoPyW5ZVuVMherTnDRQiwVhcP+iYiIiIiIiKo4Jv9EREREREREVRyH/RMREREREVVypT16rr+HZQVFUjXYnT727z3/9Z/d838x5d97/hs4qt4WoOt45Z8qvRMnTsDb2xsGBgbo3bv3S8uqkjlz5qBx48bS+xEjRpS6nx06dMDkyZNLbbtdu3bYunXr6wWoAYIgYM+ePeW+nZYtW2LXrl3lvh0iIiIi0h3Np45Fu1H90HzqWDS/NRbtbvbDkrOf4YvD/bDq5Fhth1cumPxTqRITEzFx4kTUqlULhoaGcHV1RY8ePXDo0KEytxEZGQlBEJCamqr29qdMmYLGjRsjLi4OISEhLy0rzYgRIyAIAgRBgFwuh6enJ+bNm4eCggK1Y3qZpUuXwtraGjk5OSrLsrOzYWFhgZUrV6rd7ooVK8q8nyXZu3cvHj58iEGDBgEABg0ahMDAQKU6YWFhEAQBc+bMUSqfM2cOatSo8doxlJdt27ZBEASVkySfffYZPv74YygUCu0ERkRERERUCTD5pxLFx8fDx8cHhw8fxpIlS3DlyhWEhYXBz88P48ePr5AYYmNj0bFjR1SvXh1WVlYvLSuLwMBAJCQkICYmBlOnTsWcOXOwZMmSYuvm5eWpHevQoUORlZWFX375RWXZzp07kZeXh3feeUftdi0tLdXaz5dZuXIlRo4cCZns2Vffz88PJ06cUDoBEhERAVdXV0RGRiqtGxERAT8/v9eOoTzEx8fjf//7H9q2VR2e1bVrV2RkZODPP//UQmRERERERJUDk38q0fvvvw9BEHD27Fn069cPderUQYMGDTBlyhScPn0awLPESxAEREVFSeulpqZCEARERkYiPj5eShqtra0hCAJGjBgBAMjNzcWkSZNgb28PIyMjtGnTBufOnVNq9/Hjxxg1ahQEQUBISEixZWVlaGgIR0dHuLm5Ydy4cfD398fevXsB/Du0fsGCBXB2dkbdunUBAHfv3sWAAQNgZWUFGxsb9OrVC/Hx8cW2b29vjx49emDDhg0qyzZs2IDevXvDxsYGH330EerUqQMTExPUqlULM2fORH5+/kvjfnHYf1ZWFoYNGwYzMzM4OTlh6dKlpe57cnIyDh8+jB49ekhlfn5+yMzMxPnz56WyyMhIfPzxxzhz5ow0giEnJwdnzpyR+jE1NRXvvvsu7OzsYGFhgY4dO+LSpUtK2/v111/RtGlTGBkZoVatWpg7d26Joyxmz54NJycnXL58udR9eV5hYSGCgoIwd+5c1KpVS2W5np4eunXrhm3btqnVLhERERFRVcLkX9uilwG7q5f+OtJTdd0jPcu2bvSyVwotJSUFYWFhGD9+PExNTVWWl/VKtKurq3TP9c2bN5GQkIAVK1YAAKZPn45du3Zh06ZNuHjxIjw9PREQEICUlBS4uroiISEBFhYWWL58ORISEtC/f3+VsoEDB0onBdRlbGysdIX/0KFDuHnzJsLDw7Fv3z7k5+cjICAA5ubmOHbsGE6cOAEzMzMEBga+dGRAcHAwDh8+jNu3b0tl//zzD44ePYrg4GAAgLm5OUJCQnD9+nWsWLEC3333Hb7++usyxz1t2jQcOXIEv/76Kw4cOIDIyEhcvHixxHWOHz8OExMT1KtXTyqrU6cOnJ2dERERAQDIyMjAxYsX0b9/f9SsWROnTp0CAJw8eRK5ublS8t+/f38kJSXhzz//xIULF9C0aVN06tQJKSkpAIBjx45h2LBh+OCDD3D9+nWsX78eISEhWLBggUpcoihi4sSJ2Lx5M44dO4ZGjRqV+TgAwLx582Bvby8d2+I0b94cx44dU6tdIiIiIqKqhLP9a1t+OvD0fun1clyLKUsu27r56erHBeDWrVsQRRFeXl6vtH4RPT092NjYAHh2ZbzopEFWVhbWrl2LkJAQdO3aFQDw3XffITw8HD/88AOmTZsGR0dHCIIAS0tLODo6AgBMTU1VyiwtLaUr9WUhiiIOHTqE/fv3Y+LEiVK5qakpvv/+e8jlcgDAli1boFAo8P3330snFzZu3AgrKytERkaiS5cuKm0HBATA2dkZGzdulO6bDwkJgaurKzp16gTg2X3oRWrWrIn//e9/2LZtG6ZPn15q7JmZmfjhhx+wZcsWqb1NmzahevXqJa53+/ZtODg4SEP+i/j5+SEyMhIzZszAsWPHUKdOHdjZ2aFdu3aIjIyUlru7u8PNzQ3Hjx/H2bNnkZSUBENDQwDAV199hT179mDnzp0YM2YM5s6di48//hjDhw8HANSqVQvz58/H9OnTMXv2bGnbBQUFeOedd/DXX3/h+PHjcHFxgSiKpR6DIsePH8cPP/ygNOqkOM7Ozrh79y4UCoXK/hMRERFpW2kz+RNpApN/bTOwAIxdSq9nZFd8WVnWNbBQPy5ArSTsVcTGxiI/Px++vr5SmYGBAZo3b47o6Gi12urTpw/69OlTar19+/bBzMwM+fn5UCgUGDJkiNLEdt7e3lLiDwCXLl3CrVu3YG5urtROTk4OYmNjcezYMenEBQCsX78eQUFBGD58OEJCQjB79myIoohNmzYp3Wu/fft2rFy5ErGxscjMzERBQQEsLMrWT7GxscjLy0OLFi2kMhsbm1JPfjx9+hRGRkYq5UVPCcjPz0dkZCQ6dOgAAGjfvj3Wr18PANJJgKJjkpmZiWrVqqm0HxsbK9U5ceKE0pX+wsJC5OTkIDs7GyYmJgCADz/8EIaGhjh9+jRsbW1fGvudO3dQv3596f0nn3yCiRMnYujQofjuu+9KXBd4NsJDoVAgNzcXxsbGJdYlIiIiIqqKmPxrW70pz16vov1ezcbygtq1a0MQBNy4caPEekUJ7fMnC0q6f12b/Pz8sHbtWsjlcjg7O0NfX/kr8OLtDZmZmfDx8UFoaKhKW3Z2dpDL5UpXnR0cHAAAo0aNwsKFC3H48GEoFArcvXsXI0eOBACcOnVKukc9ICAAlpaW2LZtW5nu238dtra2ePLkiUq5n58fsrKycO7cOURERGDatGkAniX/o0aNQkpKCs6cOYOxY5898iQzMxNOTk4qEwIC/94KkpmZiblz56Jv374qdZ4/AdG5c2f89NNP2L9/P4KCgl4au7Ozs9JxtrGxQWxsLOLj45XmMCia0V9fXx83b96Eh4cHgGe3sJiamjLxJyIiIqL/LK2Ofz169Ch69OgBZ2dnlWd+5+fn46OPPoK3tzdMTU3h7OyMYcOG4cGDB0ptpKSkICgoCBYWFrCyskJwcDAyMzOV6ly+fBlt27aFkZERXF1dsXjxYpVYduzYAS8vLxgZGcHb2xt//PFHueyzLrGxsUFAQADWrFmDrKwsleVFj+2zs3s2KiEhIUFa9uIw7KKr6YWFhVKZh4cH5HI5Tpw4IZXl5+fj3LlzSld5NcnU1BSenp6oUaOGSuJfnKZNmyImJgb29vbw9PRUellaWsLY2FiprGiEgIeHB9q3b48NGzZg48aN8Pf3h5ubG4Bn98+7ubnh008/RbNmzVC7dm2l+QFK4+HhAQMDA5w5c0Yqe/LkCf7+++8S12vSpAkSExNVTgB4eHjA1dUVe/fuRVRUFNq3bw8AcHFxgYuLC5YuXYq8vDzpyn/Tpk2RmJgIfX19lWNSdAW+adOmuHnzpspyT09PpWH3PXv2xNatW/Huu++WOCHfi9uysbGBl5cXrly5gqioKOnVs2dP+Pn5ISoqCq6u/94qc/XqVTRp0qSMR5iIiIiIdN2O2DTsiE3Drn/ScDwhC7v+SZPK/qu3WWg1+c/KysIbb7yBNWvWqCzLzs7GxYsXMXPmTFy8eBG//PILbt68iZ49lSe+CwoKwrVr16QJ2o4ePYoxY8ZIy9PT09GlSxe4ubnhwoULWLJkCebMmYNvv/1WqnPy5EkMHjwYwcHB+Ouvv9C7d2/07t0bV69eLb+d1xFr1qxBYWEhmjdvjl27diEmJgbR0dFYuXIlWrVqBeDZkOqWLVviyy+/RHR0NI4cOaJ0TzsAuLm5QRAE7Nu3D8nJycjMzISpqSnGjRuHadOmISwsDNevX8fo0aORnZ1d4uRtxdm9e/drz01QnKCgINja2qJXr144duwY4uLiEBkZiUmTJuHevXslrhscHIxffvkFu3fvVtqf2rVr486dO9i2bRtiY2OxcuVK7N69u8wxmZmZITg4GNOmTcPhw4dx9epVjBgxotR72Zs0aQJbW1ulky1F/Pz88M0338DT01MavQA8u/q/atUqaWJAAPD390erVq3Qu3dvHDhwAPHx8Th58iQ+/fRT6akBs2bNwubNmzF37lxcu3YN0dHR2LZtm8rnAnh2y8aPP/6IkSNHYufOnWU+DkZGRmjYsKHSy8rKCubm5mjYsKHS7RvHjh0rdn4GIiIiIqL/Cq0O++/atavS/dLPs7S0RHh4uFLZ6tWr0bx5c9y5cwc1atRAdHQ0wsLCcO7cOTRr1gwAsGrVKnTr1g1fffUVnJ2dERoairy8PGzYsAFyuRwNGjRAVFQUli1bJp0kWLFiBQIDA6XhzvPnz0d4eDhWr16NdevWleMRqPxq1aqFixcvYsGCBZg6dSoSEhJgZ2cHHx8frF27Vqq3YcMGBAcHw8fHB3Xr1sXixYuVki0XFxdpEriRI0di2LBhCAkJwZdffgmFQoGhQ4ciIyMDzZo1w/79+2Ftba1WnGlpabh586bG9ruIiYkJjh49io8++gh9+/ZFRkYGXFxc0KlTp1Lv0e/Xrx8mTJgAPT09pcf09ezZEx9++CEmTJiA3NxcdO/eHTNnzlSae6A0S5YsQWZmJnr06AFzc3NMnToVaWkln8HU09PDyJEjERoairfeektpmZ+fHzZv3izd71+kffv22LhxI4YMGSKVCYKAP/74A59++ilGjhyJ5ORkODo6ol27dtKJg4CAAOzbtw/z5s3DokWLYGBgAC8vL7z77rvFxvb2229LnwNBEFRO8r2O+/fv4+TJk9iyZYvG2iQiIiIi3fb7iesqZctbFFOxChHE8p7VrYwEQcDu3buVkqQXHTx4EF26dEFqaiosLCywYcMGTJ06VWkYc0FBAYyMjLBjxw706dMHw4YNQ3p6utItBREREejYsSNSUlJgbW2NGjVqYMqUKZg8ebJUZ/bs2dizZ4/Ks8uL5ObmIjc3V3qfnp4OV1dXPHnyRCUpzMnJQXx8PNzd3YudcI3KR35+PgwMDLQdRqWSmJiIhg0b4sKFC9JtCJWRJvvuo48+wpMnT5RG+zwvJycHcXFxqFmzJr+fr0GhUCA5ORl2dnZ8ooKOYd/pLvadbmK/6a7y7Ltd/7z+MPR+tSw1EEnloInjIRFFCBkpEM1tADUfDa4LxzQ9PR3W1tZIS0sr9eKkzkz4l5OTg48++giDBw+WdioxMRH29vZK9fT19WFjY4PExESpjru7u1KdoquTiYmJsLa2RmJiotJQ56I6RW0UZ+HChZg7d65KeXJyMnJycpTKimaWLygoQEFBQRn3mF6HKIrS/AKCml/yqszW1hbr169HXFwcXFzK8KQILdB039na2mLSpEkv/e4VFBRAoVDg8ePHPFn0GhQKBdLS0iCKIv8xq2PYd7qLfaeb2G+6qzz7TshQnV9LXUlJuaVX0hGaOB7/EiE8zQAE4P//U2a6cEwzMjLKXFcnkv/8/HwMGDAAoigqDTXXphkzZmDKlH9n6S+68m9nZ1fslf+MjAzo6+uXaZI50hwmc6r69eun7RDKRFN9V3Q7z8vo6+tDJpOhWrVqvPL/GhQKBQRB4JUsHcS+013sO93EftNd5dl3YubrX+m2t6/8V6nLShPH49/GREAERDP1r/zrwjFV59+vlT4TLUr8b9++jcOHDysl1o6OjkhKSlKqX1BQgJSUFDg6Okp1Hj58qFSn6H1pdYqWF8fQ0BCGhoYq5TKZTOWPgUwmgyAI0ovKnyiK0rHmMdctFd13Rd/L4r67pB4eR93FvtNd7DvdxH7TXeXWd8Lrt1elPk8aOB7/UjxL+gVBqd36K7+EQUY68s0tgL6AQWE6Vty9jNtGjWAit0D/Rh/rxDFVJ8ZKvTdFiX9MTAwOHjyIatWqKS1v1aoVUlNTceHCBams6LnqLVq0kOocPXpU6bnz4eHhqFu3rjSpXKtWrXDo0CGltsPDw6XZ7ImIiIiIiKjqcN++GXU2fgP37ZvhnrQZdRK/wZ57J/H7jW9w6NZmbYdXLrSa/GdmZkrP5waAuLg4REVF4c6dO8jPz8fbb7+N8+fPIzQ0FIWFhUhMTERiYiLy8vIAAPXq1UNgYCBGjx6Ns2fP4sSJE5gwYQIGDRokPZZsyJAhkMvlCA4OxrVr17B9+3asWLFCacj+Bx98gLCwMCxduhQ3btzAnDlzcP78eUyYMKHCjwkRERERERGRpmk1+T9//jyaNGmCJk2aAACmTJmCJk2aYNasWbh//z727t2Le/fuoXHjxnBycpJeJ0+elNoIDQ2Fl5cXOnXqhG7duqFNmzZKs3pbWlriwIEDiIuLg4+PD6ZOnYpZs2ZJj/kDgNatW2Pr1q349ttv8cYbb2Dnzp3Ys2cPGjZsWHEHg4iIiIiIiKicaPWe/w4dOqCkJw2W5SmENjY22Lp1a4l1GjVqhGPHjpVYp3///ujfv3+p2yMiIiIiIiLSNZX6nn8iIiIiIiIien1M/omIiIiIiIiqOCb/VOmdOHEC3t7eMDAwQO/evV9apgt++OEHdOnSRdthYMSIERVy3NatW4cePXqU+3aIiIiIiKhkTP6pVImJiZg4cSJq1aoFQ0NDuLq6okePHiqPRyxJZGQkBEFAamqq2tufMmUKGjdujLi4OISEhLy0rCRFz3J/2WvOnDlqx/V823v27Cm1Xk5ODmbOnInZs2cDAMLCwiAIAhITE5XqOTk5oWbNmkpl8fHxEARBrWNe3i5duoTBgwfD1dUVxsbGqFevHlasWKFUZ9SoUbh48WKpc24QEREREVH50uqEf1T5xcfHw9fXF1ZWVliyZAm8vb2Rn5+P/fv3Y/z48bhx40a5xxAbG4v33nsP1atXL7GsJAkJCdLv27dvx6xZs3Dz5k2pzMzMTHMBv8TOnTthYWEBX19fAECbNm2gr6+PyMhIDBo0CAAQHR2Np0+fIjs7G/Hx8dJJgIiICBgaGkrrVgYXLlyAvb09tmzZAldXV5w8eRJjxoyBnp6e9JhMuVyOIUOGYOXKlWjbtq2WIyYiIiIi+u/ilX8q0fvvvw9BEHD27Fn069cPderUQYMGDTBlyhScPn0awL9XpaOioqT1UlNTIQgCIiMjER8fDz8/PwCAtbU1BEHAiBEjAAC5ubmYNGkS7O3tYWRkhDZt2uDcuXNK7T5+/BijRo2CIAgICQkptqw0jo6O0svS0hKCICiVbdu2DfXq1YORkRG8vLzwzTffSOvm5eVhwoQJcHJygpGREdzc3LBw4UIAkJLzPn36QBAElSv2z9u2bZvSEHgzMzO8+eabiIyMlMoiIyPRpk0b+Pr6qpS3bNkSRkZGAIDvv//+pfECwN27dzFgwABYWVnBxsYGvXr1Qnx8/EtjO3fuHOzs7LBo0aJSjuS/Ro0ahRUrVqB9+/aoVasW3nnnHYwcORK//PKLUr0ePXpg7969ePr0aZnbJiIiIiIqT8ktfJHYpiOSW/gi2cIXiZYd8aZldbzh1BH17SvPBTdN4pV/LVt2ahmWnVpWar2mTk2xd/BepbKeP/XExYSLpa47pdUUTGk1Re3YUlJSEBYWhgULFsDU1FRluZWVVZnacXV1xa5du9CvXz/cvHkTFhYWMDY2BgBMnz4du3btwqZNm+Dm5obFixcjICAAt27dgqurKxISElC3bl3MmzcPAwcOhLm5OQIDA5XKLC0tERISgpEjR5bp8ZAvCg0NxaxZs7B69Wo0adIEf/31F0aPHg1TU1MMHz4cK1euxN69e/Hzzz+jRo0auHv3Lu7evQvgWdJsb2+PjRs3IjAwEHp6ei/dzvHjxzF06FClMj8/P+zcuVN6HxERgQ4dOqCwsBARERHSSZLIyEiMGjWqTPHm5+cjICAArVq1wrFjx6Cvr4/PP/8cgYGBuHz5MuRyuVIMhw8fRt++fbF48WKMGTNG7eP3vLS0NNjY2CiVNWvWDAUFBThz5gw6dOjwWu0TEREREWnC2WXfqZQFeWkhkArE5F/L0nPTcT/jfqn1XC1dVcqSs5PLtG56bvorxXbr1i2Ioggvr9f7Fujp6UkJob29vXTSICsrC2vXrkVISAi6du0KAPjuu+8QHh6OH374AdOmTYOjoyMEQYClpSUcHR0BAKampipllpaWqFu37ivFN3v2bCxduhR9+/YFALi7u+P69etYv349hg8fjjt37qB27dpo06YNBEGAm5ubtK6dnR2AZydCimIpTmpqKtLS0uDs7KxU7ufnhy+++AIJCQlwcnLCkSNHMG3aNBQUFGDt2rUAgH/++Qd37tyRRk+UFu/27duhUCjw/fffQxAEAMDGjRthZWWFyMhIpQkHd+/ejWHDhuH777/HwIEDX+n4FTl58iS2b9+O33//XancxMQElpaWuH379mu1T0REREREr47Jv5ZZGFrAxdyl1Hp2JnbFlpVlXQtDi1eK7VWuoqsjNjYW+fn5SvexGxgYoHnz5oiOjlarrT59+qBPnz5qx5CVlYXY2FgEBwdj9OjRUnlBQQEsLS0BPJsZv3Pnzqhbty4CAwPx1ltvqT1jf9GQ96Jh+0Vat24NuVyOyMhIvPHGG3j69CmaNm0KhUKB5ORkxMXFITIyEsbGxmjZsmWZ4r106RJu3boFc3NzpW3l5OQgNjZWen/mzBns27cPO3fuLHXm/65du0qT9rm5ueHatWtKy69evYpevXph9uzZxR4bY2NjZGdnl3KUiIiIiOi/YEdsmrZD+E9i8q9lrzokH4DKbQCaVrt2bQiCUOqkfjLZs6kjnj9ZkJ+fX66xaUpmZiaAZyMOWrRoobSsaAh/06ZNERcXhz///BMHDx7EgAED4O/vrzRcvzTVqlWDIAh48uSJUrmJiQmaN2+OiIgIpKSkoE2bNtDT04Oenh5at26NiIgIREREwNfXF3K5XFq/pHgzMzPh4+OD0NBQlTiKRioAgIeHB6pVq4YNGzage/fuMDAweGn833//vXQC48V6169fR6dOnTBmzBh89tlnxa6fkpKitG0iIiIiIqpYTP7ppWxsbBAQEIA1a9Zg0qRJKvf9p6amwsrKSkrqEhIS0KRJEwBQmvwPgHSfeWFhoVTm4eEBuVyOEydOSEPp8/Pzce7cOUyePLmc9kqZg4MDnJ2d8c8//yAoKOil9SwsLDBw4EAMHDgQb7/9NgIDA5GSkgIbGxsYGBgo7Vdx5HI56tevj+vXr6tcGffz88O2bdvw5MkTpXvi27Vrh8jISBw5cgTvvfdemeNt2rQptm/fDnt7e1hYvHzUh62tLX755Rd06NABAwYMwM8///zSEwAuLsWPMLl27Ro6duyI4cOHY8GCBcXWiY2NRU5OjvTZICIiIiLStvbv9IDho2Tk2toBnwKG+cnoHpeA2zInWBrZYXbn37QdosZxtn8q0Zo1a1BYWIjmzZtj165diImJQXR0NFauXIlWrVoBgDQk/csvv0R0dDSOHDmicgXYzc0NgiBg3759SE5ORmZmJkxNTTFu3DhMmzYNYWFhuH79OkaPHo3s7GwEBwerFefu3btfeW6CuXPnYuHChVi5ciX+/vtvXLlyBRs3bsSyZc8mYly2bBl++ukn3LhxA3///Td27NgBR0dHae6CmjVr4tChQ0hMTFS5sv+8gIAAHD9+XKXcz88PMTEx2L9/P9q3by+Vt2/fHnv27MHdu3el+/3LEm9QUBBsbW3Rq1cvHDt2TLp1YNKkSbh3757Stu3t7XH48GHcuHEDgwcPRkFBQZmP29WrV+Hn54cuXbpgypQpSExMRGJiIpKTk5XqHTt2DLVq1YKHh0eZ2yYiIiIiKk9mcbGwvHUDZnGxMHsaC8unNxD3NAP30m4gISO29AZ0EJN/KlGtWrVw8eJF+Pn5YerUqWjYsCE6d+6MQ4cOSRPSAcCGDRtQUFAAHx8fTJ48GZ9//rlSOy4uLpg7dy4+/vhjODg4SM+B//LLL9GvXz8MHToUTZs2xa1bt7B//35YW1urFWdaWhpu3rz5Svv47rvv4vvvv8fGjRvh7e2N9u3bIyQkBO7u7gAAc3NzLF68GM2aNcObb76J+Ph4/PHHH9LtDkuXLkV4eDhcXV1LvLodHByMP/74A2lpyvc4tWrVCoaGhhBFET4+PlJ5ixYtkJ+fLz0SsKzxmpiY4OjRo6hRowb69u2LevXqITg4GDk5OcWOBHB0dMThw4dx5coVBAUFlTqKocjOnTuRnJyMLVu2wMnJSXo9HysA/PTTT0rzExARERERUcUTxPKe1e0/Ij09HZaWlkhLS1NJsHJychAXFwd3d3eVCd+ofIiiiIKCAujr60sz3lcG/fv3R9OmTTFjxgxth1Ihim4L+Pvvv6UJCUtT0X3H76dmKBQKJCUlwd7eXjoxRrqBfae72He6if2mu8qz7zQxAV5/j7L9W6syqNAJ/0QFhIwUiOY2gPBvv3X3rQ+Thw+Q7eAMrAJM8h/AJU6GBwUK2Jg4Y13f6zpxTEvKQ1/EvzhEFWjJkiUwMzPTdhgVJiEhAZs3by5z4k9EREREROWDE/4RVaCaNWti4sSJ2g6jwvj7+2s7BCIiIiIiAq/8ExEREREREVV5TP6JiIiIiIiIqjgm/xWIcysSVT78XhIRERHRfwGT/wpgYGAAAMjOztZyJET0ory8PACAnp6eliMhIiIiIio/nPCvAujp6cHKygpJSUkAnj2HvTI9fq4qqqyP+qPSVWTfKRQKJCcnw8TEBPr6/HNIRERE9F9xfeJ06GdlosDUDKgO6BdmYrjBZdw3agQjg6r5dC7+a7eCODo6AoB0AoDKlyiKUCgUkMlkTP51TEX3nUwmQ40aNfg5ISIiIvoPiRs0QqWsiRPQpOJDqTBM/iuIIAhwcnKCvb098vPztR1OladQKPD48WNUq1YNMhnvbtElFd13crmcnxEiIiIiqvKY/FcwPT093ltcARQKBQwMDGBkZMTETsew74iIiIiINI/JPxEREREREf2nGCUlQigshKinB1gBgliIxzmP8VS/GmSCHqxNHLUdosYx+SciIiIiIqL/lE59OsLk4QNkOzgDqwCT/AdwiZPhQYECNibOWNf3urZD1DiOqSUiIiIiIiKq4pj8ExEREREREVVxTP6JiIiIiIiIqjgm/0RERERERERVHJN/IiIiIiIioiqOyT8RERERERFRFcfkn4iIiIiIiKiKY/JPREREREREVMUx+SciIiIiIiKq4vS1HQARERERERFRRTry46+QFRRAoa8POAEysQBz3O8hQ14dekLVTJOr5l4RERERERERvURmrdoqZVYm9WBV8aFUGA77JyIiIiIiIqrimPwTERERERERVXEc9k9ERERERET/Ka57d0D/6VMUGBsDrQF9xVP8nngZCcaNYKhvjDbu/bUdosYx+SciIiIiIqL/lEaLZsPk4QNkOzgDqwCT/AfoFSfDgwIFbEycq2Tyz2H/RERERERERFUck38iIiIiIiKiKo7JPxEREREREVEVx+SfiIiIiIiIqIpj8k9ERERERERUxTH5JyIiIiIiIqrimPwTERERERERVXFM/omIiIiIiIiqOH1tB0BERERERERUkXLs7P/9KX9WZmfwBDlya1gZ2WsxsvLD5J+IiIiIiIj+Uw7tiVQp+7RpxcdRkZj8ExERERERkUbsiE3Tdgj0Erznn4iIiIiIiKiKY/JPREREREREVMVx2D8RERERERH9pzT9bDLkqU+QZ2UNBAPygieYHh+NeMN6MJNbY0zL5doOUeO0euX/6NGj6NGjB5ydnSEIAvbs2aO0XBRFzJo1C05OTjA2Noa/vz9iYmKU6qSkpCAoKAgWFhawsrJCcHAwMjMzlepcvnwZbdu2hZGREVxdXbF48WKVWHbs2AEvLy8YGRnB29sbf/zxh8b3l4iIiIiI/nt2xKaV+KKK5xRxAK5hv8Ip4gCcnhyAa8qvOPr4Fk7f+RUXHxzQdnjlQqvJf1ZWFt544w2sWbOm2OWLFy/GypUrsW7dOpw5cwampqYICAhATk6OVCcoKAjXrl1DeHg49u3bh6NHj2LMmDHS8vT0dHTp0gVubm64cOEClixZgjlz5uDbb7+V6pw8eRKDBw9GcHAw/vrrL/Tu3Ru9e/fG1atXy2/niYiIiIiIiCqIVof9d+3aFV27di12mSiKWL58OT777DP06tULALB582Y4ODhgz549GDRoEKKjoxEWFoZz586hWbNmAIBVq1ahW7du+Oqrr+Ds7IzQ0FDk5eVhw4YNkMvlaNCgAaKiorBs2TLpJMGKFSsQGBiIadOmAQDmz5+P8PBwrF69GuvWrauAI0FERERERERUfirtPf9xcXFITEyEv7+/VGZpaYkWLVrg1KlTGDRoEE6dOgUrKysp8QcAf39/yGQynDlzBn369MGpU6fQrl07yOVyqU5AQAAWLVqEJ0+ewNraGqdOncKUKVOUth8QEKByG8LzcnNzkZubK71PT08HACgUCigUitfdfXpNCoUCoiiyL3QQ+043sd90F/tOd7HvdBP7TXe9Vt+J5d/fleYzVQH7qhZR/PeF52MTX/j54jq6kdepE2OlTf4TExMBAA4ODkrlDg4O0rLExETY29srLdfX14eNjY1SHXd3d5U2ipZZW1sjMTGxxO0UZ+HChZg7d65KeXJystJtCaQdCoUCaWlpEEURMhkfaqFL2He6if2mu9h3uot9p5vYb7rrdfpOyMgqp6j+lZSUW3qlClAR+6oeEcLTDEAA/v8//1+s+Pfni/m/qICQkVJpjmlJMjIyyly30ib/ld2MGTOURgukp6fD1dUVdnZ2sLCw0GJkBDz74ywIAuzs7Pg/Vh3DvtNN7Dfdxb7TXew73cR+012v03diZvlP6mdvb1nu2yiLithXtYgiIAKimQ0gPJf8C7J/fwovrCPIIJrbVJpjWhIjI6My1620yb+joyMA4OHDh3BycpLKHz58iMaNG0t1kpKSlNYrKChASkqKtL6joyMePnyoVKfofWl1ipYXx9DQEIaGhirlMpmMf8grCUEQ2B86in2nm9hvuot9p7vYd7qJ/aa7XrnvhPLv60rzeaqAfVWP4lnSLwgvxCa88PP5Rc/qVppjWgJ1Yqy0e+Pu7g5HR0ccOnRIKktPT8eZM2fQqlUrAECrVq2QmpqKCxcuSHUOHz4MhUKBFi1aSHWOHj2K/Px8qU54eDjq1q0La2trqc7z2ymqU7QdIiIiIiIiIl2m1eQ/MzMTUVFRiIqKAvBskr+oqCjcuXMHgiBg8uTJ+Pzzz7F3715cuXIFw4YNg7OzM3r37g0AqFevHgIDAzF69GicPXsWJ06cwIQJEzBo0CA4OzsDAIYMGQK5XI7g4GBcu3YN27dvx4oVK5SG7H/wwQcICwvD0qVLcePGDcyZMwfnz5/HhAkTKvqQEBEREREREWmcVof9nz9/Hn5+ftL7ooR8+PDhCAkJwfTp05GVlYUxY8YgNTUVbdq0QVhYmNJ9DaGhoZgwYQI6deoEmUyGfv36YeXKldJyS0tLHDhwAOPHj4ePjw9sbW0xa9Ys6TF/ANC6dWts3boVn332GT755BPUrl0be/bsQcOGDSvgKBAREREREVFFutujHwzSUpFvaQXYAgYFqQh8ehXxhg1hKrfSdnjlQhBFsZhnG5C60tPTYWlpibS0NE74VwkoFAokJSXB3t5eJ+7VoX+x73QT+013se90F/tON7HfdNfr9N2O2PKfBK+/R+WYnK4i9lUt/z9zv2huo/Z8BJXlmJZEnTyUf3GIiIiIiIiIqrhKO9s/ERERERERaU5pV+V14Uo3vTpe+SciIiIiIiKq4njln4iIiIiIiP5TArq8CeOHiXjq4AgsAYzzEtEgPgf3FEawNnbE8p7ntB2ixjH5JyIiIiIiov8U/awsGGRlID/LHCgEDBQZyCqU4WlBBowNzLUdXrlg8k9ERERERKTjKt0s+1Tp8J5/IiIiIiIioiqOV/6JiIiIiIioTKMH+EQA3cUr/0RERERERERVHJN/IiIiIiIioiqOyT8RERERERFRFcfkn4iIiIiIiKiKY/JPREREREREVMVxtn8iIiIiIiL6T7k4fxn0cnJQaGQEuAN6ihxMsryGh8YNINcz0nZ45YLJPxEREREREf2nJHQMVCmrVa03amkhlorCYf9EREREREREVRyTfyIiIiIiIqIqTu1h/ytXrixx+aRJk145GCIiIiIiIqLyZnU1CrK8PCjkcsAdkCnycCMtDqmG7tCXyVGrWmNth6hxZUr+r1+/jvr16wMAJk+eDBMTE9jb20MURaV6giAw+SciIiIiIqJKzXfsEJg8fIBsB2dgFWCS/wDD4mR4UKCAjYkz1vW9ru0QNa5Mw/7fe+89vP322wCATz/9FDKZDP7+/jh9+jTi4uKk1z///FOuwRIRERERERGR+sqU/B84cAB79+7Fo0ePMH/+fERHRyMvLw9169bFggULkJubW95xEhEREREREdErKlPyf//+fejp6cHMzAwA4OLigpCQEBw+fBiHDh2Cp6cnNm/eXK6BEhEREREREdGrKdM9/8OGDcPGjRthZGSEy5cv/7uyvj6WL1+OX3/9FRMmTMCKFStw4cKFcguWiIiIiIiIiNRXpuT/xIkT0u+NGzeGIAjSZH/P/x4VFaX5CImIiIiIiIjotaj9qL+4uLjyiIOIiIiIiIiIyonayb+bm1t5xEFERERERERE5UTt5H/v3r0lLu/Zs+crB0NEREREREREmqd28t+7d28IggAA0r3+RQRBQGFhoWYiIyIiIiIiIiKNUDv5DwoKwm+//Ybp06dj6tSpMDQ0LI+4iIiIiIiIiMrF/v1nAFEEBAEwBiCKWPJGFvJlptLF7qpGpu4KP/74Iw4dOoQDBw6gTp06CA0NLY+4iIiIiIiIiMpFgZk5Cswtnv3UM0eBvgUMjJxgIreAsYG5tsMrF2on/wDg4+ODyMhIrFixAvPmzUOzZs1w9OhRTcdGRERERERERBqgdvKfnp4uvTp27IgTJ06gV69eeOutt9C7d+9yCJGIiIiIiIiIXofa9/xbWVkVew+EKIr47bffNBIUERERERERUXmp/cNqGGRmIN/MHOgGGBRmYMP9y7hj1AgmBuZ4q/4EbYeocWon/xEREeURBxEREREREVGFqLPhG5g8fIBsB2egCWCS/wDb4mR4UPAnbEycmfwDgLu7O1xdXavsDIhEREREREREVY3a9/y7u7sjOTm5PGIhIiIiIiIionKgdvIvimJ5xEFERERERERE5UTtYf8AcO/ePeTk5BS7rEaNGq8VEBERERERERFp1isl/2+++aZKmSiKEAQBhYWFrx0UEREREREREWnOKyX/Z86cgZ2dnaZjISIiIiIiIqJyoHbyLwgCatSoAXt7+/KIh4iIiIiIiIg0TO3knxP+ERERERER/TftiE3Tdgj0itRO/uPi4jjkn4iIiIiIiHRWaoNGeOrkglybaoAp8LTABQ3M4mGqXxMWhtW0HV65UDv5v337Nm7fvv3S5e3atXutgIiIiIiIiIjK04lvt6mUjW6ghUAqkNrJf4cOHV66jLP9ExEREREREVU+aif/T548KY84iIiIiIiIiKicqJ38W1paSr9nZGTg008/RVRUFLy9vfHFF19oNDgiIiIiIiIien1qJ//Pmzp1KsLDw9G/f3/8+eefmDhxIjZv3qyp2IiIiIiIiIg0znfMIBimPH424d9UwLDgMQbHxSP+/yf8+8hPdU4AXfdayf/Bgwfxww8/oGPHjhg1ahTat2+vqbiIiIiIiIiIyoXVtcswefgA2Q7OQBZgkv8A1zJleFCQDBsTZ22HVy5kr7Pyo0ePULNmTQCAu7s7Hj16pImYiIiIiIiIiEiD1L7yn56ervQ+MzMT6enpyMnJ0VhQRERERERERKQ5aif/VlZWEAQBACCKIpo0aSL9XlRORERERERERJWH2sl/REREecRBREREREREROVE7eSfk/oRERERERER6ZYyJf8ZGRkwNzeX3j98+BBr1qzB9evXIQgCGjRogHHjxsHBwaHcAiUiIiIiIiKiV1PqbP8FBQVwdHREQkICAODEiRPw9PTE9u3bYW5uDjMzM/z000+oXbs2Tp8+rdHgCgsLMXPmTLi7u8PY2BgeHh6YP38+RFGU6oiiiFmzZsHJyQnGxsbw9/dHTEyMUjspKSkICgqChYUFrKysEBwcjMzMTKU6ly9fRtu2bWFkZARXV1csXrxYo/tCREREREREpC2lXvnX19eHoaEhnj59CgD43//+h3feeQfffPON0sR/48aNw9SpU3HixAmNBbdo0SKsXbsWmzZtQoMGDXD+/HmMHDkSlpaWmDRpEgBg8eLFWLlyJTZt2gR3d3fMnDkTAQEBuH79OoyMjAAAQUFBSEhIQHh4OPLz8zFy5EiMGTMGW7duBfDsCQZdunSBv78/1q1bhytXrmDUqFGwsrLCmDFjNLY/RERERERERNpQpmH/dnZ2yMjIAABERUUhJCREaWZ/QRAwefJkaeZ/TTl58iR69eqF7t27AwBq1qyJn376CWfPngXw7KTD8uXL8dlnn6FXr14AgM2bN8PBwQF79uzBoEGDEB0djbCwMJw7dw7NmjUDAKxatQrdunXDV199BWdnZ4SGhiIvLw8bNmyAXC5HgwYNEBUVhWXLljH5JyIiIiIiqmL+HvU+DDIzkG9mDjgBBoUZGITLuGPUCCYG5qU3oIPKlPy3atUK27dvxxtvvAEHBwfEx8ejbt26SnXi4uJgYWGh0eBat26Nb7/9Fn///Tfq1KmDS5cu4fjx41i2bJm0zcTERPj7+0vrWFpaokWLFjh16hQGDRqEU6dOwcrKSkr8AcDf3x8ymQxnzpxBnz59cOrUKbRr1w5yuVyqExAQgEWLFuHJkyewtrZWiS03Nxe5ubnS+/T0dACAQqGAQqHQ6HEg9SkUCoiiyL7QQew73cR+013sO93FvtNN7Dfd9Vp9J7K/tUYU/33h336IGfW+StWWLkBLaT3dyOvUibFMyf/kyZPRtm1b1K5dG4MHD8a7776Lr776Cq1btwbwbB6AadOmYcCAAa8W8Ut8/PHHSE9Ph5eXF/T09FBYWIgFCxYgKCgIAJCYmAgAKhMNOjg4SMsSExNhb2+vtFxfXx82NjZKddzd3VXaKFpWXPK/cOFCzJ07V6U8OTkZOTk5r7K7pEEKhQJpaWkQRREyWalTW1Alwr7TTew33cW+013sO93EftNdr9N3QkZWOUVFpRMhPM0ABOD//1NmSUm5pVfSsqIR+mVRpuS/cePG+O233zBmzBjcvn0b+fn5GDx4sDT0X09PD6NHj9b4JHk///wzQkNDsXXrVmko/uTJk+Hs7Izhw4drdFvqmjFjBqZMmSK9T09Ph6urK+zs7DQ+AoLUp1AoIAgC7Ozs+D9WHcO+003sN93FvtNd7DvdxH7TXa/Td2JmWjlFRaUSRUAERDMbQFAv+be3tyynoDSnaJ67sihT8g8AHTp0wN9//40bN24gOTlZGl5gbW0NDw8PmJqaqh9pKaZNm4aPP/4YgwYNAgB4e3vj9u3bWLhwIYYPHw5HR0cAzx496OTkJK338OFDNG7cGADg6OiIpKQkpXYLCgqQkpIire/o6IiHDx8q1Sl6X1TnRYaGhjA0NFQpl8lk/ENeSQiCwP7QUew73cR+013sO93FvtNN7Dfd9cp9J7CvtUfxLOkXBKV+0M/MeHZiQBAAYwCiiKcFWciXmUIQBBgbmOvEd1SdGMuc/Bfx8vKCl5cXAEiP3BPUPINSVtnZ2So7o6enJ514cHd3h6OjIw4dOiQl++np6Thz5gzGjRsH4Nl8Bampqbhw4QJ8fHwAAIcPH4ZCoUCLFi2kOp9++iny8/NhYGAAAAgPD0fdunWLHfJPREREREREuisgoAVMHj5AtoMzsAowyX8AlzgZHhQoYGPijHV9r2s7RI17pVMZmzdvhre3N4yNjWFsbIxGjRrhxx9/1HRs6NGjBxYsWIDff/8d8fHx2L17N5YtW4Y+ffoA+PcpA59//jn27t2LK1euYNiwYXB2dkbv3r0BAPXq1UNgYCBGjx6Ns2fP4sSJE5gwYQIGDRoEZ2dnAMCQIUMgl8sRHByMa9euYfv27VixYoXSsH4iIiIiIiIiXaX2lf9ly5Zh5syZmDBhAnx9fQEAx48fx3vvvYdHjx7hww8/1Fhwq1atwsyZM/H+++8jKSkJzs7OGDt2LGbNmiXVmT59OrKysjBmzBikpqaiTZs2CAsLU7r3ITQ0FBMmTECnTp0gk8nQr18/rFy5UlpuaWmJAwcOYPz48fDx8YGtrS1mzZrFx/wRERERERFRlSCIRWP3y8jd3R1z587FsGHDlMo3bdqEOXPmIC4uTqMB6or09HRYWloiLS2NE/5VAgqFAklJSbC3t9eJe3XoX+w73cR+013sO93FvtNN7Dfd9Tp9tyOWE/5pjaiAkJEC0dxG6Z7/7r71Sx3239+j8k/4p04eqvZfnISEBOkRf89r3bo1EhIS1G2OiIiIiIiIiMqZ2sm/p6cnfv75Z5Xy7du3o3bt2hoJioiIiIiIiIg0R+17/ufOnYuBAwfi6NGj0j3/J06cwKFDh4o9KUBERERERERE2qX2lf9+/frhzJkzsLW1xZ49e7Bnzx7Y2tri7Nmz0iz8RERERERERFR5qH3lHwB8fHywZcsWTcdCREREREREROVA7Sv/f/zxB/bv369Svn//fvz5558aCYqIiIiIiIiINEftK/8ff/wxvvzyS5VyURTx8ccfo2vXrhoJjIiIiIiIiKg8nFi/FbK8PCjkcsAdkCny8IlLHFIN3aEvk2s7vHKhdvIfExOD+vXrq5R7eXnh1q1bGgmKiIiIiIiIqLykNmysUmZv3hz2FR9KhVF72L+lpSX++ecflfJbt27B1NRUI0ERERERERERkeaonfz36tULkydPRmxsrFR269YtTJ06FT179tRocERERERERET0+tQe9r948WIEBgbCy8sL1atXBwDcu3cPbdu2xVdffaXxAImIiIiIiIg0yelwGPRyclBoZAQ0AfQUOTiafA0PjRtArmcEn+qB2g5R49RO/i0tLXHy5EmEh4fj0qVLMDY2RqNGjdCuXbvyiI+IiIiIiIhIo5rOnAKThw+Q7eAMrAJM8h/g7TgZHhQoYGPizOS/iCAI6NKlC7p06aLpeIiIiIiIiIhIw9S+55+IiIiIiIiIdAuTfyIiIiIiIqIqjsk/ERERERERURXH5J+IiIiIiIioinulCf8KCwuxZ88eREdHAwAaNGiAnj17Qk9PT6PBEREREREREdHrUzv5v3XrFrp374579+6hbt26AICFCxfC1dUVv//+Ozw8PDQeJBERERERERG9OrWH/U+aNAm1atXC3bt3cfHiRVy8eBF37tyBu7s7Jk2aVB4xEhEREREREdFrUPvK/5EjR3D69GnY2NhIZdWqVcOXX34JX19fjQZHREREREREpGkFpqbINzVHgakpoAfkF5rDVC8HxoIpjPRNtR1euVA7+Tc0NERGRoZKeWZmJuRyuUaCIiIiIiIiIiov+w+cUylb8KYWAqlAag/7f+uttzBmzBicOXMGoihCFEWcPn0a7733Hnr27FkeMRIRERERERHRa1A7+V+5ciU8PDzQqlUrGBkZwcjICL6+vvD09MSKFSvKI0YiIiIiIiIieg1qD/u3srLCr7/+ipiYGNy4cQMAUK9ePXh6emo8OCIiIiIiIiJ6fWon/0Vq166N2rVrAwAKCws1FhARERERERFReWr05UwYpKUi39IKGAIYFKRiwZ2riDdsCFO5FYb6zNd2iBqn9rD/uLg4DB48GOPGjcOTJ0/Qs2dPGBoaom7durh8+XJ5xEhERERERESkMa6/7UKtHT/C9bddcH20C7WSf0RY4iUcjv0RJ27v0nZ45ULt5H/s2LGIjo7G1atX0bFjR+Tl5eHXX39F/fr1MXny5HIIkYiIiIiIiIheh9rD/s+cOYNjx47Bzc0NNjY2OHfuHJo2bQpPT0+0aNGiPGIkIiIiIiIioteg9pX/jIwMODk5wdLSEiYmJrCysgLwbCLAjIwMTcdHRERERERERK/plSb8CwsLg6WlJRQKBQ4dOoSrV68iNTVVw6ERERERERERkSa8UvI/fPhw6fexY8dKvwuC8PoREREREREREZFGqZ38KxSK8oiDiIiIiIiIiMqJ2vf8b968Gbm5ueURCxERERERERGVA7Wv/I8cORKBgYGwt7cvj3iIiIiIiIh0yo7YNG2HQFQqtZN/URTLIw4iIiIiIiKiCpHg1wXy1CfIs7IGrAF5wRO0S49GvGE9mMmttR1euXilCf9+/vlnWFhYFLts2LBhrxUQERERERERUXm6+PlylbK+tSs+jor0Ssn/4sWLoaenp1IuCAKTfyIiIiIiIqJK5pWS//Pnz/OefyIiIiIiIiIdofZs/0RERERERESkW9S+8u/m5lbskH8iIiIiIiIiXdCpdwcYJSchx84e+BwwyktC67gnuCtYw8rIHl92i9R2iBqndvIfFxdXHnEQERERERERVQij5CSYPHzw7E0eYJL/AMn5MqQUPNBuYOVI7WH/kyZNwsqVK1XKV69ejcmTJ2siJiIiIiIiIiLSILWT/127dsHX11elvHXr1ti5c6dGgiIiIiIiIiIizVE7+X/8+DEsLS1Vyi0sLPDo0SONBEVEREREREREmqN28u/p6YmwsDCV8j///BO1atXSSFBEREREREREpDlqT/g3ZcoUTJgwAcnJyejYsSMA4NChQ1i6dCmWL1+u6fiIiIiIiIiI6DWpnfyPGjUKubm5WLBgAebPnw8AqFmzJtauXYthw4ZpPEAiIiIiIiIiej1qJ/8AMG7cOIwbNw7JyckwNjaGmZmZpuMiIiIiIiIiIg1R+55/ACgoKMDBgwfxyy+/QBRFAMCDBw+QmZmp0eCIiIiIiIiI6PWpfeX/9u3bCAwMxJ07d5Cbm4vOnTvD3NwcixYtQm5uLtatW1cecRIRERERERFpxOWP5kL/6VMUGBsDNQB9xVOMMb6MBONGMNQ31nZ45ULt5P+DDz5As2bNcOnSJVSrVk0q79OnD0aPHq3R4IiIiIiIiIg07W7P/ipl9e2B+lqIpaKonfwfO3YMJ0+ehFwuVyqvWbMm7t+/r7HAiIiIiIiIiEgz1L7nX6FQoLCwUKX83r17MDc310hQRERERERERKQ5aif/Xbp0wfLly6X3giAgMzMTs2fPRrdu3TQZGwDg/v37eOedd1CtWjUYGxvD29sb58+fl5aLoohZs2bByckJxsbG8Pf3R0xMjFIbKSkpCAoKgoWFBaysrBAcHKwyOeHly5fRtm1bGBkZwdXVFYsXL9b4vhAREREREZH2mf0TA4u/o2H2TwzMnsbAIjsaqUnhuJsajQdpMaU3oIPUHva/dOlSBAQEoH79+sjJycGQIUMQExMDW1tb/PTTTxoN7smTJ/D19YWfnx/+/PNP2NnZISYmBtbW1lKdxYsXY+XKldi0aRPc3d0xc+ZMBAQE4Pr16zAyMgIABAUFISEhAeHh4cjPz8fIkSMxZswYbN26FQCQnp6OLl26wN/fH+vWrcOVK1cwatQoWFlZYcyYMRrdJyIiIiIiItKu9kN7weThA2Q7OAOrAJP8B3CJk+FBgQI2Js5Y1/e6tkPUOLWT/+rVq+PSpUvYtm0bLl++jMzMTAQHByMoKAjGxpqdFXHRokVwdXXFxo0bpTJ3d3fpd1EUsXz5cnz22Wfo1asXAGDz5s1wcHDAnj17MGjQIERHRyMsLAznzp1Ds2bNAACrVq1Ct27d8NVXX8HZ2RmhoaHIy8vDhg0bIJfL0aBBA0RFRWHZsmVM/omIiIiIiEjnqZ38A4C+vj7eeecdTceiYu/evQgICED//v1x5MgRuLi44P3335eeKhAXF4fExET4+/tL61haWqJFixY4deoUBg0ahFOnTsHKykpK/AHA398fMpkMZ86cQZ8+fXDq1Cm0a9dOaRLDgIAALFq0CE+ePFEaaVAkNzcXubm50vv09HQAz+ZEUCgUGj8WpB6FQgFRFNkXOoh9p5vYb7qLfae72He6if2mu0rsO5H9WWmJ4r8vPN9P4gs/X1xHN/I6dWJUO/nfu3dvict79uypbpMv9c8//2Dt2rWYMmUKPvnkE5w7dw6TJk2CXC7H8OHDkZiYCABwcHBQWs/BwUFalpiYCHt7e6Xl+vr6sLGxUarz/IiC59tMTEwsNvlfuHAh5s6dq1KenJyMnJycV9xj0hSFQoG0tDSIogiZTO2pLUiL2He6if2mu9h3uot9p5vYb7qrpL4TMrK0FBWVToTwNAMQgP//z/8XK/79+WL+LyogZKQgKSkXlV1GRkaZ66qd/Pfu3VvpvSAIEEVR+r24JwG8KoVCgWbNmuGLL74AADRp0gRXr17FunXrMHz4cI1t51XMmDEDU6ZMkd6np6fD1dUVdnZ2sLCw0GJkBDz77AiCADs7O/6PVcew73QT+013se90F/tON7HfdFdJfSdmpmkpKiqVKAIiIJrZAMJzyb8g+/en8MI6ggyiuQ3s7S0rLMxXVTTPXVmonfy/OKzA3Nwcly5dQq1atdRtqlROTk6oX7++Ulm9evWwa9cuAICjoyMA4OHDh3BycpLqPHz4EI0bN5bqJCUlKbVRUFCAlJQUaX1HR0c8fPhQqU7R+6I6LzI0NIShoaFKuUwm4x/ySkIQBPaHjmLf6Sb2m+5i3+ku9p1uYr/prpf2ncC+rLwUz5J+QXihn4QXfj6/6FldXfiOqhPja++NIBRzsDTE19cXN2/eVCr7+++/4ebmBuDZ5H+Ojo44dOiQtDw9PR1nzpxBq1atAACtWrVCamoqLly4INU5fPgwFAoFWrRoIdU5evQo8vPzpTrh4eGoW7dusUP+iYiIiIiIiHTJayX/8fHxyMrKgrm5uabiUfLhhx/i9OnT+OKLL3Dr1i1s3boV3377LcaPHw/g2YmHyZMn4/PPP8fevXtx5coVDBs2DM7OztLtCfXq1UNgYCBGjx6Ns2fP4sSJE5gwYQIGDRoEZ2dnAMCQIUMgl8sRHByMa9euYfv27VixYoXSsH4iIiIiIiIiXaX2sP++ffsCAJ4+fYrTp0+jU6dOsLOz03hgAPDmm29i9+7dmDFjBubNmwd3d3csX74cQUFBUp3p06cjKysLY8aMQWpqKtq0aYOwsDClex9CQ0MxYcIEdOrUCTKZDP369cPKlSul5ZaWljhw4ADGjx8PHx8f2NraYtasWXzMHxEREREREVUJaif/lpbPJj1wdHREjx49MGrUKI0H9by33noLb7311kuXC4KAefPmYd68eS+tY2Njg61bt5a4nUaNGuHYsWOvHCcRERERERFRZaV28r9x48byiIOIiIiIiIioQhzafRhCYSFEPT3AChDEQnzh9RhP9atBJuhpO7xyoXbyn56eXuJyPuaOiIiIiIiIKrMce9WnupkYusBEC7FUFLWTfysrq2Jn+BdFEYIgoLCwUCOBEREREREREZFmqJ3816pVC0lJSfj444/h6+tbHjERERERERERkQapnfxHR0dj1apVWLBgAf766y8sXrwY7u7u5REbERERERERkca5bwuBflYmCkzNgI6AfmEmfk64jPtGjWBkYAb/2iO0HaLGqZ38GxgYYMqUKRgxYgTmzZuHRo0aYcyYMZg5cyasrKzKIUQiIiIiIiIizam/ajFMHj5AtoMzUAcwyX+A7nEyPCj4GTYmzlUy+Ze96oo2NjZYvnw5/vrrL8THx8PT0xPLly/XYGhEREREREREpAlqX/lv0qSJyoR/oigiNzcXU6dOxeTJkzUVGxERERERERFpgNrJf+/evcshDCIiIiIiIiIqL2on/7Nnzy6POIiIiIiIiIionKid/Kenp5e43MLC4pWDISIiIiIiIiLNUzv5t7KyUrnnH3h2378gCCgsLNRIYERERERERESkGWon/wCwc+dO2NjYaDoWIiIiIiIiIioHr5T8+/r6wt7eXtOxEBEREREREVE5eKXk//r163j8+DFMTU3h6OgIuVyu6biIiIiIiIiIykWmuwfyzS2Qa2sHGAP5+hZwN06ATOYESyM7bYdXLl4p+e/UqZN0j79MJoOXlxdGjRqFDz/8UNPxEREREREREWnUkS2/qZR90EgLgVQgtZP/uLg4iKKI/Px8pKen48GDBzh79ixmzpyJgoICTJs2rTziJCIiIiIiIqJXpHby7+bmpvTex8cHPXr0QJ06dTBv3jwm/0RERERERESVzCsN+y/OoEGD0KBBA001R0REREREREQa8srJ/4ULFxAdHQ0AqF+/Ppo2bYqmTZtqLDAiIiIiIiKi8tB8ymgYpjxGrk014H3AsOAx3ou7hXi5J8wNq2FSm++0HaLGqZ38JyUlYdCgQYiMjISVlRUAIDU1FX5+fti2bRvs7KrmzIhERERERERUNdidOQGThw+Q7eAMvAOY5D/AuTQZHhTcgY2Js7bDKxcydVeYOHEiMjIycO3aNaSkpCAlJQVXr15Feno6Jk2aVB4xEhEREREREdFrUPvKf1hYGA4ePIh69epJZfXr18eaNWvQpUsXjQZHRERERERERK9P7Sv/CoUCBgYGKuUGBgZQKBQaCYqIiIiIiIiINEft5L9jx4744IMP8ODBA6ns/v37+PDDD9GpUyeNBkdEREREREREr0/t5H/16tVIT09HzZo14eHhAQ8PD7i7uyM9PR2rVq0qjxiJiIiIiIiI6DWofc+/q6srLl68iIMHD+LGjRsAgHr16sHf31/jwRERERERERHR6ytz8p+RkQFzc3MAgCAI6Ny5Mzp37qxU59y5c3jzzTc1GyERERERERERvZYyD/vv0qULMjMzi11WUFCAzz77DL6+vhoLjIiIiIiIiIg0Q60r//7+/jhw4AAsLCyk8qtXr2Lo0KFITk7Gnj17yiNGIiIiIiIiIo2JGzgMBhnpyDe3AOwBg8J09M6/jNtGjWAityi9AR1U5uQ/IiICHTt2ROfOnREeHg5zc3MsXrwYs2fPRt++fXH48GFYW1uXZ6xEREREREREr+36pI9Vyjq4aSGQClTm5N/Ozg6HDx+Gv78/OnbsCENDQ8TExGDLli14++23yzNGIiIiIiIiInoNas32b2dnh0OHDsHf3x9Xr15FVFQUvLy8yis2IiIiIiIiItKAMk/4V8TW1haHDx9G/fr1MWTIEDx58qQ84iIiIiIiIiIiDSnzlf++ffsqvbewsMDRo0fRvHlzeHt7S+W//PKL5qIjIiIiIiIi0rDuvvVh8vABsh2cgVWASf4DuMTJ8KBAARsTZ6zre13bIWpcmZN/S0tLlffu7u4aD4iIiIiIiIiINKvMyf/GjRvLMw4iIiIiIiIiKidq3/NPRERERERERLqFyT8RERERERFRFcfkn4iIiIiIiKiKY/JPREREREREVMUx+SciIiIiIiKq4pj8ExEREREREVVxTP6JiIiIiIiIqjh9bQdAREREREREVJHOLl0PWV4eFHI54AnIFHmYZncTyUZ1YSCTazu8csHkn4iIiIiIiP5Tklu2VSlzseoEFy3EUlE47J+IiIiIiIioimPyT0RERERERFTFcdg/ERERERFVSTti00qt09/DsgIiocrG7vSxf+/5r//snv+LKf/e89/AUfW2AF3H5J+IiIiIiIj+U5pPHQuThw+Q7eAMrAJM8h9gcJwMDwoUsDFxxrq+17UdosZx2D8RERERERFRFcfkn4iIiIiIiKiKY/JPREREREREVMUx+SciIiIiIiKq4pj8ExEREREREVVxOpX8f/nllxAEAZMnT5bKcnJyMH78eFSrVg1mZmbo168fHj58qLTenTt30L17d5iYmMDe3h7Tpk1DQUGBUp3IyEg0bdoUhoaG8PT0REhISAXsEREREREREVH505nk/9y5c1i/fj0aNWqkVP7hhx/it99+w44dO3DkyBE8ePAAffv2lZYXFhaie/fuyMvLw8mTJ7Fp0yaEhIRg1qxZUp24uDh0794dfn5+iIqKwuTJk/Huu+9i//79FbZ/REREREREROVFJ5L/zMxMBAUF4bvvvoO1tbVUnpaWhh9++AHLli1Dx44d4ePjg40bN+LkyZM4ffo0AODAgQO4fv06tmzZgsaNG6Nr166YP38+1qxZg7y8PADAunXr4O7ujqVLl6JevXqYMGEC3n77bXz99dda2V8iIiIiIiIiTdLXdgBlMX78eHTv3h3+/v74/PPPpfILFy4gPz8f/v7+UpmXlxdq1KiBU6dOoWXLljh16hS8vb3h4OAg1QkICMC4ceNw7do1NGnSBKdOnVJqo6jO87cXvCg3Nxe5ubnS+/T0dACAQqGAQqF43V2m16RQKCCKIvtCB7HvdBP7TXex73QX+043sd8qmFj6cS5rX5TYd2XYDmmJKP77wvP9JL7w88V1dCOvUyfGSp/8b9u2DRcvXsS5c+dUliUmJkIul8PKykqp3MHBAYmJiVKd5xP/ouVFy0qqk56ejqdPn8LY2Fhl2wsXLsTcuXNVypOTk5GTk1P2HaRyoVAokJaWBlEUIZPpxAAX+n/sO93EftNd7Dvdxb7TTey3iiVkZJVaJykpt9Q6QMl9V5btkLaIEJ5mAALw//8BAPzx5zGVmivqPfcmI6XMnw1tysjIKHPdSp383717Fx988AHCw8NhZGSk7XCUzJgxA1OmTJHep6enw9XVFXZ2drCwsNBiZAQ8++MsCALs7Oz4P1Ydw77TTew33cW+013sO93EfqtYYmZaqXXs7S3L1FZJfVeW7ZCWiCIgAqKZDSAIpdd/Tlk/G9qkTp5cqZP/CxcuICkpCU2bNpXKCgsLcfToUaxevRr79+//v/buPT6q8t73+HcmyeRCMsGEXIhCCGLkDkGEBLSgYqJGWgpbEVFQ0V1sQhtTi1ijEO3eHkWKCkF3vYAeK4F4RM4xitIoIBClIGm5WFrSCKUhAWxzBUIys84fyJQhXBJIMpk1n/frNcJ61jMrv8Xv9Tj5zfOstXTixAlVVVW5zf5XVlYqNjZWkhQbG6stW7a4HffU0wBO73PmEwIqKytlt9vPOusvSYGBgQoMDGzWbrVa+R95J2GxWMiHlyJ33om8eS9y573InXcibx3IcuF/49bk4Zy5a8HPgac4Txb9Fkur8+QNY7Q1MXbqs7npppu0Y8cOlZSUuF7Dhw/X1KlTXX8PCAhQUVGR6z179uzR/v37lZKSIklKSUnRjh07dOjQIVeftWvXym63q3///q4+px/jVJ9TxwAAAAAAwJt16pn/sLAwDRw40K2tS5cuioyMdLXPmDFD2dnZioiIkN1u16xZs5SSkqLk5GRJUmpqqvr37697771Xzz//vCoqKpSTk6OMjAzXzP3MmTO1ePFizZ49Ww888IA+++wzrVy5UoWFhR17wgAAAACAdtf/5f+lgNoaNYbZpYlSgKNGL/39T9oXNFghNrvuGDzH0yG2uU5d/LfEwoULZbVaNWnSJDU0NCgtLU1Llixx7ffz89OHH36ohx9+WCkpKerSpYumT5+up59+2tUnISFBhYWFeuSRR/TSSy/piiuu0Ouvv660tDRPnBIAAAAAoB0lrHhbIZXlOhoTJ10nhTSW64MDVpU3bVRESBzFf2ewbt06t+2goCDl5eUpLy/vnO+Jj4/XRx99dN7jjh07Vtu3b2+LEAEAAAAA6FQ69TX/AAAAAADg0nndzD8AAAAAtJWC0vM/pu+OKzv/496AlmDmHwAAAAAAk6P4BwAAAADA5Cj+AQAAAAAwOYp/AAAAAABMjuIfAAAAAACT427/AAAAAHAOrqcBGE5Zautl1FVLFuZQvd3hkaMV+M/v1BARKdmlwKbvdG34Xn1r66OwwEhPh9cuKP4BAAAAAD5ly29ea9Y2ta8HAulAfGUFAAAAAIDJUfwDAAAAAGByLPsHAAAA4JVc1+MDuCCKfwAAAACATxlzz3gFHjmshm5R0hNSYONhpZcd1D5rd4UHRWnuzf/P0yG2OYp/AAAAAIBPCS0rVUhluY7W1kjHpJDGcpUds6q8qVpHG2s8HV674Jp/AAAAAABMjuIfAAAAAACTo/gHAAAAAMDkKP4BAAAAADA5in8AAAAAAEyO4h8AAAAAAJOj+AcAAAAAwOQo/gEAAAAAMDl/TwcAAAAAAEBH2j1rtvzr69TUJVS6QvJ31Gl6wJ/0j6DBCgoI9XR47YLiHwAAAADgU8ruuq9ZW1J3KanjQ+kwLPsHAAAAAMDkKP4BAAAAADA5lv0DAAAAAHxK0KEKWRwOGX5+UlfJYjj03fHvdMw/UlaLny4LifV0iG2O4h8AAAAA4FNu+vGNCqks19GYOGmRFNJYrsvLrCpvcioiJE6vTtzt6RDbHMv+AQAAAAAwOYp/AAAAAABMjuIfAAAAAACTo/gHAAAAAMDkuOEfAAAAgE6noLTa0yEApsLMPwAAAAAAJkfxDwAAAACAyVH8AwAAAABgclzzDwAAAKBNteR6/TuuDO+ASACcQvEPAAAAAPAp6//3almbmuT095e6S1ajSfMSDqjWdoX8LOYsk815VgAAAAAAnENd76uatXUN6aeuHR9Kh+GafwAAAAAATI7iHwAAAAAAk2PZPwAAAADAp/T4vwXyP3ZMTcHB0ijJ33lMhRV/0sHgwQr0D9Z1CXd4OsQ2R/EPAAAAoMO15IkAQHsZ/NxchVSW62hMnLRICmks14/KrCpvcioiJM6UxT/L/gEAAAAAMDmKfwAAAAAATI7iHwAAAAAAk6P4BwAAAADA5Cj+AQAAAAAwOYp/AAAAAABMjkf9AQAAAGgVHtMHeB9m/gEAAAAAMDlm/gEAAAAAPuV4VPS//7SdbIsK+JeO2y5T16BoD0bWfij+AQAAAAA+peiDdc3anhjW8XF0JJb9AwAAAABgchT/AAAAAACYXKcu/p999llde+21CgsLU3R0tCZMmKA9e/a49Tl+/LgyMjIUGRmp0NBQTZo0SZWVlW599u/fr/T0dIWEhCg6Olq//OUv1dTU5NZn3bp1GjZsmAIDA9WnTx8tW7asvU8PAAAAAIAO0amv+V+/fr0yMjJ07bXXqqmpSb/61a+Umpqq3bt3q0uXLpKkRx55RIWFhSooKFB4eLgyMzM1ceJEbdq0SZLkcDiUnp6u2NhYbd68WQcPHtS0adMUEBCg//7v/5YklZWVKT09XTNnztTvfvc7FRUV6cEHH1T37t2VlpbmsfMHAAAAALS9YTlZslX9Sye6XibNkGxN/9Lsb7/Rt4H9FGq7TP+Z/KKnQ2xznbr4X7Nmjdv2smXLFB0drW3btukHP/iBqqur9cYbb+jdd9/VjTfeKElaunSp+vXrpy+//FLJycn69NNPtXv3bv3+979XTEyMhg4dqmeeeUaPPfaY5s2bJ5vNpldffVUJCQlasGCBJKlfv37auHGjFi5cSPEPAAAAACbT/fNPFVJZrqMxcdJEKaSxXBu+s6q86S+KCInzdHjtolMX/2eqrq6WJEVEREiStm3bpsbGRo0bN87Vp2/fvurZs6eKi4uVnJys4uJiDRo0SDExMa4+aWlpevjhh7Vr1y4lJSWpuLjY7Rin+mRlZZ0zloaGBjU0NLi2a2pqJElOp1NOp/OSzxWXxul0yjAMcuGFyJ13Im/ei9x5L3LnnUyTN8PL478YhvHvl3zw/L3VOfNmnPHnme/xjrquNTF6TfHvdDqVlZWl0aNHa+DAgZKkiooK2Ww2de3a1a1vTEyMKioqXH1OL/xP7T+173x9ampqdOzYMQUHBzeL59lnn1Vubm6z9sOHD+v48eMXd5JoM06nU9XV1TIMQ1Zrp761Bc5A7rwTefNe5M57kTvvZJa8WWrrPR2CBxiyHKuVLNL3/4FXOEfeTn2BZTib1/+GU5baf+rQoQZ1drW1tS3u6zXFf0ZGhnbu3KmNGzd6OhRJ0uOPP67s7GzXdk1NjXr06KGoqCjZ7XYPRgbp5AerxWJRVFSUV3+w+iJy553Im/cid96L3Hkns+TNqKv2dAgdzzAkQzJCIyQLxb/XOFfeLNZ//3lmOi1WGWERio4O77AwL1ZQUFCL+3pF8Z+ZmakPP/xQGzZs0BVXXOFqj42N1YkTJ1RVVeU2+19ZWanY2FhXny1btrgd79TTAE7vc+YTAiorK2W328866y9JgYGBCgwMbNZutVq9+n/kZmKxWMiHlyJ33om8eS9y573InXcyRd4sXhz7RXOeLB4tFh89f291rrxZzvjz9F0n+3rDGG1NjJ36bAzDUGZmplatWqXPPvtMCQkJbvuvueYaBQQEqKioyNW2Z88e7d+/XykpKZKklJQU7dixQ4cOHXL1Wbt2rex2u/r37+/qc/oxTvU5dQwAAAAAALxZp575z8jI0LvvvqvVq1crLCzMdY1+eHi4goODFR4erhkzZig7O1sRERGy2+2aNWuWUlJSlJycLElKTU1V//79de+99+r5559XRUWFcnJylJGR4Zq5nzlzphYvXqzZs2frgQce0GeffaaVK1eqsLDQY+cOAAAAAEBb6dQz/6+88oqqq6s1duxYde/e3fVasWKFq8/ChQt1++23a9KkSfrBD36g2NhYvf/++679fn5++vDDD+Xn56eUlBTdc889mjZtmp5++mlXn4SEBBUWFmrt2rUaMmSIFixYoNdff53H/AEAAAAATKFTz/wbxlkeu3CGoKAg5eXlKS8v75x94uPj9dFHH533OGPHjtX27dtbHSMAAAAAAJ1dpy7+AQAAAHSsglIfvJM/fM7fx09SQHWVGsO7St2kgKYq3XJsp74NHKgutq6eDq9dUPwDAAAAAHzKn+Y806ztlt4eCKQDdepr/gEAAAAAwKWj+AcAAAAAwORY9g8AAAD4EK7pB3wTxT8AAAAAwKekpV6r4MoKHYuJleZLwScqNODb4zrgDNJlwbF68Yd/8HSIbY7iHwAAADARZvaBC/Ovr1dAfa0a68MkhxTgrFW9w6pjTbUKDgjzdHjtgmv+AQAAAAAwOYp/AAAAAABMjuIfAAAAAACTo/gHAAAAAMDkKP4BAAAAADA5in8AAAAAAEyO4h8AAAAAAJPz93QAAAAAAFqmoLTa0yEA8FIU/wAAAEAHKCitlgynLLX1MuqqJUvzRbh3XBnugcgA3/P1M7+R3/HjcgQFSQmSn/O4fha+S5XBA2TzC/J0eO2C4h8AAADoJJjZBzrGwRtvadbWO3KCensglo7CNf8AAAAAAJgcxT8AAAAAACbHsn8AAAAAgE/purNE1hMn5LTZpATJ6jyhP1eXqSowQf5Wm3pHDvV0iG2O4h8AAAAA4FNG/+RuhVSW62hMnLRICmks17Qyq8qbnIoIidOrE3d7OsQ2x7J/AAAAAABMjuIfAAAAAACTo/gHAAAAAMDkKP4BAAAAADA5in8AAAAAAEyO4h8AAAAAAJOj+AcAAAAAwOT8PR0AAAAAYAYFpdWeDgEAzomZfwAAAAAATI6ZfwAAAOACmNUHzOWTT76SDEOyWKRgSYah+UPq1WjtIovF4unw2gXFPwAAAADApzSFhjVrC/C3K8ADsXQUlv0DAAAAAGByFP8AAAAAAJgcy/4BAADg87imH/AtV72xWAF1tWoMDZNukwIctXrzH3/S/qDBCgkI0+39Mz0dYpuj+AcAAAAA+JTEN5copLJcR2PipCQppLFc+WVWlTd9rIiQOFMW/yz7BwAAAADA5Jj5BwAAQKfVkuX4d1wZ3gGRAIB3Y+YfAAAAAACTY+YfAAAAXo2b9QHAhTHzDwAAAACAyVH8AwAAAABgciz7BwAAgMewZB8AOgbFPwAAAC4Kd+IHAO9B8Q8AANCGLlQQ+1oxzMw+gM6oasBgHet+uRoiIqUu0rGmyzUg9Ft18e8le2Ckp8NrFxT/AAAA3+uIQpXZcgDwvE2/zW/W9tAADwTSgSj+AQAAfBBfQgCAb6H4BwAAXqEtltN7yxJ0Lh0AALQ1in8AAAAT8pYvOgAAHYPiHwAAmALFbtvj3xSAWY3+z7sU+M/vTt7w7xdSYNN3mlL2rb79/oZ/j93Q/J4A3o7iHwAAeBxFZusUlFZLhlOW2noZddWSxerpkADAq3Td9SeFVJbraEycVC+FNJZrV51V5U2HFRES5+nw2gWfFAAAAAAAmBwz/wAA+LD/87fqS5495uZzAAB0fhT/AADgkrBkHwCAzo9l/wAAAAAAmBwz/2fIy8vT/PnzVVFRoSFDhmjRokUaMWKEp8MCAKCZlsy4syQfAABIFP9uVqxYoezsbL366qsaOXKkXnzxRaWlpWnPnj2Kjo72dHgAALQaS/IBAIBE8e/mN7/5jR566CHdf//9kqRXX31VhYWFevPNNzVnzhwPRwcAaAmKXQAAgOYo/r934sQJbdu2TY8//rirzWq1aty4cSouLm7Wv6GhQQ0NDa7t6uqTv2xWVVXJ6XS2f8A4L6fTqZqaGtlsNlmt3nFri9Vl5y9YfpRw6Ut3L/QzWqIlcVzazzFkqf2Xbr9A7triXNCWTubNOHhcksXTwaBVDFlqa2TIT+TO25A770TevBe5805nz1uN06EmSUedDumo1NQoOY8bUpNkWBw6WlOlqirDY1G3VE1NjSTJMC4cK8X/944cOSKHw6GYmBi39piYGP35z39u1v/ZZ59Vbm5us/b4+Ph2ixEAAAAA0IYOV0r3ndo4WUD/S5W6b24vDwV0cWpraxUefv5JOor/i/T4448rOzvbte10OvXPf/5TkZGRslj4JtDTampq1KNHD/3973+X3W73dDhoBXLnncib9yJ33ovceSfy5r3InXcye94Mw1Btba3i4uIu2Jfi/3vdunWTn5+fKisr3dorKysVGxvbrH9gYKACAwPd2rp27dqeIeIi2O12Uw5yX0DuvBN5817kznuRO+9E3rwXufNOZs7bhWb8T/GOi6E7gM1m0zXXXKOioiJXm9PpVFFRkVJSUjwYGQAAAAAAl4aZ/9NkZ2dr+vTpGj58uEaMGKEXX3xR9fX1rrv/AwAAAADgjSj+TzN58mQdPnxYTz31lCoqKjR06FCtWbOm2U0A0fkFBgZq7ty5zS7NQOdH7rwTefNe5M57kTvvRN68F7nzTuTt3yxGS54JAAAAAAAAvBbX/AMAAAAAYHIU/wAAAAAAmBzFPwAAAAAAJkfxDwAAAACAyVH8o9PbsGGDxo8fr7i4OFksFn3wwQcXfM+6des0bNgwBQYGqk+fPlq2bJnb/nnz5slisbi9+vbt2z4n4MNam7uDBw/q7rvvVmJioqxWq7Kyss7ar6CgQH379lVQUJAGDRqkjz76qO2D93Htkbtly5Y1G3dBQUHtcwI+qrV5e//993XzzTcrKipKdrtdKSkp+uSTT5r1y8vLU69evRQUFKSRI0dqy5Yt7XQGvqs9csdnXftrbd42btyo0aNHKzIyUsHBwerbt68WLlzYrB9jrv21R+4Ycx3jYmqDUzZt2iR/f38NHTq02T5fGHcU/+j06uvrNWTIEOXl5bWof1lZmdLT03XDDTeopKREWVlZevDBB5v9UjRgwAAdPHjQ9dq4cWN7hO/TWpu7hoYGRUVFKScnR0OGDDlrn82bN2vKlCmaMWOGtm/frgkTJmjChAnauXNnW4bu89ojd5Jkt9vdxt2+ffvaKmSo9XnbsGGDbr75Zn300Ufatm2bbrjhBo0fP17bt2939VmxYoWys7M1d+5cff311xoyZIjS0tJ06NCh9joNn9QeuZP4rGtvrc1bly5dlJmZqQ0bNuibb75RTk6OcnJy9Nvf/tbVhzHXMdojdxJjriO0NnenVFVVadq0abrpppua7fOZcWcAXkSSsWrVqvP2mT17tjFgwAC3tsmTJxtpaWmu7blz5xpDhgxphwhxLi3J3enGjBlj/PznP2/Wfueddxrp6elubSNHjjR+8pOfXGKEOJe2yt3SpUuN8PDwNosL59favJ3Sv39/Izc317U9YsQIIyMjw7XtcDiMuLg449lnn22LMHEWbZU7Pus61sXm7cc//rFxzz33uLYZcx2vrXLHmOt4rcnd5MmTjZycnLPmyVfGHTP/MJ3i4mKNGzfOrS0tLU3FxcVubX/9618VFxen3r17a+rUqdq/f39HhomL1NL8onOqq6tTfHy8evTooR/96EfatWuXp0PCaZxOp2praxURESFJOnHihLZt2+Y25qxWq8aNG8eY62TOzN0pfNZ1btu3b9fmzZs1ZswYSYw5b3Jm7k5hzHVOS5cu1d/+9jfNnTu32T5fGncU/zCdiooKxcTEuLXFxMSopqZGx44dkySNHDlSy5Yt05o1a/TKK6+orKxM119/vWpraz0RMlrhXPmtqKjwUERoqauvvlpvvvmmVq9erXfeeUdOp1OjRo3SgQMHPB0avvfCCy+orq5Od955pyTpyJEjcjgcjDkvcGbuJD7rOrMrrrhCgYGBGj58uDIyMvTggw9KYsx5g3PlTmLMdVZ//etfNWfOHL3zzjvy9/dvtt+Xxl3zswd8wK233ur6++DBgzVy5EjFx8dr5cqVmjFjhgcjA8wrJSVFKSkpru1Ro0apX79++p//+R8988wzHowMkvTuu+8qNzdXq1evVnR0tKfDQSucK3d81nVeX3zxherq6vTll19qzpw56tOnj6ZMmeLpsNAC58sdY67zcTgcuvvuu5Wbm6vExERPh+NxFP8wndjYWFVWVrq1VVZWym63Kzg4+Kzv6dq1qxITE7V3796OCBGX4Fz5jY2N9VBEuFgBAQFKSkpi3HUC+fn5evDBB1VQUOC27LFbt27y8/NjzHVi58rd2fBZ13kkJCRIkgYNGqTKykrNmzdPU6ZMYcx5gXPl7mwYc55XW1urrVu3avv27crMzJR08jIpwzDk7++vTz/9VNddd53PjDuW/cN0UlJSVFRU5Na2du1atxnHM9XV1am0tFTdu3dv7/BwiS4mv+icHA6HduzYwbjzsOXLl+v+++/X8uXLlZ6e7rbPZrPpmmuucRtzTqdTRUVFjLlO4Hy5Oxs+6zonp9OphoYGSYw5b3N67s6GMed5drtdO3bsUElJies1c+ZMXX311SopKdHIkSN9atwx849Or66uzu0b07KyMpWUlCgiIkI9e/bU448/rn/84x96++23JUkzZ87U4sWLNXv2bD3wwAP67LPPtHLlShUWFrqO8eijj2r8+PGKj49XeXm55s6dKz8/P5bctbHW5k6SSkpKXO89fPiwSkpKZLPZ1L9/f0nSz3/+c40ZM0YLFixQenq68vPztXXr1maP2sGlaY/cPf3000pOTlafPn1UVVWl+fPna9++fW7XS+LStDZv7777rqZPn66XXnpJI0eOdF3bGBwcrPDwcElSdna2pk+fruHDh2vEiBF68cUXVV9fr/vvv7/jT9DE2iN3fNa1v9bmLS8vTz179nQ9+33Dhg164YUX9LOf/cx1DMZcx2iP3DHmOkZrcme1WjVw4EC390dHRysoKMit3WfGnacfNwBcyOeff25IavaaPn26YRiGMX36dGPMmDHN3jN06FDDZrMZvXv3NpYuXeq2f/LkyUb37t0Nm81mXH755cbkyZONvXv3dswJ+ZCLyd3Z+sfHx7v1WblypZGYmGjYbDZjwIABRmFhYceckA9pj9xlZWUZPXv2NGw2mxETE2Pcdtttxtdff91xJ+UDWpu3MWPGnLf/KYsWLXLlbsSIEcaXX37ZcSflI9ojd3zWtb/W5u3ll182BgwYYISEhBh2u91ISkoylixZYjgcDrfjMubaX3vkjjHXMS7md5TTneuRjL4w7iyGYRht8zUCAAAAAADojLjmHwAAAAAAk6P4BwAAAADA5Cj+AQAAAAAwOYp/AAAAAABMjuIfAAAAAACTo/gHAAAAAMDkKP4BAAAAADA5in8AAAAAANrBhg0bNH78eMXFxcliseiDDz5o9TFWrlypoUOHKiQkRPHx8Zo/f/5FxULxDwAAXO677z5NmDDB02EAAGAK9fX1GjJkiPLy8i7q/R9//LGmTp2qmTNnaufOnVqyZIkWLlyoxYsXt/pYFsMwjIuKAgAAeBWLxXLe/XPnztUjjzwiwzDUtWvXjgnqLO677z5VVVVd1OwIAACdlcVi0apVq9y+ZG9oaNATTzyh5cuXq6qqSgMHDtRzzz2nsWPHSpLuvvtuNTY2qqCgwPWeRYsW6fnnn9f+/fsv+Nl+Ov+2OhEAANC5HTx40PX3FStW6KmnntKePXtcbaGhoQoNDfVEaAAA+KTMzEzt3r1b+fn5iouL06pVq3TLLbdox44duuqqq9TQ0KCQkBC39wQHB+vAgQPat2+fevXq1eKfxbJ/AAB8RGxsrOsVHh4ui8Xi1hYaGtps2f/YsWM1a9YsZWVl6bLLLlNMTIxee+011dfX6/7771dYWJj69Omjjz/+2O1n7dy5U7feeqtCQ0MVExOje++9V0eOHHHtf++99zRo0CAFBwcrMjJS48aNU319vebNm6e33npLq1evlsVikcVi0bp16yRJjz32mBITExUSEqLevXvrySefVGNjo+uY8+bN09ChQ/Xmm2+qZ8+eCg0N1U9/+lM5HA49//zzio2NVXR0tP7rv/7LLVaLxaJXXnlFt956q4KDg9W7d2+99957bZ8AAABOs3//fi1dulQFBQW6/vrrdeWVV+rRRx/Vddddp6VLl0qS0tLS9P7776uoqEhOp1N/+ctftGDBAknuX+q3BMU/AAA4r7feekvdunXTli1bNGvWLD388MO64447NGrUKH399ddKTU3Vvffeq6NHj0qSqqqqdOONNyopKUlbt27VmjVrVFlZqTvvvFPSyV9WpkyZogceeEDffPON1q1bp4kTJ8owDD366KO68847dcstt+jgwYM6ePCgRo0aJUkKCwvTsmXLtHv3br300kt67bXXtHDhQrdYS0tL9fHHH2vNmjVavny53njjDaWnp+vAgQNav369nnvuOeXk5Oirr75ye9+TTz6pSZMm6Y9//KOmTp2qu+66S998800H/OsCAHzVjh075HA4lJiY6Fp9FxoaqvXr16u0tFSS9NBDDykzM1O33367bDabkpOTddddd0mSrNbWlfNc8w8AgA9atmyZsrKyVFVV5dZ+5vX2Y8eOlcPh0BdffCFJcjgcCg8P18SJE/X2229LkioqKtS9e3cVFxcrOTlZv/71r/XFF1/ok08+cR33wIED6tGjh/bs2aO6ujpdc801+vbbbxUfH98stpZe8//CCy8oPz9fW7dulXRy5n/+/PmqqKhQWFiYJOmWW27Rnj17VFpa6volqW/fvrrvvvs0Z84cSSdn/mfOnKlXXnnFdezk5GQNGzZMS5YsaeG/KAAA53fmNf8rVqzQ1KlTtWvXLvn5+bn1DQ0NVWxsrGvb4XCooqJCUVFRKioq0m233aZDhw4pKiqqxT+fa/4BAMB5DR482PV3Pz8/RUZGatCgQa62mJgYSdKhQ4ckSX/84x/1+eefn/X+AaWlpUpNTdVNN92kQYMGKS0tTampqfqP//gPXXbZZeeNY8WKFXr55ZdVWlqquro6NTU1yW63u/Xp1auXq/A/FZufn5/b7EhMTIwr1lNSUlKabZeUlJw3HgAALkVSUpIcDocOHTqk66+//rx9/fz8dPnll0uSli9frpSUlFYV/hLFPwAAuICAgAC3bYvF4tZ26k7DTqdTklRXV6fx48frueeea3as7t27y8/PT2vXrtXmzZv16aefatGiRXriiSf01VdfKSEh4awxFBcXa+rUqcrNzVVaWprCw8OVn5/vuu6xpbGeajsVKwAA7amurk579+51bZeVlamkpEQRERFKTEzU1KlTNW3aNC1YsEBJSUk6fPiwioqKNHjwYKWnp+vIkSN67733NHbsWB0/ftx1j4D169e3Ohau+QcAAG1q2LBh2rVrl3r16qU+ffq4vbp06SLpZAE+evRo5ebmavv27bLZbFq1apUkyWazyeFwuB1z8+bNio+P1xNPPKHhw4frqquu0r59+9os5i+//LLZdr9+/drs+AAA37R161YlJSUpKSlJkpSdna2kpCQ99dRTkqSlS5dq2rRp+sUvfqGrr75aEyZM0B/+8Af17NnTdYy33npLw4cP1+jRo7Vr1y6tW7dOI0aMaHUszPwDAIA2lZGRoddee01TpkzR7NmzFRERob179yo/P1+vv/66tm7dqqKiIqWmpio6OlpfffWVDh8+7Cq2e/XqpU8++UR79uxRZGSkwsPDddVVV2n//v3Kz8/Xtddeq8LCQteXBW2hoKBAw4cP13XXXaff/e532rJli9544402Oz4AwDeNHTtW57vNXkBAgHJzc5Wbm3vW/d26dVNxcXGbxMLMPwAAaFNxcXHatGmTHA6HUlNTNWjQIGVlZalr166yWq2y2+3asGGDbrvtNiUmJionJ0cLFizQrbfeKunknY2vvvpqDR8+XFFRUdq0aZN++MMf6pFHHlFmZqaGDh2qzZs368knn2yzmHNzc5Wfn6/Bgwfr7bff1vLly9W/f/82Oz4AAJ7G3f4BAIBPO/PuywAAmBEz/wAAAAAAmBzFPwAAAAAAJscN/wAAgE/jCkgAgC9g5h8AAAAAAJOj+AcAAAAAwOQo/gEAAAAAMDmKfwAAAAAATI7iHwAAAAAAk6P4BwAAAADA5Cj+AQAAAAAwOYp/AAAAAABM7v8Dgof10qsXKOkAAAAASUVORK5CYII=",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import polars as pl\n",
+ "import matplotlib.pyplot as plt\n",
+ "import numpy as np\n",
+ "from datetime import datetime\n",
+ "\n",
+ "\n",
+ "# 2. Статистика по текущему временному разбиению\n",
+ "global_max_time = df.select(pl.col(\"timestamps\").explode().max()).item()\n",
+ "days_val = 14\n",
+ "window_sec = days_val * 24 * 3600 \n",
+ "\n",
+ "cutoffs = [\n",
+ " int(global_max_time - 3 * window_sec),\n",
+ " int(global_max_time - 2 * window_sec),\n",
+ " int(global_max_time - 1 * window_sec)\n",
+ "]\n",
+ "\n",
+ "print(\"--- Статистика по временным интервалам (Fixed Time Window) ---\")\n",
+ "intervals = [0] + cutoffs + [None]\n",
+ "labels = [\"Base\", \"Gap (Week -6)\", \"Pre-Valid (Week -4)\", \"Test (Week -2)\"]\n",
+ "\n",
+ "# Считаем события в каждом интервале\n",
+ "counts = []\n",
+ "for i in range(len(intervals)-1):\n",
+ " start, end = intervals[i], intervals[i+1]\n",
+ " \n",
+ " q = df.lazy().explode([\"timestamps\"])\n",
+ " if end is not None:\n",
+ " q = q.filter((pl.col(\"timestamps\") >= start) & (pl.col(\"timestamps\") < end))\n",
+ " else:\n",
+ " q = q.filter(pl.col(\"timestamps\") >= start)\n",
+ " \n",
+ " count = q.select(pl.len()).collect().item()\n",
+ " counts.append(count)\n",
+ " \n",
+ " end_str = datetime.fromtimestamp(end).strftime('%Y-%m-%d') if end else \"Inf\"\n",
+ " start_str = datetime.fromtimestamp(start).strftime('%Y-%m-%d') if start > 0 else \"Start\"\n",
+ " \n",
+ " print(f\"Part {i} [{labels[i]}]: {count} events ({start_str} -> {end_str})\")\n",
+ "\n",
+ "# 3. Гистограмма распределения событий во времени\n",
+ "all_timestamps = df.select(pl.col(\"timestamps\").explode()).to_series().to_numpy()\n",
+ "\n",
+ "plt.figure(figsize=(12, 6))\n",
+ "plt.hist(all_timestamps, bins=100, color='skyblue', alpha=0.7, label='Events')\n",
+ "\n",
+ "# Рисуем линии отсечек\n",
+ "colors = ['red', 'orange', 'green']\n",
+ "for cutoff, color, label in zip(cutoffs, colors, labels[1:]):\n",
+ " plt.axvline(x=cutoff, color=color, linestyle='--', linewidth=2, label=f'Cutoff: {label}')\n",
+ "\n",
+ "plt.title(\"Распределение взаимодействий во времени\")\n",
+ "plt.xlabel(\"Timestamp\")\n",
+ "plt.ylabel(\"Количество событий\")\n",
+ "plt.legend()\n",
+ "plt.grid(True, alpha=0.3)\n",
+ "plt.show()\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "901e7400",
+ "metadata": {},
+ "source": [
+ "# QUANTILE CUTOFF"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "8c691891",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_quantile_cutoffs(df, num_parts=4, base_ratio=None):\n",
+ " \"\"\"\n",
+ " Считает cutoffs так, чтобы разбить данные на части.\n",
+ " \n",
+ " Args:\n",
+ " num_parts: На сколько частей делить \"хвост\" истории.\n",
+ " base_ratio: Какую долю данных отдать в Base (самую первую часть). \n",
+ " Если None, делит всё поровну.\n",
+ " \"\"\"\n",
+ " # Достаем все таймстемпы в один плоский массив\n",
+ " # Это может занять память, если данных очень много (>100M), но для Beauty (2M) это ок\n",
+ " all_ts = df.select(pl.col(\"timestamps\").explode()).to_series().sort()\n",
+ " total_events = len(all_ts)\n",
+ " \n",
+ " print(f\"Всего событий: {total_events}\")\n",
+ " \n",
+ " cutoffs = []\n",
+ " \n",
+ " if base_ratio:\n",
+ " # Сценарий: Base занимает X% (например 80%), а остаток делим поровну на 3 части (Valid, Gap, Test)\n",
+ " # Остаток = 1 - base_ratio\n",
+ " # Каждая малая часть = (1 - base_ratio) / num_parts_tail\n",
+ " \n",
+ " base_idx = int(total_events * base_ratio)\n",
+ " cutoffs.append(all_ts[base_idx]) # Первый cutoff отделяет Base\n",
+ " \n",
+ " remaining_events = total_events - base_idx\n",
+ " part_size = remaining_events // num_parts # Делим остаток на 3 части (P1, P2, P3)\n",
+ " \n",
+ " current_idx = base_idx\n",
+ " for _ in range(num_parts-1): # Нам нужно еще 2 границы, чтобы получить 3 части\n",
+ " current_idx += part_size\n",
+ " cutoffs.append(all_ts[current_idx])\n",
+ " \n",
+ " else:\n",
+ " # Сценарий: Просто делим всё на N равных частей\n",
+ " step = total_events // num_parts\n",
+ " for i in range(1, num_parts):\n",
+ " idx = i * step\n",
+ " cutoffs.append(all_ts[idx])\n",
+ " \n",
+ " return cutoffs\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "13c1466f",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Всего событий: 198502\n",
+ "\n",
+ "--- Новые Cutoffs (по количеству событий) ---\n",
+ "Cutoffs: [1394150400, 1397001600, 1399939200, 1403049600]\n",
+ "[0, 1394150400, 1397001600, 1399939200, 1403049600, None]\n",
+ "Проверка количества событий в новых частях:\n",
+ "Part 0: 158689 events\n",
+ "Part 1: 9965 events\n",
+ "Part 2: 9701 events\n",
+ "Part 3: 10137 events\n",
+ "Part 4: 10010 events\n"
+ ]
+ }
+ ],
+ "source": [
+ "equal_event_cutoffs = get_quantile_cutoffs(df, num_parts=4, base_ratio=0.8)\n",
+ "\n",
+ "print(\"\\n--- Новые Cutoffs (по количеству событий) ---\")\n",
+ "print(f\"Cutoffs: {equal_event_cutoffs}\")\n",
+ "\n",
+ "# Проверка распределения\n",
+ "intervals_eq = [0] + equal_event_cutoffs + [None]\n",
+ "print(intervals_eq)\n",
+ "print(\"Проверка количества событий в новых частях:\")\n",
+ "for i in range(len(intervals_eq)-1):\n",
+ " start, end = intervals_eq[i], intervals_eq[i+1]\n",
+ " q = df.lazy().explode([\"timestamps\"])\n",
+ " if end:\n",
+ " q = q.filter((pl.col(\"timestamps\") >= start) & (pl.col(\"timestamps\") < end))\n",
+ " else:\n",
+ " q = q.filter(pl.col(\"timestamps\") >= start)\n",
+ " count = q.select(pl.len()).collect().item()\n",
+ " print(f\"Part {i}: {count} events\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "4e7f7b46",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "✓ Сохранено: /home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/raw/inter_new_[0_1394150400).json\n",
+ "✓ Сохранено: /home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/raw/inter_new_[1394150400_1399939200).json\n",
+ "✓ Сохранено: /home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/raw/inter_new_[1399939200_1403049600).json\n",
+ "✓ Сохранено: /home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/raw/inter_new_[1403049600_inf).json\n",
+ "0 Base 20825 158689 \n",
+ "1 Gap 6816 19666 \n",
+ "2 Valid 3817 10137 \n",
+ "3 Test 3626 10010 \n"
+ ]
+ }
+ ],
+ "source": [
+ "new_split_files = split_session_by_timestamps(\n",
+ " df, \n",
+ " [1394150400, 1399939200, 1403049600], \n",
+ " output_dir=\"/home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/raw\"\n",
+ ")\n",
+ "\n",
+ "names = [\"Base\", \"Gap\", \"Valid\", \"Test\"]\n",
+ "for i, d in enumerate(new_split_files):\n",
+ " num_users = len(d)\n",
+ " \n",
+ " num_events = sum(len(items) for items in d.values())\n",
+ " \n",
+ " print(f\"{i:<10} {names[i]:<10} {num_users:<10} {num_events:<10}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "82fd2bca",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Merging 2 files into exp_4_0.9_inter_tiger_train.json...\n",
+ "✓ Done: /home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/merged_for_exps/exp_4_0.9_inter_tiger_train.json (Users: 21760)\n",
+ "Merging 2 files into exp_4-1_0.9_inter_semantics_train.json...\n",
+ "✓ Done: /home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/merged_for_exps/exp_4-1_0.9_inter_semantics_train.json (Users: 21760)\n",
+ "Merging 1 files into exp_4-2_0.8_inter_semantics_train.json...\n",
+ "✓ Done: /home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/merged_for_exps/exp_4-2_0.8_inter_semantics_train.json (Users: 20825)\n",
+ "Merging 3 files into exp_4-3_0.95_inter_semantics_train.json...\n",
+ "✓ Done: /home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/merged_for_exps/exp_4-3_0.95_inter_semantics_train.json (Users: 22079)\n",
+ "Merging 1 files into test_set.json...\n",
+ "✓ Done: /home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/merged_for_exps/test_set.json (Users: 3626)\n",
+ "Merging 1 files into valid_set.json...\n",
+ "✓ Done: /home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/merged_for_exps/valid_set.json (Users: 3817)\n",
+ "Merging 4 files into all_set.json...\n",
+ "✓ Done: /home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/merged_for_exps/all_set.json (Users: 22363)\n",
+ "All done!\n"
+ ]
+ }
+ ],
+ "source": [
+ "EXP_DIR = \"/home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/merged_for_exps\"\n",
+ "\n",
+ "base_p, gap_p, valid_p, test_p = new_split_files[0], new_split_files[1], new_split_files[2], new_split_files[3]\n",
+ "\n",
+ "# Tiger: base + gap\n",
+ "merge_and_save([base_p, gap_p], EXP_DIR, \"exp_4_0.9_inter_tiger_train.json\")\n",
+ "\n",
+ "# 1. Exp 4.1 (Standard)\n",
+ "# Semantics: base + gap (Всё кроме валидации и теста)\n",
+ "merge_and_save([base_p, gap_p], EXP_DIR, \"exp_4-1_0.9_inter_semantics_train.json\")\n",
+ "\n",
+ "# 2. Exp 4.2 (Short Semantics)\n",
+ "# Semantics: base (Короче на пропуск, без gap)\n",
+ "merge_and_save([base_p], EXP_DIR, \"exp_4-2_0.8_inter_semantics_train.json\")\n",
+ "\n",
+ "# 3. Exp 4.3 (Leak)\n",
+ "# Semantics: base + gap + valid (Видит валидацию)\n",
+ "merge_and_save([base_p, gap_p, valid_p], EXP_DIR, \"exp_4-3_0.95_inter_semantics_train.json\")\n",
+ "\n",
+ "# 4. Test Set (тест всех моделей)\n",
+ "merge_and_save([test_p], EXP_DIR, \"test_set.json\")\n",
+ "\n",
+ "# 4. Valid Set (валидационный набор)\n",
+ "merge_and_save([valid_p], EXP_DIR, \"valid_set.json\")\n",
+ "\n",
+ "# 4. All Set (все данные)\n",
+ "merge_and_save([base_p, gap_p, valid_p, test_p], EXP_DIR, \"all_set.json\")\n",
+ "\n",
+ "print(\"All done!\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "d34b1c55",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Проверка Train сетов (должны быть префиксами):\n",
+ "доля событий всего 0.90:\n",
+ "✅ [ПРЕФИКСЫ] Все 21760 массивов ОК. Полных совпадений: 16175\n",
+ "доля событий всего 0.80:\n",
+ "✅ [ПРЕФИКСЫ] Все 20825 массивов ОК. Полных совпадений: 12129\n",
+ "доля событий всего 0.95:\n",
+ "✅ [ПРЕФИКСЫ] Все 22079 массивов ОК. Полных совпадений: 18737\n",
+ "доля событий всего 0.90:\n",
+ "✅ [ПРЕФИКСЫ] Все 21760 массивов ОК. Полных совпадений: 16175\n",
+ "\n",
+ "Проверка Test сета (должен быть суффиксом):\n",
+ "доля событий всего 0.05:\n",
+ "✅ [СУФФИКСЫ] Все 3626 массивов ОК. Полных совпадений: 284\n",
+ "\n",
+ "(Контроль) Проверка Test сета как префикса (должна упасть):\n",
+ "доля событий всего 0.05:\n",
+ "❌ [ПРЕФИКСЫ] Найдено 3342 ошибок.\n",
+ "доля событий всего 1.00:\n",
+ "✅ [ПРЕФИКСЫ] Все 22363 массивов ОК. Полных совпадений: 22363\n"
+ ]
+ }
+ ],
+ "source": [
+ "with open(\"/home/jovyan/IRec/data/Beauty/inter_new.json\", 'r') as f:\n",
+ " old_inter_new = json.load(f)\n",
+ "\n",
+ "with open(\"/home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/merged_for_exps/exp_4-1_0.9_inter_semantics_train.json\", 'r') as ff:\n",
+ " first_sem = json.load(ff)\n",
+ " \n",
+ "with open(\"/home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/merged_for_exps/exp_4-2_0.8_inter_semantics_train.json\", 'r') as ff:\n",
+ " second_sem = json.load(ff)\n",
+ " \n",
+ "with open(\"/home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/merged_for_exps/exp_4-3_0.95_inter_semantics_train.json\", 'r') as ff:\n",
+ " third_sem = json.load(ff)\n",
+ " \n",
+ "with open(\"/home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/merged_for_exps/exp_4_0.9_inter_tiger_train.json\", 'r') as ff:\n",
+ " tiger_sem = json.load(ff)\n",
+ "\n",
+ "with open(\"//home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/merged_for_exps/test_set.json\", 'r') as ff:\n",
+ " test_sem = json.load(ff)\n",
+ "\n",
+ "with open(\"/home/jovyan/IRec/sigir/Beauty_new/updated_quantile_splits/merged_for_exps/all_set.json\", 'r') as ff:\n",
+ " all_test_data = json.load(ff)\n",
+ "\n",
+ "def check_prefix_match(full_data, subset_data, check_suffix=False):\n",
+ " \"\"\"\n",
+ " check_suffix=True включит режим проверки суффиксов (для теста).\n",
+ " \"\"\"\n",
+ " mismatch_count = 0\n",
+ " full_match_count = 0\n",
+ "\n",
+ " num_events_full_data = sum(len(items) for items in full_data.values())\n",
+ " num_events_subset_data = sum(len(items) for items in subset_data.values())\n",
+ " print(f\"доля событий всего {(num_events_subset_data/num_events_full_data):.2f}:\")\n",
+ " \n",
+ " # Итерируемся по ключам сабсета, так как в full_data может быть больше юзеров\n",
+ " for user, sub_items in subset_data.items():\n",
+ " \n",
+ " # Проверяем есть ли такой юзер в исходнике\n",
+ " if user not in full_data:\n",
+ " print(f\"⚠ Юзер {user} не найден в исходном файле!\")\n",
+ " mismatch_count += 1\n",
+ " continue\n",
+ " \n",
+ " full_items = full_data[user]\n",
+ " \n",
+ " # Логика для проверки ПРЕФИКСА (начало совпадает)\n",
+ " if not check_suffix:\n",
+ " if len(sub_items) > len(full_items):\n",
+ " mismatch_count += 1\n",
+ " continue\n",
+ " \n",
+ " # Сравниваем начало full с sub\n",
+ " if full_items[:len(sub_items)] == sub_items:\n",
+ " if len(full_items) == len(sub_items):\n",
+ " full_match_count += 1\n",
+ " else:\n",
+ " mismatch_count += 1\n",
+ " \n",
+ " # Логика для проверки СУФФИКСА (конец совпадает - для теста)\n",
+ " else:\n",
+ " if len(sub_items) > len(full_items):\n",
+ " mismatch_count += 1\n",
+ " continue\n",
+ " \n",
+ " # Сравниваем конец full с sub\n",
+ " # Срез [-len:] берет последние N элементов\n",
+ " if full_items[-len(sub_items):] == sub_items:\n",
+ " if len(full_items) == len(sub_items):\n",
+ " full_match_count += 1\n",
+ " else:\n",
+ " mismatch_count += 1\n",
+ "\n",
+ " mode = \"СУФФИКСЫ\" if check_suffix else \"ПРЕФИКСЫ\"\n",
+ " \n",
+ " if mismatch_count == 0:\n",
+ " print(f\"✅ [{mode}] Все {len(subset_data)} массивов ОК. Полных совпадений: {full_match_count}\")\n",
+ " else:\n",
+ " print(f\"❌ [{mode}] Найдено {mismatch_count} ошибок.\")\n",
+ "\n",
+ "# --- Запуск проверок ---\n",
+ "print(\"Проверка Train сетов (должны быть префиксами):\")\n",
+ "check_prefix_match(old_inter_new, first_sem)\n",
+ "check_prefix_match(old_inter_new, second_sem)\n",
+ "check_prefix_match(old_inter_new, third_sem)\n",
+ "check_prefix_match(old_inter_new, tiger_sem)\n",
+ "\n",
+ "print(\"\\nПроверка Test сета (должен быть суффиксом):\")\n",
+ "check_prefix_match(old_inter_new, test_sem, check_suffix=True)\n",
+ "\n",
+ "print(\"\\n(Контроль) Проверка Test сета как префикса (должна упасть):\")\n",
+ "check_prefix_match(old_inter_new, test_sem, check_suffix=False)\n",
+ "\n",
+ "check_prefix_match(old_inter_new, all_test_data)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "501fae46",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Part 0 [Base]: 19666 events (2014-03-07 -> 2014-05-13)\n",
+ "Part 1 [Gap]: 10137 events (2014-05-13 -> 2014-06-18)\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAA/8AAAIjCAYAAABViau2AAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjAsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvlHJYcgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAlp5JREFUeJzs3XmcjXX/x/H3NftiFsOsmhj7nqWslSVrIpJSKsVNiSTdlBahpFSyluoXWriTlBbdGCSSLRIiodGCmcGY3azn+v3hnpPjjJlzmDFzptfz8Zh7nO/3c33P5zrfc6b7c67r+l6GaZqmAAAAAABAheVW1gkAAAAAAIDSRfEPAAAAAEAFR/EPAAAAAEAFR/EPAAAAAEAFR/EPAAAAAEAFR/EPAAAAAEAFR/EPAAAAAEAFR/EPAAAAAEAFR/EPAAAAAEAFR/EPAAD+8fbt26cVK1ZYH+/evVsrV64su4Qu0V9//aVFixZZHx89elSLFy8uu4QAAOUGxT8AlLFFixbJMAzrj4+Pj+rWratRo0YpISGhrNMD/hHS0tL04IMPauvWrTp06JAeffRR7d27t6zTcpphGBo5cqRWr16to0ePavz48dq0aVNZpwUAKAc8yjoBAMA5U6ZMUUxMjLKysvTdd9/pzTff1Ndff619+/bJz8+vrNMDKrS2bdtafySpbt26GjZsWBln5bxq1app2LBh6tGjhyQpMjJSGzZsKNukAADlgmGaplnWSQDAP9miRYv0wAMPaMeOHbr22mut7Y8//rhmzJihJUuW6K677irDDIF/jv379+vs2bNq0qSJvLy8yjqdS3bkyBGdOnVKjRs3lr+/f1mnAwAoBzjtHwDKqc6dO0uS4uLiJElJSUn697//rSZNmqhSpUoKDAxUz5499dNPP9ltm5WVpUmTJqlu3bry8fFRZGSkbrvtNh05ckTSueuAz7/U4MKfjh07WsfasGGDDMPQ0qVL9dRTTykiIkL+/v7q06eP/vzzT7vn3rZtm3r06KGgoCD5+fmpQ4cO2rx5c6H72LFjx0Kff9KkSXaxH374oVq2bClfX1+FhIRo4MCBhT5/Uft2PovFopkzZ6pRo0by8fFReHi4HnzwQZ05c8YmrkaNGrrlllvsnmfUqFF2YxaW+yuvvGL3mkpSdna2nnvuOdWuXVve3t6Kjo7W+PHjlZ2dXehrdb4LX7eqVauqV69e2rdvnzXm9OnT6tmzp6666ip5e3srMjJSgwYN0u+//24z1quvvqp27dqpSpUq8vX1VcuWLfXJJ5/YPaej+1bwfjEMQ7t377aJP3bsmNzd3WUYht1zrF+/XjfccIP8/f0VHBysW2+9VQcOHHBo/y/3fdOxY0frPjRs2FAtW7bUTz/9VOj7pmDMVq1ayc/PT5UrV9aNN96oNWvWSDr3finqs1WjRg1J9u9TT09P1ahRQ+PGjVNOTo71uQouCzp69Ki1zWKxqGnTpjIMw+b6/vvvv986fq1atdS6dWslJSXJ19fXbozC3H///TY5Va5cWR07diz0soE33nhDjRo1kre3t6KiojRy5EglJycXOb4kTZo0yeY5AgIC1KpVK5v1FqRzc9K4cWPt3LlT7dq1k6+vr2JiYjR//ny7MR39LBU858yZM+3GqF+/vgzD0KhRo2zak5OTNWbMGEVHR8vb21u1a9fWyy+/LIvFYo0pmMtXX33VbtzGjRsX+vm48GyMXr16XfQ9DAAlhdP+AaCcKijUq1SpIkn67bfftGLFCg0YMEAxMTFKSEjQW2+9pQ4dOmj//v2KioqSJOXn5+uWW27RunXrNHDgQD366KNKS0tTbGys9u3bp1q1almf46677tLNN99s87wTJkwoNJ+pU6fKMAw98cQTSkxM1MyZM9WlSxft3r1bvr6+ks4VcD179lTLli313HPPyc3NTQsXLlTnzp21adMmtWrVym7cq666StOmTZMkpaena8SIEYU+97PPPqs77rhD//rXv3Ty5EnNmTNHN954o3788UcFBwfbbTN8+HDdcMMNkqRPP/1Un332mU3/gw8+aD3rYvTo0YqLi9PcuXP1448/avPmzfL09Cz0dXBGcnKydd/OZ7FY1KdPH3333XcaPny4GjRooL179+r111/Xr7/+alcIFaZ+/fp6+umnZZqmjhw5ohkzZujmm2/WH3/8IUnKyclRQECAHn30UVWpUkVHjhzRnDlztGfPHptr2WfNmqU+ffpo0KBBysnJ0UcffaQBAwboq6++Uq9evZzetwI+Pj5auHChZs2aZW1777335OXlpaysLJvYtWvXqmfPnqpZs6YmTZqks2fPas6cOWrfvr127dplLWjPV1rvmwJPPPFEoe2TJ0/WpEmT1K5dO02ZMkVeXl7atm2b1q9fr27dumnmzJlKT0+XJB04cEAvvviinnrqKTVo0ECSVKlSJZvxCt6n2dnZWr16tV599VX5+Pjo+eefv2huH3zwgcPrEUycONHu9S5K1apV9frrr0s6t3jgrFmzdPPNN+vPP/+0vl6TJk3S5MmT1aVLF40YMUIHDx7Um2++qR07djj82fnggw8kSadOndIbb7yhAQMGaN++fapXr5415syZM7r55pt1xx136K677tLHH3+sESNGyMvLS0OGDJHk/Gep4H05ZswYa9v3339v96WYJGVmZqpDhw46duyYHnzwQV199dX6/vvvNWHCBJ04caLQLxEuxcaNG/X111+XyFgAUCQTAFCmFi5caEoy165da548edL8888/zY8++sisUqWK6evra/7111+maZpmVlaWmZ+fb7NtXFyc6e3tbU6ZMsXatmDBAlOSOWPGDLvnslgs1u0kma+88opdTKNGjcwOHTpYH3/zzTemJLNatWpmamqqtf3jjz82JZmzZs2yjl2nTh2ze/fu1ucxTdPMzMw0Y2JizK5du9o9V7t27czGjRtbH588edKUZD733HPWtqNHj5ru7u7m1KlTbbbdu3ev6eHhYdd+6NAhU5L53nvvWduee+458/z/5G3atMmUZC5evNhm21WrVtm1V69e3ezVq5dd7iNHjjQv/M/ohbmPHz/eDAsLM1u2bGnzmn7wwQemm5ubuWnTJpvt58+fb0oyN2/ebPd85+vQoYPNeKZpmk899ZQpyUxMTLzodtOnTzclmadOnbK2ZWZm2sTk5OSYjRs3Njt37nxJ+1bwfrnrrrvMKlWqmNnZ2da+OnXqmHfffbcpyVy2bJm1vVmzZmZYWJh5+vRpa9tPP/1kurm5mffdd5/dfpT0++bC1/Prr782JZk9evSwmeNDhw6Zbm5uZr9+/ew+i+e/5y98Lb755hu7voLP4MKFC23ao6KizJtvvtn6uODvQ1xcnGma5/4OXH311WbPnj3tth88eLBZvXp16+N9+/aZbm5u1tiCMS7mwu1N0zTffvttU5K5fft20zRNMzEx0fTy8jK7detm8xrMnTvXlGQuWLCgyOe48LNomqa5Zs0aU5L58ccfW9s6dOhgSjJfe+01a1t2drb1vZKTk2OapnOfJUnm7bffbnp4eJg//PCDtX3o0KHW9+XIkSOt7c8//7zp7+9v/vrrrzZjP/nkk6a7u7v5xx9/mKZ5aX9Pz39PtG7d2jpH57+HAaCkcdo/AJQTXbp0UWhoqKKjozVw4EBVqlRJn332mapVqyZJ8vb2lpvbuT/b+fn5On36tCpVqqR69epp165d1nGWL1+uqlWr6pFHHrF7jsJOYXbUfffdp4CAAOvj22+/XZGRkdYjVrt379ahQ4d099136/Tp0zp16pROnTqljIwM3XTTTdq4caPNqbLSucsTfHx8inzeTz/9VBaLRXfccYd1zFOnTikiIkJ16tTRN998YxNfcMq0t7f3RcdctmyZgoKC1LVrV5sxW7ZsqUqVKtmNmZubaxN36tSpYo+mHjt2THPmzNGzzz5rd7R32bJlatCggerXr28zZsGlHhc+f2EKcjp58qS2bNmizz77TE2bNlXVqlVt4tLS0pSYmKgtW7boP//5jxo1aqSQkBBrf8FZG9K5I60pKSm64YYbbN5Tzuxbgd69e8swDH3xxReSpE2bNumvv/7SnXfeaRN34sQJ7d69W/fff79NXk2bNlXXrl0LPSJaGu+bAqZpasKECerfv79at25t07dixQpZLBZNnDjR+lkscKmfrfT0dJ06dUrHjh3T22+/rfj4eN10000XjZ83b55Onz6t5557rtixJ0yYoBYtWmjAgAEO52OxWKyv1e7du/X+++8rMjLSeubC2rVrlZOTozFjxti8BsOGDVNgYKDDt0cseI4DBw5o/vz58vf3V5s2bWxiPDw89OCDD1ofe3l56cEHH1RiYqJ27twpyfnPUnh4uHr16qWFCxdKOnd0/+OPP9YDDzxgl+OyZct0ww03qHLlyjZjd+nSRfn5+dq4caNNfGZmpt3fifz8/CJfh08//VQ7duzQSy+95NDrBgCXg9P+AaCcmDdvnurWrSsPDw+Fh4erXr16Nv/n2mKxaNasWXrjjTcUFxdn838qCy4NkM5dLlCvXj15eJTsn/g6derYPDYMQ7Vr17ZeR3zo0CFJ0uDBgy86RkpKiipXrmx9fOrUKbtxL3To0CGZpnnRuAtPMS647vhiRWnBmCkpKQoLCyu0PzEx0ebxmjVrFBoaWmSeF3ruuecUFRWlBx980O769kOHDunAgQMXHfPC5y/M999/b7N9nTp1tGLFCrsidNiwYVq6dKkk6brrrtPXX39tE/PVV1/phRde0O7du22ukS6qmC1q3wp4enrqnnvu0YIFC3T77bdrwYIF6t+/vwIDA23iCk63Pv907wINGjTQ6tWrlZGRYbNoXWm8bwosXrxYP//8sz7++GMtWbLEpu/IkSNyc3NTw4YNi3xuZzzyyCM2X9Q98MADeuyxxwqNTUlJ0YsvvqixY8cqPDy8yHG/++47ffnll1q3bp31UhBH/Pnnnzbvq8jISC1fvtz6ebrYfHl5ealmzZqFnj5fmPOfIzAwUIsXL1Z0dLRNTFRUlN1ihXXr1pV07jr7Nm3aXNJn6YEHHtADDzyg1157TcuWLVPlypWtXxac79ChQ9qzZ4/DYz/33HOFfilzsbnKz8/XU089pUGDBqlp06aFxgBASaL4B4ByolWrVjar/V/oxRdf1LPPPqshQ4bo+eefV0hIiNzc3DRmzBi7I+ploSCHV155Rc2aNSs05vyCPCcnRydOnFDXrl2LHdcwDP33v/+Vu7t7kWNKUnx8vCQpIiKiyDHDwsK0ePHiQvsv/D/7rVu31gsvvGDTNnfuXH3++eeFbn/gwAEtWrRIH374YaFFpsViUZMmTTRjxoxCt7+wCCpM06ZN9dprr0mSTp48qdmzZ6tjx47atWuXzb4/88wzeuCBB3TkyBFNnz5dAwcO1Nq1a+Xh4aFNmzapT58+uvHGG/XGG28oMjJSnp6eWrhwoV3h6+i+nW/IkCFq3ry5Dh48qGXLllnPArgcpfW+KRj72Wef1dChQ61FZmkbN26cunXrpvz8fP3888+aMmWKTNO0Hpk+38svvyw3NzeNGzdOp0+fLnLcJ554Qt27d1fnzp1tFgUsTnh4uD788ENJ575sWLBggXr06KHvvvtOTZo0cWrfihIbGytJysjI0PLly3XHHXfoq6++KnZeL3Qpn6VevXrJy8tLK1as0MKFCzV48GC7MzkKxu7atavGjx9f6NgXvkeGDx9ud5ZFUbeLfPfdd3X06FGtXr36ojEAUJIo/gHARXzyySfq1KmT3n33XZv25ORkm1O9a9WqpW3btik3N7dEFq0rUHBkv4Bpmjp8+LD1iFXBQoKBgYHq0qVLseP99NNPys3NLfILj4JxTdNUTEyMQwXZ/v37ZRhGoUeSzx9z7dq1at++vc1p7xdTtWpVu30qalG+CRMmqFmzZnanuJ///D/99JNuuummSz5dvHLlyjY5dezYUVFRUVq4cKHNoo2NGzdW48aNJUlNmjTRjTfeqNjYWPXs2VPLly+Xj4+PVq9ebXOZRGGFp6P7dr4mTZqoefPmuuOOOxQaGqpOnTrp22+/tYmpXr26JOngwYN22//yyy+qWrWqzdHf0nrfSOdWsE9MTLzoiuu1atWSxWLR/v37L/oFl7MaNmxoncfu3bsrOztbTz31lKZOnWpdxFOSjh8/rlmzZmnatGkKCAgosvhfsWKFtmzZUuSlGxfj4+Nj877q06ePQkJCNHfuXL311ls281WzZk1rXE5OjuLi4hz67Euyibv11lu1bds2vfrqqzbF//Hjx+3O+vj1118lyeauBs5+ljw8PHTvvfdq6tSp+vnnn7VgwYJC42rVqqX09HSH96lOnTp2sRe7zWJmZqYmT56shx9+2PqaAkBp45p/AHAR7u7uMk3Tpm3ZsmU6duyYTVv//v116tQpzZ07126MC7d3xvvvv6+0tDTr408++UQnTpxQz549JUktW7ZUrVq19Oqrr1pXOz/fyZMn7XJ3d3cv9DZ657vtttvk7u6uyZMn2+VvmqZNEZSXl6fly5erVatWRZ72f8cddyg/P7/QFdXz8vIcumXZxWzZskWff/65XnrppYsWI3fccYeOHTumd955x67v7NmzysjIcPp5z549K0lF3irw1KlTNjEFt907/xKSo0ePXvSLDUf27UJDhgzRnj17rLeRu1BkZKSaNWum9957z+Z137dvn9asWWN3N4rSeN9I59ZGmDp1qh577LGLnjXSt29fubm5acqUKXZn21zOZ+t8BfN4/u3+pHN3GQgPD9dDDz1U5PYFp5LffffdJfIFRU5OjvLy8qzvmS5dusjLy0uzZ8+22ed3331XKSkpRd4hoqicc3Jy7N67eXl5euutt2xyeeuttxQaGqqWLVtKuvTP0pAhQ7R3717deOONNl9inO+OO+7Qli1bCj0yn5ycrLy8PIf38UKzZs1SRkaGnn766UseAwCcxZF/AHARt9xyi6ZMmaIHHnhA7dq10969e7V48WK7/+N633336f3339fYsWO1fft23XDDDcrIyNDatWv18MMP69Zbb72k5w8JCdH111+vBx54QAkJCZo5c6Zq165tPa3Vzc1N//d//6eePXuqUaNGeuCBB1StWjUdO3ZM33zzjQIDA/Xll18qIyND8+bN0+zZs1W3bl2b+10XfGmwZ88ebdmyRW3btlWtWrX0wgsvaMKECTp69Kj69u2rgIAAxcXF6bPPPtPw4cP173//W2vXrtWzzz6rPXv26MsvvyxyXzp06KAHH3xQ06ZN0+7du9WtWzd5enrq0KFDWrZsmWbNmqXbb7/9kl6nNWvWqGvXrkUeLbz33nv18ccf66GHHtI333yj9u3bKz8/X7/88os+/vhjrV69utgj2wkJCdbTs0+dOqW33npLHh4e1qL4nXfe0caNG9WiRQsFBgZq//79eueddxQZGWldUK5Xr16aMWOGevToobvvvluJiYmaN2+eateurT179lzSvl1o2LBhGjBggIKCgi4a88orr6hnz55q27athg4dar3VX1BQkPUofGm9bwrs2rVLVatWvegp3pJUu3ZtPf3003r++ed1ww036LbbbpO3t7d27NihqKioIm99eDFbtmyRh4eH9bT/OXPmqHnz5na3N1yzZo0WL14sLy+vIsf766+/5OXldcm3jsvIyLA57f+DDz5QVlaW+vXrJ+ncJTETJkzQ5MmT1aNHD/Xp00cHDx7UG2+8oeuuu0733HOPQ89T8BwZGRlasWKFjh49anP7PencNf8vv/yyjh49qrp162rp0qXavXu33n77betZTZf6WWrQoIFOnTpV5Jk/48aN0xdffKFbbrlF999/v1q2bKmMjAzt3btXn3zyiY4ePWq3wKaj1qxZo6lTp9qs1wIApa4M7jAAADhPwa28duzYUWRcVlaW+fjjj5uRkZGmr6+v2b59e3PLli2F3vYtMzPTfPrpp82YmBjT09PTjIiIMG+//XbzyJEjpmle2q2p/vOf/5gTJkwww8LCTF9fX7NXr17m77//brf9jz/+aN52221mlSpVTG9vb7N69ermHXfcYa5bt87muYv7GTx4sM24y5cvN6+//nrT39/f9Pf3N+vXr2+OHDnSPHjwoGmapvnII4+YN954o7lq1Sq7nAq7vZhpnruNWcuWLU1fX18zICDAbNKkiTl+/Hjz+PHj1hhnb/VnGIa5c+dOm/bC5ignJ8d8+eWXzUaNGpne3t5m5cqVzZYtW5qTJ082U1JS7J7vwvHOf62Cg4PN9u3bm19//bU15ttvvzVvuOEGMzg42PT29jZr1KhhDhs2zO52b++++65Zp04d09vb26xfv765cOHCQl8vR/et4P1y/q38znex/rVr15rt27c3fX19zcDAQLN3797m/v37rf2l9b45//V8/fXXbba92PtmwYIFZvPmza3z1qFDBzM2Nvai+1rUrf4Kftzc3MyrrrrKHDx4sPX2nqb599+HZs2a2dxOsLBbBQ4ePNiUZD766KM2z3Xh7QIvpmD7gp9KlSqZLVq0MD/44AO72Llz55r169c3PT09zfDwcHPEiBHmmTNnihzfNP9+TQt+fH19zYYNG5qvv/66zf516NDBbNSokfnDDz+Ybdu2NX18fMzq1aubc+fOtRvT0c+SLriV34UK609LSzMnTJhg1q5d2/Ty8jKrVq1qtmvXznz11Vettxu8lL+nkZGRZkZGht3zc6s/AKXJMM0SOk8NAFAhbdiwQZ06ddKyZcsu+Wj4+Y4ePaqYmBjFxcXZHd0sMGnSJB09etSphcpQsfG++Wfp2LGjTp06pX379pV1KgBQYXDNPwAAAAAAFRzX/AMArqhKlSpp0KBBRS7I17RpU5uVzgHeNwAAXB6KfwDAFVW1alXrYl8Xc9ttt12hbOAqeN8AAHB5uOYfAAAAAIAKrkyv+d+4caN69+6tqKgoGYZx0fsKS9JDDz0kwzA0c+ZMm/akpCQNGjRIgYGBCg4O1tChQ+3uL71nzx7dcMMN8vHxUXR0tKZPn243/rJly1S/fn35+PioSZMml3yLHAAAAAAAypsyLf4zMjJ0zTXXaN68eUXGffbZZ9q6dWuh1/ENGjRIP//8s2JjY/XVV19p48aNGj58uLU/NTVV3bp1U/Xq1bVz50698sormjRpkt5++21rzPfff6+77rpLQ4cO1Y8//qi+ffuqb9++rDALAAAAAKgQys1p/4Zh6LPPPlPfvn1t2o8dO6bWrVtr9erV6tWrl8aMGaMxY8ZIkg4cOKCGDRtqx44duvbaayVJq1at0s0336y//vpLUVFRevPNN/X0008rPj5eXl5ekqQnn3xSK1as0C+//CJJuvPOO5WRkaGvvvrK+rxt2rRRs2bNNH/+fIfyt1gsOn78uAICAmQYxmW+GgAAAAAAFM00TaWlpSkqKkpubkUf2y/XC/5ZLBbde++9GjdunBo1amTXv2XLFgUHB1sLf0nq0qWL3NzctG3bNvXr109btmzRjTfeaC38Jal79+56+eWXdebMGVWuXFlbtmzR2LFjbcbu3r17kZchZGdnKzs72/r42LFjatiw4WXsLQAAAAAAzvvzzz911VVXFRlTrov/l19+WR4eHho9enSh/fHx8QoLC7Np8/DwUEhIiOLj460xMTExNjHh4eHWvsqVKys+Pt7adn5MwRiFmTZtmiZPnmzXvmvXriJvQ4Qrw2KxKDU1VYGBgcV+A4byhblzTcyb62LuXBdz55qYN9fF3JUPVW+4Qe4JCcoPD9epTZuKj99yvdxzEpXvFaZTbb+z6bth6Q1KyEhQuH+4Nt1Z/FjlUXp6ulq0aKGAgIBiY8tt8b9z507NmjVLu3btKpen0U+YMMHmbIHU1FRFR0crJiZGgYGBZZgZpHN/nE+ePKnQ0FD+OLsY5s41MW+ui7lzXcyda2LeXBdzVz4YHh4yJJkeHgqqVav4+H2eMs5Kpq+nXbyHn4eUf+53LQfGKo9SU1MlyaGaudwW/5s2bVJiYqKuvvpqa1t+fr4ef/xxzZw5U0ePHlVERIQSExNttsvLy1NSUpIiIiIkSREREUpISLCJKXhcXExBf2G8vb3l7e1t1+7m5sYfg3LCMAzmw0Uxd66JeXNdzJ3rYu5cE/Pmupi78sOQZDgwD+cvcFfUvLnqnDqTd7ndw3vvvVd79uzR7t27rT9RUVEaN26cVq9eLUlq27atkpOTtXPnTut269evl8ViUevWra0xGzduVG5urjUmNjZW9erVU+XKla0x69ats3n+2NhYtW3btrR3EwAAAACAUlemR/7T09N1+PBh6+O4uDjt3r1bISEhuvrqq1WlShWbeE9PT0VERKhevXqSpAYNGqhHjx4aNmyY5s+fr9zcXI0aNUoDBw603hbw7rvv1uTJkzV06FA98cQT2rdvn2bNmqXXX3/dOu6jjz6qDh066LXXXlOvXr300Ucf6YcffrC5HSAAAAAAAK6qTIv/H374QZ06dbI+LriGfvDgwVq0aJFDYyxevFijRo3STTfdJDc3N/Xv31+zZ8+29gcFBWnNmjUaOXKkWrZsqapVq2rixIkaPny4NaZdu3ZasmSJnnnmGT311FOqU6eOVqxYocaNG5fMjv6PaZrKy8tTfn5+iY4LexaLRbm5ucrKyiq3p/C4u7vLw8OjXK5pAQAAAJRLO3ZI+fmSu7tD4Wa3bTp5MkFVQ8N14f/r3jFsh/LNfLkbjo3l6gzTNM3iw1Cc1NRUBQUFKSUlpdAF/3JycnTixAllZmaWQXb/PKZpymKxyM3NrVwX135+foqMjLS5FeU/ncViUWJiosLCwsrtFzewx7y5LubOdTF3rol5c13MnWuq6PNWXB16vnK74F9FYrFYFBcXJ3d3d0VFRcnLy6tcF6QVQcFZFuX1yLppmsrJydHJkycVFxenOnXqVMg/RgAAAADKB4r/KyAnJ0cWi0XR0dHy8/Mr63T+Ecp78S9Jvr6+8vT01O+//66cnBz5+PiUdUoAAAAAKiiK/yuII7u4EO8JAAAAwAlvvy2lp0uVKknnreN2UYffll9yvJQaIdV9yHaonW8rPSddlbwqaXhLB8ZycRT/AAAAAADXMGWKdOyYVK2aQ8W/8fMLCjx7TKZvNbvif8q3U3Qs7ZiqBVT7RxT/HHYEAAAAAKCC48h/GVt2JOWKPdeAWkFX7LkAAAAAAOUHR/5RpPvvv1+GYdj99OjR44rlMGnSJDVr1uyKPR8AAAAAVDQc+UexevTooYULF9q0eXt7l1E2AAAAAABnceQfxfL29lZERITNT+XKlXX33XfrzjvvtInNzc1V1apV9f7770uSLBaLpk2bppiYGPn6+uqaa67RJ598Yo3fsGGDDMPQunXrdO2118rPz0/t2rXTwYMHJUmLFi3S5MmT9dNPP1nPOli0aJFM09SkSZN09dVXy9vbW1FRURo9evSVe1EAAAAAwIVw5B+XbNCgQRowYIDS09NVqVIlSdLq1auVmZmpfv36SZKmTZumDz/8UPPnz1edOnW0ceNG3XPPPQoNDVWHDh2sYz399NN67bXXFBoaqoceekhDhgzR5s2bdeedd2rfvn1atWqV1q5dK0kKCgrS8uXL9frrr+ujjz5So0aNFB8fr59++unKvwgAAAAA4AIo/lGsr776ylrcF3jqqac0fvx4+fv767PPPtO9994rSVqyZIn69OmjgIAAZWdn68UXX9TatWvVtm1bSVLNmjX13Xff6a233rIp/qdOnWp9/OSTT6pXr17KysqSr6+vKlWqJA8PD0VERFjj//jjD0VERKhLly7y9PTU1VdfrVatWpX2SwEAAAAALoniH8Xq1KmT3nzzTZu2kJAQeXh46I477tDixYt17733KiMjQ59//rk++ugjSdLhw4eVmZmprl272mybk5Oj5s2b27Q1bdrU+u/IyEhJUmJioq6++upCcxowYIBmzpypmjVrqkePHrr55pvVu3dveXjwlgYAAACAC1EpoVj+/v6qXbt2oX2DBg1Shw4dlJiYqNjYWPn6+lrvBJCeni5JWrlypapVq2az3YULBnp6elr/bRiGpHPrBVxMdHS0Dh48qLVr1yo2NlYPP/ywXnnlFX377bc2YwEAAACoQOrWlYKCpPBwx+ID6irXzV8e/tXsuupWqasgnyCF+zs4louj+MdladeunaKjo7V06VL997//1YABA6zFd8OGDeXt7a0//vjD5hR/Z3l5eSk/P9+u3dfXV71791bv3r01cuRI1a9fX3v37lWLFi0u+bkAAAAAlGPr1zsVbnZeq9OJiQoLC5Nx4VCDnRvL1VH8o1jZ2dmKj4+3afPw8FDVqlUlSXfffbfmz5+vX3/9Vd988401JiAgQP/+97/12GOPyWKx6Prrr1dKSoo2b96swMBADR482KHnr1GjhuLi4rR7925dddVVCggI0H/+8x/l5+erdevW8vPz04cffihfX19Vr1695HYcAAAAQLm07EhKkf0DagVdoUxcB8V/GXOFN+WqVaus1+EXqFevnn755RdJ5079nzp1qqpXr6727dvbxD3//PMKDQ3VtGnT9Ntvvyk4OFgtWrTQU0895fDz9+/fX59++qk6deqk5ORkLVy4UMHBwXrppZc0duxY5efnq0mTJvryyy9VpUqVy99hAAAAAKhgDNM0zbJOoiJITU1VUFCQUlJSFBgYaNOXlZWluLg4xcTEyMfHp4wy/GcxTVN5eXny8PCwriFQHvHesGexWJT4v1Oz3NzcyjodOIh5c13Mneti7lwT8+a6mLvyxdEj/xV93oqqQy/EkX8AAAAAgGsYNEg6dUqtfIK0fcY7xYYbW+5R5bTjMgKipPZLbIf6dJBOZZ5SVb+qWnzb4tLKuNyg+AcAAAAAuIZvv5WOHVNoeJRj8Ykb5X32mMyz9qv9f3v0Wx1LO6ZqAfZ9FVHFO+8BAAAAAADYoPgHAAAAAKCCo/gHAAAAAKCCo/gHAAAAAKCCo/gHAAAAAKCCo/gHAAAAAKCCo/gHAAAAAKCCo/hHubd582Y1adJEnp6e6tu370XbAAAAAACF8yjrBFD+xcfHa+rUqVq5cqWOHTumsLAwNWvWTGPGjNFNN93k0BgbNmxQp06ddObMGQUHBzv1/GPHjlWzZs303//+V5UqVbpom6P7Mm3aNK1cuVJ//fWXgoKCVLt2bd1zzz0aPHiw/Pz8nMoNAAAAQMladiTlon0N+98rz7RU5QYEOjSWWetfykw5Id+gSBkX9A1rMUwp2SkK8g66jGxdB8U/inT06FG1b99ewcHBeuWVV9SkSRPl5uZq9erVGjlypH755ZdSz+HIkSN66KGHdNVVVxXZVpzffvvNui8vvviimjRpIm9vb+3du1dvv/22qlWrpj59+pTGLgAAAAAoAftHP+ncBo0nKi0xUb5hYXZdz3V8roSycg2c9o8iPfzwwzIMQ9u3b1f//v1Vt25dNWrUSGPHjtXWrVslnfuCwDAM7d6927pdcnKyDMPQhg0bdPToUXXq1EmSVLlyZRmGofvvv1+SlJ2drdGjRyssLEw+Pj66/vrrtWPHDptxT58+rSFDhsgwDC1atKjQNkf3xcPDQz/88IPuuOMONWjQQDVr1tStt96qlStXqnfv3tbYGTNmqEmTJvL391d0dLQefvhhpaenW/sXLVqk4OBgrVixQnXq1JGPj4+6d++uP//88zJebQAAAAAoHRT/ZW3GDOmqq4r/KeyIdJ8+jm07Y8YlpZaUlKRVq1Zp5MiR8vf3t+t39PT96OhoLV++XJJ08OBBnThxQrNmzZIkjR8/XsuXL9d7772nXbt2qXbt2urevbuSkpIUHR2tEydOKDAwUDNnztSJEyc0YMAAu7Y777zT+qXAxZw+fVpr1qy56L5Istnezc1Ns2fP1s8//6z33ntP69ev1/jx423iMzMzNXXqVL3//vvavHmzkpOTNXDgQIdeEwAAAAC4kjjtv6ylpkrHjhUfFx1t33bypGPbpqY6n5ekw4cPyzRN1a9f/5K2L+Du7q6QkBBJUlhYmPVLg4yMDL355ptatGiRevbsKUl65513FBsbq3fffVfjxo1TRESEDMNQUFCQIiIiJEn+/v52bUFBQapXr16x+3JhTNWqVZWVlSVJGjlypF5++WVJ0pgxY6wxNWrU0AsvvKCHHnpIb7zxhrU9NzdXc+fOVevWrSVJ7733nho0aKDt27erVatWl/pyAQAAAECJo/gva4GBUrVqxceFhhbe5si2gY4thnEh0zQvaTtHHTlyRLm5uWrfvr21zdPTU61atdKBAwecGqtfv37q16+f0zls375dFotFgwYNUnZ2trV97dq1mjZtmn755RelpqYqLy9PWVlZyszMtC4K6OHhoeuuu866Tf369RUcHKwDBw5Q/AMAAACloFf7hvJLOK7M8Cit3Ly/2Hjj86sVcfaYTN9qUr+/bPqumnGVjqUdU7WAavpr7F8XGaHioPgva2PHnvu5FF98UbK5XKBOnToyDKPYRf3c3M5dPXL+lwW5ubmlmpuzateuLcMwdPDgQZv2mjVrSpJ8fX2tbUePHtUtt9yiESNGaOrUqQoJCdF3332noUOHKicnhzsCAAAAAHA5XPOPiwoJCVH37t01b948ZWRk2PUnJydLkkL/d1bCiRMnrH3nL/4nSV5eXpKk/Px8a1utWrXk5eWlzZs3W9tyc3O1Y8cONWzYsKR2Q5JUpUoVde3aVXPnzi10X863c+dOWSwWvfbaa2rTpo3q1q2r48eP28Xl5eXphx9+sD4+ePCgkpOT1aBBgxLNHQAAAAAuF8U/ijRv3jzl5+erVatWWr58uQ4dOqQDBw5o9uzZatu2raRzR83btGmjl156SQcOHNC3336rZ555xmac6tWryzAMffXVVzp58qTS09Pl7++vESNGaNy4cVq1apX279+vYcOGKTMzU0OHDnUqz88++6zYtQneeOMN5eXl6dprr9XSpUt14MABHTx4UB9++KF++eUXubu7Szp3lkBubq7mzJmj3377TR988IHmz59vN56np6ceeeQRbdu2TTt37tT999+vNm3acMo/AAAAgHKH4h9Fqlmzpnbt2qVOnTrp8ccfV+PGjdW1a1etW7dOb775pjVuwYIFysvLU8uWLTVmzBi98MILNuNUq1ZNkydP1pNPPqnw8HCNGjVKkvTSSy+pf//+uvfee9WiRQsdPnxYq1evVuXKlZ3KMyUlxe6U/gvVqlVLP/74o7p06aIJEybommuu0bXXXqs5c+bo3//+t55//nlJ0jXXXKMZM2bo5ZdfVuPGjbV48WJNmzbNbjw/Pz898cQTuvvuu9W+fXtVqlRJS5cudSpvAAAAALgSDLO0V3X7h0hNTVVQUJBSUlIUeMECe1lZWYqLi1NMTIx8fHzKKMN/FtM0lZeXJw8PjyJvAXipFi1apDFjxlgvfbhUvDfsWSwWJSYmKiwszLqeBMo/5s11MXeui7lzTcyb62LurpxlR1Iu2ufogn8DagVJkszPrpLxvwX/jAq44F9RdeiFeNcCAAAAAFDBsdo/AAAAAOAfZdmRFJ3NP3cS/Nl8s9CzDQrOHqgoOPIPXIL777//sk/5BwAAAIArheIfAAAAAIAKjtP+AQAAAAAuYftrb8ktJ0cWLy+H4s027+tMUoKCQ8J14TLgj7R7S7mWHHm6OTaWq6P4BwAAAAC4hJNtbnBug/COyjESpbAwu65GEU6O5eI47R8AAAAAgAqO4h8AAAAAgAqO0/4BAAAAoJwr7FZ056tot6W7mNCtm6zX/Dt0CUDCBnklJUhmuBTZ2abr5/hN1mv+/wmXAHDkH+Xe5s2b1aRJE3l6eqpv374XbbsSOnbsqDFjxlgf16hRQzNnzixyG8MwtGLFilLNCwAAAPgnaPX4g7pxSH+1evxBh+KNrfcp5Ke7ZWy9z65vzvcP6sX1/TXne8fGcnUU/yhWfHy8HnnkEdWsWVPe3t6Kjo5W7969tW7dOofH2LBhgwzDUHJystPPP3bsWDVr1kxxcXFatGjRRduK0rt3b/Xo0aPQvk2bNskwDO3Zs8fp3Hbs2KHhw4c7vR0AAAAAXEkU/yjS0aNH1bJlS61fv16vvPKK9u7dq1WrVqlTp04aOXLkFcnhyJEj6ty5s6666ioFBwdftK0oQ4cOVWxsrP766y+7voULF+raa69V06ZNnc4tNDRUfn5+Tm8HAAAAAFcSxT+K9PDDD8swDG3fvl39+/dX3bp11ahRI40dO1Zbt26VdO4LAsMwtHv3but2ycnJMgxDGzZs0NGjR9WpUydJUuXKlWUYhu6//35JUnZ2tkaPHq2wsDD5+Pjo+uuv144dO2zGPX36tIYMGSLDMLRo0aJC24pzyy23KDQ01C42PT1dy5Yt09ChQ3X69Gndddddqlatmvz8/NSkSRP95z//KXLcC0/7P3TokG688Ub5+PioYcOGio2NLTY3AAAAAChtLPhX1g7MkH6ZUXxcSAupwxe2bd/2kZJ2Fb9t/bFSg7FOp5aUlKRVq1Zp6tSp8vf3t+t35Ii7JEVHR2v58uXq37+/Dh48qMDAQPn6+kqSxo8fr+XLl+u9995T9erVNX36dHXv3l2HDx9WdHS0Tpw4oXr16mnKlCm68847FRAQoB49eti0BQUFadGiRXrggQdkmmahOXh4eOi+++7TokWL9PTTT8swDEnSsmXLlJ+fr7vuukvp6elq2bKlnnjiCQUGBmrlypW69957VatWLbVq1arY/bRYLLrtttsUHh6ubdu2KSUlxWZ9AAAAAAAoKxT/ZS03VTp7rPi4rOhC2k46tm1uqvN5STp8+LBM01T9+vUvafsC7u7uCgkJkSSFhYVZvzTIyMjQm2++qUWLFqlnz56SpHfeeUexsbF69913NW7cOEVERMgwDAUFBSkiIkKS5O/vb9cWFBSkevXqFZnHkCFD9Morr+jbb79Vx44dJZ075b9///4KCgpSUFCQ/v3vf1vjH3nkEa1evVoff/yxQ8X/2rVr9csvv2j16tWKioqSJL344ovWfQMAAAAKU9xK/kBJoPgva56Bkm+14uN8Qgtvc2Rbz0Dn85IuehS9pBw5ckS5ublq3769tc3T01OtWrXSgQMHnBqrX79+6tevX5Ex9evXV7t27bRgwQJ17NhRhw8f1qZNmzRlyhRJUn5+vl588UV9/PHHOnbsmHJycpSdne3wNf0HDhxQdHS0tfCXpLZt2zq1HwAAAABQGij+y1qDSzslX5L9ZQAlrE6dOjIMQ7/88kuRcW5u55aOOP/Lgtzc3FLN7VINHTpUjzzyiObNm6eFCxeqVq1a6tChgyTplVde0axZszRz5kw1adJE/v7+GjNmjHJycso4awAAAAC4PGW64N/GjRvVu3dvRUVF2d0LPTc3V0888YS1CIuKitJ9992n48eP24yRlJSkQYMGKTAwUMHBwRo6dKjS09NtYvbs2aMbbrhBPj4+io6O1vTp0+1yWbZsmerXry8fHx81adJEX3/9danssysJCQlR9+7dNW/ePGVkZNj1F9y2LzT03FkJJ06csPadv/ifJHl5eUk6d3S9QK1ateTl5aXNmzdb23Jzc7Vjxw41bNiwpHbDxh133CE3NzctWbJE77//vnXRQEnavHmzbr31Vt1zzz265pprVLNmTf36668Oj92gQQP9+eefNq9DwaKIAAAAAK6cZUdStOxIis7mWSRJZ/Ms1rZ/6mUWZVr8Z2Rk6JprrtG8efPs+jIzM7Vr1y49++yz2rVrlz799FMdPHhQffr0sYkbNGiQfv75Z8XGxuqrr77Sxo0bbe67npqaqm7duql69erauXOnXnnlFU2aNElvv/22Neb777/XXXfdpaFDh+rHH39U37591bdvX+3bt6/0dt5FzJs3T/n5+WrVqpWWL1+uQ4cO6cCBA5o9e7b1lHZfX1+1adNGL730kg4cOKBvv/1WzzzzjM041atXl2EY+uqrr3Ty5Emlp6fL399fI0aM0Lhx47Rq1Srt379fw4YNU2ZmpoYOHepUnp999plDaxNUqlRJd955pyZMmKATJ05Y7zognTvTITY2Vt9//70OHDigBx98UAkJCQ7n0KVLF9WtW1eDBw/WTz/9pE2bNunpp592aj8AAAAAoDSU6Wn/PXv2vOhiaEFBQXa3SZs7d65atWqlP/74Q1dffbUOHDigVatWaceOHbr22mslSXPmzNHNN9+sV199VVFRUVq8eLFycnK0YMECeXl5qVGjRtq9e7dmzJhh/ZJg1qxZ6tGjh8aNGydJev755xUbG6u5c+dq/vz5pfgKlH81a9bUrl27NHXqVD3++OM6ceKEQkND1bJlS7355pvWuAULFmjo0KFq2bKl6tWrp+nTp6tbt27W/mrVqmny5Ml68skn9cADD1hX3n/ppZdksVh07733Ki0tTddee61Wr16typUrO5VnSkqKDh486FDs0KFD9e677+rmm2+2uT7/mWee0W+//abu3bvLz89Pw4cPV9++fZWS4tg3g25ubvrss880dOhQtWrVSjVq1NDs2bPVo0cPp/YFAAAAQOFWbt7vXHzzn2WkJckMCLHrm3+bc2O5OsMs7VXdHGQYhj777DP17dv3ojFr165Vt27dlJycrMDAQC1YsECPP/64zpw5Y43Jy8uTj4+Pli1bpn79+um+++5TamqqzSUF33zzjTp37qykpCRVrlxZV199tcaOHWtzW7bnnntOK1as0E8//VRoLtnZ2crOzrY+Tk1NVXR0tM6cOaPAQNsF9rKysnT06FHFxMTIx8fHuRcGlyw3N1eenp5lnUaRsrKyFBcXpxo1avDe+B+LxaKTJ08qNDTUup4Eyj/mzXUxd66LuXNNzJvrKs25W/7b5Z+G3r9mUAlkUj6UxOthZZp/F///u9zXUa7wmqampqpy5cpKSUmxq0Mv5DIL/mVlZemJJ57QXXfdZd2p+Ph4hYWF2cR5eHgoJCRE8fHx1piYmBibmPDwcGtf5cqVFR8fb207P6ZgjMJMmzZNkydPtms/efKksrKybNpyc3NlsViUl5envLw8B/cYl8M0Tev6AoaTH/IrKS8vTxaLRadPny73X1RcKRaLRSkpKTJNk/9T5EKYN9fF3Lku5s41MW+uqzTnzkizX1/LWYmJ2cUHuYiSeD3+Zso4myYZ0v/+x2Gu8JqmpaU5HOsSxX9ubq7uuOMOmaZpc6p5WZowYYLGjv17lf6CI/+hoaGFHvlPS0uTh4eHPDxc4iWvMMp7Qe3h4SE3NzdVqVKFI///Y7FYZBgGR0RcDPPmupg718XcuSbmzXWV5tyZ6Zd/pDssrPwfpXZUSbwefw9mSqZkVnL+yL8rvKbO1BDlvhItKPx///13rV+/3qawjoiIUGJiok18Xl6ekpKSFBERYY25cNG2gsfFxRT0F8bb21ve3t527W5ubnZ/DNzc3GQYhvUHpc80TetrXZ5f84L3RGHvm38yXhPXxLy5LubOdTF3rol5c12lNnfG5Y9Xod5PRbweDWe/JM+0VOUGBGr/6CeLHarhsWnyPJuo3LQw7b9qgk3fsj0vKTMnVX5egRrQ1H4sV3hNncmxXO9NQeF/6NAhrV27VlWqVLHpb9u2rZKTk7Vz505r2/r162WxWNS6dWtrzMaNG23uOx8bG6t69epZF5Vr27at1q1bZzN2bGysdTV7AAAAAEDZi1n6vuoufEMxS993LD7xfdVNWqiYRPv4dYff18pf3tC6w46N5erKtPhPT0/X7t27rfeEj4uL0+7du/XHH38oNzdXt99+u3744QctXrxY+fn5io+PV3x8vHJyciSdu696jx49NGzYMG3fvl2bN2/WqFGjNHDgQOsq7nfffbe8vLw0dOhQ/fzzz1q6dKlmzZplc8r+o48+qlWrVum1117TL7/8okmTJumHH37QqFGjrvhrAgAAAABASSvT4v+HH35Q8+bN1bx5c0nS2LFj1bx5c02cOFHHjh3TF198ob/++kvNmjVTZGSk9ef777+3jrF48WLVr19fN910k26++WZdf/31evvtt639QUFBWrNmjeLi4tSyZUs9/vjjmjhxovU2f5LUrl07LVmyRG+//bauueYaffLJJ1qxYoUaN2585V4MAAAAAABKSZle89+xY0cVdadBR+5CGBISoiVLlhQZ07RpU23atKnImAEDBmjAgAHFPh8AAAAAAK6mXF/zDwAAAAAALh/FPwAAAAAAFRzFP8q9zZs3q0mTJvL09FTfvn0v2gYAAAAAKBzFP4oVHx+vRx55RDVr1pS3t7eio6PVu3dvu9sjFmXDhg0yDEPJyclOP//YsWPVrFkzxcXFadGiRRdtK4phGEX+TJo0yem8zh97xYoVl7w9AAAAAJS2Ml3wD+Xf0aNH1b59ewUHB+uVV15RkyZNlJubq9WrV2vkyJH65ZdfSj2HI0eO6KGHHtJVV11VZFtRTpw4Yf330qVLNXHiRB08eNDaVqlSpZJLGAAAAADKGY78o0gPP/ywDMPQ9u3b1b9/f9WtW1eNGjXS2LFjtXXrVknnviAwDEO7d++2bpecnCzDMLRhwwYdPXpUnTp1kiRVrlxZhmHo/vvvlyRlZ2dr9OjRCgsLk4+Pj66//nrt2LHDZtzTp09ryJAhMgxDixYtKrStOBEREdafoKAgGYZh0/bRRx+pQYMG8vHxUf369fXGG29Yt83JydGoUaMUGRkpHx8fVa9eXdOmTZMk1ahRQ5LUr18/GYZhfQwAAACg5J1s3V7x13fWydbtHYsPbK94/+t1MtA+vmFYe10T2VkNwxwby9Vx5L+MzdgyQzO2zCg2rkVkC31x1xc2bX3+00e7TuwqdtuxbcdqbNuxTueWlJSkVatWaerUqfL397frDw4Odmic6OhoLV++XP3799fBgwcVGBgoX19fSdL48eO1fPlyvffee6pevbqmT5+u7t276/Dhw4qOjtaJEydUr149TZkyRXfeeacCAgLUo0cPm7agoCAtWrRIDzzwgEO3h7zQ4sWLNXHiRM2dO1fNmzfXjz/+qGHDhsnf31+DBw/W7Nmz9cUXX+jjjz/W1VdfrT///FN//vmnJGnHjh0KCwvTwoUL1aNHD7m7uzv9/AAAAAAcs33GO87F13pbRlqSzIAQu77R1zs3lquj+C9jqdmpOpZ2rNi46KBou7aTmScd2jY1O/WScjt8+LBM01T9+vUvafsC7u7uCgk592ELCwuzfmmQkZGhN998U4sWLVLPnj0lSe+8845iY2P17rvvaty4cYqIiJBhGAoKClJERIQkyd/f364tKChI9erVu6T8nnvuOb322mu67bbbJEkxMTHav3+/3nrrLQ0ePFh//PGH6tSpo+uvv16GYah69erWbUNDQyWd+yKkIBcAAAAAKG8o/stYoHegqgVUKzYu1C+00DZHtg30Dryk3C7lKLozjhw5otzcXLVv//dpNp6enmrVqpUOHDjg1Fj9+vVTv379nM4hIyNDR44c0dChQzVs2DBre15enoKCgiRJ999/v7p27ap69eqpR48euuWWW9StWzennwsAAACAtOxISlmn8I9E8V/GLvWUfEl2lwGUtDp16sgwjGIX9XNzO7d0xPlfFuTm5pZqbiUlPT1d0rkzDlq3bm3TV3AKf4sWLRQXF6f//ve/Wrt2re644w516dJFn3zyyRXPFwAAAAAuBQv+4aJCQkLUvXt3zZs3TxkZGXb9BbftKzj1/fwV9c9f/E+SvLy8JEn5+fnWtlq1asnLy0ubN2+2tuXm5mrHjh1q2LBhSe1GkcLDwxUVFaXffvtNtWvXtvmJiYmxxgUGBurOO+/UO++8o6VLl2r58uVKSkqSdO5shfP3CwAAAEDp6HBPb3Xr0UYd7untWPyBW9X1SA91OHCrXd/k2N4a+2UbTY51bCxXx5F/FGnevHlq3769WrVqpSlTpqhp06bKy8tTbGys3nzzTR04cEC+vr5q06aNXnrpJcXExCgxMVHPPPOMzTjVq1eXYRj66quvdPPNN8vX11eVKlXSiBEjNG7cOIWEhOjqq6/W9OnTlZmZqaFDhzqV52effaYJEyZc0q0HJ0+erNGjRysoKEg9evRQdna2fvjhB505c0Zjx47VjBkzFBkZqebNm8vNzU3Lli1TRESEde2CGjVqaN26dWrfvr28vb1VuXJlp3MAAAAAULxKcUfkl3BcmWmOrWtWKeuw/HJPKNO0P5h5Iu2IkjKPKzP30tZIczUc+UeRatasqV27dqlTp056/PHH1bhxY3Xt2lXr1q3Tm2++aY1bsGCB8vLy1LJlS40ZM0YvvPCCzTjVqlXT5MmT9eSTTyo8PFyjRo2SJL300kvq37+/7r33XrVo0UKHDx/W6tWrnS6gU1JSdPDgwUvax3/961/6v//7Py1cuFBNmjRRhw4dtGjRIuuR/4CAAE2fPl3XXnutrrvuOh09elRff/219XKH1157TbGxsYqOjlbz5s0vKQcAAAAAKE2GWdqruv1DpKamKigoSCkpKQoMtF1gLysrS3FxcYqJiZGPj08ZZfjPYpqm8vLy5OHhIcMwyjqdi+K9Yc9isSgxMVFhYWHWL1hQ/jFvrou5c13MnWti3lxXac5dSSyAN6BWUAlkcmVczv72at/w3JH/8Cit3Ly/+PhdDc4d+feM1MoWtouKP/RpQyVlHleIX5Tm32Y/liu8pkXVoRfiLw4AAAAAABUcxT8AAAAAABUcxT8AAAAAABUcxT8AAAAAABUcxf8VxNqKuBDvCQAAAABXAsX/FeDp6SlJyszMLONMUN4UvCcK3iMAAAAAUBo8yjqBfwJ3d3cFBwcrMTFRkuTn51eubz9XEZT3W/2ZpqnMzEwlJiYqODhY7u7uZZ0SAAAAUO7tf2S8PDLSledfybH4auPkmXlSuX6hdn23NxmvrNx0+Xg6Nparo/i/QiIiIiTJ+gUASpdpmrJYLHJzcyuXxX+B4OBg63sDAAAAQNHiBt7vXHzY/TLSkmQGhNj1danj3FiujuL/CjEMQ5GRkQoLC1Nubm5Zp1PhWSwWnT59WlWqVJGbW/m8usXT05Mj/gAAAACuCIr/K8zd3Z2C7wqwWCzy9PSUj49PuS3+AQAAAOBKofgHAAAAALgEn8R4Gfn5Mt3dlRVW/OWzPjnxcss9LUtOjrK8o2z6zmTGy2Lmy81wV2W/in8pLsU/AAAAAMAl3NSvs/wSjiszPEorN+8vPv7nm+SXe0KZnpFa2eKATd+EVZ2VlHlcIX5Rmn9b8WO5Os6HBgAAAACggqP4BwAAAACggqP4BwAAAACggqP4BwAAAACggqP4BwAAAACggqP4BwAAAACggqP4BwAAAACggqP4BwAAAACggqP4BwAAAACggvMo6wQAAAAAAHDEtx98Lre8PFk8HCtlv62/Qu4Zp5XvX8Wub+JNnyvfzJO78c8oi/8ZewkAAAAAcHnpNes4F+9bR0ZeFZm+IXZ9UUHOjeXqOO0fAAAAAIAKjuIfAAAAAIAKjtP+AQAAAAAuIfqLZfI4e1Z5vr76s8+A4uNPfSKPzJPKyw7Vn6F32PR9F7dM2Xln5e3hq+tjih/L1VH8AwAAAABcQtOXn5NfwnFlhkc5VPw3/fM5+eWeUKZnpF3x/+GPzykp87hC/KL+EcU/p/0DAAAAAFDBUfwDAAAAAFDBUfwDAAAAAFDBUfwDAAAAAFDBUfwDAAAAAFDBUfwDAAAAAFDBUfwDAAAAAFDBUfwDAAAAAFDBeZR1AgAAAAAAOCIrNMzmd7HxnuGSaTn3+wLBPmE2vys6in8AAAAAgEtYt2KDc/GN18tIS5IZEGLX99LNzo3l6ij+AQAAAAAlYtmRlLJOARfBNf8AAAAAAFRwFP8AAAAAAFRwnPYPAAAAAHAJLZ4ZI6/kM8oJrqxdL8wsPj7uMXlnJSjbJ1y7as6y6Xt76xil55xRJa/KGt6m+LFcXZke+d+4caN69+6tqKgoGYahFStW2PSbpqmJEycqMjJSvr6+6tKliw4dOmQTk5SUpEGDBikwMFDBwcEaOnSo0tPTbWL27NmjG264QT4+PoqOjtb06dPtclm2bJnq168vHx8fNWnSRF9//XWJ7y8AAACAf55lR1KK/IHjIr9Zo+hVnyvymzWOxSev0VVpqxSZbB+/6/gabf3jc+067thYrq5Mi/+MjAxdc801mjdvXqH906dP1+zZszV//nxt27ZN/v7+6t69u7KysqwxgwYN0s8//6zY2Fh99dVX2rhxo4YPH27tT01NVbdu3VS9enXt3LlTr7zyiiZNmqS3337bGvP999/rrrvu0tChQ/Xjjz+qb9++6tu3r/bt21d6Ow8AAAAAwBVSpqf99+zZUz179iy0zzRNzZw5U88884xuvfVWSdL777+v8PBwrVixQgMHDtSBAwe0atUq7dixQ9dee60kac6cObr55pv16quvKioqSosXL1ZOTo4WLFggLy8vNWrUSLt379aMGTOsXxLMmjVLPXr00Lhx4yRJzz//vGJjYzV37lzNnz//CrwSAAAAAACUnnJ7zX9cXJzi4+PVpUsXa1tQUJBat26tLVu2aODAgdqyZYuCg4Othb8kdenSRW5ubtq2bZv69eunLVu26MYbb5SXl5c1pnv37nr55Zd15swZVa5cWVu2bNHYsWNtnr979+52lyGcLzs7W9nZ2dbHqampkiSLxSKLxXK5u4/LZLFYZJomc+GCmDvXxLy5LubOdTF3rol5c12XNXdm6c93uXlPlfq+mn//dva5Low3zb9/FzJWuXlNi+BMjuW2+I+Pj5ckhYeH27SHh4db++Lj4xUWFmbT7+HhoZCQEJuYmJgYuzEK+ipXrqz4+Pgin6cw06ZN0+TJk+3aT548aXNZAsqGxWJRSkqKTNOUmxs3tXAlzJ1rYt5cF3Pnupg718S8ua7LmTsjLaOUsvpbYmJ28UFXQKnva0GRblpkpCVdXnwxY5WX17QoaWlpDseW2+K/vJswYYLN2QKpqamKjo5WaGioAgMDyzAzSOf+OBuGodDQUP7D6mKYO9fEvLku5s51MXeuiXlzXZczd2Z66S/qFxYWVOrP4YhS31fDzfrbDAi5vPhixiovr2lRfHx8HI4tt8V/RESEJCkhIUGRkZHW9oSEBDVr1swak5iYaLNdXl6ekpKSrNtHREQoISHBJqbgcXExBf2F8fb2lre3t127m5sbf8jLCcMwmA8Xxdy5JubNdTF3rou5c03Mm+u65LkzSn+uy837qdT31fj7t7PPdWG8Yfz9u5Cxys1rWgRnciy3exMTE6OIiAitW7fO2paamqpt27apbdu2kqS2bdsqOTlZO3futMasX79eFotFrVu3tsZs3LhRubm51pjY2FjVq1dPlStXtsac/zwFMQXPAwAAAACAKyvT4j89PV27d+/W7t27JZ1b5G/37t36448/ZBiGxowZoxdeeEFffPGF9u7dq/vuu09RUVHq27evJKlBgwbq0aOHhg0bpu3bt2vz5s0aNWqUBg4cqKioKEnS3XffLS8vLw0dOlQ///yzli5dqlmzZtmcsv/oo49q1apVeu211/TLL79o0qRJ+uGHHzRq1Kgr/ZIAAAAAAFDiyvS0/x9++EGdOnWyPi4oyAcPHqxFixZp/PjxysjI0PDhw5WcnKzrr79eq1atsrmuYfHixRo1apRuuukmubm5qX///po9e7a1PygoSGvWrNHIkSPVsmVLVa1aVRMnTrTe5k+S2rVrpyVLluiZZ57RU089pTp16mjFihVq3LjxFXgVAAAAAACO+LN3f3mmJCs3KNix+Cr95XU2QTm+4XZ97av3V0ZOsvy9HBvL1RmmWXB/A1yO1NRUBQUFKSUlhQX/ygGLxaLExESFhYW5xLU6+Btz55qYN9fF3Lku5s41MW+u63LmbtmR0l/wb0Ct8rE43ZXYV6f8byV/MyDE6TUCystrWhRn6lD+4gAAAAAAUMGV29X+AQAAAAAlp7ij8q5wpBuXjiP/AAAAAABUcBz5BwAAAAC4hO7drpNvQrzOhkdo9ZodxcfvaS3fnOM66xWl1dfYxo/54jqdORuvyr4Rmtmn+LFcHUf+AQAAAAAuwSMjQ54ZafLIyHAsPj9dnpYMeeSn2/Vl5WXobG6asvIcG8vVceQfAAAAAFxcuVtlH+UOR/4BAAAAAKjgOPIPAAAAAHDo7AHuCOC6OPIPAAAAAEAFR/EPAAAAAEAFR/EPAAAAAEAFR/EPAAAAAEAFR/EPAAAAAEAFx2r/AAAAAACXsOv5GXLPylK+j49j8TVmyD3zpPL9Qu36hrWaoZz8LHm5OzaWq6P4BwAAAAC4hBOdezgXX7m7DI8kmQEhdn0tr3JuLFfHaf8AAAAAAFRwFP8AAAAAAFRwTp/2P3v27CL7R48efcnJAAAAAABwMcH7dsstJ0cWLy8lN25WfHzGbrlnnlK+W1UlV2ph0/fb6d3Ks+TIw81LNasUP5arc6j4379/vxo2bChJGjNmjPz8/BQWFibTNG3iDMOg+AcAAAAAlIr2D94tv4TjygyP0srN+4uP/3WQ/HJPKNMzUitbHLDpm/7t3UrKPK4QvyjNv634sVydQ6f9P/TQQ7r99tslSU8//bTc3NzUpUsXbd26VXFxcdaf3377rVSTBQAAAAAAznOo+F+zZo2++OILnTp1Ss8//7wOHDignJwc1atXT1OnTlV2dnZp5wkAAAAAAC6RQ8X/sWPH5O7urkqVKkmSqlWrpkWLFmn9+vVat26dateurffff79UEwUAAAAAAJfGoWv+77vvPi1cuFA+Pj7as2fP3xt7eGjmzJn6/PPPNWrUKM2aNUs7d+4stWQBAAAAAIDzHCr+N2/ebP13s2bNZBiGdbG/8/+9e/fuks8QAAAAAABcFqdv9RcXF1caeQAAAAAAgFLidPFfvXr10sgDAAAAAACUEqeL/y+++KLI/j59+lxyMgAAAAAAoOQ5Xfz37dtXhmFIkvVa/wKGYSg/P79kMgMAAAAAACXC6eJ/0KBB+vLLLzV+/Hg9/vjj8vb2Lo28AAAAAACwsXr1Nsk0pf8dkC42vulWGWlJMgNC7Ppe771NpmlaD25XdG7ObvDBBx9o3bp1WrNmjerWravFixeXRl4AAAAAANjIqxSgvIBA5VUKcCzePcD6cyFfzwD5eQXK19OxsVyd08W/JLVs2VIbNmzQrFmzNGXKFF177bXauHFjSecGAAAAAABKgNPFf2pqqvWnc+fO2rx5s2699Vbdcsst6tu3bymkCAAAAAAALofT1/wHBwcXek2EaZr68ssvSyQpAAAAAAAuVOfdufJMT1NupQAdGjqq+PgT8+R1NlE56WE6FPWITd9X++cqMzdNfp4BuqVh8WO5OqeL/2+++aY08gAAAAAAoEh1F7whv4TjygyPcqj4rxv/hvxyTyjTM9K++P/lDSVlHleIXxTFf2FiYmIUHR39j1kREQAAAAAAV+f0Nf8xMTE6efJkaeQCAAAAAABKgdPFv2mapZEHAAAAAAAoJU6f9i9Jf/31l7Kysgrtu/rqqy8rIQAAAAAAULIuqfi/7rrr7NpM05RhGMrPz7/spAAAAAAAQMm5pOJ/27ZtCg0NLelcAAAAAABAKXC6+DcMQ1dffbXCwsJKIx8AAAAAAFDCnC7+WfAPAAAAAP6Zlh1JKesUcImcLv7j4uI45R8AAAAAcMUlN2qqs5HVlB1SxbF4/2t0Njtc2d4Rdn0xlZuqil81BXo7Nparc7r4//333/X7779ftP/GG2+8rIQAAAAAACjM5rc/ci6+7hIZaUkyA0Ls+p7o5NxYrs7p4r9jx44X7WO1fwAAAAAAyh+ni/8zZ86URh4AAAAAAKCUOF38BwUFWf+dlpamp59+Wrt371aTJk304osvlmhyAAAAAADg8jld/J/v8ccfV2xsrAYMGKD//ve/euSRR/T++++XVG4AAAAAAFi1Hz5Q3kmnlR1SxaHr/9v/ere8s+OV7R2hzfVs41/+ZqBSs08r0LvKP+L6/8sq/teuXat3331XnTt31pAhQ9ShQ4eSygsAAAAAABvBP++RX8JxZYZHORaf8ZP8ck8oMy/Sri/uzB4lZR5XiJ9jY7k6t8vZ+NSpU6pRo4YkKSYmRqdOnSqJnAAAAAAAQAly+sh/amqqzeP09HSlpqYqKyurxJICAAAAAAAlx+niPzg4WIZhSJJM01Tz5s2t/y5oBwAAAAAA5YfTxf8333xTGnkAAAAAAIBS4nTxz6J+AAAAAAC4FoeK/7S0NAUEBFgfJyQkaN68edq/f78Mw1CjRo00YsQIhYeHl1qiAAAAAADg0hS72n9eXp4iIiJ04sQJSdLmzZtVu3ZtLV26VAEBAapUqZL+85//qE6dOtq6dWuJJpefn69nn31WMTEx8vX1Va1atfT888/LNE1rjGmamjhxoiIjI+Xr66suXbro0KFDNuMkJSVp0KBBCgwMVHBwsIYOHar09HSbmD179uiGG26Qj4+PoqOjNX369BLdFwAAAAAAykqxR/49PDzk7e2ts2fPSpL+/e9/65577tEbb7xhs/DfiBEj9Pjjj2vz5s0lltzLL7+sN998U++9954aNWqkH374QQ888ICCgoI0evRoSdL06dM1e/Zsvffee4qJidGzzz6r7t27a//+/fLx8ZEkDRo0SCdOnFBsbKxyc3P1wAMPaPjw4VqyZImkc3cw6Natm7p06aL58+dr7969GjJkiIKDgzV8+PAS2x8AAAAAAMqCQ6f9h4aGKi0tTZK0e/duLVq0yGZlf8MwNGbMGOvK/yXl+++/16233qpevXpJkmrUqKH//Oc/2r59u6RzXzrMnDlTzzzzjG699VZJ0vvvv6/w8HCtWLFCAwcO1IEDB7Rq1Srt2LFD1157rSRpzpw5uvnmm/Xqq68qKipKixcvVk5OjhYsWCAvLy81atRIu3fv1owZMyj+AQAAAKCc+HXIw/JMT1NupYDigyX9GvGwvM4mKsc3zK7vlvoPKzM3TX6ejo3l6hwq/tu2baulS5fqmmuuUXh4uI4ePap69erZxMTFxSkwMLBEk2vXrp3efvtt/frrr6pbt65++uknfffdd5oxY4b1OePj49WlSxfrNkFBQWrdurW2bNmigQMHasuWLQoODrYW/pLUpUsXubm5adu2berXr5+2bNmiG2+8UV5eXtaY7t276+WXX9aZM2dUuXJlu9yys7OVnZ1tfZyamipJslgsslgsJfo6wHkWi0WmaTIXLoi5c03Mm+ti7lwXc+eamDfXdVlzZzLfJeXQkIf/fuDA63oo4mEZaUkyA0Ls4m9pUPRYrvA5dSZHh4r/MWPG6IYbblCdOnV011136V//+pdeffVVtWvXTtK5dQDGjRunO+6449Iyvognn3xSqampql+/vtzd3ZWfn6+pU6dq0KBBkqT4+HhJsltoMDw83NoXHx+vsDDbb3k8PDwUEhJiExMTE2M3RkFfYcX/tGnTNHnyZLv2kydPKisr61J2FyXIYrEoJSVFpmnKza3YpS1QjjB3rol5c13Mneti7lwT8+a6LmfujLSMUsoKxTNlnE2TDOl//+OwxMTs4oPKWMEZ+o5wqPhv1qyZvvzySw0fPly///67cnNzddddd1lP/Xd3d9ewYcNKfJG8jz/+WIsXL9aSJUusp+KPGTNGUVFRGjx4cIk+l7MmTJigsWPHWh+npqYqOjpaoaGhJX4GBJxnsVhkGIZCQ0P5D6uLYe5cE/Pmupg718XcuSbmzXVdztyZ6SmllBWKZZqSKZmVQiTDueI/LCyolJIqOQXr3DnCoeJfkjp27Khff/1Vv/zyi06ePGk9vaBy5cqqVauW/P39nc+0GOPGjdOTTz6pgQMHSpKaNGmi33//XdOmTdPgwYMVEREh6dytByMjI63bJSQkqFmzZpKkiIgIJSYm2oybl5enpKQk6/YRERFKSEiwiSl4XBBzIW9vb3l7e9u1u7m58Ye8nDAMg/lwUcyda2LeXBdz57qYO9fEvLmuS547g7kuKR7paecKesNQngPX/Xvkp8iwpMu0eCnPw7aYP5ubJtM0ZRiGfAu57t8VPqPO5Ohw8V+gfv36ql+/viRZb7lnOPkNiqMyMzPtdsbd3d36xUNMTIwiIiK0bt06a7Gfmpqqbdu2acSIEZLOrVeQnJysnTt3qmXLlpKk9evXy2KxqHXr1taYp59+Wrm5ufL09JQkxcbGql69eoWe8g8AAAAAuPK6d28tv4TjygyP0srN+4uP39NGfrknlOkZqZUtDtj0PfZlayVlHleIX5Tm31b8WK7ukr7KeP/999WkSRP5+vrK19dXTZs21QcffFDSual3796aOnWqVq5cqaNHj+qzzz7TjBkz1K9fP0l/32XghRde0BdffKG9e/fqvvvuU1RUlPr27StJatCggXr06KFhw4Zp+/bt2rx5s0aNGqWBAwcqKipKknT33XfLy8tLQ4cO1c8//6ylS5dq1qxZNqf1AwAAAADgqpw+8j9jxgw9++yzGjVqlNq3by9J+u677/TQQw/p1KlTeuyxx0osuTlz5ujZZ5/Vww8/rMTEREVFRenBBx/UxIkTrTHjx49XRkaGhg8fruTkZF1//fVatWqVzbUPixcv1qhRo3TTTTfJzc1N/fv31+zZs639QUFBWrNmjUaOHKmWLVuqatWqmjhxIrf5AwAAAABUCE4X/3PmzNGbb76p++67z9rWp08fNWrUSJMmTSrR4j8gIEAzZ87UzJkzLxpjGIamTJmiKVOmXDQmJCRES5YsKfK5mjZtqk2bNl1qqgAAAAAAlFtOn/Z/4sQJ6y3+zteuXTudOHGiRJICAAAAAAAlx+niv3bt2vr444/t2pcuXao6deqUSFIAAAAAAKDkOH3a/+TJk3XnnXdq48aN1mv+N2/erHXr1hX6pQAAAAAAAChbTh/579+/v7Zt26aqVatqxYoVWrFihapWrart27dbV+EHAAAAAADlh9NH/iWpZcuW+vDDD0s6FwAAAAAAUAqcPvL/9ddfa/Xq1Xbtq1ev1n//+98SSQoAAAAAAJQcp4/8P/nkk3rppZfs2k3T1JNPPqmePXuWSGIAAAAAAJxv81tL5JaTI4uXl2PxdRfLPf2U8itVtesb32GJ8iw58nBzbCxX53Txf+jQITVs2NCuvX79+jp8+HCJJAUAAAAAwIWSGzdzLt6/mQxLkkz/ELu+mlWcG8vVOX3af1BQkH777Te79sOHD8vf379EkgIAAAAAACXH6eL/1ltv1ZgxY3TkyBFr2+HDh/X444+rT58+JZocAAAAAAC4fE6f9j99+nT16NFD9evX11VXXSVJ+uuvv3TDDTfo1VdfLfEEAQAAAACQpMj1q+SelaV8Hx+d6Nyj+Pgzq+WeeVL5eaE6EWK7Pt3Ov1YpJz9LXu4+anlV8WO5OqeL/6CgIH3//feKjY3VTz/9JF9fXzVt2lQ33nhjaeQHAAAAAIAkqcWzY+WXcFyZ4VFa6UDx3+LoWPnlnlCmZ6RWXlD8v7N9rJIyjyvEL4ri/2IMw1C3bt3UrVu3ks4HAAAAAACUMKev+QcAAAAAAK6F4h8AAAAAgAqO4h8AAAAAgAqO4h8AAAAAgArukhb8y8/P14oVK3TgwAFJUqNGjdSnTx+5u7uXaHIAAAAAAODyOV38Hz58WL169dJff/2levXqSZKmTZum6OhorVy5UrVq1SrxJAEAAAAAwKVz+rT/0aNHq2bNmvrzzz+1a9cu7dq1S3/88YdiYmI0evTo0sgRAAAAAABcBqeP/H/77bfaunWrQkJCrG1VqlTRSy+9pPbt25docgAAAAAAFMjz91euf4Dy/P0di3evpNx8f+W5V7Lr8/Hwl69ngHw8HBvL1Tld/Ht7eystLc2uPT09XV5eXiWSFAAAAAAAF1q9Zodz8U23yUhLkhkQYtc3s49zY7k6p0/7v+WWWzR8+HBt27ZNpmnKNE1t3bpVDz30kPr06VMaOQIAAAAAgMvgdPE/e/Zs1apVS23btpWPj498fHzUvn171a5dW7NmzSqNHAEAAAAAwGVw+rT/4OBgff755zp06JB++eUXSVKDBg1Uu3btEk8OAAAAAABcPqeL/wJ16tRRnTp1JEn5+fkllhAAAAAAAIVp+tKz8kxJVm5QsPY8+Xzx8X9MlNfZBOX4hmtP9Rds+j7Y+awycpLl7xWse1sWP5arc/q0/7i4ON11110aMWKEzpw5oz59+sjb21v16tXTnj17SiNHAAAAAAAU/eVy1Vz2gaK/XO5Y/OnliklZpujT9vGbf1+u9Uc+0ObfHRvL1Tld/D/44IM6cOCA9u3bp86dOysnJ0eff/65GjZsqDFjxpRCigAAAAAA4HI4fdr/tm3btGnTJlWvXl0hISHasWOHWrRoodq1a6t169alkSMAAAAAALgMTh/5T0tLU2RkpIKCguTn56fg4GBJ5xYCTEtLK+n8AAAAAADAZbqkBf9WrVqloKAgWSwWrVu3Tvv27VNycnIJpwYAAAAAAErCJRX/gwcPtv77wQcftP7bMIzLzwgAAAAAAJQop4t/i8VSGnkAAAAAAIBS4vQ1/++//76ys7NLIxcAAAAAAFAKnD7y/8ADD6hHjx4KCwsrjXwAAAAAwKUsO5JS1ikAxXK6+DdNszTyAAAAAACgSCc6dZNX8hnlBFd2LD64m7yzEpTtE27X1yKqm9JzzqiSl2NjubpLWvDv448/VmBgYKF9991332UlBAAAAABAYXa9MNO5+JjXZaQlyQwIsesb3sa5sVzdJRX/06dPl7u7u127YRgU/wAAAAAAlDOXVPz/8MMPXPMPAAAAAICLcHq1fwAAAAAA4FqcPvJfvXr1Qk/5BwAAAACgNN3Ut6N8TiYqKzRM61ZsKD5+X2f55JxQllek1jWxjX/y645KzkpUsE+YXrq5+LFcndPFf1xcXGnkAQAAAABAkXxOJsov4bjj8bkJ8stLkAz7k96TsxKVlOn4WK7O6dP+R48erdmzZ9u1z507V2PGjCmJnAAAAAAAQAlyuvhfvny52rdvb9ferl07ffLJJyWSFAAAAAAAKDlOF/+nT59WUFCQXXtgYKBOnTpVIkkBAAAAAICS43TxX7t2ba1atcqu/b///a9q1qxZIkkBAAAAAICS4/SCf2PHjtWoUaN08uRJde7cWZK0bt06vfbaa5o5c2ZJ5wcAAAAAAC6T08X/kCFDlJ2dralTp+r555+XJNWoUUNvvvmm7rvvvhJPEAAAAAAAXB6ni39JGjFihEaMGKGTJ0/K19dXlSpVKum8AAAAAABACXH6mn9JysvL09q1a/Xpp5/KNE1J0vHjx5Wenl6iyQEAAAAAgMvn9JH/33//XT169NAff/yh7Oxsde3aVQEBAXr55ZeVnZ2t+fPnl0aeAAAAAIB/uD1PTJbH2bPK8/V1LD56sjwyTyrPL9Su757mk5Wdd1beHo6N5eqcLv4fffRRXXvttfrpp59UpUoVa3u/fv00bNiwEk0OAAAAAIACf/YZ4Fx81dtlpCXJDAix67s+xrmxXJ3Txf+mTZv0/fffy8vLy6a9Ro0aOnbsWIklBgAAAAAASobT1/xbLBbl5+fbtf/1118KCAgokaQAAAAAAEDJcbr479atm2bOnGl9bBiG0tPT9dxzz+nmm28uydwkSceOHdM999yjKlWqyNfXV02aNNEPP/xg7TdNUxMnTlRkZKR8fX3VpUsXHTp0yGaMpKQkDRo0SIGBgQoODtbQoUPtFifcs2ePbrjhBvn4+Cg6OlrTp08v8X0BAAAAAFy6Sr8dUuCvB1Tpt0PFB0uqdPaQArN/VaWz9vHHUw7pz+QDOp7i2FiuzunT/l977TV1795dDRs2VFZWlu6++24dOnRIVatW1X/+858STe7MmTNq3769OnXqpP/+978KDQ3VoUOHVLlyZWvM9OnTNXv2bL333nuKiYnRs88+q+7du2v//v3y8fGRJA0aNEgnTpxQbGyscnNz9cADD2j48OFasmSJJCk1NVXdunVTly5dNH/+fO3du1dDhgxRcHCwhg8fXqL7BAAAAAC4NB3uvVV+CceVGR6llZv3Fx//S1/55Z5QpmekVrY4YNM3Zd2tSso8rhC/KM2/rfixXJ3Txf9VV12ln376SR999JH27Nmj9PR0DR06VIMGDZKvgysuOurll19WdHS0Fi5caG2LiYmx/ts0Tc2cOVPPPPOMbr31VknS+++/r/DwcK1YsUIDBw7UgQMHtGrVKu3YsUPXXnutJGnOnDm6+eab9eqrryoqKkqLFy9WTk6OFixYIC8vLzVq1Ei7d+/WjBkzKP4BAAAAAC7P6eJfkjw8PHTPPfeUdC52vvjiC3Xv3l0DBgzQt99+q2rVqunhhx+23lUgLi5O8fHx6tKli3WboKAgtW7dWlu2bNHAgQO1ZcsWBQcHWwt/SerSpYvc3Ny0bds29evXT1u2bNGNN95os4hh9+7d9fLLL+vMmTM2ZxoUyM7OVnZ2tvVxamqqpHNrIlgslhJ/LeAci8Ui0zSZCxfE3Lkm5s11MXeui7lzTcyb6ypy7kzm88ox//7t7Ot+Ybxp/v27kLFc4XPqTI5OF/9ffPFFkf19+vRxdsiL+u233/Tmm29q7Nixeuqpp7Rjxw6NHj1aXl5eGjx4sOLj4yVJ4eHhNtuFh4db++Lj4xUWFmbT7+HhoZCQEJuY888oOH/M+Pj4Qov/adOmafLkyXbtJ0+eVFZW1iXuMUqKxWJRSkqKTNOUm5vTS1ugDDF3rol5c13Mneti7lwT8+a6ipo7Iy2jjLL6Byoo0k2LjLSky4svZqzExGy7tvImLS3N4Vini/++ffvaPDYMQ+b/vjExDKPQOwFcKovFomuvvVYvvviiJKl58+bat2+f5s+fr8GDB5fY81yKCRMmaOzYsdbHqampio6OVmhoqAIDA8swM0jn3juGYSg0NJT/sLoY5s41MW+ui7lzXcyda2LeXFdRc2emp5RRVv9Ahpv1txkQcnnxxYwVFhZ0GYleGQXr3DnC6eL/wtMKAgIC9NNPP6lmzZrODlWsyMhINWzY0KatQYMGWr58uSQpIiJCkpSQkKDIyEhrTEJCgpo1a2aNSUxMtBkjLy9PSUlJ1u0jIiKUkJBgE1PwuCDmQt7e3vL29rZrd3Nz4w95OWEYBvPhopg718S8uS7mznUxd66JeXNdF507g7m8coy/fzv7ul8Ybxh//y5kLFf4jDqT42XvjVHwgpWC9u3b6+DBgzZtv/76q6pXry7p3OJ/ERERWrdunbU/NTVV27ZtU9u2bSVJbdu2VXJysnbu3GmNWb9+vSwWi1q3bm2N2bhxo3Jzc60xsbGxqlevXqGn/AMAAAAA4Eouq/g/evSoMjIyFBAQUFL52Hjssce0detWvfjiizp8+LCWLFmit99+WyNHjpR07ouHMWPG6IUXXtAXX3yhvXv36r777lNUVJT18oQGDRqoR48eGjZsmLZv367Nmzdr1KhRGjhwoKKioiRJd999t7y8vDR06FD9/PPPWrp0qWbNmmVzWj8AAAAAAK7K6dP+b7vtNknS2bNntXXrVt10000KDQ0t8cQk6brrrtNnn32mCRMmaMqUKYqJidHMmTM1aNAga8z48eOVkZGh4cOHKzk5Wddff71WrVplc+3D4sWLNWrUKN10001yc3NT//79NXv2bGt/UFCQ1qxZo5EjR6ply5aqWrWqJk6cyG3+AAAAAAAVgtPFf1DQuUUPIiIi1Lt3bw0ZMqTEkzrfLbfcoltuueWi/YZhaMqUKZoyZcpFY0JCQrRkyZIin6dp06batGnTJecJAAAAAEB55XTxv3DhwtLIAwAAAACAIq37bL2M/HyZ7u6OxTdaJ7f007JUqmLXN63HelnMfLkZjo3l6pwu/lNTU4vs5zZ3AAAAAIDSkBVW+N3YLhrvFSHD00uml/2t/Cr7OTeWq3O6+A8ODi50hX/TNGUYhvLz80skMQAAAAAAUDKcLv5r1qypxMREPfnkk2rfvn1p5AQAAAAAAEqQ08X/gQMHNGfOHE2dOlU//vijpk+frpiYmNLIDQAAAAAAq5iPFskjI115/pUUN/D+4uMTF8kz86Ryz4YqLtx2sfq1hxYpKzddPp6V1KVO8WO5OjdnN/D09NTYsWN16NAhVatWTU2bNtXjjz+u5OTkUkgPAAAAAIBzGs6ZrmbTnlHDOdMdiz/2iq5JnKaGx16x6/tk73S9v+sZfbLXsbFcndPFf4GQkBDNnDlTP/74o44eParatWtr5syZJZgaAAAAAAAoCU6f9t+8eXO7Bf9M01R2drYef/xxjRkzpqRyAwAAAAAAJcDp4r9v376lkAYAAAAAACgtThf/zz33XGnkAQAAAAAASonTxX9qamqR/YGBgZecDAAAAAAAKHlOF//BwcF21/xL5677NwxD+fn5JZIYAAAAAAAoGU4X/5L0ySefKCQkpKRzAQAAAAAApeCSiv/27dsrLCyspHMBAAAAAACl4JKK//379+v06dPy9/dXRESEvLy8SjovAAAAAABspMfUUm5AoLKrhjoW71NbuYa/sr0j7foiA2rJzzNQQT6OjeXqLqn4v+mmm6zX+Lu5ual+/foaMmSIHnvssZLODwAAAAAASdK3H37pXHyDz2WkJckMsL9s/bmuzo3l6pwu/uPi4mSapnJzc5Wamqrjx49r+/btevbZZ5WXl6dx48aVRp4AAAAAAOASOV38V69e3eZxy5Yt1bt3b9WtW1dTpkyh+AcAAAAAoJy5pNP+CzNw4EA1atSopIYDAAAAAAAl5JKL/507d+rAgQOSpIYNG6pFixZq0aJFiSUGAAAAAMD5Wo0dJu+k08oOqaLtM94pPv7IcHlnxSvbJ0Lba/+fTd/s74YpLfu0AryraPT1xY/l6pwu/hMTEzVw4EBt2LBBwcHBkqTk5GR16tRJH330kUJD/xkrJQIAAAAArqzQbZvll3BcmeFRjsWnbpZf7gll5tiv9r8/cbOSMo8rxM+xsVydm7MbPPLII0pLS9PPP/+spKQkJSUlad++fUpNTdXo0aNLI0cAAAAAAHAZnD7yv2rVKq1du1YNGjSwtjVs2FDz5s1Tt27dSjQ5AAAAAABw+Zw+8m+xWOTp6WnX7unpKYvFUiJJAQAAAACAkuN08d+5c2c9+uijOn78uLXt2LFjeuyxx3TTTTeVaHIAAAAAAODyOV38z507V6mpqapRo4Zq1aqlWrVqKSYmRqmpqZozZ05p5AgAAAAAAC6D09f8R0dHa9euXVq7dq1++eUXSVKDBg3UpUuXEk8OAAAAAABcPoeL/7S0NAUEBEiSDMNQ165d1bVrV5uYHTt26LrrrivZDAEAAAAAwGVx+LT/bt26KT09vdC+vLw8PfPMM2rfvn2JJQYAAAAAAEqGU0f+u3TpojVr1igwMNDavm/fPt177706efKkVqxYURo5AgAAAACguDvvk2daqnIDAosPlhQXdp88zyYq1zfMru+m2vcpMydVfl6OjeXqHC7+v/nmG3Xu3Fldu3ZVbGysAgICNH36dD333HO67bbbtH79elWuXLk0cwUAAAAA/IPtH/2kc/HVnpCRliQzIMSub0BT58ZydQ4X/6GhoVq/fr26dOmizp07y9vbW4cOHdKHH36o22+/vTRzBAAAAAAAl8Gp1f5DQ0O1bt06denSRfv27dPu3btVv3790soNAAAAAACUAIcX/CtQtWpVrV+/Xg0bNtTdd9+tM2fOlEZeAAAAAACghDh85P+2226zeRwYGKiNGzeqVatWatKkibX9008/LbnsAAAAAAD4n17tG8ov4bgyw6O0cvP+4uN/bCS/3BPK9IzUyhYHbPoe+rShkjKPK8QvSvNvK34sV+dw8R8UFGT3OCYmpsQTAgAAAAAAJcvh4n/hwoWlmQcAAAAAACglTl/zDwAAAAAAXAvFPwAAAAAAFRzFPwAAAAAAFRzFPwAAAAAAFRzFPwAAAAAAFRzFPwAAAAAAFRzFPwAAAAAAFZxHWScAAAAAAIAjtr/2ltxycmTx8nIsvtZbcs84pXz/qnZ9j7R7S7mWHHm6OTaWq6P4BwAAAAC4hJNtbnAuPvB6GUaSzIAQu75GEc6N5eo47R8AAAAAgAqO4h8AAAAAgAqO0/4BAAAAVEjLjqQUGzOgVtAVyAQlJXTrJus1/45cAhCa+t25a/7NqjoZdKNN38/xm6zX/P8TLgGg+AcAAAAAuIRWjz8ov4TjygyP0srN+4uPP/Kg/HJPKNMzUitbHLDpm/P9g0rKPK4QvyjNv634sVwdp/0DAAAAAFDBUfwDAAAAAFDBUfwDAAAAAFDBUfwDAAAAAFDBUfwDAAAAAFDBuVTx/9JLL8kwDI0ZM8balpWVpZEjR6pKlSqqVKmS+vfvr4SEBJvt/vjjD/Xq1Ut+fn4KCwvTuHHjlJeXZxOzYcMGtWjRQt7e3qpdu7YWLVp0BfYIAAAAAIDS5zLF/44dO/TWW2+padOmNu2PPfaYvvzySy1btkzffvutjh8/rttuu83an5+fr169eiknJ0fff/+93nvvPS1atEgTJ060xsTFxalXr17q1KmTdu/erTFjxuhf//qXVq9efcX2DwAAAACA0uISxX96eroGDRqkd955R5UrV7a2p6Sk6N1339WMGTPUuXNntWzZUgsXLtT333+vrVu3SpLWrFmj/fv368MPP1SzZs3Us2dPPf/885o3b55ycnIkSfPnz1dMTIxee+01NWjQQKNGjdLtt9+u119/vUz2FwAAAACAkuRR1gk4YuTIkerVq5e6dOmiF154wdq+c+dO5ebmqkuXLta2+vXr6+qrr9aWLVvUpk0bbdmyRU2aNFF4eLg1pnv37hoxYoR+/vlnNW/eXFu2bLEZoyDm/MsLLpSdna3s7Gzr49TUVEmSxWKRxWK53F3GZbJYLDJNk7lwQcyda2LeXBdz57qYO9fEvF1hZvGvs6NzUeTcOfA8KCnm37+dfd0vjDfNv38XMpYrfE6dybHcF/8fffSRdu3apR07dtj1xcfHy8vLS8HBwTbt4eHhio+Pt8acX/gX9Bf0FRWTmpqqs2fPytfX1+65p02bpsmTJ9u1nzx5UllZWY7vIEqFxWJRSkqKTNOUm5tLnOCC/2HuXBPz5rqYO9fF3Lkm5u3KMtIyio1JTMwuNkYqeu4ceR6UjK//u8n6byMtqfj42htlZKbJ9Auwi3+r699jqZCxHH1vlKW0tDSHY8t18f/nn3/q0UcfVWxsrHx8fMo6HRsTJkzQ2LFjrY9TU1MVHR2t0NBQBQYGlmFmkM79cTYMQ6GhofyH1cUwd66JeXNdzJ3rYu5cE/N2ZZnpKcXGhIUFOTRWUXPnyPOgjJjmuZMEKoVIhuHUpo6+N8qSM3VyuS7+d+7cqcTERLVo0cLalp+fr40bN2ru3LlavXq1cnJylJycbHP0PyEhQREREZKkiIgIbd++3WbcgrsBnB9z4R0CEhISFBgYWOhRf0ny9vaWt7e3Xbubmxt/yMsJwzCYDxfF3Lkm5s11MXeui7lzTczbFWQU/xo7Mw8XnTsHngdlxXKu6DcMp+fJFT6jzuRYrvfmpptu0t69e7V7927rz7XXXqtBgwZZ/+3p6al169ZZtzl48KD++OMPtW3bVpLUtm1b7d27V4mJidaY2NhYBQYGqmHDhtaY88coiCkYAwAAAAAAV1auj/wHBASocePGNm3+/v6qUqWKtX3o0KEaO3asQkJCFBgYqEceeURt27ZVmzZtJEndunVTw4YNde+992r69OmKj4/XM888o5EjR1qP3D/00EOaO3euxo8fryFDhmj9+vX6+OOPtXLlyiu7wwAAAACAi2o4+yV5pqUqNyBQ+0c/WXz8sZfleTZRualh2n/VBJu+ZXteUmZOqvy8AjWgafFjubpyXfw74vXXX5ebm5v69++v7Oxsde/eXW+88Ya1393dXV999ZVGjBihtm3byt/fX4MHD9aUKVOsMTExMVq5cqUee+wxzZo1S1dddZX+7//+T927dy+LXQIAAAAAFCJm6fvySziuzPAoh4r/mMT35Zd7QpmekXbF/7rD7ysp87hC/KIo/sujDRs22Dz28fHRvHnzNG/evItuU716dX399ddFjtuxY0f9+OOPJZEiAAAAAADlSrm+5h8AAAAAAFw+lzvyDwAAAAAlZdmRom/TN6BW+b/dG+AIjvwDAAAAAFDBUfwDAAAAAFDBUfwDAAAAAFDBUfwDAAAAAFDBUfwDAAAAAFDBsdo/AAAAAFyE9W4ApkVGWobM9BTJ4BhqWTnZur28k04rO6SKY/GB7eWdFa9snwi7voZh7ZWWfVoB3o6N5eoo/gEAAAAALmH7jHeci6/1toy0JJkBIXZ9o693bixXx1dWAAAAAABUcBT/AAAAAABUcJz2DwAAAMAlWa/HB1Asin8AAAAAgEvocE9veZ86qeyqofr2wy+Ljz9wq7yzTyjbO1LfNrSNnxzbWylZJxXkE6rnuhY/lquj+AcAAAAAuIRKcUfkl3BcmWmpjsVnHZZf7gllmhl2fSfSjigp87gycx0by9VxzT8AAAAAABUcxT8AAAAAABUcxT8AAAAAABUcxT8AAAAAABUcxT8AAAAAABUcxT8AAAAAABUcxT8AAAAAABUcxT8AAAAAABWcR1knAAAAAACAI/Y/Ml4eGenK86/kWHy1cfLMPKlcv1C7vtubjFdWbrp8PB0by9VR/AMAAAAAXELcwPudiw+7X0ZaksyAELu+LnWcG8vVcdo/AAAAAAAVHMU/AAAAAAAVHKf9AwAAAABcgk9ivIz8fJnu7soKiyg+PidebrmnZcnJUZZ3lE3fmcx4Wcx8uRnuquxX/FiujuIfAAAAAOASburXWX4Jx5UZHqWVm/cXH//zTfLLPaFMz0itbHHApm/Cqs5KyjyuEL8ozb+t+LFcHaf9AwAAAABQwVH8AwAAAABQwVH8AwAAAABQwVH8AwAAAABQwbHgHwAAAIByZ9mRlLJOAahQOPIPAAAAAEAFR/EPAAAAAEAFR/EPAAAAAEAFxzX/AAAAAEqUI9frD6gVdAUyAVCA4h8AAAAA4BK+/eBzueXlyeLhWCn7bf0Vcs84rXz/KnZ9E2/6XPlmntyNf0ZZ/M/YSwAAAACAy0uvWce5eN86MvKqyPQNseuLCnJuLFfHNf8AAAAAAFRwFP8AAAAAAFRwnPYPAAAAAHAJ0V8sk8fZs8rz9dWffQYUH3/qE3lknlRedqj+DL3Dpu+7uGXKzjsrbw9fXR9T/FiujuIfAAAAwBXnyB0BgAs1ffk5+SUcV2Z4lEPFf9M/n5Nf7gllekbaFf8f/vickjKPK8Qv6h9R/HPaPwAAAAAAFRzFPwAAAAAAFRzFPwAAAAAAFRzFPwAAAAAAFRzFPwAAAAAAFRzFPwAAAAAAFRy3+gMAAADgFG7TB7gejvwDAAAAAFDBceQfAAAAAOASskLDbH4XG+8ZLpmWc78vEOwTZvO7oqP4BwAAAAC4hHUrNjgX33i9jLQkmQEhdn0v3ezcWK6O0/4BAAAAAKjgKP4BAAAAAKjgynXxP23aNF133XUKCAhQWFiY+vbtq4MHD9rEZGVlaeTIkapSpYoqVaqk/v37KyEhwSbmjz/+UK9eveTn56ewsDCNGzdOeXl5NjEbNmxQixYt5O3trdq1a2vRokWlvXsAAAAAAFwR5br4//bbbzVy5Eht3bpVsbGxys3NVbdu3ZSRkWGNeeyxx/Tll19q2bJl+vbbb3X8+HHddttt1v78/Hz16tVLOTk5+v777/Xee+9p0aJFmjhxojUmLi5OvXr1UqdOnbR7926NGTNG//rXv7R69eorur8AAAAAgItr8cwYtRk1WC2eGeNYfNxjavPXKLWIe8yu7+2tYzRj42C9vdWxsVxduV7wb9WqVTaPFy1apLCwMO3cuVM33nijUlJS9O6772rJkiXq3LmzJGnhwoVq0KCBtm7dqjZt2mjNmjXav3+/1q5dq/DwcDVr1kzPP/+8nnjiCU2aNEleXl6aP3++YmJi9Nprr0mSGjRooO+++06vv/66unfvfsX3GwAAAABgL/KbNfJLOK7M8CjH4pPXyC/3hDKzIu36dh1fo6TM4wrxc2wsV1eui/8LpaSkSJJCQs6t1Lhz507l5uaqS5cu1pj69evr6quv1pYtW9SmTRtt2bJFTZo0UXj437d26N69u0aMGKGff/5ZzZs315YtW2zGKIgZM2bMRXPJzs5Wdna29XFqaqokyWKxyGKxXPa+4vJYLBaZpslcuCDmzjUxb66LuXNdzJ1rqjDzZrp4/pfCNP/+0T9w/8sN8+/fzr4PL4w3zb9/FzKWK3xOncnRZYp/i8WiMWPGqH379mrcuLEkKT4+Xl5eXgoODraJDQ8PV3x8vDXm/MK/oL+gr6iY1NRUnT17Vr6+vnb5TJs2TZMnT7ZrP3nypLKysi5tJ1FiLBaLUlJSZJqm3NzK9dUtuABz55qYN9fF3Lku5s41VZR5M9Iyig+qcEwZZ9MkQ/rf/6AsFBTppkVGWtLlxRczVmJitl1beZOWluZwrMsU/yNHjtS+ffv03XfflXUqkqQJEyZo7Nix1sepqamKjo5WaGioAgMDyzAzSOf+w2oYhkJDQ136P6z/RMyda2LeXBdz57qYO9dUUebNTE8p6xSuPNM8d7C5UohkUPyXGcPN+tsMCLm8+GLGCgsLuoxErwwfHx+HY12i+B81apS++uorbdy4UVdddZW1PSIiQjk5OUpOTrY5+p+QkKCIiAhrzPbt223GK7gbwPkxF94hICEhQYGBgYUe9Zckb29veXt727W7ubm59B/yisQwDObDRTF3rol5c13Mneti7lxThZg3w4Vzv2SWc0W/YfxD97+8MP7+7ew8XBhf8CXORebUFT6jzuRYrvfGNE2NGjVKn332mdavX6+YmBib/pYtW8rT01Pr1q2zth08eFB//PGH2rZtK0lq27at9u7dq8TERGtMbGysAgMD1bBhQ2vM+WMUxBSMAQAAAACAKyvXR/5HjhypJUuW6PPPP1dAQID1Gv2goCD5+voqKChIQ4cO1dixYxUSEqLAwEA98sgjatu2rdq0aSNJ6tatmxo2bKh7771X06dPV3x8vJ555hmNHDnSeuT+oYce0ty5czV+/HgNGTJE69ev18cff6yVK1eW2b4DAAAAAFBSyvWR/zfffFMpKSnq2LGjIiMjrT9Lly61xrz++uu65ZZb1L9/f914442KiIjQp59+au13d3fXV199JXd3d7Vt21b33HOP7rvvPk2ZMsUaExMTo5UrVyo2NlbXXHONXnvtNf3f//0ft/kDAAAAAFQI5frIv1lw64Ui+Pj4aN68eZo3b95FY6pXr66vv/66yHE6duyoH3/80ekcAQAAAAAo78p18Q8AAADgylp25B+4kj9cxp+9+8szJVm5QcGOxVfpL6+zCcrxDbfra1+9vzJykuXv5dhYro7iHwAAAADgEvY8+bxz8VdPkZGWVOit/O5t6dxYrq5cX/MPAAAAAAAuH8U/AAAAAAAVHKf9AwAAAP8gXNMP/DNR/AMAAAAAXEL3btfJNyFeZ8MjtHrNjuLj97SWb85xnfWK0uprbOPHfHGdzpyNV2XfCM3sU/xYro7iHwAAAKhAOLKPiswjI0OeGWnKzQhwLD4/XZ6WDOXmp9v1ZeVl6Gxumnw9HRvL1XHNPwAAAAAAFRzFPwAAAAAAFRzFPwAAAAAAFRzFPwAAAAAAFRzFPwAAAAAAFRzFPwAAAAAAFRzFPwAAAAAAFZxHWScAAP/f3v1HRVXnfxx/DegghGCggpRCpmgaAsqC+ONoZaKSm9tumrL5o+ybu9pGrMf0iCLb7umYmpapbW6injZQ/Gae8zW1Dptiirqo9FUzS3J1XQF1XZQfhTZzv3+U820cBUb5NTPPxzlzDvfez3x4X97nc+685/O5FwAAUD+5xZebOwQALoriHwAAAGgCucWXJcMqU0WVjMrLkslxEe6T9wc2Q2SA6zj0yuvy/u47Wdq0qV/7iNflXX1BFr8ODseei39dVy3fyexdv75cHcU/AAAA0EIwsw/UruThEc61vztJplaXZLQNcjjW717n+nJ13PMPAAAAAICbo/gHAAAAAMDNsewfAAAAAOAS2h0tktfVq7KazSp/MKbu9lVF8q6+KItXe5X797U79s2/i/S99apaeZnVNbjuvlwdxT8AAAAAwCUMfH6C/MrOqTokTFv3fFF3+69S5HetRNWtO2lr3+N2x17bNUGXqs8pyC9Mbz9Rd1+ujmX/AAAAAAC4OYp/AAAAAADcHMU/AAAAAABujuIfAAAAAAA3R/EPAAAAAICbo/gHAAAAAMDNUfwDAAAAAODmWjV3AAAAAIA7yC2+3NwhAMAtMfMPAAAAAICbY+YfAAAAqAOz+kDLsGPHfskwJJOpfu377JOp4pKMtkEOx5aO3i/DMGSqZ1+ujuIfAAAAAOASvvdv61x777YyeV+T4e34Pt/WzvXl6lj2DwAAAACAm6P4BwAAAADAzbHsHwAAAB6Pe/oB19D93bfUurJC1/zb6utnZ9TdvmSFzN+e19XKjvo67AW7Y//zxVuqvlYhv9Zt9VivuvtydRT/AAAAAACXELlmpfzKzqk6JKxexX9k6Ur5XStRdetOjsX/lyt1qfqcgvzCPKL4Z9k/AAAAAABujpl/AAAAtFj1WY7/5P2BTRAJALg2Zv4BAAAAAHBzzPwDAADApfGwPgCoGzP/AAAAAAC4OYp/AAAAAADcHMv+AQAA0GxYsg8ATYPiHwAAALeFJ/EDgOug+AcAAGhAdRXEnlYMM7MPoCGV9+6jbzvdo5qg4Pq1vyta39aEqMYn1OHYfXf3UbDfPQrwqV9fro7iHwAA4EdNUagyWw4At2/POznOtY98X6aKSzLaBjkce/kh5/pydRT/AAAAHogvIQDAs1D8AwAAl9AQy+ldZQk6tw4AABoaxT8AAIAbcpUvOgAATYPiHwAAuAWK3YbH3xRASzPwv56Sz6V/qyYouF73/w/8aoJ8akpV4xOqPT3s2y/89Cldqfm3AnyCPeL+f4p/AADQ7CgynZNbfFkyrDJVVMmovCyZvJo7JABoEu2O/a/8ys6pOiSsfu2rPpfftRJVf9/J4dip//yvLlWfU5Bf/fpydVwpAAAAAABwc8z8AwDgwf77m8t3PHvMw+cAAGj5KP4BAMAdYck+AAAtH8v+AQAAAABwc8z832DFihVatGiRSktLFR0dreXLlys+Pr65wwIAwEF9ZtxZkg8AACSKfzsbNmxQWlqa3n77bSUkJGjZsmVKSkrSiRMn1LFjx+YODwAAp7EkHwAASBT/dl5//XU999xzmjJliiTp7bff1tatW7VmzRrNnj27maMDANQHxS4AAIAjiv8fXb16VQcPHtScOXNs+7y8vDRs2DAVFBQ4tK+pqVFNTY1t+/LlHz5slpeXy2q1Nn7AqJXVatWVK1dkNpvl5eUaj7bYcqr2guXx++586W5dv6M+6hPHnf0eQ6aK/+ixOnLXEOeChvRD3oyS7ySZmjsYOMWQqeKKDHmL3LkacueayJvrInctwRWrRd9LqrZaVH2lvO721VZ9f02qbm11aG98a5G+kwzTzfsqLzcaJObGdOXKFUmSYdQdK8X/jy5evCiLxaKQkBC7/SEhIfryyy8d2r/66qvKzMx02B8eHt5oMQIAAAAAJF0ok/pGOPGGMkk3b/8flWlyhjN9tTwVFRUKDKx9ko7i/zbNmTNHaWlptm2r1apLly4pODhYJhPfBDa3K1euqHPnzvrnP/+pgICA5g4HTiB3rom8uS5y57rInWsib66L3Lkmd8+bYRiqqKhQWFhYnW0p/n/Uvn17eXt7q6yszG5/WVmZQkNDHdr7+PjIx8fHbl+7du0aM0TchoCAALcc5J6A3Lkm8ua6yJ3rIneuiby5LnLnmtw5b3XN+F/nGjdDNwGz2ax+/fopLy/Pts9qtSovL0+JiYnNGBkAAAAAAHeGmf+fSEtL06RJkxQXF6f4+HgtW7ZMVVVVtqf/AwAAAADgiij+f2LcuHG6cOGC5s+fr9LSUsXExGj79u0ODwFEy+fj46OMjAyHWzPQ8pE710TeXBe5c13kzjWRN9dF7lwTeft/JqM+/xMAAAAAAAC4LO75BwAAAADAzVH8AwAAAADg5ij+AQAAAABwcxT/AAAAAAC4OYp/tHj5+fkaPXq0wsLCZDKZ9OGHH9b5np07d6pv377y8fFRt27dtHbtWrvjCxYskMlksnv17NmzcU7Agzmbu5KSEk2YMEGRkZHy8vJSamrqTdvl5uaqZ8+eatOmjaKiovTRRx81fPAerjFyt3btWodx16ZNm8Y5AQ/lbN4++OADPfroo+rQoYMCAgKUmJioHTt2OLRbsWKFIiIi1KZNGyUkJOjAgQONdAaeqzFyx7Wu8Tmbt88++0wDBw5UcHCwfH191bNnTy1dutShHWOu8TVG7hhzTeN2aoPr9uzZo1atWikmJsbhmCeMO4p/tHhVVVWKjo7WihUr6tX+1KlTSk5O1kMPPaSioiKlpqZq6tSpDh+KevfurZKSEtvrs88+a4zwPZqzuaupqVGHDh2Unp6u6Ojom7bZu3evxo8fr2effVaHDx/WmDFjNGbMGB09erQhQ/d4jZE7SQoICLAbd6dPn26okCHn85afn69HH31UH330kQ4ePKiHHnpIo0eP1uHDh21tNmzYoLS0NGVkZOjQoUOKjo5WUlKSzp8/31in4ZEaI3cS17rG5mze7rrrLs2YMUP5+fk6fvy40tPTlZ6ernfeecfWhjHXNBojdxJjrik4m7vrysvLNXHiRD3yyCMOxzxm3BmAC5FkbN68udY2s2bNMnr37m23b9y4cUZSUpJtOyMjw4iOjm6ECHEr9cndTw0ZMsR48cUXHfaPHTvWSE5OttuXkJBgPP/883cYIW6loXKXlZVlBAYGNlhcqJ2zebuuV69eRmZmpm07Pj7emD59um3bYrEYYWFhxquvvtoQYeImGip3XOua1u3m7Re/+IXx61//2rbNmGt6DZU7xlzTcyZ348aNM9LT02+aJ08Zd8z8w+0UFBRo2LBhdvuSkpJUUFBgt+/rr79WWFiYunbtqpSUFJ05c6Ypw8Rtqm9+0TJVVlYqPDxcnTt31uOPP65jx441d0j4CavVqoqKCgUFBUmSrl69qoMHD9qNOS8vLw0bNowx18LcmLvruNa1bIcPH9bevXs1ZMgQSYw5V3Jj7q5jzLVMWVlZ+uabb5SRkeFwzJPGHcU/3E5paalCQkLs9oWEhOjKlSv69ttvJUkJCQlau3attm/frlWrVunUqVMaPHiwKioqmiNkOOFW+S0tLW2miFBfPXr00Jo1a7Rlyxa99957slqtGjBggM6ePdvcoeFHixcvVmVlpcaOHStJunjxoiwWC2POBdyYO4lrXUt27733ysfHR3FxcZo+fbqmTp0qiTHnCm6VO4kx11J9/fXXmj17tt577z21atXK4bgnjTvHswc8wMiRI20/9+nTRwkJCQoPD9fGjRv17LPPNmNkgPtKTExUYmKibXvAgAF64IEH9Oc//1mvvPJKM0YGSXr//feVmZmpLVu2qGPHjs0dDpxwq9xxrWu5du/ercrKSu3bt0+zZ89Wt27dNH78+OYOC/VQW+4Ycy2PxWLRhAkTlJmZqcjIyOYOp9lR/MPthIaGqqyszG5fWVmZAgIC5Ovre9P3tGvXTpGRkTp58mRThIg7cKv8hoaGNlNEuF2tW7dWbGws464FyMnJ0dSpU5Wbm2u37LF9+/by9vZmzLVgt8rdzXCtaznuu+8+SVJUVJTKysq0YMECjR8/njHnAm6Vu5thzDW/iooKFRYW6vDhw5oxY4akH26TMgxDrVq10scff6xBgwZ5zLhj2T/cTmJiovLy8uz2ffLJJ3YzjjeqrKxUcXGxOnXq1Njh4Q7dTn7RMlksFh05coRx18yys7M1ZcoUZWdnKzk52e6Y2WxWv3797Mac1WpVXl4eY64FqC13N8O1rmWyWq2qqamRxJhzNT/N3c0w5ppfQECAjhw5oqKiIttr2rRp6tGjh4qKipSQkOBR446Zf7R4lZWVdt+Ynjp1SkVFRQoKClKXLl00Z84c/etf/9L69eslSdOmTdNbb72lWbNm6ZlnntHf/vY3bdy4UVu3brX1MXPmTI0ePVrh4eE6d+6cMjIy5O3tzZK7BuZs7iSpqKjI9t4LFy6oqKhIZrNZvXr1kiS9+OKLGjJkiJYsWaLk5GTl5OSosLDQ4V/t4M40Ru7+8Ic/qH///urWrZvKy8u1aNEinT592u5+SdwZZ/P2/vvva9KkSXrjjTeUkJBgu7fR19dXgYGBkqS0tDRNmjRJcXFxio+P17Jly1RVVaUpU6Y0/Qm6scbIHde6xuds3lasWKEuXbrY/vd7fn6+Fi9erN/97ne2PhhzTaMxcseYaxrO5M7Ly0sPPvig3fs7duyoNm3a2O33mHHX3P9uAKjLp59+akhyeE2aNMkwDMOYNGmSMWTIEIf3xMTEGGaz2ejatauRlZVld3zcuHFGp06dDLPZbNxzzz3GuHHjjJMnTzbNCXmQ28ndzdqHh4fbtdm4caMRGRlpmM1mo3fv3sbWrVub5oQ8SGPkLjU11ejSpYthNpuNkJAQY9SoUcahQ4ea7qQ8gLN5GzJkSK3tr1u+fLktd/Hx8ca+ffua7qQ8RGPkjmtd43M2b2+++abRu3dvw8/PzwgICDBiY2ONlStXGhaLxa5fxlzja4zcMeaaxu18RvmpW/1LRk8YdybDMIyG+RoBAAAAAAC0RNzzDwAAAACAm6P4BwAAAADAzVH8AwAAAADg5ij+AQAAAABwcxT/AAAAAAC4OYp/AAAAAADcHMU/AAAAAABujuIfAAAAAIBGkJ+fr9GjRyssLEwmk0kffvih031s3LhRMTEx8vPzU3h4uBYtWnRbsVD8AwAAm8mTJ2vMmDHNHQYAAG6hqqpK0dHRWrFixW29f9u2bUpJSdG0adN09OhRrVy5UkuXLtVbb73ldF8mwzCM24oCAAC4FJPJVOvxjIwMvfTSSzIMQ+3atWuaoG5i8uTJKi8vv63ZEQAAWiqTyaTNmzfbfcleU1OjuXPnKjs7W+Xl5XrwwQe1cOFCDR06VJI0YcIEXbt2Tbm5ubb3LF++XK+99prOnDlT57X9p1o11IkAAICWraSkxPbzhg0bNH/+fJ04ccK2z9/fX/7+/s0RGgAAHmnGjBn64osvlJOTo7CwMG3evFkjRozQkSNH1L17d9XU1MjPz8/uPb6+vjp79qxOnz6tiIiIev8ulv0DAOAhQkNDba/AwECZTCa7ff7+/g7L/ocOHaoXXnhBqampuvvuuxUSEqLVq1erqqpKU6ZMUdu2bdWtWzdt27bN7ncdPXpUI0eOlL+/v0JCQvT000/r4sWLtuObNm1SVFSUfH19FRwcrGHDhqmqqkoLFizQunXrtGXLFplMJplMJu3cuVOS9PLLLysyMlJ+fn7q2rWr5s2bp2vXrtn6XLBggWJiYrRmzRp16dJF/v7++u1vfyuLxaLXXntNoaGh6tixo/70pz/ZxWoymbRq1SqNHDlSvr6+6tq1qzZt2tTwCQAA4CfOnDmjrKws5ebmavDgwbr//vs1c+ZMDRo0SFlZWZKkpKQkffDBB8rLy5PVatVXX32lJUuWSLL/Ur8+KP4BAECt1q1bp/bt2+vAgQN64YUX9Jvf/EZPPvmkBgwYoEOHDmn48OF6+umnVV1dLUkqLy/Xww8/rNjYWBUWFmr79u0qKyvT2LFjJf3wYWX8+PF65plndPz4ce3cuVNPPPGEDMPQzJkzNXbsWI0YMUIlJSUqKSnRgAEDJElt27bV2rVr9cUXX+iNN97Q6tWrtXTpUrtYi4uLtW3bNm3fvl3Z2dl69913lZycrLNnz2rXrl1auHCh0tPTtX//frv3zZs3T7/85S/1+eefKyUlRU899ZSOHz/eBH9dAICnOnLkiCwWiyIjI22r7/z9/bVr1y4VFxdLkp577jnNmDFDjz32mMxms/r376+nnnpKkuTl5Vw5zz3/AAB4oLVr1yo1NVXl5eV2+2+8337o0KGyWCzavXu3JMlisSgwMFBPPPGE1q9fL0kqLS1Vp06dVFBQoP79++uPf/yjdu/erR07dtj6PXv2rDp37qwTJ06osrJS/fr10z/+8Q+Fh4c7xFbfe/4XL16snJwcFRYWSvph5n/RokUqLS1V27ZtJUkjRozQiRMnVFxcbPuQ1LNnT02ePFmzZ8+W9MPM/7Rp07Rq1Spb3/3791ffvn21cuXKev5FAQCo3Y33/G/YsEEpKSk6duyYvL297dr6+/srNDTUtm2xWFRaWqoOHTooLy9Po0aN0vnz59WhQ4d6/37u+QcAALXq06eP7Wdvb28FBwcrKirKti8kJESSdP78eUnS559/rk8//fSmzw8oLi7W8OHD9cgjjygqKkpJSUkaPny4fvWrX+nuu++uNY4NGzbozTffVHFxsSorK/X9998rICDArk1ERISt8L8em7e3t93sSEhIiC3W6xITEx22i4qKao0HAIA7ERsbK4vFovPnz2vw4MG1tvX29tY999wjScrOzlZiYqJThb9E8Q8AAOrQunVru22TyWS37/qThq1WqySpsrJSo0eP1sKFCx366tSpk7y9vfXJJ59o7969+vjjj7V8+XLNnTtX+/fv13333XfTGAoKCpSSkqLMzEwlJSUpMDBQOTk5tvse6xvr9X3XYwUAoDFVVlbq5MmTtu1Tp06pqKhIQUFBioyMVEpKiiZOnKglS5YoNjZWFy5cUF5envr06aPk5GRdvHhRmzZt0tChQ/Xdd9/ZnhGwa9cup2Phnn8AANCg+vbtq2PHjikiIkLdunWze911112SfijABw4cqMzMTB0+fFhms1mbN2+WJJnNZlksFrs+9+7dq/DwcM2dO1dxcXHq3r27Tp8+3WAx79u3z2H7gQceaLD+AQCeqbCwULGxsYqNjZUkpaWlKTY2VvPnz5ckZWVlaeLEifr973+vHj16aMyYMfr73/+uLl262PpYt26d4uLiNHDgQB07dkw7d+5UfHy807Ew8w8AABrU9OnTtXr1ao0fP16zZs1SUFCQTp48qZycHP3lL39RYWGh8vLyNHz4cHXs2FH79+/XhQsXbMV2RESEduzYoRMnTig4OFiBgYHq3r27zpw5o5ycHP3sZz/T1q1bbV8WNITc3FzFxcVp0KBB+utf/6oDBw7o3XffbbD+AQCeaejQoartMXutW7dWZmamMjMzb3q8ffv2KigoaJBYmPkHAAANKiwsTHv27JHFYtHw4cMVFRWl1NRUtWvXTl5eXgoICFB+fr5GjRqlyMhIpaena8mSJRo5cqSkH55s3KNHD8XFxalDhw7as2ePfv7zn+ull17SjBkzFBMTo71792revHkNFnNmZqZycnLUp08frV+/XtnZ2erVq1eD9Q8AQHPjaf8AAMCj3fj0ZQAA3BEz/wAAAAAAuDmKfwAAAAAA3BwP/AMAAB6NOyABAJ6AmX8AAAAAANwcxT8AAAAAAG6O4h8AAAAAADdH8Q8AAAAAgJuj+AcAAAAAwM1R/AMAAAAA4OYo/gEAAAAAcHMU/wAAAAAAuLn/AxvQweiDU8GTAAAAAElFTkSuQmCC",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import polars as pl\n",
+ "import matplotlib.pyplot as plt\n",
+ "import numpy as np\n",
+ "from datetime import datetime\n",
+ "labels = [\"Base\", \"Gap\", \"Valid\", \"Test\"]\n",
+ "\n",
+ "# Считаем события в каждом интервале\n",
+ "counts = []\n",
+ "intervals = [1394150400, 1399939200, 1403049600]\n",
+ "for i in range(len(intervals)-1):\n",
+ " start, end = intervals[i], intervals[i+1]\n",
+ " \n",
+ " q = df.lazy().explode([\"timestamps\"])\n",
+ " if end is not None:\n",
+ " q = q.filter((pl.col(\"timestamps\") >= start) & (pl.col(\"timestamps\") < end))\n",
+ " else:\n",
+ " q = q.filter(pl.col(\"timestamps\") >= start)\n",
+ " \n",
+ " count = q.select(pl.len()).collect().item()\n",
+ " counts.append(count)\n",
+ " \n",
+ " end_str = datetime.fromtimestamp(end).strftime('%Y-%m-%d') if end else \"Inf\"\n",
+ " start_str = datetime.fromtimestamp(start).strftime('%Y-%m-%d') if start > 0 else \"Start\"\n",
+ " \n",
+ " print(f\"Part {i} [{labels[i]}]: {count} events ({start_str} -> {end_str})\")\n",
+ "\n",
+ "# 3. Гистограмма распределения событий во времени\n",
+ "all_timestamps = df.select(pl.col(\"timestamps\").explode()).to_series().to_numpy()\n",
+ "\n",
+ "plt.figure(figsize=(12, 6))\n",
+ "plt.hist(all_timestamps, bins=100, color='skyblue', alpha=0.7, label='Events')\n",
+ "\n",
+ "# Рисуем линии отсечек\n",
+ "colors = ['red', 'orange', 'green']\n",
+ "for cutoff, color, label in zip(intervals, colors, labels[1:]):\n",
+ " plt.axvline(x=cutoff, color=color, linestyle='--', linewidth=2, label=f'Cutoff: {label}')\n",
+ "\n",
+ "plt.title(\"Распределение взаимодействий во времени\")\n",
+ "plt.xlabel(\"Timestamp\")\n",
+ "plt.ylabel(\"Количество событий\")\n",
+ "plt.legend()\n",
+ "plt.grid(True, alpha=0.3)\n",
+ "plt.show()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/sigir/yambda_processing/YambdaDatasetProcessing.ipynb b/sigir/yambda_processing/YambdaDatasetProcessing.ipynb
new file mode 100644
index 0000000..c36af65
--- /dev/null
+++ b/sigir/yambda_processing/YambdaDatasetProcessing.ipynb
@@ -0,0 +1,640 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "id": "SbkKok0dfjjS"
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/usr/local/lib/python3.12/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+ " from .autonotebook import tqdm as notebook_tqdm\n"
+ ]
+ }
+ ],
+ "source": [
+ "from collections import defaultdict, Counter\n",
+ "from typing import Any, Dict, List, Optional, Tuple\n",
+ "\n",
+ "from datasets import load_dataset\n",
+ "\n",
+ "import numpy as np\n",
+ "\n",
+ "import polars as pl\n",
+ "\n",
+ "import torch\n",
+ "import torch.nn as nn\n",
+ "from torch.utils.data import Dataset, DataLoader"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "gwwdsnwBfjjT"
+ },
+ "source": [
+ "## 🛠️ Подготовка данных"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "viKiSaEKfjjT",
+ "outputId": "6229cbba-dc3b-4d15-a8e4-ac08e4e187d6"
+ },
+ "outputs": [],
+ "source": [
+ "format = 'sequential'\n",
+ "size = '50m'\n",
+ "events = 'listens'\n",
+ "# listens_data = load_dataset('yandex/yambda', data_dir=f'{format}/{size}', data_files=f'{events}.parquet')\n",
+ "# yambda_df = pl.from_arrow(listens_data['train'].data.table)\n",
+ "yambda_df = pl.read_parquet(\"/home/jovyan/yambda_sequential_50m/sequential/50m/listens.parquet\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "VNanksDRfjjT",
+ "outputId": "e118e2b4-0076-475d-9104-5e1565dab7d9"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "✅ test_yambda_data_loading: OK\n"
+ ]
+ }
+ ],
+ "source": [
+ "def test_yambda_data_loading():\n",
+ " assert isinstance(yambda_df, pl.DataFrame), 'yambda_df должен быть Polars DataFrame'\n",
+ " assert yambda_df.shape == (9238, 6), f'Неправильный размер: {yambda_df.shape}'\n",
+ "\n",
+ " expected_cols = {'uid', 'timestamp', 'item_id', 'is_organic', 'played_ratio_pct', 'track_length_seconds'}\n",
+ " assert set(yambda_df.columns) == expected_cols, f'Неправильные колонки: {yambda_df.columns}'\n",
+ "\n",
+ " assert yambda_df['item_id'].dtype == pl.List(pl.UInt32), 'item_id должен быть List[UInt32]'\n",
+ " assert yambda_df['timestamp'].dtype == pl.List(pl.UInt32), 'timestamp должен быть List[UInt32]'\n",
+ "\n",
+ " assert yambda_df['item_id'].list.len().min() > 0, 'Есть пустые истории'\n",
+ "\n",
+ " print('✅ test_yambda_data_loading: OK')\n",
+ "\n",
+ "test_yambda_data_loading()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 527
+ },
+ "id": "q33EG4wlc8ev",
+ "outputId": "01d03740-713e-46e8-d8c5-81ebf6b71546"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(shape: (5, 6)\n",
+ " ┌─────┬─────────────────────┬────────────┬─────────────┬─────────────────────┬─────────────────────┐\n",
+ " │ uid ┆ timestamp ┆ item_id ┆ is_organic ┆ played_ratio_pct ┆ track_length_second │\n",
+ " │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ s │\n",
+ " │ u32 ┆ list[u32] ┆ list[u32] ┆ list[u8] ┆ list[u16] ┆ --- │\n",
+ " │ ┆ ┆ ┆ ┆ ┆ list[u32] │\n",
+ " ╞═════╪═════════════════════╪════════════╪═════════════╪═════════════════════╪═════════════════════╡\n",
+ " │ 100 ┆ [39420, 39420, … ┆ [8326270, ┆ [0, 0, … 0] ┆ [100, 100, … 100] ┆ [170, 105, … 165] │\n",
+ " │ ┆ 25966140] ┆ 1441281, … ┆ ┆ ┆ │\n",
+ " │ ┆ ┆ 4734787] ┆ ┆ ┆ │\n",
+ " │ 200 ┆ [14329075, ┆ [3285270, ┆ [1, 1, … 1] ┆ [9, 28, … 100] ┆ [170, 170, … 145] │\n",
+ " │ ┆ 14329075, … ┆ 5253582, … ┆ ┆ ┆ │\n",
+ " │ ┆ 2545672… ┆ 3778807] ┆ ┆ ┆ │\n",
+ " │ 300 ┆ [54090, 54100, … ┆ [618910, ┆ [1, 1, … 1] ┆ [2, 4, … 15] ┆ [270, 130, … 210] │\n",
+ " │ ┆ 25907225] ┆ 8793425, … ┆ ┆ ┆ │\n",
+ " │ ┆ ┆ 9286415] ┆ ┆ ┆ │\n",
+ " │ 500 ┆ [22695440, ┆ [6417502, ┆ [0, 0, … 1] ┆ [100, 37, … 13] ┆ [225, 210, … 230] │\n",
+ " │ ┆ 22695690, … ┆ 6896222, … ┆ ┆ ┆ │\n",
+ " │ ┆ 2486145… ┆ 4077285] ┆ ┆ ┆ │\n",
+ " │ 600 ┆ [1329190, 1329405, ┆ [8077497, ┆ [0, 0, … 0] ┆ [100, 100, … 100] ┆ [245, 215, … 205] │\n",
+ " │ ┆ … 25997540] ┆ 1865247, … ┆ ┆ ┆ │\n",
+ " │ ┆ ┆ 6481452] ┆ ┆ ┆ │\n",
+ " └─────┴─────────────────────┴────────────┴─────────────┴─────────────────────┴─────────────────────┘,\n",
+ " shape: (0, 6)\n",
+ " ┌─────┬───────────┬───────────┬────────────┬──────────────────┬──────────────────────┐\n",
+ " │ uid ┆ timestamp ┆ item_id ┆ is_organic ┆ played_ratio_pct ┆ track_length_seconds │\n",
+ " │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
+ " │ u32 ┆ list[u32] ┆ list[u32] ┆ list[u8] ┆ list[u16] ┆ list[u32] │\n",
+ " ╞═════╪═══════════╪═══════════╪════════════╪══════════════════╪══════════════════════╡\n",
+ " └─────┴───────────┴───────────┴────────────┴──────────────────┴──────────────────────┘)"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "(yambda_df.head(), yambda_df.filter(yambda_df['timestamp'].list.len() != yambda_df['item_id'].list.len()))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "id": "-9ou8IARfjjT"
+ },
+ "outputs": [
+ {
+ "ename": "ColumnNotFoundError",
+ "evalue": "'explode' on column: 'is_organic' is invalid\n\nSchema at this point: Schema:\nname: _idx, field: UInt32\nname: uid, field: UInt32\nname: timestamp, field: List(UInt32)\nname: item_id, field: List(UInt32)\n",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mColumnNotFoundError\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[0;32mIn[7], line 5\u001b[0m\n\u001b[1;32m 1\u001b[0m yambda_df \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 2\u001b[0m \u001b[43myambda_df\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfilter\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpl\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcol\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43muid\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m%\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m200\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# надо убрать\u001b[39;49;00m\n\u001b[1;32m 4\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwith_row_index\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m_idx\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m----> 5\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexplode\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtimestamp\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mitem_id\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mis_organic\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mplayed_ratio_pct\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 10\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtrack_length_seconds\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 11\u001b[0m \u001b[43m \u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;241m.\u001b[39mfilter(\n\u001b[1;32m 13\u001b[0m (pl\u001b[38;5;241m.\u001b[39mcol(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mis_organic\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m) \u001b[38;5;241m&\u001b[39m\n\u001b[1;32m 14\u001b[0m (pl\u001b[38;5;241m.\u001b[39mcol(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mplayed_ratio_pct\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m50\u001b[39m)\n\u001b[1;32m 15\u001b[0m )\n\u001b[1;32m 16\u001b[0m \u001b[38;5;241m.\u001b[39mgroup_by([\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m_idx\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124muid\u001b[39m\u001b[38;5;124m'\u001b[39m], maintain_order\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 17\u001b[0m \u001b[38;5;241m.\u001b[39magg([\n\u001b[1;32m 18\u001b[0m pl\u001b[38;5;241m.\u001b[39mcol(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtimestamp\u001b[39m\u001b[38;5;124m'\u001b[39m),\n\u001b[1;32m 19\u001b[0m pl\u001b[38;5;241m.\u001b[39mcol(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mitem_id\u001b[39m\u001b[38;5;124m'\u001b[39m),\n\u001b[1;32m 20\u001b[0m ])\n\u001b[1;32m 21\u001b[0m \u001b[38;5;241m.\u001b[39mdrop(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m_idx\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 22\u001b[0m )\n",
+ "File \u001b[0;32m/usr/local/lib/python3.12/dist-packages/polars/dataframe/frame.py:8072\u001b[0m, in \u001b[0;36mDataFrame.explode\u001b[0;34m(self, columns, *more_columns)\u001b[0m\n\u001b[1;32m 8015\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mexplode\u001b[39m(\n\u001b[1;32m 8016\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 8017\u001b[0m columns: \u001b[38;5;28mstr\u001b[39m \u001b[38;5;241m|\u001b[39m Expr \u001b[38;5;241m|\u001b[39m Sequence[\u001b[38;5;28mstr\u001b[39m \u001b[38;5;241m|\u001b[39m Expr],\n\u001b[1;32m 8018\u001b[0m \u001b[38;5;241m*\u001b[39mmore_columns: \u001b[38;5;28mstr\u001b[39m \u001b[38;5;241m|\u001b[39m Expr,\n\u001b[1;32m 8019\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m DataFrame:\n\u001b[1;32m 8020\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 8021\u001b[0m \u001b[38;5;124;03m Explode the dataframe to long format by exploding the given columns.\u001b[39;00m\n\u001b[1;32m 8022\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 8070\u001b[0m \u001b[38;5;124;03m └─────────┴─────────┘\u001b[39;00m\n\u001b[1;32m 8071\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 8072\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlazy\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexplode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mmore_columns\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcollect\u001b[49m\u001b[43m(\u001b[49m\u001b[43m_eager\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n",
+ "File \u001b[0;32m/usr/local/lib/python3.12/dist-packages/polars/lazyframe/frame.py:2053\u001b[0m, in \u001b[0;36mLazyFrame.collect\u001b[0;34m(self, type_coercion, predicate_pushdown, projection_pushdown, simplify_expression, slice_pushdown, comm_subplan_elim, comm_subexpr_elim, cluster_with_columns, collapse_joins, no_optimization, streaming, engine, background, _eager, **_kwargs)\u001b[0m\n\u001b[1;32m 2051\u001b[0m \u001b[38;5;66;03m# Only for testing purposes\u001b[39;00m\n\u001b[1;32m 2052\u001b[0m callback \u001b[38;5;241m=\u001b[39m _kwargs\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpost_opt_callback\u001b[39m\u001b[38;5;124m\"\u001b[39m, callback)\n\u001b[0;32m-> 2053\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m wrap_df(\u001b[43mldf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcollect\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcallback\u001b[49m\u001b[43m)\u001b[49m)\n",
+ "\u001b[0;31mColumnNotFoundError\u001b[0m: 'explode' on column: 'is_organic' is invalid\n\nSchema at this point: Schema:\nname: _idx, field: UInt32\nname: uid, field: UInt32\nname: timestamp, field: List(UInt32)\nname: item_id, field: List(UInt32)\n"
+ ]
+ }
+ ],
+ "source": [
+ "yambda_df = (\n",
+ " yambda_df\n",
+ " .filter(pl.col('uid') % 200 == 0) # надо убрать\n",
+ " .with_row_index('_idx')\n",
+ " .explode([\n",
+ " 'timestamp',\n",
+ " 'item_id',\n",
+ " 'is_organic',\n",
+ " 'played_ratio_pct',\n",
+ " 'track_length_seconds',\n",
+ " ])\n",
+ " .filter(\n",
+ " (pl.col('is_organic') == 0) &\n",
+ " (pl.col('played_ratio_pct') >= 50)\n",
+ " )\n",
+ " .group_by(['_idx', 'uid'], maintain_order=True)\n",
+ " .agg([\n",
+ " pl.col('timestamp'),\n",
+ " pl.col('item_id'),\n",
+ " ])\n",
+ " .drop('_idx')\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "4HTturbHfjjU",
+ "outputId": "ea8b1d93-4997-441a-c6e3-2628acd11d7f"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "✅ test_yambda_filtering: OK\n"
+ ]
+ }
+ ],
+ "source": [
+ "def test_yambda_filtering():\n",
+ " assert yambda_df.shape[0] == 4289, \\\n",
+ " f'Неправильное количество пользователей: {yambda_df.shape[0]}'\n",
+ "\n",
+ " expected_columns = {'uid', 'timestamp', 'item_id'}\n",
+ " actual_columns = set(yambda_df.columns)\n",
+ " assert actual_columns == expected_columns, \\\n",
+ " f'Неправильные колонки. Ожидалось: {expected_columns}, получено: {actual_columns}'\n",
+ "\n",
+ " assert yambda_df['timestamp'].dtype == pl.List(pl.UInt32), \\\n",
+ " f\"timestamp должен быть List[UInt32], получено: {yambda_df['timestamp'].dtype}\"\n",
+ " assert yambda_df['item_id'].dtype == pl.List(pl.UInt32), \\\n",
+ " f\"item_id должен быть List[UInt32], получено: {yambda_df['item_id'].dtype}\"\n",
+ "\n",
+ " seq_lengths = yambda_df['item_id'].list.len()\n",
+ " assert seq_lengths.min() >= 1, \\\n",
+ " f'Минимальная длина последовательности должна быть >= 1, получено: {seq_lengths.min()}'\n",
+ " assert seq_lengths.sum() == 7587469, \\\n",
+ " f'Общее количество событий неверно. Ожидалось: 7587469, получено: {seq_lengths.sum()}'\n",
+ "\n",
+ " unique_items = yambda_df.select('item_id').explode('item_id').unique().shape[0]\n",
+ " assert unique_items == 304787, \\\n",
+ " f'Количество уникальных айтемов неверно. Ожидалось: 304787, получено: {unique_items}'\n",
+ "\n",
+ " print('✅ test_yambda_filtering: OK')\n",
+ "\n",
+ "test_yambda_filtering()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Оригинальные эмбеддинги: (7721749, 3)\n",
+ "Колонки: ['item_id', 'embed', 'normalized_embed']\n",
+ "shape: (5, 3)\n",
+ "┌─────────┬─────────────────────────────────┬─────────────────────────────────┐\n",
+ "│ item_id ┆ embed ┆ normalized_embed │\n",
+ "│ --- ┆ --- ┆ --- │\n",
+ "│ u32 ┆ list[f64] ┆ list[f64] │\n",
+ "╞═════════╪═════════════════════════════════╪═════════════════════════════════╡\n",
+ "│ 2 ┆ [-1.534035, -0.366767, … 0.999… ┆ [-0.064638, -0.015454, … 0.042… │\n",
+ "│ 3 ┆ [-3.761467, -1.068254, … -2.66… ┆ [-0.163937, -0.046558, … -0.11… │\n",
+ "│ 4 ┆ [2.445533, -2.523603, … -0.536… ┆ [0.076272, -0.078707, … -0.016… │\n",
+ "│ 5 ┆ [0.832846, 0.116125, … -1.4857… ┆ [0.03149, 0.004391, … -0.05617… │\n",
+ "│ 6 ┆ [-2.431483, -0.56872, … 0.0946… ┆ [-0.10345, -0.024197, … 0.0040… │\n",
+ "└─────────┴─────────────────────────────────┴─────────────────────────────────┘\n"
+ ]
+ }
+ ],
+ "source": [
+ "import polars as pl\n",
+ "import pandas as pd\n",
+ "import pickle\n",
+ "\n",
+ "# === 1. Загрузить оригинальные embeddings ===\n",
+ "embeddings_path = \"/home/jovyan/yambda_embeddings/embeddings.parquet\"\n",
+ "emb_df = pl.read_parquet(embeddings_path)\n",
+ "\n",
+ "print(f\"Оригинальные эмбеддинги: {emb_df.shape}\")\n",
+ "print(f\"Колонки: {emb_df.columns}\")\n",
+ "print(emb_df.head())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Валидные item_id: 7721749\n",
+ "Было строк: 4289\n",
+ "Стало строк: 4138\n"
+ ]
+ }
+ ],
+ "source": [
+ "valid_item_ids = set(emb_df['item_id'].to_list())\n",
+ "print(f\"\\nВалидные item_id: {len(valid_item_ids)}\")\n",
+ "valid_ids_pl = pl.Series(list(valid_item_ids))\n",
+ "\n",
+ "valid_item_ids = set(emb_df['item_id'].to_list())\n",
+ "valid_ids_pl = pl.Series(list(valid_item_ids))\n",
+ "\n",
+ "yambda_df_filtered = (\n",
+ " yambda_df\n",
+ " .with_columns(\n",
+ " pl.col(\"item_id\").list.eval(\n",
+ " pl.when(pl.element().is_in(valid_ids_pl))\n",
+ " .then(pl.int_range(pl.len()))\n",
+ " .otherwise(None)\n",
+ " ).list.drop_nulls().alias(\"valid_indices\")\n",
+ " )\n",
+ " .with_columns([\n",
+ " pl.col(\"item_id\").list.gather(pl.col(\"valid_indices\")),\n",
+ " pl.col(\"timestamp\").list.gather(pl.col(\"valid_indices\"))\n",
+ " ])\n",
+ " .drop(\"valid_indices\")\n",
+ " .filter(pl.col(\"item_id\").list.len() > 0)\n",
+ " .rename({\"item_id\": \"item_ids\", \"timestamp\": \"timestamps\"})\n",
+ ")\n",
+ "yambda_df_filtered = yambda_df_filtered.filter(yambda_df_filtered['item_ids'].list.len() >= 5)\n",
+ "\n",
+ "print(f\"Было строк: {yambda_df.shape[0]}\")\n",
+ "print(f\"Стало строк: {yambda_df_filtered.shape[0]}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "3️⃣ Получите все уникальные ID треков из датасета и создайте маппинг: старый_id - новый_id, где новый_id находится в диапазоне от 0 до N - 1.\n",
+ "\n",
+ "Модели глубокого обучения требуют, чтобы категориальные признаки (в нашем случае ID треков) были представлены целыми числами в диапазоне от 0 до N-1, где N — количество уникальных треков. Датасет Yambda содержит оригинальные ID треков, которые могут быть разреженными (например, [100, 5000, 7, 12000, ...]) — это неэффективно для embedding-таблиц."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "unique_items = (\n",
+ " yambda_df_filtered\n",
+ " .select('item_ids')\n",
+ " .explode('item_ids')\n",
+ " .unique()\n",
+ " .sort('item_ids')\n",
+ ").with_row_index('new_item_ids')\n",
+ "\n",
+ "\n",
+ "item_mapping = dict(zip(unique_items['item_ids'], unique_items['new_item_ids']))\n",
+ "\n",
+ "\n",
+ "yambda_df_filtered = yambda_df_filtered.with_columns([\n",
+ " pl.col('item_ids')\n",
+ " .map_elements(\n",
+ " lambda items: [item_mapping[item] for item in items],\n",
+ " return_dtype=pl.List(pl.UInt32)\n",
+ " )\n",
+ " .alias('item_ids')\n",
+ "])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "✅ test_item_mapping: OK\n"
+ ]
+ }
+ ],
+ "source": [
+ "def test_item_mapping():\n",
+ " assert unique_items.shape == (292865, 2), f'Неправильный размер unique_items: {unique_items.shape}'\n",
+ " assert set(unique_items.columns) == {'new_item_ids', 'item_ids'}, 'Неправильные колонки unique_items'\n",
+ "\n",
+ " assert len(item_mapping) == 292865, f'Неправильный размер item_mapping: {len(item_mapping)}'\n",
+ " assert item_mapping[50] == 0 and item_mapping[175] == 1 and item_mapping[195] == 2, \\\n",
+ " 'Неверные первые маппинги'\n",
+ "\n",
+ " new_ids = unique_items['new_item_ids']\n",
+ " assert new_ids.min() == 0 and new_ids.max() == 292864, 'new_item_id должны быть в [0, 292865,]'\n",
+ "\n",
+ " all_ids = yambda_df_filtered.select('item_ids').explode('item_ids')['item_ids']\n",
+ " assert all_ids.min() == 0 and all_ids.max() == 292864, 'item_id в yambda_df не обновлены'\n",
+ " assert all_ids.n_unique() == 292865, 'Количество уникальных item_id изменилось'\n",
+ "\n",
+ " print('✅ test_item_mapping: OK')\n",
+ "\n",
+ "test_item_mapping()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Маппинг использует: 292865 уникальных item_id\n",
+ "Переиндексированные эмбеддинги: (292865, 2)\n",
+ "shape: (5, 2)\n",
+ "┌─────────────────────────────────┬─────────┐\n",
+ "│ embedding ┆ item_id │\n",
+ "│ --- ┆ --- │\n",
+ "│ list[f64] ┆ u32 │\n",
+ "╞═════════════════════════════════╪═════════╡\n",
+ "│ [-0.0526, 0.048672, … -0.04217… ┆ 0 │\n",
+ "│ [0.090222, -0.00718, … -0.0862… ┆ 1 │\n",
+ "│ [-0.00822, -0.057882, … 0.2188… ┆ 2 │\n",
+ "│ [-0.107289, -0.034719, … -0.02… ┆ 3 │\n",
+ "│ [0.012762, -0.043315, … 0.1494… ┆ 4 │\n",
+ "└─────────────────────────────────┴─────────┘\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(f\"\\nМаппинг использует: {len(item_mapping)} уникальных item_id\")\n",
+ "\n",
+ "# === 3. Переиндексировать embeddings используя существующий маппинг ===\n",
+ "emb_df_reindexed = emb_df.with_columns(\n",
+ " pl.col('item_id')\n",
+ " .map_elements(\n",
+ " lambda x: item_mapping.get(x, None),\n",
+ " return_dtype=pl.UInt32\n",
+ " )\n",
+ " .alias('new_item_id')\n",
+ ").filter(pl.col('new_item_id').is_not_null()).drop('item_id', 'embed').rename({'new_item_id': 'item_id', 'normalized_embed': 'embedding'})\n",
+ "\n",
+ "print(f\"Переиндексированные эмбеддинги: {emb_df_reindexed.shape}\")\n",
+ "print(emb_df_reindexed.head())\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "✅ test_item_mapping: OK\n"
+ ]
+ }
+ ],
+ "source": [
+ "def test_emb_item_mapping():\n",
+ " all_ids = emb_df_reindexed['item_id']\n",
+ " assert all_ids.min() == 0 and all_ids.max() == 292864, 'item_id в yambda_df не обновлены'\n",
+ " assert all_ids.n_unique() == 292865, 'Количество уникальных item_id изменилось'\n",
+ "\n",
+ " print('✅ test_item_mapping: OK')\n",
+ "\n",
+ "test_emb_item_mapping()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "✓ Сохранены embeddings: /home/jovyan/IRec/sigir/yambda_data/yambda_embeddings_reindexed.parquet\n"
+ ]
+ }
+ ],
+ "source": [
+ "embeddings_output_parquet_path = \"/home/jovyan/IRec/sigir/yambda_data/yambda_embeddings_reindexed.parquet\"\n",
+ "emb_df_reindexed.write_parquet(embeddings_output_parquet_path)\n",
+ "print(f\"\\n✓ Сохранены embeddings: {embeddings_output_parquet_path}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Тест пройден: все 4138 строк синхронизированы\n"
+ ]
+ }
+ ],
+ "source": [
+ "def test_integrity(df):\n",
+ " bad_rows = df.filter(\n",
+ " (pl.col(\"item_ids\").list.len() != pl.col(\"timestamps\").list.len()) | (pl.col(\"timestamps\").list.len() < 5)\n",
+ " )\n",
+ " \n",
+ " if bad_rows.height > 0:\n",
+ " print(f\"ОШИБКА: {bad_rows.height} строк рассинхронизированы!\")\n",
+ " raise ValueError(\"Рассинхрон массивов!\")\n",
+ " \n",
+ " print(f\"Тест пройден: все {df.height} строк синхронизированы\")\n",
+ "\n",
+ "test_integrity(yambda_df_filtered)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Сохранён filtered yambda_df: /home/jovyan/IRec/sigir/yambda_data/yambda_sequential_50m_filtered_reindexed.parquet\n"
+ ]
+ }
+ ],
+ "source": [
+ "yambda_output_parquet_path = \"/home/jovyan/IRec/sigir/yambda_data/yambda_sequential_50m_filtered_reindexed.parquet\"\n",
+ "yambda_df_filtered.write_parquet(yambda_output_parquet_path)\n",
+ "print(f\"Сохранён filtered yambda_df: {yambda_output_parquet_path}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Сохранён маппинг: /home/jovyan/IRec/sigir/yambda_data/old_to_new_item_id_mapping.json\n"
+ ]
+ }
+ ],
+ "source": [
+ "import json\n",
+ "mapping_output_path = \"/home/jovyan/IRec/sigir/yambda_data/old_to_new_item_id_mapping.json\"\n",
+ "\n",
+ "with open(mapping_output_path, 'w') as f:\n",
+ " json.dump({str(k): v for k, v in item_mapping.items()}, f, indent=2)\n",
+ "\n",
+ "print(f\"Сохранён маппинг: {mapping_output_path}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (10, 3)| uid | timestamps | item_ids |
|---|
| u32 | list[u32] | list[u32] |
| 600 | [1329190, 1329405, … 25997540] | [252026, 58171, … 201909] |
| 800 | [121100, 121290, … 25977310] | [20844, 198210, … 60455] |
| 1000 | [11335730, 11335925, … 25972225] | [46643, 57592, … 95670] |
| 1400 | [280570, 280735, … 25993315] | [4634, 213798, … 104891] |
| 1600 | [899275, 930305, … 25941890] | [223933, 154424, … 104876] |
| 2000 | [18814620, 18828965, … 25225145] | [137828, 138498, … 19072] |
| 2200 | [10053900, 10054120, … 25948025] | [4923, 231643, … 28122] |
| 2400 | [14246260, 14246390, … 25999860] | [157350, 217652, … 75038] |
| 2600 | [6089640, 6089915, … 25951140] | [9426, 202953, … 140393] |
| 2800 | [19744285, 19744475, … 25894825] | [123607, 291065, … 272888] |
"
+ ],
+ "text/plain": [
+ "shape: (10, 3)\n",
+ "┌──────┬─────────────────────────────────┬────────────────────────────┐\n",
+ "│ uid ┆ timestamps ┆ item_ids │\n",
+ "│ --- ┆ --- ┆ --- │\n",
+ "│ u32 ┆ list[u32] ┆ list[u32] │\n",
+ "╞══════╪═════════════════════════════════╪════════════════════════════╡\n",
+ "│ 600 ┆ [1329190, 1329405, … 25997540] ┆ [252026, 58171, … 201909] │\n",
+ "│ 800 ┆ [121100, 121290, … 25977310] ┆ [20844, 198210, … 60455] │\n",
+ "│ 1000 ┆ [11335730, 11335925, … 2597222… ┆ [46643, 57592, … 95670] │\n",
+ "│ 1400 ┆ [280570, 280735, … 25993315] ┆ [4634, 213798, … 104891] │\n",
+ "│ 1600 ┆ [899275, 930305, … 25941890] ┆ [223933, 154424, … 104876] │\n",
+ "│ 2000 ┆ [18814620, 18828965, … 2522514… ┆ [137828, 138498, … 19072] │\n",
+ "│ 2200 ┆ [10053900, 10054120, … 2594802… ┆ [4923, 231643, … 28122] │\n",
+ "│ 2400 ┆ [14246260, 14246390, … 2599986… ┆ [157350, 217652, … 75038] │\n",
+ "│ 2600 ┆ [6089640, 6089915, … 25951140] ┆ [9426, 202953, … 140393] │\n",
+ "│ 2800 ┆ [19744285, 19744475, … 2589482… ┆ [123607, 291065, … 272888] │\n",
+ "└──────┴─────────────────────────────────┴────────────────────────────┘"
+ ]
+ },
+ "execution_count": 44,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "yambda_df_filtered.head(10)"
+ ]
+ }
+ ],
+ "metadata": {
+ "accelerator": "GPU",
+ "colab": {
+ "gpuType": "T4",
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/sigir/yambda_processing/yambda_exps_data.ipynb b/sigir/yambda_processing/yambda_exps_data.ipynb
new file mode 100644
index 0000000..e8f8d8b
--- /dev/null
+++ b/sigir/yambda_processing/yambda_exps_data.ipynb
@@ -0,0 +1,1168 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "e2462a97-6705-44e1-a232-4dd78a5dfc85",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import polars as pl\n",
+ "import json\n",
+ "from typing import List, Dict"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "fd38624d-5796-4aa5-929f-7e82c5544f6c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "interactions_output_parquet_path = '/home/jovyan/IRec/sigir/yambda_data/yambda_sequential_50m_filtered_reindexed.parquet'\n",
+ "df = pl.read_parquet(interactions_output_parquet_path)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "69066941",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def merge_and_save(parts_to_merge, dirr, output_name):\n",
+ " merged = {}\n",
+ " print(f\"Merging {len(parts_to_merge)} files into {output_name}...\")\n",
+ " \n",
+ " for part in parts_to_merge:\n",
+ " # with open(fp, 'r') as f:\n",
+ " # part = json.load(f)\n",
+ " for uid, items in part.items():\n",
+ " if uid not in merged:\n",
+ " merged[uid] = []\n",
+ " merged[uid].extend(items)\n",
+ " \n",
+ " out_path = f\"{dirr}/{output_name}\"\n",
+ " with open(out_path, 'w') as f:\n",
+ " json.dump(merged, f)\n",
+ " print(f\"✓ Done: {out_path} (Users: {len(merged)})\")\n",
+ "\n",
+ "\n",
+ "def merge_and_save_with_filter(parts_to_merge, dirr, output_name, min_history_len=5):\n",
+ " merged = {}\n",
+ " print(f\"Merging {len(parts_to_merge)} files into {output_name} (min len={min_history_len})...\")\n",
+ " \n",
+ " for part in parts_to_merge:\n",
+ " for uid, items in part.items():\n",
+ " if uid not in merged:\n",
+ " merged[uid] = []\n",
+ " merged[uid].extend(items)\n",
+ "\n",
+ " filtered_merged = {}\n",
+ " filtered_count = 0\n",
+ " \n",
+ " for uid, items in merged.items():\n",
+ " if len(items) >= min_history_len:\n",
+ " filtered_merged[uid] = items\n",
+ " else:\n",
+ " filtered_count += 1\n",
+ " \n",
+ " print(f\"Filtered {filtered_count} users with history < {min_history_len}\")\n",
+ " print(f\"Remaining: {len(filtered_merged)} users\")\n",
+ " \n",
+ " out_path = f\"{dirr}/{output_name}\"\n",
+ " with open(out_path, 'w') as f:\n",
+ " json.dump(filtered_merged, f)\n",
+ " print(f\"Done: {out_path} (Users: {len(filtered_merged)})\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "ee127317-66b8-4f22-9109-94bcb8b1f1ae",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def split_session_by_timestamps(\n",
+ " df: pl.DataFrame,\n",
+ " time_cutoffs: List[int],\n",
+ " output_dir: str = None,\n",
+ " return_dicts: bool = True\n",
+ ") -> List[Dict[int, List[int]]]:\n",
+ " \"\"\"\n",
+ " Args:\n",
+ " df: Polars DataFrame с колонками uid, item_ids (list), timestamps (list)\n",
+ " time_cutoffs: Лист временных точек для разбиения\n",
+ " output_dir: Директория для сохранения JSON файлов (опционально)\n",
+ " return_dicts: Возвращать ли словари (как json_data format)\n",
+ " \n",
+ " Возвращает лист словарей в формате {user_id: [item_ids для интервала]}\n",
+ " \"\"\"\n",
+ " \n",
+ " result_dicts = []\n",
+ " \n",
+ " def extract_interval(df_source, start, end=None):\n",
+ " q = df_source.lazy()\n",
+ " q = q.explode([\"item_ids\", \"timestamps\"])\n",
+ " \n",
+ " if end is not None:\n",
+ " q = q.filter(\n",
+ " (pl.col(\"timestamps\") >= start) & \n",
+ " (pl.col(\"timestamps\") < end)\n",
+ " )\n",
+ " else:\n",
+ " q = q.filter(\n",
+ " pl.col(\"timestamps\") >= start\n",
+ " )\n",
+ " \n",
+ " q = q.group_by(\"uid\").agg([\n",
+ " pl.col(\"item_ids\").alias(\"item_ids\")\n",
+ " ]).sort(\"uid\")\n",
+ " \n",
+ " return q.collect()\n",
+ " \n",
+ " intervals = []\n",
+ " current_start = 0\n",
+ " for cutoff in time_cutoffs:\n",
+ " intervals.append((current_start, cutoff))\n",
+ " current_start = cutoff\n",
+ "\n",
+ " intervals.append((current_start, None))\n",
+ "\n",
+ " for start, end in intervals:\n",
+ " subset = extract_interval(df, start, end)\n",
+ "\n",
+ " json_dict = {}\n",
+ " for user_id, item_ids in subset.iter_rows():\n",
+ " json_dict[user_id] = item_ids\n",
+ " \n",
+ " result_dicts.append(json_dict)\n",
+ "\n",
+ " if output_dir:\n",
+ " if end is not None:\n",
+ " filename = f\"inter_new_[{start}_{end}).json\"\n",
+ " else:\n",
+ " filename = f\"inter_new_[{start}_inf).json\"\n",
+ " \n",
+ " filepath = f\"{output_dir}/{filename}\"\n",
+ " with open(filepath, 'w') as f:\n",
+ " json.dump(json_dict, f, indent=2)\n",
+ " \n",
+ " print(f\"✓ Сохранено: {filepath}\")\n",
+ " \n",
+ " return result_dicts"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "6cff8e7b",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (5, 3)| uid | timestamps | item_ids |
|---|
| u32 | list[u32] | list[u32] |
| 600 | [1329190, 1329405, … 25997540] | [252026, 58171, … 201909] |
| 800 | [121100, 121290, … 25977310] | [20844, 198210, … 60455] |
| 1000 | [11335730, 11335925, … 25972225] | [46643, 57592, … 95670] |
| 1400 | [280570, 280735, … 25993315] | [4634, 213798, … 104891] |
| 1600 | [899275, 930305, … 25941890] | [223933, 154424, … 104876] |
"
+ ],
+ "text/plain": [
+ "shape: (5, 3)\n",
+ "┌──────┬─────────────────────────────────┬────────────────────────────┐\n",
+ "│ uid ┆ timestamps ┆ item_ids │\n",
+ "│ --- ┆ --- ┆ --- │\n",
+ "│ u32 ┆ list[u32] ┆ list[u32] │\n",
+ "╞══════╪═════════════════════════════════╪════════════════════════════╡\n",
+ "│ 600 ┆ [1329190, 1329405, … 25997540] ┆ [252026, 58171, … 201909] │\n",
+ "│ 800 ┆ [121100, 121290, … 25977310] ┆ [20844, 198210, … 60455] │\n",
+ "│ 1000 ┆ [11335730, 11335925, … 2597222… ┆ [46643, 57592, … 95670] │\n",
+ "│ 1400 ┆ [280570, 280735, … 25993315] ┆ [4634, 213798, … 104891] │\n",
+ "│ 1600 ┆ [899275, 930305, … 25941890] ┆ [223933, 154424, … 104876] │\n",
+ "└──────┴─────────────────────────────────┴────────────────────────────┘"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "901e7400",
+ "metadata": {},
+ "source": [
+ "# QUANTILE CUTOFF"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "8c691891",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_quantile_cutoffs(df, num_parts=4, base_ratio=None):\n",
+ " \"\"\"\n",
+ " Считает cutoffs так, чтобы разбить данные на части.\n",
+ " \n",
+ " Args:\n",
+ " num_parts: На сколько частей делить \"хвост\" истории.\n",
+ " base_ratio: Какую долю данных отдать в Base (самую первую часть). \n",
+ " Если None, делит всё поровну.\n",
+ " \"\"\"\n",
+ " # Достаем все таймстемпы в один плоский массив\n",
+ " # Это может занять память, если данных очень много (>100M), но для Beauty (2M) это ок\n",
+ " all_ts = df.select(pl.col(\"timestamps\").explode()).to_series().sort()\n",
+ " total_events = len(all_ts)\n",
+ " \n",
+ " print(f\"Всего событий: {total_events}\")\n",
+ " \n",
+ " cutoffs = []\n",
+ " \n",
+ " if base_ratio:\n",
+ " # Base занимает X% (например 80%), а остаток делим поровну на 3 части (Valid, Gap, Test)\n",
+ " # Остаток = 1 - base_ratio\n",
+ " # Каждая малая часть = (1 - base_ratio) / num_parts_tail\n",
+ " \n",
+ " base_idx = int(total_events * base_ratio)\n",
+ " cutoffs.append(all_ts[base_idx]) # Первый cutoff отделяет Base\n",
+ " \n",
+ " remaining_events = total_events - base_idx\n",
+ " part_size = remaining_events // num_parts # Делим остаток на 3 части (P1, P2, P3)\n",
+ " \n",
+ " current_idx = base_idx\n",
+ " for _ in range(num_parts-1): # Нам нужно еще 2 границы, чтобы получить 3 части\n",
+ " current_idx += part_size\n",
+ " cutoffs.append(all_ts[current_idx])\n",
+ " \n",
+ " else:\n",
+ " # Сценарий: Просто делим всё на N равных частей\n",
+ " step = total_events // num_parts\n",
+ " for i in range(1, num_parts):\n",
+ " idx = i * step\n",
+ " cutoffs.append(all_ts[idx])\n",
+ " \n",
+ " return cutoffs\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "13c1466f",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Всего событий: 7371990\n",
+ "\n",
+ "--- Новые Cutoffs (по количеству событий) ---\n",
+ "Cutoffs: [22138015, 23136375, 24137410, 25093085]\n",
+ "[0, 22138015, 23136375, 24137410, 25093085, None]\n"
+ ]
+ }
+ ],
+ "source": [
+ "equal_event_cutoffs = get_quantile_cutoffs(df, num_parts=4, base_ratio=0.8)\n",
+ "\n",
+ "print(\"\\n--- Новые Cutoffs (по количеству событий) ---\")\n",
+ "print(f\"Cutoffs: {equal_event_cutoffs}\")\n",
+ "\n",
+ "# Проверка распределения\n",
+ "intervals_eq = [0] + equal_event_cutoffs + [None]\n",
+ "print(intervals_eq)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "4e7f7b46",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "✓ Сохранено: /home/jovyan/IRec/data/Yambda/updated_quantile_splits/raw/inter_new_[0_22138015).json\n",
+ "✓ Сохранено: /home/jovyan/IRec/data/Yambda/updated_quantile_splits/raw/inter_new_[22138015_24137410).json\n",
+ "✓ Сохранено: /home/jovyan/IRec/data/Yambda/updated_quantile_splits/raw/inter_new_[24137410_25093085).json\n",
+ "✓ Сохранено: /home/jovyan/IRec/data/Yambda/updated_quantile_splits/raw/inter_new_[25093085_inf).json\n",
+ "0 Base 3813 5897592 \n",
+ "1 Gap 3315 737198 \n",
+ "2 Valid 3120 368599 \n",
+ "3 Test 3154 368601 \n"
+ ]
+ }
+ ],
+ "source": [
+ "new_split_files = split_session_by_timestamps(\n",
+ " df, \n",
+ " [22138015, 24137410, 25093085], \n",
+ " output_dir=\"/home/jovyan/IRec/data/Yambda/updated_quantile_splits/raw\"\n",
+ ")\n",
+ "\n",
+ "names = [\"Base\", \"Gap\", \"Valid\", \"Test\"]\n",
+ "for i, d in enumerate(new_split_files):\n",
+ " num_users = len(d)\n",
+ " \n",
+ " num_events = sum(len(items) for items in d.values())\n",
+ " \n",
+ " print(f\"{i:<10} {names[i]:<10} {num_users:<10} {num_events:<10}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "id": "82fd2bca",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Merging 2 files into exp_4_0.9_inter_tiger_train.json...\n",
+ "✓ Done: /home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/exp_4_0.9_inter_tiger_train.json (Users: 4016)\n",
+ "Merging 2 files into exp_4-1_0.9_inter_semantics_train.json...\n",
+ "✓ Done: /home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/exp_4-1_0.9_inter_semantics_train.json (Users: 4016)\n",
+ "Merging 1 files into exp_4-2_0.8_inter_semantics_train.json...\n",
+ "✓ Done: /home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/exp_4-2_0.8_inter_semantics_train.json (Users: 3813)\n",
+ "Merging 3 files into exp_4-3_0.95_inter_semantics_train.json...\n",
+ "✓ Done: /home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/exp_4-3_0.95_inter_semantics_train.json (Users: 4118)\n",
+ "Merging 1 files into test_set.json...\n",
+ "✓ Done: /home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/test_set.json (Users: 3154)\n",
+ "Merging 1 files into valid_set.json...\n",
+ "✓ Done: /home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/valid_set.json (Users: 3120)\n",
+ "Merging 4 files into all_set.json...\n",
+ "✓ Done: /home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/all_set.json (Users: 4138)\n",
+ "All done!\n"
+ ]
+ }
+ ],
+ "source": [
+ "EXP_DIR = \"/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps\"\n",
+ "\n",
+ "base_p, gap_p, valid_p, test_p = new_split_files[0], new_split_files[1], new_split_files[2], new_split_files[3]\n",
+ "\n",
+ "# Tiger: base + gap\n",
+ "merge_and_save([base_p, gap_p], EXP_DIR, \"exp_4_0.9_inter_tiger_train.json\")\n",
+ "\n",
+ "# 1. Exp 4.1 (Standard)\n",
+ "# Semantics: base + gap (Всё кроме валидации и теста)\n",
+ "merge_and_save([base_p, gap_p], EXP_DIR, \"exp_4-1_0.9_inter_semantics_train.json\")\n",
+ "\n",
+ "# 2. Exp 4.2 (Short Semantics)\n",
+ "# Semantics: base (Короче на пропуск, без gap)\n",
+ "merge_and_save([base_p], EXP_DIR, \"exp_4-2_0.8_inter_semantics_train.json\")\n",
+ "\n",
+ "# 3. Exp 4.3 (Leak)\n",
+ "# Semantics: base + gap + valid (Видит валидацию)\n",
+ "merge_and_save([base_p, gap_p, valid_p], EXP_DIR, \"exp_4-3_0.95_inter_semantics_train.json\")\n",
+ "\n",
+ "# 4. Test Set (тест всех моделей)\n",
+ "merge_and_save([test_p], EXP_DIR, \"test_set.json\")\n",
+ "\n",
+ "# 4. Valid Set (валидационный набор)\n",
+ "merge_and_save([valid_p], EXP_DIR, \"valid_set.json\")\n",
+ "\n",
+ "# 4. All Set (все данные)\n",
+ "merge_and_save([base_p, gap_p, valid_p, test_p], EXP_DIR, \"all_set.json\")\n",
+ "\n",
+ "print(\"All done!\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "id": "d34b1c55",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2025-12-11T08:56:58.546300Z",
+ "start_time": "2025-12-11T08:56:58.343394Z"
+ }
+ },
+ "source": [
+ "with open(\"/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/all_set.json\", 'r') as f:\n",
+ " old_inter_new = json.load(f)\n",
+ "\n",
+ "with open(\"/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/exp_4-1_0.9_inter_semantics_train.json\", 'r') as ff:\n",
+ " first_sem = json.load(ff)\n",
+ " \n",
+ "with open(\"/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/exp_4-2_0.8_inter_semantics_train.json\", 'r') as ff:\n",
+ " second_sem = json.load(ff)\n",
+ " \n",
+ "with open(\"/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/exp_4-3_0.95_inter_semantics_train.json\", 'r') as ff:\n",
+ " third_sem = json.load(ff)\n",
+ " \n",
+ "with open(\"/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/exp_4_0.9_inter_tiger_train.json\", 'r') as ff:\n",
+ " tiger_sem = json.load(ff)\n",
+ "\n",
+ "with open(\"/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/test_set.json\", 'r') as ff:\n",
+ " test_sem = json.load(ff)\n",
+ "\n",
+ "with open(\"/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/all_set.json\", 'r') as ff:\n",
+ " all_test_data = json.load(ff)\n",
+ "\n",
+ "def check_prefix_match(full_data, subset_data, check_suffix=False):\n",
+ " \"\"\"\n",
+ " check_suffix=True включит режим проверки суффиксов (для теста).\n",
+ " \"\"\"\n",
+ " mismatch_count = 0\n",
+ " full_match_count = 0\n",
+ "\n",
+ " num_events_full_data = sum(len(items) for items in full_data.values())\n",
+ " num_events_subset_data = sum(len(items) for items in subset_data.values())\n",
+ " print(f\"доля событий всего {(num_events_subset_data/num_events_full_data):.2f}:\")\n",
+ " \n",
+ " for user, sub_items in subset_data.items():\n",
+ " \n",
+ " if user not in full_data:\n",
+ " print(f\"⚠ Юзер {user} не найден в исходном файле!\")\n",
+ " mismatch_count += 1\n",
+ " continue\n",
+ " \n",
+ " full_items = full_data[user]\n",
+ " \n",
+ " if not check_suffix:\n",
+ " if len(sub_items) > len(full_items):\n",
+ " mismatch_count += 1\n",
+ " continue\n",
+ " \n",
+ " if full_items[:len(sub_items)] == sub_items:\n",
+ " if len(full_items) == len(sub_items):\n",
+ " full_match_count += 1\n",
+ " else:\n",
+ " mismatch_count += 1\n",
+ "\n",
+ " else:\n",
+ " if len(sub_items) > len(full_items):\n",
+ " mismatch_count += 1\n",
+ " continue\n",
+ "\n",
+ " if full_items[-len(sub_items):] == sub_items:\n",
+ " if len(full_items) == len(sub_items):\n",
+ " full_match_count += 1\n",
+ " else:\n",
+ " mismatch_count += 1\n",
+ "\n",
+ " mode = \"СУФФИКСЫ\" if check_suffix else \"ПРЕФИКСЫ\"\n",
+ " \n",
+ " if mismatch_count == 0:\n",
+ " print(f\"OK [{mode}] Все {len(subset_data)} массивов ОК. Полных совпадений: {full_match_count}\")\n",
+ " else:\n",
+ " print(f\"NOT OK [{mode}] Найдено {mismatch_count} ошибок.\")\n",
+ "\n",
+ "# --- Запуск проверок ---\n",
+ "print(\"Проверка Train сетов (должны быть префиксами):\")\n",
+ "check_prefix_match(old_inter_new, first_sem)\n",
+ "check_prefix_match(old_inter_new, second_sem)\n",
+ "check_prefix_match(old_inter_new, third_sem)\n",
+ "check_prefix_match(old_inter_new, tiger_sem)\n",
+ "\n",
+ "print(\"\\nПроверка Test сета (должен быть суффиксом):\")\n",
+ "check_prefix_match(old_inter_new, test_sem, check_suffix=True)\n",
+ "\n",
+ "print(\"\\n(Контроль) Проверка Test сета как префикса (должна упасть):\")\n",
+ "check_prefix_match(old_inter_new, test_sem, check_suffix=False)\n",
+ "\n",
+ "check_prefix_match(old_inter_new, all_test_data)\n"
+ ],
+ "outputs": [
+ {
+ "ename": "FileNotFoundError",
+ "evalue": "[Errno 2] No such file or directory: '/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/all_set.json'",
+ "output_type": "error",
+ "traceback": [
+ "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
+ "\u001B[0;31mFileNotFoundError\u001B[0m Traceback (most recent call last)",
+ "Cell \u001B[0;32mIn[1], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m \u001B[38;5;28;43mopen\u001B[39;49m\u001B[43m(\u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43m/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/all_set.json\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43mr\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m)\u001B[49m \u001B[38;5;28;01mas\u001B[39;00m f:\n\u001B[1;32m 2\u001B[0m old_inter_new \u001B[38;5;241m=\u001B[39m json\u001B[38;5;241m.\u001B[39mload(f)\n\u001B[1;32m 4\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m \u001B[38;5;28mopen\u001B[39m(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/exp_4-1_0.9_inter_semantics_train.json\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mr\u001B[39m\u001B[38;5;124m'\u001B[39m) \u001B[38;5;28;01mas\u001B[39;00m ff:\n",
+ "File \u001B[0;32m~/repositories/ucp-author-centric/ucp-env/lib/python3.9/site-packages/IPython/core/interactiveshell.py:310\u001B[0m, in \u001B[0;36m_modified_open\u001B[0;34m(file, *args, **kwargs)\u001B[0m\n\u001B[1;32m 303\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m file \u001B[38;5;129;01min\u001B[39;00m {\u001B[38;5;241m0\u001B[39m, \u001B[38;5;241m1\u001B[39m, \u001B[38;5;241m2\u001B[39m}:\n\u001B[1;32m 304\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\n\u001B[1;32m 305\u001B[0m \u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mIPython won\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mt let you open fd=\u001B[39m\u001B[38;5;132;01m{\u001B[39;00mfile\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m by default \u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[1;32m 306\u001B[0m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mas it is likely to crash IPython. If you know what you are doing, \u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[1;32m 307\u001B[0m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124myou can use builtins\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124m open.\u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[1;32m 308\u001B[0m )\n\u001B[0;32m--> 310\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mio_open\u001B[49m\u001B[43m(\u001B[49m\u001B[43mfile\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n",
+ "\u001B[0;31mFileNotFoundError\u001B[0m: [Errno 2] No such file or directory: '/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps/all_set.json'"
+ ]
+ }
+ ],
+ "execution_count": 1
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "id": "c3a0adf2",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "================================================================================\n",
+ "ПРОВЕРКА НА ПУСТЫЕ ЧАСТИ ИСТОРИЙ\n",
+ "================================================================================\n",
+ "\n",
+ "[exp_4-1_0.9] Анализ...\n",
+ " Юзеров в сплите: 4,016 / 4,138\n",
+ " ПУСТЫХ сессий: 15\n",
+ " ОБЩИХ ПРОБЛЕМ: 15\n",
+ "\n",
+ "[exp_4-2_0.8] Анализ...\n",
+ " Юзеров в сплите: 3,813 / 4,138\n",
+ " ПУСТЫХ сессий: 22\n",
+ " ОБЩИХ ПРОБЛЕМ: 22\n",
+ "\n",
+ "[exp_4-3_0.95] Анализ...\n",
+ " Юзеров в сплите: 4,118 / 4,138\n",
+ " ПУСТЫХ сессий: 7\n",
+ " ОБЩИХ ПРОБЛЕМ: 7\n",
+ "\n",
+ "[exp_4_0.9_tiger] Анализ...\n",
+ " Юзеров в сплите: 4,016 / 4,138\n",
+ " ПУСТЫХ сессий: 15\n",
+ " ОБЩИХ ПРОБЛЕМ: 15\n",
+ "\n",
+ "[test_set] Анализ...\n",
+ " Юзеров в сплите: 3,154 / 4,138\n",
+ " ПУСТЫХ сессий: 105\n",
+ " ОБЩИХ ПРОБЛЕМ: 105\n"
+ ]
+ }
+ ],
+ "source": [
+ "def check_non_empty_splits(full_data, splits_data, split_names, min_history_len=2):\n",
+ " \"\"\"\n",
+ " Проверяет, что ни одна часть истории пользователя НЕ пустая во всех разбиениях.\n",
+ " \"\"\"\n",
+ " print(\"\\n\" + \"=\"*80)\n",
+ " print(\"ПРОВЕРКА НА ПУСТЫЕ ЧАСТИ ИСТОРИЙ\")\n",
+ " print(\"=\"*80)\n",
+ " \n",
+ " all_users = set(full_data.keys())\n",
+ " total_issues = 0\n",
+ " \n",
+ " for i in range(len(split_names)):\n",
+ " split_name = split_names[i]\n",
+ " split_data = splits_data[i]\n",
+ " print(f\"\\n[{split_name}] Анализ...\")\n",
+ " \n",
+ " split_users = set(split_data.keys())\n",
+ " empty_sessions = []\n",
+ " \n",
+ " for user, items in split_data.items():\n",
+ " if not items or len(items) < min_history_len:\n",
+ " empty_sessions.append(user)\n",
+ " \n",
+ " issues_count = len(empty_sessions)\n",
+ " total_issues += issues_count\n",
+ " \n",
+ " print(f\" Юзеров в сплите: {len(split_users):,} / {len(all_users):,}\")\n",
+ " print(f\" ПУСТЫХ сессий: {len(empty_sessions)}\")\n",
+ " print(f\" ОБЩИХ ПРОБЛЕМ: {issues_count}\")\n",
+ " \n",
+ " if total_issues == 0:\n",
+ " print(\"\\nВСЕ РАЗБИЕНИЯ БЕЗ ПУСТЫХ СЕССИЙ\")\n",
+ "\n",
+ "split_names = ['exp_4-1_0.9', 'exp_4-2_0.8', 'exp_4-3_0.95', 'exp_4_0.9_tiger', 'test_set']\n",
+ "splits_list = [first_sem, second_sem, third_sem, tiger_sem, test_sem]\n",
+ "\n",
+ "check_non_empty_splits(old_inter_new, splits_list, split_names)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "43aa0142",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Merging 2 files into exp_4_0.9_inter_tiger_train.json (min len=2)...\n",
+ "Filtered 15 users with history < 2\n",
+ "Remaining: 4001 users\n",
+ "✓ Done: /home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps_filtered/exp_4_0.9_inter_tiger_train.json (Users: 4001)\n",
+ "Merging 2 files into exp_4-1_0.9_inter_semantics_train.json (min len=2)...\n",
+ "Filtered 15 users with history < 2\n",
+ "Remaining: 4001 users\n",
+ "✓ Done: /home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps_filtered/exp_4-1_0.9_inter_semantics_train.json (Users: 4001)\n",
+ "Merging 1 files into exp_4-2_0.8_inter_semantics_train.json (min len=2)...\n",
+ "Filtered 22 users with history < 2\n",
+ "Remaining: 3791 users\n",
+ "✓ Done: /home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps_filtered/exp_4-2_0.8_inter_semantics_train.json (Users: 3791)\n",
+ "Merging 3 files into exp_4-3_0.95_inter_semantics_train.json (min len=2)...\n",
+ "Filtered 7 users with history < 2\n",
+ "Remaining: 4111 users\n",
+ "✓ Done: /home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps_filtered/exp_4-3_0.95_inter_semantics_train.json (Users: 4111)\n",
+ "Merging 1 files into test_set.json...\n",
+ "✓ Done: /home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps_filtered/test_set.json (Users: 3154)\n",
+ "Merging 1 files into valid_set.json...\n",
+ "✓ Done: /home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps_filtered/valid_set.json (Users: 3120)\n",
+ "Merging 4 files into all_set.json...\n",
+ "✓ Done: /home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps_filtered/all_set.json (Users: 4138)\n",
+ "All done!\n"
+ ]
+ }
+ ],
+ "source": [
+ "EXP_DIR_FILTERED = \"/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps_filtered\"\n",
+ "\n",
+ "base_p, gap_p, valid_p, test_p = new_split_files[0], new_split_files[1], new_split_files[2], new_split_files[3]\n",
+ "\n",
+ "# Tiger: base + gap\n",
+ "merge_and_save_with_filter([base_p, gap_p], EXP_DIR_FILTERED, \"exp_4_0.9_inter_tiger_train.json\", min_history_len=2)\n",
+ "\n",
+ "# 1. Exp 4.1 (Standard)\n",
+ "# Semantics: base + gap (Всё кроме валидации и теста)\n",
+ "merge_and_save_with_filter([base_p, gap_p], EXP_DIR_FILTERED, \"exp_4-1_0.9_inter_semantics_train.json\", min_history_len=2)\n",
+ "\n",
+ "# 2. Exp 4.2 (Short Semantics)\n",
+ "# Semantics: base (Короче на пропуск, без gap)\n",
+ "merge_and_save_with_filter([base_p], EXP_DIR_FILTERED, \"exp_4-2_0.8_inter_semantics_train.json\", min_history_len=2)\n",
+ "\n",
+ "# 3. Exp 4.3 (Leak)\n",
+ "# Semantics: base + gap + valid (Видит валидацию)\n",
+ "merge_and_save_with_filter([base_p, gap_p, valid_p], EXP_DIR_FILTERED, \"exp_4-3_0.95_inter_semantics_train.json\", min_history_len=2)\n",
+ "\n",
+ "# 4. Test Set (тест всех моделей)\n",
+ "merge_and_save([test_p], EXP_DIR_FILTERED, \"test_set.json\")\n",
+ "\n",
+ "# 4. Valid Set (валидационный набор)\n",
+ "merge_and_save([valid_p], EXP_DIR_FILTERED, \"valid_set.json\")\n",
+ "\n",
+ "# 4. All Set (все данные)\n",
+ "merge_and_save([base_p, gap_p, valid_p, test_p], EXP_DIR_FILTERED, \"all_set.json\")\n",
+ "\n",
+ "print(\"All done!\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9060beaa",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Проверка Train сетов (должны быть префиксами):\n",
+ "доля событий всего 0.90:\n",
+ "✅ [ПРЕФИКСЫ] Все 4001 массивов ОК. Полных совпадений: 564\n",
+ "доля событий всего 0.80:\n",
+ "✅ [ПРЕФИКСЫ] Все 3791 массивов ОК. Полных совпадений: 343\n",
+ "доля событий всего 0.95:\n",
+ "✅ [ПРЕФИКСЫ] Все 4111 массивов ОК. Полных совпадений: 984\n",
+ "доля событий всего 0.90:\n",
+ "✅ [ПРЕФИКСЫ] Все 4001 массивов ОК. Полных совпадений: 564\n",
+ "\n",
+ "Проверка Test сета (должен быть суффиксом):\n",
+ "доля событий всего 0.05:\n",
+ "✅ [СУФФИКСЫ] Все 3154 массивов ОК. Полных совпадений: 20\n",
+ "\n",
+ "(Контроль) Проверка Test сета как префикса (должна упасть):\n",
+ "доля событий всего 0.05:\n",
+ "❌ [ПРЕФИКСЫ] Найдено 3134 ошибок.\n",
+ "доля событий всего 1.00:\n",
+ "✅ [ПРЕФИКСЫ] Все 4138 массивов ОК. Полных совпадений: 4138\n",
+ "\n",
+ "================================================================================\n",
+ "ПРОВЕРКА НА ПУСТЫЕ ЧАСТИ ИСТОРИЙ\n",
+ "================================================================================\n",
+ "\n",
+ "[exp_4-1_0.9] Анализ...\n",
+ " Юзеров в сплите: 4,001 / 4,138\n",
+ " ПУСТЫХ сессий: 0\n",
+ " ОБЩИХ ПРОБЛЕМ: 0\n",
+ "\n",
+ "[exp_4-2_0.8] Анализ...\n",
+ " Юзеров в сплите: 3,791 / 4,138\n",
+ " ПУСТЫХ сессий: 0\n",
+ " ОБЩИХ ПРОБЛЕМ: 0\n",
+ "\n",
+ "[exp_4-3_0.95] Анализ...\n",
+ " Юзеров в сплите: 4,111 / 4,138\n",
+ " ПУСТЫХ сессий: 0\n",
+ " ОБЩИХ ПРОБЛЕМ: 0\n",
+ "\n",
+ "[exp_4_0.9_tiger] Анализ...\n",
+ " Юзеров в сплите: 4,001 / 4,138\n",
+ " ПУСТЫХ сессий: 0\n",
+ " ОБЩИХ ПРОБЛЕМ: 0\n",
+ "\n",
+ "ВСЕ РАЗБИЕНИЯ БЕЗ ПУСТЫХ СЕССИЙ\n"
+ ]
+ }
+ ],
+ "source": [
+ "with open(\"/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_expsx/exp_4-1_0.9_inter_semantics_train.json\", 'r') as ff:\n",
+ " filtered_first_sem = json.load(ff)\n",
+ " \n",
+ "with open(\"/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps_filtered/exp_4-2_0.8_inter_semantics_train.json\", 'r') as ff:\n",
+ " filtered_second_sem = json.load(ff)\n",
+ " \n",
+ "with open(\"/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps_filtered/exp_4-3_0.95_inter_semantics_train.json\", 'r') as ff:\n",
+ " filtered_third_sem = json.load(ff)\n",
+ " \n",
+ "with open(\"/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps_filtered/exp_4_0.9_inter_tiger_train.json\", 'r') as ff:\n",
+ " filtered_tiger_sem = json.load(ff)\n",
+ "\n",
+ "with open(\"/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps_filtered/valid_set.json\", 'r') as ff:\n",
+ " fiiltered_valid_sem = json.load(ff)\n",
+ "\n",
+ "with open(\"/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps_filtered/test_set.json\", 'r') as ff:\n",
+ " fiiltered_test_sem = json.load(ff)\n",
+ "\n",
+ "with open(\"/home/jovyan/IRec/data/Yambda/updated_quantile_splits/merged_for_exps_filtered/all_set.json\", 'r') as ff:\n",
+ " filtered_all_test_data = json.load(ff)\n",
+ "\n",
+ "# --- Запуск проверок ---\n",
+ "print(\"Проверка Train сетов (должны быть префиксами):\")\n",
+ "check_prefix_match(filtered_all_test_data, filtered_first_sem)\n",
+ "check_prefix_match(filtered_all_test_data, filtered_second_sem)\n",
+ "check_prefix_match(filtered_all_test_data, filtered_third_sem)\n",
+ "check_prefix_match(filtered_all_test_data, filtered_tiger_sem)\n",
+ "\n",
+ "print(\"\\nПроверка Test сета (должен быть суффиксом):\")\n",
+ "check_prefix_match(filtered_all_test_data, test_sem, check_suffix=True)\n",
+ "\n",
+ "print(\"\\n(Контроль) Проверка Test сета как префикса (должна упасть):\")\n",
+ "check_prefix_match(filtered_all_test_data, test_sem, check_suffix=False)\n",
+ "\n",
+ "check_prefix_match(filtered_all_test_data, all_test_data)\n",
+ "\n",
+ "split_names = ['exp_4-1_0.9', 'exp_4-2_0.8', 'exp_4-3_0.95', 'exp_4_0.9_tiger']\n",
+ "splits_list_filtered = [filtered_first_sem, filtered_second_sem, filtered_third_sem, filtered_tiger_sem]\n",
+ "\n",
+ "check_non_empty_splits(filtered_all_test_data, splits_list_filtered, split_names, min_history_len = 2)\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "id": "c540c8d5",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "для теста и валидации (может упасть и скорее всего упадет)\n",
+ "\n",
+ "================================================================================\n",
+ "ПРОВЕРКА НА ПУСТЫЕ ЧАСТИ ИСТОРИЙ\n",
+ "================================================================================\n",
+ "\n",
+ "[valid] Анализ...\n",
+ " Юзеров в сплите: 3,120 / 4,138\n",
+ " ПУСТЫХ сессий: 88\n",
+ " ОБЩИХ ПРОБЛЕМ: 88\n",
+ "\n",
+ "[test] Анализ...\n",
+ " Юзеров в сплите: 3,154 / 4,138\n",
+ " ПУСТЫХ сессий: 105\n",
+ " ОБЩИХ ПРОБЛЕМ: 105\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"для теста и валидации (может упасть и скорее всего упадет)\")\n",
+ "vt_split_names = ['valid', 'test']\n",
+ "vt_splits_list_filtered = [fiiltered_valid_sem, test_sem]\n",
+ "\n",
+ "check_non_empty_splits(filtered_all_test_data, vt_splits_list_filtered, vt_split_names, min_history_len = 2)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "89efa96e",
+ "metadata": {},
+ "source": [
+ "# Разбиение YAMBDA по неделям"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "28e4ddc8",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Cutoffs: [25740785, 25827185, 25913585]\n",
+ "✓ Сохранено: /home/jovyan/IRec/data/Yambda/day-splits/raw/inter_new_[0_25740785).json\n",
+ "✓ Сохранено: /home/jovyan/IRec/data/Yambda/day-splits/raw/inter_new_[25740785_25827185).json\n",
+ "✓ Сохранено: /home/jovyan/IRec/data/Yambda/day-splits/raw/inter_new_[25827185_25913585).json\n",
+ "✓ Сохранено: /home/jovyan/IRec/data/Yambda/day-splits/raw/inter_new_[25913585_inf).json\n",
+ "Part 0 [Base]: 4133 users\n",
+ "Part 1 [day -3]: 1381 users\n",
+ "Part 2 [day -2]: 1350 users\n",
+ "Part 3 [day -1]: 1403 users\n"
+ ]
+ }
+ ],
+ "source": [
+ "global_max_time = df.select(\n",
+ " pl.col(\"timestamps\").explode().max()\n",
+ ").item()\n",
+ "\n",
+ "# 3. Размер окна (неделя)\n",
+ "days_val = 1\n",
+ "window_sec = days_val * 24 * 3600 \n",
+ "\n",
+ "# 4. Три отсечки с конца\n",
+ "cutoff_test_start = global_max_time - window_sec # T - 1w\n",
+ "cutoff_val_start = global_max_time - 2 * window_sec # T - 2w\n",
+ "cutoff_gap_start = global_max_time - 3 * window_sec # T - 3w\n",
+ "\n",
+ "cutoffs = [\n",
+ " int(cutoff_gap_start), # Граница Part 0 | Part 1\n",
+ " int(cutoff_val_start), # Граница Part 1 | Part 2\n",
+ " int(cutoff_test_start) # Граница Part 2 | Part 3\n",
+ "]\n",
+ "\n",
+ "print(f\"Cutoffs: {cutoffs}\")\n",
+ "\n",
+ "split_files = split_session_by_timestamps(\n",
+ " df, \n",
+ " cutoffs, \n",
+ " output_dir=\"/home/jovyan/IRec/data/Yambda/day-splits/raw\"\n",
+ ")\n",
+ "\n",
+ "names = [\"Base\", \"day -3\", \"day -2\", \"day -1\"]\n",
+ "for i, d in enumerate(split_files):\n",
+ " print(f\"Part {i} [{names[i]}]: {len(d)} users\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "8d5b0c22",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Merging 2 files into exp_4_0.9_inter_tiger_train.json (min len=2)...\n",
+ "Filtered 3 users with history < 2\n",
+ "Remaining: 4133 users\n",
+ "✓ Done: /home/jovyan/IRec/data/Yambda/day-splits/merged_for_exps_filtered/exp_4_0.9_inter_tiger_train.json (Users: 4133)\n",
+ "Merging 2 files into exp_4-1_0.9_inter_semantics_train.json (min len=2)...\n",
+ "Filtered 3 users with history < 2\n",
+ "Remaining: 4133 users\n",
+ "✓ Done: /home/jovyan/IRec/data/Yambda/day-splits/merged_for_exps_filtered/exp_4-1_0.9_inter_semantics_train.json (Users: 4133)\n",
+ "Merging 1 files into exp_4-2_0.8_inter_semantics_train.json (min len=2)...\n",
+ "Filtered 3 users with history < 2\n",
+ "Remaining: 4130 users\n",
+ "✓ Done: /home/jovyan/IRec/data/Yambda/day-splits/merged_for_exps_filtered/exp_4-2_0.8_inter_semantics_train.json (Users: 4130)\n",
+ "Merging 3 files into exp_4-3_0.95_inter_semantics_train.json (min len=2)...\n",
+ "Filtered 3 users with history < 2\n",
+ "Remaining: 4133 users\n",
+ "✓ Done: /home/jovyan/IRec/data/Yambda/day-splits/merged_for_exps_filtered/exp_4-3_0.95_inter_semantics_train.json (Users: 4133)\n",
+ "Merging 1 files into test_set.json (min len=1)...\n",
+ "Filtered 0 users with history < 1\n",
+ "Remaining: 1403 users\n",
+ "✓ Done: /home/jovyan/IRec/data/Yambda/day-splits/merged_for_exps_filtered/test_set.json (Users: 1403)\n",
+ "Merging 1 files into valid_set.json (min len=1)...\n",
+ "Filtered 0 users with history < 1\n",
+ "Remaining: 1350 users\n",
+ "✓ Done: /home/jovyan/IRec/data/Yambda/day-splits/merged_for_exps_filtered/valid_set.json (Users: 1350)\n",
+ "Merging 4 files into all_set.json...\n",
+ "✓ Done: /home/jovyan/IRec/data/Yambda/day-splits/merged_for_exps_filtered/all_set.json (Users: 4138)\n",
+ "All done!\n"
+ ]
+ }
+ ],
+ "source": [
+ "EXP_DIR_FILTERED = \"/home/jovyan/IRec/data/Yambda/day-splits/merged_for_exps_filtered\"\n",
+ "\n",
+ "base_p, gap_p, valid_p, test_p = split_files[0], split_files[1], split_files[2], split_files[3]\n",
+ "\n",
+ "# Tiger: base + gap\n",
+ "merge_and_save_with_filter([base_p, gap_p], EXP_DIR_FILTERED, \"exp_4_0.9_inter_tiger_train.json\", min_history_len=2)\n",
+ "\n",
+ "# 1. Exp 4.1 (Standard)\n",
+ "# Semantics: base + gap (Всё кроме валидации и теста)\n",
+ "merge_and_save_with_filter([base_p, gap_p], EXP_DIR_FILTERED, \"exp_4-1_0.9_inter_semantics_train.json\", min_history_len=2)\n",
+ "\n",
+ "# 2. Exp 4.2 (Short Semantics)\n",
+ "# Semantics: base (Короче на пропуск, без gap)\n",
+ "merge_and_save_with_filter([base_p], EXP_DIR_FILTERED, \"exp_4-2_0.8_inter_semantics_train.json\", min_history_len=2)\n",
+ "\n",
+ "# 3. Exp 4.3 (Leak)\n",
+ "# Semantics: base + gap + valid (Видит валидацию)\n",
+ "merge_and_save_with_filter([base_p, gap_p, valid_p], EXP_DIR_FILTERED, \"exp_4-3_0.95_inter_semantics_train.json\", min_history_len=2)\n",
+ "\n",
+ "# 4. Test Set (тест всех моделей)\n",
+ "merge_and_save_with_filter([test_p], EXP_DIR_FILTERED, \"test_set.json\", min_history_len=1)\n",
+ "\n",
+ "# 4. Valid Set (валидационный набор)\n",
+ "merge_and_save_with_filter([valid_p], EXP_DIR_FILTERED, \"valid_set.json\", min_history_len=1)\n",
+ "\n",
+ "# 4. All Set (все данные)\n",
+ "merge_and_save([base_p, gap_p, valid_p, test_p], EXP_DIR_FILTERED, \"all_set.json\")\n",
+ "\n",
+ "print(\"All done!\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "c0b9b767",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def check_non_empty_splits(full_data, splits_data, split_names, min_history_len=2):\n",
+ " \"\"\"\n",
+ " Проверяет, что ни одна часть истории пользователя НЕ пустая во всех разбиениях.\n",
+ " \"\"\"\n",
+ " print(\"\\n\" + \"=\"*80)\n",
+ " print(\"ПРОВЕРКА НА ПУСТЫЕ ЧАСТИ ИСТОРИЙ\")\n",
+ " print(\"=\"*80)\n",
+ " \n",
+ " all_users = set(full_data.keys())\n",
+ " total_issues = 0\n",
+ " \n",
+ " for i in range(len(split_names)):\n",
+ " split_name = split_names[i]\n",
+ " split_data = splits_data[i]\n",
+ " print(f\"\\n[{split_name}] Анализ...\")\n",
+ " \n",
+ " split_users = set(split_data.keys())\n",
+ " empty_sessions = []\n",
+ " \n",
+ " for user, items in split_data.items():\n",
+ " if not items or len(items) < min_history_len:\n",
+ " empty_sessions.append(user)\n",
+ " \n",
+ " issues_count = len(empty_sessions)\n",
+ " total_issues += issues_count\n",
+ " \n",
+ " print(f\" Юзеров в сплите: {len(split_users):,} / {len(all_users):,}\")\n",
+ " print(f\" ПУСТЫХ сессий: {len(empty_sessions)}\")\n",
+ " print(f\" ОБЩИХ ПРОБЛЕМ: {issues_count}\")\n",
+ " \n",
+ " if total_issues == 0:\n",
+ " print(\"\\nВСЕ РАЗБИЕНИЯ БЕЗ ПУСТЫХ СЕССИЙ\")\n",
+ "\n",
+ "def check_prefix_match(full_data, subset_data, check_suffix=False):\n",
+ " \"\"\"\n",
+ " check_suffix=True включит режим проверки суффиксов (для теста).\n",
+ " \"\"\"\n",
+ " mismatch_count = 0\n",
+ " full_match_count = 0\n",
+ " \n",
+ " # Итерируемся по ключам сабсета, так как в full_data может быть больше юзеров\n",
+ " for user, sub_items in subset_data.items():\n",
+ " \n",
+ " # Проверяем есть ли такой юзер в исходнике\n",
+ " if user not in full_data:\n",
+ " print(f\"⚠ Юзер {user} не найден в исходном файле!\")\n",
+ " mismatch_count += 1\n",
+ " continue\n",
+ " \n",
+ " full_items = full_data[user]\n",
+ " \n",
+ " # Логика для проверки ПРЕФИКСА (начало совпадает)\n",
+ " if not check_suffix:\n",
+ " if len(sub_items) > len(full_items):\n",
+ " mismatch_count += 1\n",
+ " continue\n",
+ " \n",
+ " # Сравниваем начало full с sub\n",
+ " if full_items[:len(sub_items)] == sub_items:\n",
+ " if len(full_items) == len(sub_items):\n",
+ " full_match_count += 1\n",
+ " else:\n",
+ " mismatch_count += 1\n",
+ " \n",
+ " # Логика для проверки СУФФИКСА (конец совпадает - для теста)\n",
+ " else:\n",
+ " if len(sub_items) > len(full_items):\n",
+ " mismatch_count += 1\n",
+ " continue\n",
+ " \n",
+ " # Сравниваем конец full с sub\n",
+ " # Срез [-len:] берет последние N элементов\n",
+ " if full_items[-len(sub_items):] == sub_items:\n",
+ " if len(full_items) == len(sub_items):\n",
+ " full_match_count += 1\n",
+ " else:\n",
+ " mismatch_count += 1\n",
+ "\n",
+ " mode = \"СУФФИКСЫ\" if check_suffix else \"ПРЕФИКСЫ\"\n",
+ " \n",
+ " if mismatch_count == 0:\n",
+ " print(f\"✅ [{mode}] Все {len(subset_data)} массивов ОК. Полных совпадений: {full_match_count}\")\n",
+ " else:\n",
+ " print(f\"❌ [{mode}] Найдено {mismatch_count} ошибок.\")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "36ac0115",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Проверка Train сетов (должны быть префиксами):\n",
+ "✅ [ПРЕФИКСЫ] Все 4133 массивов ОК. Полных совпадений: 2272\n",
+ "✅ [ПРЕФИКСЫ] Все 4130 массивов ОК. Полных совпадений: 1969\n",
+ "✅ [ПРЕФИКСЫ] Все 4133 массивов ОК. Полных совпадений: 2735\n",
+ "✅ [ПРЕФИКСЫ] Все 4133 массивов ОК. Полных совпадений: 2272\n",
+ "\n",
+ "Проверка Test сета (должен быть суффиксом):\n",
+ "✅ [СУФФИКСЫ] Все 1403 массивов ОК. Полных совпадений: 2\n",
+ "\n",
+ "(Контроль) Проверка Test сета как префикса (должна упасть):\n",
+ "❌ [ПРЕФИКСЫ] Найдено 1401 ошибок.\n",
+ "✅ [ПРЕФИКСЫ] Все 4138 массивов ОК. Полных совпадений: 4138\n"
+ ]
+ }
+ ],
+ "source": [
+ "with open(f\"{EXP_DIR_FILTERED}/exp_4-1_0.9_inter_semantics_train.json\", 'r') as ff:\n",
+ " filtered_first_sem = json.load(ff)\n",
+ " \n",
+ "with open(f\"{EXP_DIR_FILTERED}/exp_4-2_0.8_inter_semantics_train.json\", 'r') as ff:\n",
+ " filtered_second_sem = json.load(ff)\n",
+ " \n",
+ "with open(f\"{EXP_DIR_FILTERED}/exp_4-3_0.95_inter_semantics_train.json\", 'r') as ff:\n",
+ " filtered_third_sem = json.load(ff)\n",
+ " \n",
+ "with open(f\"{EXP_DIR_FILTERED}/exp_4_0.9_inter_tiger_train.json\", 'r') as ff:\n",
+ " filtered_tiger_sem = json.load(ff)\n",
+ "\n",
+ "with open(f\"{EXP_DIR_FILTERED}/valid_set.json\", 'r') as ff:\n",
+ " fiiltered_valid_sem = json.load(ff)\n",
+ "\n",
+ "with open(f\"{EXP_DIR_FILTERED}/test_set.json\", 'r') as ff:\n",
+ " filtered_test_sem = json.load(ff)\n",
+ "\n",
+ "with open(f\"{EXP_DIR_FILTERED}/all_set.json\", 'r') as ff:\n",
+ " filtered_all_test_data = json.load(ff)\n",
+ "\n",
+ "# --- Запуск проверок ---\n",
+ "print(\"Проверка Train сетов (должны быть префиксами):\")\n",
+ "check_prefix_match(filtered_all_test_data, filtered_first_sem)\n",
+ "check_prefix_match(filtered_all_test_data, filtered_second_sem)\n",
+ "check_prefix_match(filtered_all_test_data, filtered_third_sem)\n",
+ "check_prefix_match(filtered_all_test_data, filtered_tiger_sem)\n",
+ "\n",
+ "print(\"\\nПроверка Test сета (должен быть суффиксом):\")\n",
+ "check_prefix_match(filtered_all_test_data, filtered_test_sem, check_suffix=True)\n",
+ "\n",
+ "print(\"\\n(Контроль) Проверка Test сета как префикса (должна упасть):\")\n",
+ "check_prefix_match(filtered_all_test_data, filtered_test_sem, check_suffix=False)\n",
+ "\n",
+ "check_prefix_match(filtered_all_test_data, filtered_all_test_data)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "2c65331b",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "================================================================================\n",
+ "ПРОВЕРКА НА ПУСТЫЕ ЧАСТИ ИСТОРИЙ\n",
+ "================================================================================\n",
+ "\n",
+ "[exp_4-1_0.9] Анализ...\n",
+ " Юзеров в сплите: 4,133 / 4,138\n",
+ " ПУСТЫХ сессий: 0\n",
+ " ОБЩИХ ПРОБЛЕМ: 0\n",
+ "\n",
+ "[exp_4-2_0.8] Анализ...\n",
+ " Юзеров в сплите: 4,130 / 4,138\n",
+ " ПУСТЫХ сессий: 0\n",
+ " ОБЩИХ ПРОБЛЕМ: 0\n",
+ "\n",
+ "[exp_4-3_0.95] Анализ...\n",
+ " Юзеров в сплите: 4,133 / 4,138\n",
+ " ПУСТЫХ сессий: 0\n",
+ " ОБЩИХ ПРОБЛЕМ: 0\n",
+ "\n",
+ "[exp_4_0.9_tiger] Анализ...\n",
+ " Юзеров в сплите: 4,133 / 4,138\n",
+ " ПУСТЫХ сессий: 0\n",
+ " ОБЩИХ ПРОБЛЕМ: 0\n",
+ "\n",
+ "ВСЕ РАЗБИЕНИЯ БЕЗ ПУСТЫХ СЕССИЙ\n"
+ ]
+ }
+ ],
+ "source": [
+ "split_names = ['exp_4-1_0.9', 'exp_4-2_0.8', 'exp_4-3_0.95', 'exp_4_0.9_tiger']\n",
+ "splits_list_filtered = [filtered_first_sem, filtered_second_sem, filtered_third_sem, filtered_tiger_sem]\n",
+ "\n",
+ "check_non_empty_splits(filtered_all_test_data, splits_list_filtered, split_names, min_history_len = 2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "f596db64",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0 Base 4133 7264231 \n",
+ "1 day -3 1381 36676 \n",
+ "2 day -2 1350 35128 \n",
+ "3 day -1 1403 35955 \n"
+ ]
+ }
+ ],
+ "source": [
+ "filtered_all_test_data.keys()\n",
+ "for i, d in enumerate(split_files):\n",
+ " num_users = len(d)\n",
+ " \n",
+ " num_events = sum(len(items) for items in d.values())\n",
+ " \n",
+ " print(f\"{i:<10} {names[i]:<10} {num_users:<10} {num_events:<10}\")"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/irec/callbacks/stopping.py b/src/irec/callbacks/stopping.py
index 3d1405f..bbe091f 100644
--- a/src/irec/callbacks/stopping.py
+++ b/src/irec/callbacks/stopping.py
@@ -44,14 +44,18 @@ def after_step(self, runner: Runner, context: RunnerContext):
metric = context.metrics[self._metric]
if self._best_metric is None:
self._best_metric = metric
- torch.save(runner.model.state_dict(), f'{self._model_path}_best_{round(self._best_metric, 4)}.pth')
+ save_path = f'{self._model_path}_best_{round(self._best_metric, 4)}.pth'
+ os.makedirs(os.path.dirname(save_path), exist_ok=True)
+ torch.save(runner.model.state_dict(), save_path)
else:
if (self._minimize and metric < self._best_metric) or (not self._minimize and metric > self._best_metric):
self._wait = 0
old_metric = self._best_metric
self._best_metric = metric
# Saving new model
- torch.save(runner.model.state_dict(), f'{self._model_path}_best_{round(self._best_metric, 4)}.pth')
+ save_path = f'{self._model_path}_best_{round(self._best_metric, 4)}.pth'
+ os.makedirs(os.path.dirname(save_path), exist_ok=True)
+ torch.save(runner.model.state_dict(), save_path)
# Deleting old model
if str(round(self._best_metric, 4)) != str(round(old_metric, 4)):
os.remove(f'{self._model_path}_best_{round(old_metric, 4)}.pth')