From 71e010cbb2863fed77de8369a00e866c388d3659 Mon Sep 17 00:00:00 2001
From: Ayush Kanodia <kanodiaayush@gmail.com>
Date: Mon, 31 Oct 2022 16:57:17 -0700
Subject: [PATCH 01/19] create machinery for chunked bemb, pass chunking info

---
 bemb/model/__init__.py     |    1 +
 bemb/model/bemb.py         |    8 +-
 bemb/model/bemb_chunked.py | 1394 ++++++++++++++++++++++++++++++++++++
 3 files changed, 1399 insertions(+), 4 deletions(-)
 create mode 100644 bemb/model/bemb_chunked.py

diff --git a/bemb/model/__init__.py b/bemb/model/__init__.py
index 60c8b2e..d5f6d8a 100644
--- a/bemb/model/__init__.py
+++ b/bemb/model/__init__.py
@@ -1,2 +1,3 @@
 from .bemb import *
+from .bemb_chunked import *
 from .bemb_flex_lightning import *
diff --git a/bemb/model/bemb.py b/bemb/model/bemb.py
index 9958c2b..59eadd8 100644
--- a/bemb/model/bemb.py
+++ b/bemb/model/bemb.py
@@ -325,10 +325,10 @@ def __init__(self,
                     if coef_name not in self.prior_mean.keys():
                         # the user may specify 'default' prior variance through the prior_variance dictionary.
                         if 'default' in self.prior_mean.keys():
-                            warnings.warn(f"You provided a dictionary of prior mean, but coefficient {coef_name} is not a key in it. We found a key 'default' in the dictionary, so we use the value of 'default' as the prior mean for coefficient {coef_name}.")
+                            # warnings.warn(f"You provided a dictionary of prior mean, but coefficient {coef_name} is not a key in it. We found a key 'default' in the dictionary, so we use the value of 'default' as the prior mean for coefficient {coef_name}.")
                             self.prior_mean[coef_name] = self.prior_mean['default']
                         else:
-                            warnings.warn(f"You provided a dictionary of prior mean, but coefficient {coef_name} is not a key in it. Supply a value for 'default' in the prior_mean dictionary to use that as default value (e.g., prior_mean['default'] = 0.1); now using mean=0.0 since this is not supplied.")
+                            # warnings.warn(f"You provided a dictionary of prior mean, but coefficient {coef_name} is not a key in it. Supply a value for 'default' in the prior_mean dictionary to use that as default value (e.g., prior_mean['default'] = 0.1); now using mean=0.0 since this is not supplied.")
                             self.prior_variance[coef_name] = 0.0
 
                 mean = self.prior_mean[coef_name] if isinstance(
@@ -339,10 +339,10 @@ def __init__(self,
                     if coef_name not in self.prior_variance.keys():
                         # the user may specify 'default' prior variance through the prior_variance dictionary.
                         if 'default' in self.prior_variance.keys():
-                            warnings.warn(f"You provided a dictionary of prior variance, but coefficient {coef_name} is not a key in it. We found a key 'default' in the dictionary, so we use the value of 'default' as the prior variance for coefficient {coef_name}.")
+                            # warnings.warn(f"You provided a dictionary of prior variance, but coefficient {coef_name} is not a key in it. We found a key 'default' in the dictionary, so we use the value of 'default' as the prior variance for coefficient {coef_name}.")
                             self.prior_variance[coef_name] = self.prior_variance['default']
                         else:
-                            warnings.warn(f"You provided a dictionary of prior variance, but coefficient {coef_name} is not a key in it. Supply a value for 'default' in the prior_variance dictionary to use that as default value (e.g., prior_variance['default'] = 0.3); now using variance=1.0 since this is not supplied.")
+                            # warnings.warn(f"You provided a dictionary of prior variance, but coefficient {coef_name} is not a key in it. Supply a value for 'default' in the prior_variance dictionary to use that as default value (e.g., prior_variance['default'] = 0.3); now using variance=1.0 since this is not supplied.")
                             self.prior_variance[coef_name] = 1.0
 
                 s2 = self.prior_variance[coef_name] if isinstance(
diff --git a/bemb/model/bemb_chunked.py b/bemb/model/bemb_chunked.py
new file mode 100644
index 0000000..c5c9e1b
--- /dev/null
+++ b/bemb/model/bemb_chunked.py
@@ -0,0 +1,1394 @@
+"""
+The core class of the Bayesian EMBedding (BEMB) model.
+
+Author: Tianyu Du
+Update: Apr. 28, 2022
+"""
+import warnings
+from pprint import pprint
+import warnings
+from typing import Dict, List, Optional, Tuple, Union
+from pprint import pprint
+from typing import Dict, List, Optional, Union, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch_choice.data import ChoiceDataset
+from torch_scatter import scatter_logsumexp, scatter_max
+from torch_scatter.composite import scatter_log_softmax
+
+from bemb.model.bayesian_coefficient import BayesianCoefficient
+
+# ======================================================================================================================
+# helper functions.
+# ======================================================================================================================
+
+from bemb.model.bemb import PositiveInteger, parse_utility
+positive_integer = PositiveInteger()
+'''
+class PositiveInteger(object):
+    # A helper wildcard class for shape matching.
+    def __eq__(self, other):
+        return isinstance(other, int) and other > 0
+
+def parse_utility(utility_string: str) -> List[Dict[str, Union[List[str], None]]]:
+    """
+    A helper function parse utility string into a list of additive terms.
+
+    Example:
+        utility_string = 'lambda_item + theta_user * alpha_item + gamma_user * beta_item * price_obs'
+        output = [
+            {
+                'coefficient': ['lambda_item'],
+                'observable': None
+            },
+            {
+                'coefficient': ['theta_user', 'alpha_item'],
+                'observable': None
+            },
+            {
+                'coefficient': ['gamma_user', 'beta_item'],
+                'observable': 'price_obs'
+            }
+            ]
+    """
+    # split additive terms
+    coefficient_suffix = ('_item', '_user', '_constant', '_category')
+    observable_prefix = ('item_', 'user_', 'session_', 'price_', 'taste_')
+
+    def is_coefficient(name: str) -> bool:
+        return any(name.endswith(suffix) for suffix in coefficient_suffix)
+
+    def is_observable(name: str) -> bool:
+        return any(name.startswith(prefix) for prefix in observable_prefix)
+
+    additive_terms = utility_string.split(' + ')
+    additive_decomposition = list()
+    for term in additive_terms:
+        atom = {'coefficient': [], 'observable': None}
+        # split multiplicative terms.
+        for x in term.split(' * '):
+            # Programmers can specify itemsession for price observables, this brings better intuition.
+            if x.startswith('itemsession_'):
+                # case 1: special observable name.
+                atom['observable'] = 'price_' + x[len('itemsession_'):]
+            elif is_observable(x):
+                # case 2: normal observable name.
+                atom['observable'] = x
+            elif is_coefficient(x):
+                # case 3: normal coefficient name.
+                atom['coefficient'].append(x)
+            else:
+                # case 4: special coefficient name.
+                # the _constant coefficient suffix is not intuitive enough, we allow none coefficient suffix for
+                # coefficient with constant value. For example, `lambda` is the same as `lambda_constant`.
+                warnings.warn(f'{x} term has no appropriate coefficient suffix or observable prefix, it is assumed to be a coefficient constant across all items, users, and sessions. If this is the desired behavior, you can also specify `{x}_constant` in the utility formula to avoid this warning message. The utility parser has replaced {x} term with `{x}_constant`.')
+                atom['coefficient'].append(f'{x}_constant')
+
+        additive_decomposition.append(atom)
+    return additive_decomposition
+'''
+
+# ======================================================================================================================
+# core class of the BEMB model.
+# ======================================================================================================================
+
+
+class BEMBFlexChunked(nn.Module):
+    # ==================================================================================================================
+    # core function as a PyTorch module.
+    # ==================================================================================================================
+    def __init__(self,
+                 utility_formula: str,
+                 obs2prior_dict: Dict[str, bool],
+                 coef_dim_dict: Dict[str, int],
+                 num_items: int,
+                 pred_item: bool,
+                 num_classes: int = 2,
+                 H_zero_mask_dict: Optional[Dict[str, torch.BoolTensor]] = None,
+                 prior_mean: Union[float, Dict[str, float]] = 0.0,
+                 prior_variance: Union[float, Dict[str, float]] = 1.0,
+                 num_users: Optional[int] = None,
+                 num_sessions: Optional[int] = None,
+                 trace_log_q: bool = False,
+                 category_to_item: Dict[int, List[int]] = None,
+                 # number of observables.
+                 num_user_obs: Optional[int] = None,
+                 num_item_obs: Optional[int] = None,
+                 num_session_obs: Optional[int] = None,
+                 num_price_obs: Optional[int] = None,
+                 num_taste_obs: Optional[int] = None,
+                 # additional modules.
+                 additional_modules: Optional[List[nn.Module]] = None,
+                 deterministic_variational: bool = False,
+                 chunk_info: Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor] = None,
+                 ) -> None:
+        """
+        Args:
+            utility_formula (str): a string representing the utility function U[user, item, session].
+                See documentation for more details in the documentation for the format of formula.
+                Examples:
+                    lambda_item
+                    lambda_item + theta_user * alpha_item + zeta_user * item_obs
+                    lambda_item + theta_user * alpha_item + gamma_user * beta_item * price_obs
+                See the doc-string of parse_utility for an example.
+
+            obs2prior_dict (Dict[str, bool]): a dictionary maps coefficient name (e.g., 'lambda_item')
+                to a boolean indicating if observable (e.g., item_obs) enters the prior of the coefficient.
+
+            coef_dim_dict (Dict[str, int]): a dictionary maps coefficient name (e.g., 'lambda_item')
+                to an integer indicating the dimension of coefficient.
+                For standalone coefficients like U = lambda_item, the dim should be 1.
+                For factorized coefficients like U = theta_user * alpha_item, the dim should be the
+                    latent dimension of theta and alpha.
+                For coefficients multiplied with observables like U = zeta_user * item_obs, the dim
+                    should be the number of observables in item_obs.
+                For factorized coefficient multiplied with observables like U = gamma_user * beta_item * price_obs,
+                    the dim should be the latent dim multiplied by number of observables in price_obs.
+
+            H_zero_mask_dict (Dict[str, torch.BoolTensor]): A dictionary maps coefficient names to a boolean tensor,
+                you should only specify the H_zero_mask for coefficients with obs2prior turned on.
+                Recall that with obs2prior on, the prior of coefficient looks like N(H*X_obs, sigma * I), the H_zero_mask
+                the mask for this coefficient should have the same shape as H, and H[H_zero_mask] will be set to zeros
+                and non-learnable during the training.
+                Defaults to None.
+
+            num_items (int): number of items.
+
+            pred_item (bool): there are two use cases of this model, suppose we have `user_index[i]` and `item_index[i]`
+                for the i-th observation in the dataset.
+                Case 1: which item among all items user `user_index[i]` is going to purchase, the prediction label
+                    is therefore `item_index[i]`. Equivalently, we can ask what's the likelihood for user `user_index[i]`
+                    to purchase `item_index[i]`.
+                Case 2: what rating would user `user_index[i]` assign to item `item_index[i]`? In this case, the dataset
+                    object needs to contain a separate label.
+                    NOTE: for now, we only support binary labels.
+
+            prior_mean (Union[float, Dict[str, float]]): the mean of prior
+                distribution for coefficients. If a float is provided, all prior
+                mean will be diagonal matrix with the provided value.  If a
+                dictionary is provided, keys of prior_mean should be coefficient
+                names, and the mean of prior of coef_name would the provided
+                value Defaults to 0.0, which means all prior means are
+                initialized to 0.0
+
+                If a dictionary prior_mean is supplied, for coefficient names not in the prior_mean.keys(), the
+                user can add a `prior_mean['default']` value to specify the mean for those coefficients.
+                If no `prior_mean['default']` is provided, the default prior mean will be 0.0 for those coefficients
+                not in the prior_mean.keys().
+
+                Defaults to 0.0.
+
+            prior_variance (Union[float, Dict[str, float]], Dict[str, torch. Tensor]): the variance of prior distribution
+                for coefficients.
+                If a float is provided, all priors will be diagonal matrix with prior_variance along the diagonal.
+                If a float-valued dictionary is provided, keys of prior_variance should be coefficient names, and the
+                variance of prior of coef_name would be a diagonal matrix with prior_variance[coef_name] along the diagonal.
+                If a tensor-valued dictionary is provided, keys of prior_variance should be coefficient names, and the
+                values need to be tensor with shape (num_classes, coef_dim_dict[coef_name]). For example, for `beta_user` in
+                `U = beta_user * item_obs`, the prior_variance should be a tensor with shape (num_classes, dimension_of_item_obs).
+                In this case, every single entry in the coefficient has its own prior variance.
+                Following the `beta_user` example, for every `i` and `j`, `beta_user[i, j]` is a scalar with prior variance
+                `prior_variance['beta_user'][i, j]`. Moreover, `beta_user[i, j]`'s are independent for different `i, j`.
+
+                If a dictionary prior_variance is supplied, for coefficient names not in the prior_variance.keys(), the
+                user can add a `prior_variance['default']` value to specify the variance for those coefficients.
+                If no `prior_variance['default']` is provided, the default prior variance will be 1.0 for those coefficients
+                not in the prior_variance.keys().
+
+                Defaults to 1.0, which means all priors have identity matrix as the covariance matrix.
+
+            num_users (int, optional): number of users, required only if coefficient or observable
+                depending on user is in utility. Defaults to None.
+            num_sessions (int, optional): number of sessions, required only if coefficient or
+                observable depending on session is in utility. Defaults to None.
+
+            trace_log_q (bool, optional): whether to trace the derivative of variational likelihood logQ
+                with respect to variational parameters in the ELBO while conducting gradient update.
+                Defaults to False.
+
+            category_to_item (Dict[str, List[int]], optional): a dictionary with category id or name
+                as keys, and category_to_item[C] contains the list of item ids belonging to category C.
+                If None is provided, all items are assumed to be in the same category.
+                Defaults to None.
+
+            num_{user, item, session, price, taste}_obs (int, optional): number of observables of
+                each type of features, only required if observable enters prior.
+                NOTE: currently we only allow coefficient to depend on either user or item, thus only
+                user and item observables can enter the prior of coefficient. Hence session, price,
+                and taste observables are never required, we include it here for completeness.
+
+            deterministic_variational (bool, optional): if True, the variational posterior is equivalent to frequentist MLE estimates of parameters
+
+            chunk_info (Tuple[torch.LongTensor, torch.LongTensor, torch.LongTensor, torch.LongTensor], optional): a tuple of four tensors
+                The first tensor specifies a chunk id for each user
+                The second tensor specifies a chunk id for each item
+                The third tensor specifies a chunk id for each category
+                The fourth tensor specifies a chunk id for each session
+        """
+        super(BEMBFlexChunked, self).__init__()
+        self.utility_formula = utility_formula
+        self.obs2prior_dict = obs2prior_dict
+        self.coef_dim_dict = coef_dim_dict
+        if H_zero_mask_dict is not None:
+            self.H_zero_mask_dict = H_zero_mask_dict
+        else:
+            self.H_zero_mask_dict = dict()
+        self.prior_variance = prior_variance
+        self.prior_mean = prior_mean
+        self.pred_item = pred_item
+        if not self.pred_item:
+            assert isinstance(num_classes, int) and num_classes > 0, \
+                f"With pred_item being False, the num_classes should be a positive integer, received {num_classes} instead."
+            self.num_classes = num_classes
+            if self.num_classes != 2:
+                raise NotImplementedError('Multi-class classification is not supported yet.')
+            # we don't set the num_classes attribute when pred_item == False to avoid calling it accidentally.
+
+        self.num_items = num_items
+        self.num_users = num_users
+        self.num_sessions = num_sessions
+        self.deterministic_variational = deterministic_variational
+
+        self.trace_log_q = trace_log_q
+        self.category_to_item = category_to_item
+
+        # ==============================================================================================================
+        # Category ID to Item ID mapping.
+        # Category ID to Category Size mapping.
+        # Item ID to Category ID mapping.
+        # ==============================================================================================================
+        if self.category_to_item is None:
+            if self.pred_item:
+                # assign all items to the same category if predicting items.
+                self.category_to_item = {0: list(np.arange(self.num_items))}
+            else:
+                # otherwise, for the j-th observation in the dataset, the label[j]
+                # only depends on user_index[j] and item_index[j], so we put each
+                # item to its own category.
+                self.category_to_item = {i: [i] for i in range(self.num_items)}
+
+        self.num_categories = len(self.category_to_item)
+
+        max_category_size = max(len(x) for x in self.category_to_item.values())
+        category_to_item_tensor = torch.full(
+            (self.num_categories, max_category_size), -1)
+        category_to_size_tensor = torch.empty(self.num_categories)
+
+        for c, item_in_c in self.category_to_item.items():
+            category_to_item_tensor[c, :len(
+                item_in_c)] = torch.LongTensor(item_in_c)
+            category_to_size_tensor[c] = torch.scalar_tensor(len(item_in_c))
+
+        self.register_buffer('category_to_item_tensor',
+                             category_to_item_tensor.long())
+        self.register_buffer('category_to_size_tensor',
+                             category_to_size_tensor.long())
+
+        item_to_category_tensor = torch.zeros(self.num_items)
+        for c, items_in_c in self.category_to_item.items():
+            item_to_category_tensor[items_in_c] = c
+        self.register_buffer('item_to_category_tensor',
+                             item_to_category_tensor.long())
+
+        # ==============================================================================================================
+        # Chunk Information
+        self.num_user_chunks = chunk_info[0].max().item() + 1
+        self.num_item_chunks = chunk_info[1].max().item() + 1
+        self.num_category_chunks = chunk_info[2].max().item() + 1
+        self.num_session_chunks = chunk_info[3].max().item() + 1
+        self.register_buffer('user_chunks', chunk_info[0])
+        self.register_buffer('item_chunks', chunk_info[1])
+        self.register_buffer('category_chunks', chunk_info[2])
+        self.register_buffer('session_chunks', chunk_info[3])
+        # ==============================================================================================================
+        # Create Bayesian Coefficient Objects
+        # ==============================================================================================================
+        # model configuration.
+        self.formula = parse_utility(utility_formula)
+        print('BEMB: utility formula parsed:')
+        pprint(self.formula)
+        self.raw_formula = utility_formula
+        self.obs2prior_dict = obs2prior_dict
+
+        # dimension of each observable, this one is used only for obs2prior.
+        self.num_obs_dict = {
+            'user': num_user_obs,
+            'item': num_item_obs,
+            'category' : 0,
+            'session': num_session_obs,
+            'price': num_price_obs,
+            'taste': num_taste_obs,
+            'constant': 1  # not really used, for dummy variables.
+        }
+
+        # how many classes for the variational distribution.
+        # for example, beta_item would be `num_items` 10-dimensional gaussian if latent dim = 10.
+        variation_to_num_classes = {
+            'user': self.num_users,
+            'item': self.num_items,
+            'constant': 1,
+            'category' : self.num_categories,
+        }
+
+        coef_dict = dict()
+        for additive_term in self.formula:
+            for coef_name in additive_term['coefficient']:
+                variation = coef_name.split('_')[-1]
+                if isinstance(self.prior_mean, dict):
+                    # the user didn't specify prior mean for this coefficient.
+                    if coef_name not in self.prior_mean.keys():
+                        # the user may specify 'default' prior variance through the prior_variance dictionary.
+                        if 'default' in self.prior_mean.keys():
+                            # warnings.warn(f"You provided a dictionary of prior mean, but coefficient {coef_name} is not a key in it. We found a key 'default' in the dictionary, so we use the value of 'default' as the prior mean for coefficient {coef_name}.")
+                            self.prior_mean[coef_name] = self.prior_mean['default']
+                        else:
+                            # warnings.warn(f"You provided a dictionary of prior mean, but coefficient {coef_name} is not a key in it. Supply a value for 'default' in the prior_mean dictionary to use that as default value (e.g., prior_mean['default'] = 0.1); now using mean=0.0 since this is not supplied.")
+                            self.prior_variance[coef_name] = 0.0
+
+                mean = self.prior_mean[coef_name] if isinstance(
+                    self.prior_mean, dict) else self.prior_mean
+
+                if isinstance(self.prior_variance, dict):
+                    # the user didn't specify prior variance for this coefficient.
+                    if coef_name not in self.prior_variance.keys():
+                        # the user may specify 'default' prior variance through the prior_variance dictionary.
+                        if 'default' in self.prior_variance.keys():
+                            # warnings.warn(f"You provided a dictionary of prior variance, but coefficient {coef_name} is not a key in it. We found a key 'default' in the dictionary, so we use the value of 'default' as the prior variance for coefficient {coef_name}.")
+                            self.prior_variance[coef_name] = self.prior_variance['default']
+                        else:
+                            # warnings.warn(f"You provided a dictionary of prior variance, but coefficient {coef_name} is not a key in it. Supply a value for 'default' in the prior_variance dictionary to use that as default value (e.g., prior_variance['default'] = 0.3); now using variance=1.0 since this is not supplied.")
+                            self.prior_variance[coef_name] = 1.0
+
+                s2 = self.prior_variance[coef_name] if isinstance(
+                    self.prior_variance, dict) else self.prior_variance
+
+                if coef_name in self.H_zero_mask_dict.keys():
+                    H_zero_mask = self.H_zero_mask_dict[coef_name]
+                else:
+                    H_zero_mask = None
+
+                if (not self.obs2prior_dict[coef_name]) and (H_zero_mask is not None):
+                    raise ValueError(f'You specified H_zero_mask for {coef_name}, but obs2prior is False for this coefficient.')
+
+                coef_dict[coef_name] = BayesianCoefficient(variation=variation,
+                                                           num_classes=variation_to_num_classes[variation],
+                                                           obs2prior=self.obs2prior_dict[coef_name],
+                                                           num_obs=self.num_obs_dict[variation],
+                                                           dim=self.coef_dim_dict[coef_name],
+                                                           prior_mean=mean,
+                                                           prior_variance=s2,
+                                                           H_zero_mask=H_zero_mask,
+                                                           is_H=False)
+        self.coef_dict = nn.ModuleDict(coef_dict)
+
+        # ==============================================================================================================
+        # Optional: register additional modules.
+        # ==============================================================================================================
+        if additional_modules is None:
+            self.additional_modules = []
+        else:
+            raise NotImplementedError(
+                'Additional modules are temporarily disabled for further development.')
+            self.additional_modules = nn.ModuleList(additional_modules)
+
+    def __str__(self):
+        return f'Bayesian EMBedding Model with U[user, item, session] = {self.raw_formula}\n' \
+               + f'Total number of parameters: {self.num_params}.\n' \
+               + 'With the following coefficients:\n' \
+               + str(self.coef_dict) + '\n' \
+               + str(self.additional_modules)
+
+    def posterior_mean(self, coef_name: str) -> torch.Tensor:
+        """Returns the mean of estimated posterior distribution of coefficient `coef_name`.
+
+        Args:
+            coef_name (str): name of the coefficient to query.
+
+        Returns:
+            torch.Tensor: mean of the estimated posterior distribution of `coef_name`.
+        """
+        if coef_name in self.coef_dict.keys():
+            return self.coef_dict[coef_name].variational_mean
+        else:
+            raise KeyError(f'{coef_name} is not a valid coefficient name in {self.utility_formula}.')
+
+    def posterior_distribution(self, coef_name: str) -> torch.distributions.lowrank_multivariate_normal.LowRankMultivariateNormal:
+        """Returns the posterior distribution of coefficient `coef_name`.
+
+        Args:
+            coef_name (str): name of the coefficient to query.
+
+        Returns:
+            torch.Tensor: variance of the estimated posterior distribution of `coef_name`.
+        """
+        if coef_name in self.coef_dict.keys():
+            return self.coef_dict[coef_name].variational_distribution
+        else:
+            raise KeyError(f'{coef_name} is not a valid coefficient name in {self.utility_formula}.')
+
+    def ivs(self, batch) -> torch.Tensor:
+        """The combined method of computing utilities and log probability.
+
+            Args:
+                batch (dict): a batch of data.
+
+            Returns:
+                torch.Tensor: the combined utility and log probability.
+            """
+        # Use the means of variational distributions as the sole MC sample.
+        sample_dict = dict()
+        for coef_name, coef in self.coef_dict.items():
+            sample_dict[coef_name] = coef.variational_distribution.mean.unsqueeze(dim=0)  # (1, num_*, dim)
+
+        # there is 1 random seed in this case.
+        # (num_seeds=1, len(batch), num_items)
+        out = self.log_likelihood_all_items(batch, return_logit=True, sample_dict=sample_dict)
+        out = out.squeeze(0)
+        # import pdb; pdb.set_trace()
+        ivs = scatter_logsumexp(out, self.item_to_category_tensor, dim=-1)
+        return ivs # (len(batch), num_categories)
+
+    def sample_choices(self, batch:ChoiceDataset, debug: bool = False, num_seeds: int = 1, **kwargs) -> Tuple[torch.Tensor]:
+        """Samples choices given model paramaters and trips
+
+        Args:
+        batch(ChoiceDataset): batch data containing trip information; item choice information is discarded
+        debug(bool): whether to print debug information
+
+        Returns:
+        Tuple[torch.Tensor]: sampled choices; shape: (batch_size, num_categories)
+        """
+        # Use the means of variational distributions as the sole MC sample.
+        sample_dict = dict()
+        for coef_name, coef in self.coef_dict.items():
+            sample_dict[coef_name] = coef.variational_distribution.mean.unsqueeze(dim=0)  # (1, num_*, dim)
+        # sample_dict = self.sample_coefficient_dictionary(num_seeds)
+        maxes, out = self.sample_log_likelihoods(batch, sample_dict)
+        return maxes.squeeze(), out.squeeze()
+
+    def sample_log_likelihoods(self, batch:ChoiceDataset, sample_dict: Dict[str, torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Samples log likelihoods given model parameters and trips
+
+        Args:
+        batch(ChoiceDataset): batch data containing trip information; item choice information is discarded
+        sample_dict(Dict[str, torch.Tensor]): sampled coefficient values
+
+        Returns:
+        Tuple[torch.Tensor]: sampled log likelihoods; shape: (batch_size, num_categories)
+        """
+        # get the log likelihoods for all items for all categories
+        utility = self.log_likelihood_all_items(batch, return_logit=True, sample_dict=sample_dict)
+        mu_gumbel = 0.0
+        beta_gumbel = 1.0
+        EUL_MAS_CONST = 0.5772156649
+        mean_gumbel = torch.tensor([mu_gumbel + beta_gumbel * EUL_MAS_CONST], device=self.device)
+        m = torch.distributions.gumbel.Gumbel(torch.tensor([0.0], device=self.device), torch.tensor([1.0], device=self.device))
+        # m = torch.distributions.gumbel.Gumbel(0.0, 1.0)
+        gumbel_samples = m.sample(utility.shape).squeeze(-1)
+        gumbel_samples -= mean_gumbel
+        utility += gumbel_samples
+        max_by_category, argmax_by_category = scatter_max(utility, self.item_to_category_tensor, dim=-1)
+        return max_by_category, argmax_by_category
+        log_likelihoods = self.sample_log_likelihoods_per_category(batch, sample_dict)
+
+        # sum over all categories.
+        log_likelihoods = log_likelihoods.sum(dim=1)
+
+        return log_likelihoods, log_likelihoods
+
+    @torch.no_grad()
+    def predict_proba(self, batch: ChoiceDataset) -> torch.Tensor:
+        """
+        Draw prediction on a given batch of dataset.
+
+        Args:
+        batch (ChoiceDataset): the dataset to draw inference on.
+
+        Returns:
+        torch.Tensor: the predicted probabilities for each class, the behavior varies by self.pred_item.
+        (1: pred_item == True) While predicting items, the return tensor has shape (len(batch), num_items), out[i, j] is the predicted probability for choosing item j AMONG ALL ITEMS IN ITS CATEGORY in observation i. Please note that since probabilities are computed from within-category normalization, hence out.sum(dim=0) can be greater than 1 if there are multiple categories.
+        (2: pred_item == False) While predicting external labels for each observations, out[i, 0] is the predicted probability for label == 0 on the i-th observation, out[i, 1] is the predicted probability for label == 1 on the i-th observation. Generally, out[i, 0] + out[i, 1] = 1.0. However, this could be false if under-flowing/over-flowing issue is encountered.
+
+        We highly recommend users to get log-probs as those are less prone to overflow/underflow; those can be accessed using the forward() function.
+        """
+        if self.pred_item:
+            # (len(batch), num_items)
+            log_p = self.forward(batch, return_type='log_prob', return_scope='all_items', deterministic=True)
+            p = log_p.exp()
+        else:
+            # (len(batch), num_items)
+            # probability of getting label = 1.
+            p_1 = torch.nn.functional.sigmoid(self.forward(batch, return_type='utility', return_scope='all_items', deterministic=True))
+            # (len(batch), 1)
+            p_1 = p_1[torch.arange(len(batch)), batch.item_index].view(len(batch), 1)
+            p_0 = 1 - p_1
+            # (len(batch), 2)
+            p = torch.cat([p_0, p_1], dim=1)
+
+        if self.pred_item:
+            assert p.shape == (len(batch), self.num_items)
+        else:
+            assert p.shape == (len(batch), self.num_classes)
+
+        return p
+
+    def forward(self, batch: ChoiceDataset,
+                return_type: str,
+                return_scope: str,
+                deterministic: bool = True,
+                sample_dict: Optional[Dict[str, torch.Tensor]] = None,
+                num_seeds: Optional[int] = None
+                ) -> torch.Tensor:
+        """A combined method for inference with the model.
+
+        Args:
+            batch (ChoiceDataset): batch data containing choice information.
+            return_type (str): either 'log_prob' or 'utility'.
+                'log_prob': return the log-probability (by within-category log-softmax) for items
+                'utility': return the utility value of items.
+            return_scope (str): either 'item_index' or 'all_items'.
+                'item_index': for each observation i, return log-prob/utility for the chosen item batch.item_index[i] only.
+                'all_items': for each observation i, return log-prob/utility for all items.
+            deterministic (bool, optional):
+                True: expectations of parameter variational distributions are used for inference.
+                False: the user needs to supply a dictionary of sampled parameters for inference.
+                Defaults to True.
+            sample_dict (Optional[Dict[str, torch.Tensor]], optional): sampled parameters for inference task.
+                This is not needed when `deterministic` is True.
+                When `deterministic` is False, the user can supply a `sample_dict`. If `sample_dict` is not provided,
+                this method will create `num_seeds` samples.
+                Defaults to None.
+            num_seeds (Optional[int]): the number of random samples of parameters to construct. This is only required
+                if `deterministic` is False (i.e., stochastic mode) and `sample_dict` is not provided.
+                Defaults to None.
+        Returns:
+            torch.Tensor: a tensor of log-probabilities or utilities, depending on `return_type`.
+                The shape of the returned tensor depends on `return_scope` and `deterministic`.
+                -------------------------------------------------------------------------
+                | `return_scope` | `deterministic` |         Output shape               |
+                -------------------------------------------------------------------------
+                |   'item_index` |      True       | (len(batch),)                      |
+                -------------------------------------------------------------------------
+                |   'all_items'  |      True       | (len(batch), num_items)            |
+                -------------------------------------------------------------------------
+                |   'item_index' |      False      | (num_seeds, len(batch))            |
+                -------------------------------------------------------------------------
+                |   'all_items'  |      False      | (num_seeds, len(batch), num_items) |
+                -------------------------------------------------------------------------
+        """
+        # ==============================================================================================================
+        # check arguments.
+        # ==============================================================================================================
+        assert return_type in [
+            'log_prob', 'utility'], "return_type must be either 'log_prob' or 'utility'."
+        assert return_scope in [
+            'item_index', 'all_items'], "return_scope must be either 'item_index' or 'all_items'."
+        assert deterministic in [True, False]
+        if (not deterministic) and (sample_dict is None):
+            assert num_seeds >= 1, "A positive interger `num_seeds` is required if `deterministic` is False and no `sample_dict` is provided."
+
+        # when pred_item is true, the model is predicting which item is bought (specified by item_index).
+        if self.pred_item:
+            batch.label = batch.item_index
+
+        # ==============================================================================================================
+        # get sample_dict ready.
+        # ==============================================================================================================
+        if deterministic:
+            num_seeds = 1
+            # Use the means of variational distributions as the sole deterministic MC sample.
+            # NOTE: here we don't need to sample the obs2prior weight H since we only compute the log-likelihood.
+            # TODO: is this correct?
+            sample_dict = dict()
+            for coef_name, coef in self.coef_dict.items():
+                sample_dict[coef_name] = coef.variational_distribution.mean.unsqueeze(
+                    dim=0)  # (1, num_*, dim)
+        else:
+            if sample_dict is None:
+                # sample stochastic parameters.
+                sample_dict = self.sample_coefficient_dictionary(num_seeds)
+            else:
+                # use the provided sample_dict.
+                num_seeds = list(sample_dict.values())[0].shape[0]
+
+        # ==============================================================================================================
+        # call the sampling method of additional modules.
+        # ==============================================================================================================
+        for module in self.additional_modules:
+            # deterministic sample.
+            if deterministic:
+                module.dsample()
+            else:
+                module.rsample(num_seeds=num_seeds)
+
+        # if utility is requested, don't run log-softmax, simply return logit.
+        return_logit = (return_type == 'utility')
+        if return_scope == 'all_items':
+            # (num_seeds, len(batch), num_items)
+            out = self.log_likelihood_all_items(
+                batch=batch, sample_dict=sample_dict, return_logit=return_logit)
+        elif return_scope == 'item_index':
+            # (num_seeds, len(batch))
+            out = self.log_likelihood_item_index(
+                batch=batch, sample_dict=sample_dict, return_logit=return_logit)
+
+        if deterministic:
+            # drop the first dimension, which has size of `num_seeds` (equals 1 in the deterministic case).
+            # (len(batch), num_items) or (len(batch),)
+            return out.squeeze(dim=0)
+
+        return out
+
+    @property
+    def num_params(self) -> int:
+        return sum([p.numel() for p in self.parameters()])
+
+    @property
+    def device(self) -> torch.device:
+        for coef in self.coef_dict.values():
+            return coef.device
+
+    # ==================================================================================================================
+    # helper functions.
+    # ==================================================================================================================
+    def sample_coefficient_dictionary(self, num_seeds: int) -> Dict[str, torch.Tensor]:
+        """A helper function to sample parameters from coefficients.
+
+        Args:
+            num_seeds (int): number of random samples.
+
+        Returns:
+            Dict[str, torch.Tensor]: a dictionary maps coefficient names to tensor of sampled coefficient parameters,
+                where the first dimension of the sampled tensor has size `num_seeds`.
+                Each sample tensor has shape (num_seeds, num_classes, dim).
+        """
+        sample_dict = dict()
+        for coef_name, coef in self.coef_dict.items():
+            s = coef.rsample(num_seeds)
+            if coef.obs2prior:
+                # sample both obs2prior weight and realization of variable.
+                assert isinstance(s, tuple) and len(s) == 2
+                sample_dict[coef_name] = s[0]
+                sample_dict[coef_name + '.H'] = s[1]
+            else:
+                # only sample the realization of variable.
+                assert torch.is_tensor(s)
+                sample_dict[coef_name] = s
+        return sample_dict
+
+    @torch.no_grad()
+    def get_within_category_accuracy(self, log_p_all_items: torch.Tensor, label: torch.LongTensor) -> Dict[str, float]:
+        """A helper function for computing prediction accuracy (i.e., all non-differential metrics)
+        within category.
+        In particular, this method calculates the accuracy, precision, recall and F1 score.
+
+
+        This method has the same functionality as the following peusodcode:
+        for C in categories:
+            # get sessions in which item in category C was purchased.
+            T <- (t for t in {0,1,..., len(label)-1} if label[t] is in C)
+            Y <- label[T]
+
+            predictions = list()
+            for t in T:
+                # get the prediction within category for this session.
+                y_pred = argmax_{items in C} log prob computed before.
+                predictions.append(y_pred)
+
+            accuracy = mean(Y == predictions)
+
+        Similarly, this function computes precision, recall and f1score as well.
+
+        Args:
+            log_p_all_items (torch.Tensor): shape (num_sessions, num_items) the log probability of
+                choosing each item in each session.
+            label (torch.LongTensor): shape (num_sessions,), the IDs of items purchased in each session.
+
+        Returns:
+            [Dict[str, float]]: A dictionary containing performance metrics.
+        """
+        # argmax: (num_sessions, num_categories), within category argmax.
+        # item IDs are consecutive, thus argmax is the same as IDs of the item with highest P.
+        _, argmax_by_category = scatter_max(
+            log_p_all_items, self.item_to_category_tensor, dim=-1)
+
+        # category_purchased[t] = the category of item label[t].
+        # (num_sessions,)
+        category_purchased = self.item_to_category_tensor[label]
+
+        # pred[t] = the item with highest utility from the category item label[t] belongs to.
+        # (num_sessions,)
+        pred_from_category = argmax_by_category[torch.arange(
+            len(label)), category_purchased]
+
+        within_category_accuracy = (
+            pred_from_category == label).float().mean().item()
+
+        # precision
+        precision = list()
+
+        recall = list()
+        for i in range(self.num_items):
+            correct_i = torch.sum(
+                (torch.logical_and(pred_from_category == i, label == i)).float())
+            precision_i = correct_i / \
+                torch.sum((pred_from_category == i).float())
+            recall_i = correct_i / torch.sum((label == i).float())
+
+            # do not add if divided by zero.
+            if torch.any(pred_from_category == i):
+                precision.append(precision_i.cpu().item())
+            if torch.any(label == i):
+                recall.append(recall_i.cpu().item())
+
+        precision = float(np.mean(precision))
+        recall = float(np.mean(recall))
+
+        if precision == recall == 0:
+            f1 = 0
+        else:
+            f1 = 2 * precision * recall / (precision + recall)
+
+        return {'accuracy': within_category_accuracy,
+                'precision': precision,
+                'recall': recall,
+                'f1score': f1}
+
+    # ==================================================================================================================
+    # Methods for terms in the ELBO: prior, likelihood, and variational.
+    # ==================================================================================================================
+    def log_likelihood_all_items(self, batch: ChoiceDataset, return_logit: bool, sample_dict: Dict[str, torch.Tensor]) -> torch.Tensor:
+        """
+        NOTE to developers:
+        NOTE (akanodia to tianyudu): Is this really slow; even with log_likelihood you need log_prob which depends on logits of all items?
+        This method computes utilities for all items available, which is a relatively slow operation. For
+        training the model, you only need the utility/log-prob for the chosen/relevant item (i.e., item_index[i] for each i-th observation).
+        Use this method for inference only.
+        Use self.log_likelihood_item_index() for training instead.
+
+        Computes the log probability of choosing `each` item in each session based on current model parameters.
+        NOTE (akanodiadu to tianyudu): What does the next line mean? I think it just says its allowing for samples instead of posterior mean.
+        This method allows for specifying {user, item}_latent_value for Monte Carlo estimation in ELBO.
+        For actual prediction tasks, use the forward() function, which will use means of variational
+        distributions for user and item latents.
+
+        Args:
+            batch (ChoiceDataset): a ChoiceDataset object containing relevant information.
+            return_logit(bool): if set to True, return the log-probability, otherwise return the logit/utility.
+            sample_dict(Dict[str, torch.Tensor]): Monte Carlo samples for model coefficients
+                (i.e., those Greek letters).
+                sample_dict.keys() should be the same as keys of self.obs2prior_dict, i.e., those
+                greek letters actually enter the functional form of utility.
+                The value of sample_dict should be tensors of shape (num_seeds, num_classes, dim)
+                where num_classes in {num_users, num_items, 1}
+                and dim in {latent_dim(K), num_item_obs, num_user_obs, 1}.
+
+        Returns:
+            torch.Tensor: a tensor of shape (num_seeds, len(batch), self.num_items), where
+                out[x, y, z] is the probability of choosing item z in session y conditioned on
+                latents to be the x-th Monte Carlo sample.
+        """
+        num_seeds = next(iter(sample_dict.values())).shape[0]
+
+        # avoid repeated work when user purchased several items in the same session.
+        user_session_index = torch.stack(
+            [batch.user_index, batch.session_index])
+        assert user_session_index.shape == (2, len(batch))
+        unique_user_sess, inverse_indices = torch.unique(
+            user_session_index, dim=1, return_inverse=True)
+
+        user_index = unique_user_sess[0, :]
+        session_index = unique_user_sess[1, :]
+        assert len(user_index) == len(session_index)
+
+        # short-hands for easier shape check.
+        R = num_seeds
+        # P = len(batch)  # num_purchases.
+        P = unique_user_sess.shape[1]
+        S = self.num_sessions
+        U = self.num_users
+        I = self.num_items
+        NC = self.num_categories
+
+        # ==============================================================================================================
+        # Helper Functions for Reshaping.
+        # ==============================================================================================================
+        def reshape_user_coef_sample(C):
+            # input shape (R, U, *)
+            C = C.view(R, U, 1, -1).expand(-1, -1, I, -1)  # (R, U, I, *)
+            C = C[:, user_index, :, :]
+            assert C.shape == (R, P, I, positive_integer)
+            return C
+
+        def reshape_item_coef_sample(C):
+            # input shape (R, I, *)
+            C = C.view(R, 1, I, -1).expand(-1, P, -1, -1)
+            assert C.shape == (R, P, I, positive_integer)
+            return C
+
+        def reshape_category_coef_sample(C):
+            # input shape (R, NC, *)
+            C = torch.repeat_interleave(C, self.category_to_size_tensor, dim=1)
+            # input shape (R, I, *)
+            C = C.view(R, 1, I, -1).expand(-1, P, -1, -1)
+            assert C.shape == (R, P, I, positive_integer)
+            return C
+
+        def reshape_constant_coef_sample(C):
+            # input shape (R, *)
+            C = C.view(R, 1, 1, -1).expand(-1, P, I, -1)
+            assert C.shape == (R, P, I, positive_integer)
+            return C
+
+        def reshape_coef_sample(sample, name):
+            # reshape the monte carlo sample of coefficients to (R, P, I, *).
+            if name.endswith('_user'):
+                # (R, U, *) --> (R, P, I, *)
+                return reshape_user_coef_sample(sample)
+            elif name.endswith('_item'):
+                # (R, I, *) --> (R, P, I, *)
+                return reshape_item_coef_sample(sample)
+            elif name.endswith('_category'):
+                # (R, NC, *) --> (R, P, NC, *)
+                return reshape_category_coef_sample(sample)
+            elif name.endswith('_constant'):
+                # (R, *) --> (R, P, I, *)
+                return reshape_constant_coef_sample(sample)
+            else:
+                raise ValueError
+
+        def reshape_observable(obs, name):
+            # reshape observable to (R, P, I, *) so that it can be multiplied with monte carlo
+            # samples of coefficients.
+            O = obs.shape[-1]  # number of observables.
+            assert O == positive_integer
+            if name.startswith('item_'):
+                assert obs.shape == (I, O)
+                obs = obs.view(1, 1, I, O).expand(R, P, -1, -1)
+            elif name.startswith('user_'):
+                assert obs.shape == (U, O)
+                obs = obs[user_index, :]  # (P, O)
+                obs = obs.view(1, P, 1, O).expand(R, -1, I, -1)
+            elif name.startswith('session_'):
+                assert obs.shape == (S, O)
+                obs = obs[session_index, :]  # (P, O)
+                return obs.view(1, P, 1, O).expand(R, -1, I, -1)
+            elif name.startswith('price_'):
+                assert obs.shape == (S, I, O)
+                obs = obs[session_index, :, :]  # (P, I, O)
+                return obs.view(1, P, I, O).expand(R, -1, -1, -1)
+            elif name.startswith('taste_'):
+                assert obs.shape == (U, I, O)
+                obs = obs[user_index, :, :]  # (P, I, O)
+                return obs.view(1, P, I, O).expand(R, -1, -1, -1)
+            else:
+                raise ValueError
+            assert obs.shape == (R, P, I, O)
+            return obs
+
+        # ==============================================================================================================
+        # Compute the Utility Term by Term.
+        # ==============================================================================================================
+        # P is the number of unique (user, session) pairs.
+        # (random_seeds, P, num_items).
+        utility = torch.zeros(R, P, I, device=self.device)
+
+        # loop over additive term to utility
+        for term in self.formula:
+            # Type I: single coefficient, e.g., lambda_item or lambda_user.
+            if len(term['coefficient']) == 1 and term['observable'] is None:
+                # E.g., lambda_item or lambda_user
+                coef_name = term['coefficient'][0]
+                coef_sample = reshape_coef_sample(
+                    sample_dict[coef_name], coef_name)
+                assert coef_sample.shape == (R, P, I, 1)
+                additive_term = coef_sample.view(R, P, I)
+
+            # Type II: factorized coefficient, e.g., <theta_user, lambda_item>.
+            elif len(term['coefficient']) == 2 and term['observable'] is None:
+                coef_name_0 = term['coefficient'][0]
+                coef_name_1 = term['coefficient'][1]
+
+                coef_sample_0 = reshape_coef_sample(
+                    sample_dict[coef_name_0], coef_name_0)
+                coef_sample_1 = reshape_coef_sample(
+                    sample_dict[coef_name_1], coef_name_1)
+
+                assert coef_sample_0.shape == coef_sample_1.shape == (
+                    R, P, I, positive_integer)
+
+                additive_term = (coef_sample_0 * coef_sample_1).sum(dim=-1)
+
+            # Type III: single coefficient multiplied by observable, e.g., theta_user * x_obs_item.
+            elif len(term['coefficient']) == 1 and term['observable'] is not None:
+                coef_name = term['coefficient'][0]
+                coef_sample = reshape_coef_sample(
+                    sample_dict[coef_name], coef_name)
+                assert coef_sample.shape == (R, P, I, positive_integer)
+
+                obs_name = term['observable']
+                obs = reshape_observable(getattr(batch, obs_name), obs_name)
+                assert obs.shape == (R, P, I, positive_integer)
+
+                additive_term = (coef_sample * obs).sum(dim=-1)
+
+            # Type IV: factorized coefficient multiplied by observable.
+            # e.g., gamma_user * beta_item * price_obs.
+            elif len(term['coefficient']) == 2 and term['observable'] is not None:
+                coef_name_0, coef_name_1 = term['coefficient'][0], term['coefficient'][1]
+
+                coef_sample_0 = reshape_coef_sample(
+                    sample_dict[coef_name_0], coef_name_0)
+                coef_sample_1 = reshape_coef_sample(
+                    sample_dict[coef_name_1], coef_name_1)
+                assert coef_sample_0.shape == coef_sample_1.shape == (
+                    R, P, I, positive_integer)
+                num_obs_times_latent_dim = coef_sample_0.shape[-1]
+
+                obs_name = term['observable']
+                obs = reshape_observable(getattr(batch, obs_name), obs_name)
+                assert obs.shape == (R, P, I, positive_integer)
+                num_obs = obs.shape[-1]  # number of observables.
+
+                assert (num_obs_times_latent_dim % num_obs) == 0
+                latent_dim = num_obs_times_latent_dim // num_obs
+
+                coef_sample_0 = coef_sample_0.view(
+                    R, P, I, num_obs, latent_dim)
+                coef_sample_1 = coef_sample_1.view(
+                    R, P, I, num_obs, latent_dim)
+                # compute the factorized coefficient with shape (R, P, I, O).
+                coef = (coef_sample_0 * coef_sample_1).sum(dim=-1)
+
+                additive_term = (coef * obs).sum(dim=-1)
+
+            else:
+                raise ValueError(f'Undefined term type: {term}')
+
+            assert additive_term.shape == (R, P, I)
+            utility += additive_term
+
+        # ==============================================================================================================
+        # Mask Out Unavailable Items in Each Session.
+        # ==============================================================================================================
+
+        if batch.item_availability is not None:
+            # expand to the Monte Carlo sample dimension.
+            # (S, I) -> (P, I) -> (1, P, I) -> (R, P, I)
+            A = batch.item_availability[session_index, :].unsqueeze(
+                dim=0).expand(R, -1, -1)
+            utility[~A] = - (torch.finfo(utility.dtype).max / 2)
+
+        utility = utility[:, inverse_indices, :]
+        assert utility.shape == (R, len(batch), I)
+
+        for module in self.additional_modules:
+            additive_term = module(batch)
+            assert additive_term.shape == (R, len(batch), 1)
+            utility += additive_term.expand(-1, -1, I)
+
+        if return_logit:
+            # output shape: (num_seeds, len(batch), num_items)
+            assert utility.shape == (num_seeds, len(batch), self.num_items)
+            return utility
+        else:
+            # compute log likelihood log p(choosing item i | user, item latents)
+            # compute log softmax separately within each category.
+            if self.pred_item:
+                # output shape: (num_seeds, len(batch), num_items)
+                log_p = scatter_log_softmax(utility, self.item_to_category_tensor, dim=-1)
+            else:
+                label_expanded = batch.label.to(torch.float32).view(1, len(batch), 1).expand(num_seeds, -1, self.num_items)
+                assert label_expanded.shape == (num_seeds, len(batch), self.num_items)
+                bce = nn.BCELoss(reduction='none')
+                log_p = - bce(torch.sigmoid(utility), label_expanded)
+            assert log_p.shape == (num_seeds, len(batch), self.num_items)
+            return log_p
+
+    def log_likelihood_item_index(self, batch: ChoiceDataset, return_logit: bool, sample_dict: Dict[str, torch.Tensor]) -> torch.Tensor:
+        """
+        NOTE for developers:
+        This method is more efficient and only computes log-likelihood/logit(utility) for item in item_index[i] for each
+        i-th observation.
+        Developers should use use `log_likelihood_all_items` for inference purpose and to computes log-likelihoods/utilities
+        for ALL items for the i-th observation.
+
+        Computes the log probability of choosing item_index[i] in each session based on current model parameters.
+        This method allows for specifying {user, item}_latent_value for Monte Carlo estimation in ELBO.
+        For actual prediction tasks, use the forward() function, which will use means of variational
+        distributions for user and item latents.
+
+        Args:
+            batch (ChoiceDataset): a ChoiceDataset object containing relevant information.
+            return_logit(bool): if set to True, return the logit/utility, otherwise return the log-probability.
+            sample_dict(Dict[str, torch.Tensor]): Monte Carlo samples for model coefficients
+                (i.e., those Greek letters).
+                sample_dict.keys() should be the same as keys of self.obs2prior_dict, i.e., those
+                greek letters actually enter the functional form of utility.
+                The value of sample_dict should be tensors of shape (num_seeds, num_classes, dim)
+                where num_classes in {num_users, num_items, 1}
+                and dim in {latent_dim(K), num_item_obs, num_user_obs, 1}.
+
+        Returns:
+            torch.Tensor: a tensor of shape (num_seeds, len(batch)), where
+                out[x, y] is the probabilities of choosing item batch.item[y] in session y
+                conditioned on latents to be the x-th Monte Carlo sample.
+        """
+        num_seeds = next(iter(sample_dict.values())).shape[0]
+
+        # get category id of the item bought in each row of batch.
+        cate_index = self.item_to_category_tensor[batch.item_index]
+
+        # get item ids of all items from the same category of each item bought.
+        relevant_item_index = self.category_to_item_tensor[cate_index, :]
+        relevant_item_index = relevant_item_index.view(-1,)
+        # index were padded with -1's, drop those dummy entries.
+        relevant_item_index = relevant_item_index[relevant_item_index != -1]
+
+        # the first repeats[0] entries in relevant_item_index are for the category of item_index[0]
+        repeats = self.category_to_size_tensor[cate_index]
+        # argwhere(reverse_indices == k) are positions in relevant_item_index for the category of item_index[k].
+        reverse_indices = torch.repeat_interleave(
+            torch.arange(len(batch), device=self.device), repeats)
+        # expand the user_index and session_index.
+        user_index = torch.repeat_interleave(batch.user_index, repeats)
+        repeat_category_index = torch.repeat_interleave(cate_index, repeats)
+        session_index = torch.repeat_interleave(batch.session_index, repeats)
+        # duplicate the item focused to match.
+        item_index_expanded = torch.repeat_interleave(
+            batch.item_index, repeats)
+
+        # short-hands for easier shape check.
+        R = num_seeds
+        # total number of relevant items.
+        total_computation = len(session_index)
+        S = self.num_sessions
+        U = self.num_users
+        I = self.num_items
+        NC = self.num_categories
+        # ==========================================================================================
+        # Helper Functions for Reshaping.
+        # ==========================================================================================
+
+        def reshape_coef_sample(sample, name):
+            # reshape the monte carlo sample of coefficients to (R, P, I, *).
+            if name.endswith('_user'):
+                # (R, U, *) --> (R, total_computation, *)
+                return sample[:, user_index, :]
+            elif name.endswith('_item'):
+                # (R, I, *) --> (R, total_computation, *)
+                return sample[:, relevant_item_index, :]
+            elif name.endswith('_category'):
+                # (R, NC, *) --> (R, total_computation, *)
+                return sample[:, repeat_category_index, :]
+            elif name.endswith('_constant'):
+                # (R, *) --> (R, total_computation, *)
+                return sample.view(R, 1, -1).expand(-1, total_computation, -1)
+            else:
+                raise ValueError
+
+        def reshape_observable(obs, name):
+            # reshape observable to (R, P, I, *) so that it can be multiplied with monte carlo
+            # samples of coefficients.
+            O = obs.shape[-1]  # number of observables.
+            assert O == positive_integer
+            if name.startswith('item_'):
+                assert obs.shape == (I, O)
+                obs = obs[relevant_item_index, :]
+            elif name.startswith('user_'):
+                assert obs.shape == (U, O)
+                obs = obs[user_index, :]
+            elif name.startswith('session_'):
+                assert obs.shape == (S, O)
+                obs = obs[session_index, :]
+            elif name.startswith('price_'):
+                assert obs.shape == (S, I, O)
+                obs = obs[session_index, relevant_item_index, :]
+            elif name.startswith('taste_'):
+                assert obs.shape == (U, I, O)
+                obs = obs[user_index, relevant_item_index, :]
+            else:
+                raise ValueError
+            assert obs.shape == (total_computation, O)
+            return obs.unsqueeze(dim=0).expand(R, -1, -1)
+
+        # ==========================================================================================
+        # Compute Components related to users and items only.
+        # ==========================================================================================
+        utility = torch.zeros(R, total_computation, device=self.device)
+
+        # loop over additive term to utility
+        for term in self.formula:
+            # Type I: single coefficient, e.g., lambda_item or lambda_user.
+            if len(term['coefficient']) == 1 and term['observable'] is None:
+                # E.g., lambda_item or lambda_user
+                coef_name = term['coefficient'][0]
+                coef_sample = reshape_coef_sample(
+                    sample_dict[coef_name], coef_name)
+                assert coef_sample.shape == (R, total_computation, 1)
+                additive_term = coef_sample.view(R, total_computation)
+
+            # Type II: factorized coefficient, e.g., <theta_user, lambda_item>.
+            elif len(term['coefficient']) == 2 and term['observable'] is None:
+                coef_name_0 = term['coefficient'][0]
+                coef_name_1 = term['coefficient'][1]
+
+                coef_sample_0 = reshape_coef_sample(
+                    sample_dict[coef_name_0], coef_name_0)
+                coef_sample_1 = reshape_coef_sample(
+                    sample_dict[coef_name_1], coef_name_1)
+
+                assert coef_sample_0.shape == coef_sample_1.shape == (
+                    R, total_computation, positive_integer)
+
+                additive_term = (coef_sample_0 * coef_sample_1).sum(dim=-1)
+
+            # Type III: single coefficient multiplied by observable, e.g., theta_user * x_obs_item.
+            elif len(term['coefficient']) == 1 and term['observable'] is not None:
+                coef_name = term['coefficient'][0]
+                coef_sample = reshape_coef_sample(
+                    sample_dict[coef_name], coef_name)
+                assert coef_sample.shape == (
+                    R, total_computation, positive_integer)
+
+                obs_name = term['observable']
+                obs = reshape_observable(getattr(batch, obs_name), obs_name)
+                assert obs.shape == (R, total_computation, positive_integer)
+
+                additive_term = (coef_sample * obs).sum(dim=-1)
+
+            # Type IV: factorized coefficient multiplied by observable.
+            # e.g., gamma_user * beta_item * price_obs.
+            elif len(term['coefficient']) == 2 and term['observable'] is not None:
+                coef_name_0, coef_name_1 = term['coefficient'][0], term['coefficient'][1]
+                coef_sample_0 = reshape_coef_sample(
+                    sample_dict[coef_name_0], coef_name_0)
+                coef_sample_1 = reshape_coef_sample(
+                    sample_dict[coef_name_1], coef_name_1)
+                assert coef_sample_0.shape == coef_sample_1.shape == (
+                    R, total_computation, positive_integer)
+                num_obs_times_latent_dim = coef_sample_0.shape[-1]
+
+                obs_name = term['observable']
+                obs = reshape_observable(getattr(batch, obs_name), obs_name)
+                assert obs.shape == (R, total_computation, positive_integer)
+                num_obs = obs.shape[-1]  # number of observables.
+
+                assert (num_obs_times_latent_dim % num_obs) == 0
+                latent_dim = num_obs_times_latent_dim // num_obs
+
+                coef_sample_0 = coef_sample_0.view(
+                    R, total_computation, num_obs, latent_dim)
+                coef_sample_1 = coef_sample_1.view(
+                    R, total_computation, num_obs, latent_dim)
+                # compute the factorized coefficient with shape (R, P, I, O).
+                coef = (coef_sample_0 * coef_sample_1).sum(dim=-1)
+
+                additive_term = (coef * obs).sum(dim=-1)
+
+            else:
+                raise ValueError(f'Undefined term type: {term}')
+
+            assert additive_term.shape == (R, total_computation)
+            utility += additive_term
+
+        # ==========================================================================================
+        # Mask Out Unavailable Items in Each Session.
+        # ==========================================================================================
+
+        if batch.item_availability is not None:
+            # expand to the Monte Carlo sample dimension.
+            A = batch.item_availability[session_index, relevant_item_index].unsqueeze(
+                dim=0).expand(R, -1)
+            utility[~A] = - (torch.finfo(utility.dtype).max / 2)
+
+        for module in self.additional_modules:
+            # current utility shape: (R, total_computation)
+            additive_term = module(batch)
+            assert additive_term.shape == (
+                R, len(batch)) or additive_term.shape == (R, len(batch), 1)
+            if additive_term.shape == (R, len(batch), 1):
+                # TODO: need to make this consistent with log_likelihood_all.
+                # be tolerant for some customized module with BayesianLinear that returns (R, len(batch), 1).
+                additive_term = additive_term.view(R, len(batch))
+            # expand to total number of computation, query by reverse_indices.
+            # reverse_indices has length total_computation, and reverse_indices[i] correspond to the row-id that this
+            # computation is responsible for.
+            additive_term = additive_term[:, reverse_indices]
+            assert additive_term.shape == (R, total_computation)
+
+        if return_logit:
+            # (num_seeds, len(batch))
+            u = utility[:, item_index_expanded == relevant_item_index]
+            assert u.shape == (R, len(batch))
+            return u
+
+        if self.pred_item:
+            # compute log likelihood log p(choosing item i | user, item latents)
+            # compute the log probability from logits/utilities.
+            # output shape: (num_seeds, len(batch), num_items)
+            log_p = scatter_log_softmax(utility, reverse_indices, dim=-1)
+            # select the log-P of the item actually bought.
+            log_p = log_p[:, item_index_expanded == relevant_item_index]
+            assert log_p.shape == (R, len(batch))
+            return log_p
+        else:
+            # This is the binomial choice situation in which case we just report sigmoid log likelihood
+            utility = utility[:, item_index_expanded == relevant_item_index]
+            assert utility.shape == (R, len(batch))
+            bce = nn.BCELoss(reduction='none')
+            # make num_seeds copies of the label, expand to (R, len(batch))
+            label_expanded = batch.label.to(torch.float32).view(1, len(batch)).expand(R, -1)
+            assert label_expanded.shape == (R, len(batch))
+            log_p = - bce(torch.sigmoid(utility), label_expanded)
+            assert log_p.shape == (R, len(batch))
+            return log_p
+
+    def log_prior(self, batch: ChoiceDataset, sample_dict: Dict[str, torch.Tensor]) -> torch.Tensor:
+        """Calculates the log-likelihood of Monte Carlo samples of Bayesian coefficients under their
+        prior distribution. This method assume coefficients are statistically independent.
+
+        Args:
+            batch (ChoiceDataset): a dataset object contains observables for computing the prior distribution
+                if obs2prior is True.
+            sample_dict (Dict[str, torch.Tensor]): a dictionary coefficient names to Monte Carlo samples.
+
+        Raises:
+            ValueError: [description]
+
+        Returns:
+            torch.scalar_tensor: a tensor with shape (num_seeds,) of [ log P_{prior_distribution}(param[i]) ],
+                where param[i] is the i-th Monte Carlo sample.
+        """
+        # assert sample_dict.keys() == self.coef_dict.keys()
+        num_seeds = next(iter(sample_dict.values())).shape[0]
+
+        total = torch.zeros(num_seeds, device=self.device)
+
+        for coef_name, coef in self.coef_dict.items():
+            if self.obs2prior_dict[coef_name]:
+                if coef_name.endswith('_item'):
+                    x_obs = batch.item_obs
+                elif coef_name.endswith('_user'):
+                    x_obs = batch.user_obs
+                else:
+                    raise ValueError(
+                        f'No observable found to support obs2prior for {coef_name}.')
+
+                total += coef.log_prior(sample=sample_dict[coef_name],
+                                        H_sample=sample_dict[coef_name + '.H'],
+                                        x_obs=x_obs).sum(dim=-1)
+            else:
+                # log_prob outputs (num_seeds, num_{items, users}), sum to (num_seeds).
+                total += coef.log_prior(
+                    sample=sample_dict[coef_name], H_sample=None, x_obs=None).sum(dim=-1)
+
+        for module in self.additional_modules:
+            raise NotImplementedError()
+            total += module.log_prior()
+
+        return total
+
+    def log_variational(self, sample_dict: Dict[str, torch.Tensor]) -> torch.Tensor:
+        """Calculate the log-likelihood of samples in sample_dict under the current variational
+        distribution.
+
+        Args:
+            sample_dict (Dict[str, torch.Tensor]):  a dictionary coefficient names to Monte Carlo
+                samples.
+
+        Returns:
+            torch.Tensor: a tensor of shape (num_seeds) of [ log P_{variational_distribution}(param[i]) ],
+                where param[i] is the i-th Monte Carlo sample.
+        """
+        num_seeds = list(sample_dict.values())[0].shape[0]
+        total = torch.zeros(num_seeds, device=self.device)
+
+        for coef_name, coef in self.coef_dict.items():
+            # log_prob outputs (num_seeds, num_{items, users}), sum to (num_seeds).
+            total += coef.log_variational(sample_dict[coef_name]).sum(dim=-1)
+
+        for module in self.additional_modules:
+            raise NotImplementedError()
+            # with shape (num_seeds,)
+            total += module.log_variational().sum()
+
+        return total
+
+    def elbo(self, batch: ChoiceDataset, num_seeds: int = 1) -> torch.Tensor:
+        """A combined method to computes the current ELBO given a batch, this method is used for training the model.
+
+        Args:
+            batch (ChoiceDataset): a ChoiceDataset containing necessary information.
+            num_seeds (int, optional): the number of Monte Carlo samples from variational distributions
+                to evaluate the expectation in ELBO.
+                Defaults to 1.
+
+        Returns:
+            torch.Tensor: a scalar tensor of the ELBO estimated from num_seeds Monte Carlo samples.
+        """
+        # ==============================================================================================================
+        # 1. sample latent variables from their variational distributions.
+        # ==============================================================================================================
+        if self.deterministic_variational:
+            num_seeds = 1
+            # Use the means of variational distributions as the sole deterministic MC sample.
+            # NOTE: here we don't need to sample the obs2prior weight H since we only compute the log-likelihood.
+            # TODO: is this correct?
+            sample_dict = dict()
+            for coef_name, coef in self.coef_dict.items():
+                sample_dict[coef_name] = coef.variational_distribution.mean.unsqueeze(
+                    dim=0)  # (1, num_*, dim)
+        else:
+            sample_dict = self.sample_coefficient_dictionary(num_seeds)
+
+        # ==============================================================================================================
+        # 2. compute log p(latent) prior.
+        # (num_seeds,) --mean--> scalar.
+        elbo = self.log_prior(batch, sample_dict).mean(dim=0)
+        # ==============================================================================================================
+
+        # ==============================================================================================================
+        # 3. compute the log likelihood log p(obs|latent).
+        # sum over independent purchase decision for individual observations, mean over MC seeds.
+        # the forward() function calls module.rsample(num_seeds) for module in self.additional_modules.
+        # ==============================================================================================================
+        if self.pred_item:
+            # the prediction target is item_index.
+            elbo_expanded = self.forward(batch,
+                                 return_type='log_prob',
+                                 return_scope='item_index',
+                                 deterministic=self.deterministic_variational,
+                                 sample_dict=sample_dict)
+            if self.deterministic_variational:
+                elbo_expanded = elbo_expanded.unsqueeze(dim=0)
+            elbo += elbo_expanded.sum(dim=1).mean(dim=0)  # (num_seeds, len(batch)) --> scalar.
+        else:
+            # the prediction target is binary.
+            # TODO: update the prediction function.
+            utility = self.forward(batch,
+                                   return_type='utility',
+                                   return_scope='item_index',
+                                   deterministic=self.deterministic_variational,
+                                   sample_dict=sample_dict)  # (num_seeds, len(batch))
+
+            # compute the log-likelihood for binary label.
+            # (num_seeds, len(batch))
+            y_stacked = torch.stack([batch.label] * num_seeds).float()
+            assert y_stacked.shape == utility.shape
+            bce = nn.BCELoss(reduction='none')
+            # scalar.
+            ll = - bce(torch.sigmoid(utility),
+                       y_stacked).sum(dim=1).mean(dim=0)
+            elbo += ll
+
+        # ==============================================================================================================
+        # 4. optionally add log likelihood under variational distributions q(latent).
+        # ==============================================================================================================
+        if self.trace_log_q:
+            assert not self.deterministic_variational, "deterministic_variational is not compatible with trace_log_q."
+            elbo -= self.log_variational(sample_dict).mean(dim=0)
+
+        return elbo

From b57e5c8e4b06df2d2fbc4f2ba93c15b77d7d3222 Mon Sep 17 00:00:00 2001
From: Ayush Kanodia <kanodiaayush@gmail.com>
Date: Tue, 1 Nov 2022 19:58:35 -0700
Subject: [PATCH 02/19] first cut working bemb chunked

TODO
1. check rehshaping code
2. add reshaping for other coeffs
3. fix printing code
4. fix code for log variational (although this probably never needs to
be done as we never need to use `trace_log_q`)
---
 bemb/model/bemb_chunked.py | 292 +++++++++++++++++++++++--------------
 1 file changed, 179 insertions(+), 113 deletions(-)

diff --git a/bemb/model/bemb_chunked.py b/bemb/model/bemb_chunked.py
index c5c9e1b..be40e65 100644
--- a/bemb/model/bemb_chunked.py
+++ b/bemb/model/bemb_chunked.py
@@ -26,69 +26,6 @@
 
 from bemb.model.bemb import PositiveInteger, parse_utility
 positive_integer = PositiveInteger()
-'''
-class PositiveInteger(object):
-    # A helper wildcard class for shape matching.
-    def __eq__(self, other):
-        return isinstance(other, int) and other > 0
-
-def parse_utility(utility_string: str) -> List[Dict[str, Union[List[str], None]]]:
-    """
-    A helper function parse utility string into a list of additive terms.
-
-    Example:
-        utility_string = 'lambda_item + theta_user * alpha_item + gamma_user * beta_item * price_obs'
-        output = [
-            {
-                'coefficient': ['lambda_item'],
-                'observable': None
-            },
-            {
-                'coefficient': ['theta_user', 'alpha_item'],
-                'observable': None
-            },
-            {
-                'coefficient': ['gamma_user', 'beta_item'],
-                'observable': 'price_obs'
-            }
-            ]
-    """
-    # split additive terms
-    coefficient_suffix = ('_item', '_user', '_constant', '_category')
-    observable_prefix = ('item_', 'user_', 'session_', 'price_', 'taste_')
-
-    def is_coefficient(name: str) -> bool:
-        return any(name.endswith(suffix) for suffix in coefficient_suffix)
-
-    def is_observable(name: str) -> bool:
-        return any(name.startswith(prefix) for prefix in observable_prefix)
-
-    additive_terms = utility_string.split(' + ')
-    additive_decomposition = list()
-    for term in additive_terms:
-        atom = {'coefficient': [], 'observable': None}
-        # split multiplicative terms.
-        for x in term.split(' * '):
-            # Programmers can specify itemsession for price observables, this brings better intuition.
-            if x.startswith('itemsession_'):
-                # case 1: special observable name.
-                atom['observable'] = 'price_' + x[len('itemsession_'):]
-            elif is_observable(x):
-                # case 2: normal observable name.
-                atom['observable'] = x
-            elif is_coefficient(x):
-                # case 3: normal coefficient name.
-                atom['coefficient'].append(x)
-            else:
-                # case 4: special coefficient name.
-                # the _constant coefficient suffix is not intuitive enough, we allow none coefficient suffix for
-                # coefficient with constant value. For example, `lambda` is the same as `lambda_constant`.
-                warnings.warn(f'{x} term has no appropriate coefficient suffix or observable prefix, it is assumed to be a coefficient constant across all items, users, and sessions. If this is the desired behavior, you can also specify `{x}_constant` in the utility formula to avoid this warning message. The utility parser has replaced {x} term with `{x}_constant`.')
-                atom['coefficient'].append(f'{x}_constant')
-
-        additive_decomposition.append(atom)
-    return additive_decomposition
-'''
 
 # ======================================================================================================================
 # core class of the BEMB model.
@@ -298,10 +235,10 @@ def __init__(self,
         self.num_item_chunks = chunk_info[1].max().item() + 1
         self.num_category_chunks = chunk_info[2].max().item() + 1
         self.num_session_chunks = chunk_info[3].max().item() + 1
-        self.register_buffer('user_chunks', chunk_info[0])
-        self.register_buffer('item_chunks', chunk_info[1])
-        self.register_buffer('category_chunks', chunk_info[2])
-        self.register_buffer('session_chunks', chunk_info[3])
+        self.register_buffer('user_chunk_ids', chunk_info[0])
+        self.register_buffer('item_chunk_ids', chunk_info[1])
+        self.register_buffer('category_chunk_ids', chunk_info[2])
+        self.register_buffer('session_chunk_ids', chunk_info[3])
         # ==============================================================================================================
         # Create Bayesian Coefficient Objects
         # ==============================================================================================================
@@ -332,6 +269,14 @@ def __init__(self,
             'category' : self.num_categories,
         }
 
+        variation_to_num_chunks = {
+            'user':(self.num_category_chunks, self.num_session_chunks),
+            'item':(self.num_session_chunks, self.num_user_chunks),
+            'category':(self.num_session_chunks, self.num_user_chunks),
+            'session':(self.num_user_chunks, self.num_category_chunks),
+            'constant': (1, 1),
+            }
+
         coef_dict = dict()
         for additive_term in self.formula:
             for coef_name in additive_term['coefficient']:
@@ -372,15 +317,26 @@ def __init__(self,
                 if (not self.obs2prior_dict[coef_name]) and (H_zero_mask is not None):
                     raise ValueError(f'You specified H_zero_mask for {coef_name}, but obs2prior is False for this coefficient.')
 
-                coef_dict[coef_name] = BayesianCoefficient(variation=variation,
-                                                           num_classes=variation_to_num_classes[variation],
-                                                           obs2prior=self.obs2prior_dict[coef_name],
-                                                           num_obs=self.num_obs_dict[variation],
-                                                           dim=self.coef_dim_dict[coef_name],
-                                                           prior_mean=mean,
-                                                           prior_variance=s2,
-                                                           H_zero_mask=H_zero_mask,
-                                                           is_H=False)
+                chunk_sizes = variation_to_num_chunks[variation]
+                bayesian_coefs = [] * chunk_sizes[0]
+                for ii in range(chunk_sizes[0]):
+                    bayesian_coefs_inner = []
+                    for jj in range(chunk_sizes[1]):
+                        bayesian_coefs_inner.append(BayesianCoefficient(variation=variation,
+                                                                        num_classes=variation_to_num_classes[variation],
+                                                                        obs2prior=self.obs2prior_dict[coef_name],
+                                                                        num_obs=self.num_obs_dict[variation],
+                                                                        dim=self.coef_dim_dict[coef_name],
+                                                                        prior_mean=mean,
+                                                                        prior_variance=s2,
+                                                                        H_zero_mask=H_zero_mask,
+                                                                        is_H=False
+                                                                        )
+                                                    )
+                    bayesian_coefs_inner = nn.ModuleList(bayesian_coefs_inner)
+                    bayesian_coefs.append(bayesian_coefs_inner)
+                coef_dict[coef_name] = nn.ModuleList(bayesian_coefs)
+
         self.coef_dict = nn.ModuleDict(coef_dict)
 
         # ==============================================================================================================
@@ -437,6 +393,9 @@ def ivs(self, batch) -> torch.Tensor:
             Returns:
                 torch.Tensor: the combined utility and log probability.
             """
+        # TODO(akanodia): disallow this for now
+        raise NotImplementedError()
+
         # Use the means of variational distributions as the sole MC sample.
         sample_dict = dict()
         for coef_name, coef in self.coef_dict.items():
@@ -478,6 +437,8 @@ def sample_log_likelihoods(self, batch:ChoiceDataset, sample_dict: Dict[str, tor
         Returns:
         Tuple[torch.Tensor]: sampled log likelihoods; shape: (batch_size, num_categories)
         """
+        # TODO(akanodia): disallow this for now
+        raise NotImplementedError()
         # get the log likelihoods for all items for all categories
         utility = self.log_likelihood_all_items(batch, return_logit=True, sample_dict=sample_dict)
         mu_gumbel = 0.0
@@ -602,9 +563,19 @@ def forward(self, batch: ChoiceDataset,
             # NOTE: here we don't need to sample the obs2prior weight H since we only compute the log-likelihood.
             # TODO: is this correct?
             sample_dict = dict()
-            for coef_name, coef in self.coef_dict.items():
-                sample_dict[coef_name] = coef.variational_distribution.mean.unsqueeze(
-                    dim=0)  # (1, num_*, dim)
+            for coef_name, bayesian_coeffs in self.coef_dict.items():
+                num_classes = bayesian_coeffs[0][0].num_classes
+                dim = bayesian_coeffs[0][0].dim
+                this_sample = torch.FloatTensor(num_seeds, num_classes, dim, len(bayesian_coeffs), len(bayesian_coeffs[0])).to(self.device)
+                # outer_list = []
+                for ii, bayesian_coeffs_inner in enumerate(bayesian_coeffs):
+                    # inner_list = []
+                    for jj, coef in enumerate(bayesian_coeffs_inner):
+                        this_sample[:, :, :, ii, jj] = coef.variational_distribution.mean.unsqueeze(dim=0) # (1, num_*, dim)
+                        # inner_list.append(coef.variational_distribution.mean.unsqueeze(dim=0)) # (1, num_*, dim)
+                        # inner_list.append(coef.variational_distribution.mean.unsqueeze(dim=0)) # (1, num_*, dim)
+                    # outer_list.append(inner_list)
+                sample_dict[coef_name] = this_sample
         else:
             if sample_dict is None:
                 # sample stochastic parameters.
@@ -627,6 +598,8 @@ def forward(self, batch: ChoiceDataset,
         return_logit = (return_type == 'utility')
         if return_scope == 'all_items':
             # (num_seeds, len(batch), num_items)
+            # TODO: (akanodia) disallow this for now.
+            raise NotImplementedError()
             out = self.log_likelihood_all_items(
                 batch=batch, sample_dict=sample_dict, return_logit=return_logit)
         elif return_scope == 'item_index':
@@ -648,7 +621,7 @@ def num_params(self) -> int:
     @property
     def device(self) -> torch.device:
         for coef in self.coef_dict.values():
-            return coef.device
+            return coef[0][0].device
 
     # ==================================================================================================================
     # helper functions.
@@ -665,17 +638,43 @@ def sample_coefficient_dictionary(self, num_seeds: int) -> Dict[str, torch.Tenso
                 Each sample tensor has shape (num_seeds, num_classes, dim).
         """
         sample_dict = dict()
-        for coef_name, coef in self.coef_dict.items():
-            s = coef.rsample(num_seeds)
-            if coef.obs2prior:
-                # sample both obs2prior weight and realization of variable.
-                assert isinstance(s, tuple) and len(s) == 2
-                sample_dict[coef_name] = s[0]
-                sample_dict[coef_name + '.H'] = s[1]
-            else:
-                # only sample the realization of variable.
-                assert torch.is_tensor(s)
-                sample_dict[coef_name] = s
+        for coef_name, bayesian_coeffs in self.coef_dict.items():
+            # outer_list = []
+            num_classes = bayesian_coeffs[0][0].num_classes
+            dim = bayesian_coeffs[0][0].dim
+            this_sample = torch.FloatTensor(num_seeds, num_classes, dim, len(bayesian_coeffs), len(bayesian_coeffs[0])).to(self.device)
+            obs2prior = self.obs2prior_dict[coef_name]
+            if obs2prior:
+                num_obs = bayesian_coeffs[0][0].num_obs
+                this_sample_H = torch.FloatTensor(num_seeds, dim, num_obs, len(bayesian_coeffs), len(bayesian_coeffs[0])).to(self.device)
+                # outer_list_H = []
+            for ii, bayesian_coeffs_inner in enumerate(bayesian_coeffs):
+                # inner_list = []
+                # if obs2prior:
+                    # inner_list_H = []
+                for jj, coef in enumerate(bayesian_coeffs_inner):
+                    s = coef.rsample(num_seeds)
+                    if coef.obs2prior:
+                        # sample both obs2prior weight and realization of variable.
+                        assert isinstance(s, tuple) and len(s) == 2
+                        this_sample[:, :, :, ii, jj] = s[0]
+                        this_sample_H[:, :, :, ii, jj] = s[1]
+                        # inner_list.append(s[0])
+                        # inner_list_H.append(s[1])
+                    else:
+                        # only sample the realization of variable.
+                        assert torch.is_tensor(s)
+                        this_sample[:, :, :, ii, jj] = s
+                        # inner_list.append(s)
+                # outer_list.append(inner_list)
+                # if obs2prior:
+                        # outer_list_H.append(inner_list_H)
+            sample_dict[coef_name] = this_sample
+            # sample_dict[coef_name] = outer_list
+            if obs2prior:
+                sample_dict[coef_name + '.H'] = this_sample_H
+                # sample_dict[coef_name + '.H'] = outer_list_H
+
         return sample_dict
 
     @torch.no_grad()
@@ -790,6 +789,8 @@ def log_likelihood_all_items(self, batch: ChoiceDataset, return_logit: bool, sam
                 out[x, y, z] is the probability of choosing item z in session y conditioned on
                 latents to be the x-th Monte Carlo sample.
         """
+        # TODO: (akanodia) disallow this for now.
+        raise NotImplementedError()
         num_seeds = next(iter(sample_dict.values())).shape[0]
 
         # avoid repeated work when user purchased several items in the same session.
@@ -1036,7 +1037,7 @@ def log_likelihood_item_index(self, batch: ChoiceDataset, return_logit: bool, sa
                 out[x, y] is the probabilities of choosing item batch.item[y] in session y
                 conditioned on latents to be the x-th Monte Carlo sample.
         """
-        num_seeds = next(iter(sample_dict.values())).shape[0]
+        num_seeds = list(sample_dict.values())[0].shape[0]
 
         # get category id of the item bought in each row of batch.
         cate_index = self.item_to_category_tensor[batch.item_index]
@@ -1068,6 +1069,12 @@ def log_likelihood_item_index(self, batch: ChoiceDataset, return_logit: bool, sa
         U = self.num_users
         I = self.num_items
         NC = self.num_categories
+
+        user_chunk_ids = torch.repeat_interleave(self.user_chunk_ids[batch.user_index], repeats)
+        item_chunk_ids = torch.repeat_interleave(self.item_chunk_ids[batch.item_index], repeats)
+        session_chunk_ids = torch.repeat_interleave(self.session_chunk_ids[batch.session_index], repeats)
+        category_chunk_ids = torch.repeat_interleave(self.category_chunk_ids[cate_index], repeats)
+
         # ==========================================================================================
         # Helper Functions for Reshaping.
         # ==========================================================================================
@@ -1076,16 +1083,32 @@ def reshape_coef_sample(sample, name):
             # reshape the monte carlo sample of coefficients to (R, P, I, *).
             if name.endswith('_user'):
                 # (R, U, *) --> (R, total_computation, *)
-                return sample[:, user_index, :]
+                temp = sample[:, user_index, :, :, :]
+                stemp = session_chunk_ids.reshape(1, -1, 1, 1, 1)
+                stemp = stemp.repeat(1, 1, temp.shape[2], temp.shape[3], 1)
+                ctemp = category_chunk_ids.reshape(1, -1, 1, 1)
+                ctemp = ctemp.repeat(1, 1, temp.shape[2], 1)
+                gathered1 = torch.gather(temp, 4, stemp).squeeze(4)
+                gathered2 = torch.gather(gathered1, 3, ctemp).squeeze(3)
+                return gathered2
+                # return sample[:, user_index, :, category_chunk_ids, session_chunk_ids]
             elif name.endswith('_item'):
                 # (R, I, *) --> (R, total_computation, *)
-                return sample[:, relevant_item_index, :]
+                temp = sample[:, relevant_item_index, :, :, :]
+                utemp = user_chunk_ids.reshape(1, -1, 1, 1, 1)
+                utemp = utemp.repeat(1, 1, temp.shape[2], temp.shape[3], 1)
+                stemp = session_chunk_ids.reshape(1, -1, 1, 1)
+                stemp = stemp.repeat(1, 1, temp.shape[2], 1)
+                gathered1 = torch.gather(temp, 4, utemp).squeeze(4)
+                gathered2 = torch.gather(gathered1, 3, stemp).squeeze(3)
+                return gathered2
+                # return sample[:, relevant_item_index, :, session_chunk_ids, user_chunk_ids]
             elif name.endswith('_category'):
                 # (R, NC, *) --> (R, total_computation, *)
-                return sample[:, repeat_category_index, :]
+                return sample[:, repeat_category_index, :, session_chunk_ids, user_chunk_ids]
             elif name.endswith('_constant'):
                 # (R, *) --> (R, total_computation, *)
-                return sample.view(R, 1, -1).expand(-1, total_computation, -1)
+                return sample[:, 0, 0].view(R, 1, -1).expand(-1, total_computation, -1)
             else:
                 raise ValueError
 
@@ -1263,27 +1286,68 @@ def log_prior(self, batch: ChoiceDataset, sample_dict: Dict[str, torch.Tensor])
                 where param[i] is the i-th Monte Carlo sample.
         """
         # assert sample_dict.keys() == self.coef_dict.keys()
-        num_seeds = next(iter(sample_dict.values())).shape[0]
+        num_seeds = list(sample_dict.values())[0].shape[0]
+        cate_index = self.item_to_category_tensor[batch.item_index]
+        user_chunk_ids = self.user_chunk_ids[batch.user_index]
+        item_chunk_ids = self.item_chunk_ids[batch.item_index]
+        session_chunk_ids = self.session_chunk_ids[batch.session_index]
+        category_chunk_ids = self.category_chunk_ids[cate_index]
 
         total = torch.zeros(num_seeds, device=self.device)
 
-        for coef_name, coef in self.coef_dict.items():
-            if self.obs2prior_dict[coef_name]:
-                if coef_name.endswith('_item'):
-                    x_obs = batch.item_obs
-                elif coef_name.endswith('_user'):
-                    x_obs = batch.user_obs
-                else:
-                    raise ValueError(
-                        f'No observable found to support obs2prior for {coef_name}.')
-
-                total += coef.log_prior(sample=sample_dict[coef_name],
-                                        H_sample=sample_dict[coef_name + '.H'],
-                                        x_obs=x_obs).sum(dim=-1)
+        def reshape_coef_sample(sample, name):
+            # reshape the monte carlo sample of coefficients to (R, P, I, *).
+            if name.endswith('_user'):
+                # (R, U, *) --> (R, total_computation, *)
+                temp = sample[:, :, :, :, :]
+                stemp = session_chunk_ids.reshape(1, -1, 1, 1, 1)
+                stemp = stemp.repeat(1, 1, temp.shape[2], temp.shape[3], 1)
+                ctemp = category_chunk_ids.reshape(1, -1, 1, 1)
+                ctemp = ctemp.repeat(1, 1, temp.shape[2], 1)
+                gathered1 = torch.gather(temp, 4, stemp).squeeze(4)
+                gathered2 = torch.gather(gathered1, 3, ctemp).squeeze(3)
+                return gathered2
+                # return sample[:, user_index, :, category_chunk_ids, session_chunk_ids]
+            elif name.endswith('_item'):
+                # (R, I, *) --> (R, total_computation, *)
+                temp = sample[:, :, :, :, :]
+                utemp = user_chunk_ids.reshape(1, -1, 1, 1, 1)
+                utemp = utemp.repeat(1, 1, temp.shape[2], temp.shape[3], 1)
+                stemp = session_chunk_ids.reshape(1, -1, 1, 1)
+                stemp = stemp.repeat(1, 1, temp.shape[2], 1)
+                gathered1 = torch.gather(temp, 4, utemp).squeeze(4)
+                gathered2 = torch.gather(gathered1, 3, stemp).squeeze(3)
+                return gathered2
+                # return sample[:, relevant_item_index, :, session_chunk_ids, user_chunk_ids]
+            elif name.endswith('_category'):
+                # (R, NC, *) --> (R, total_computation, *)
+                return sample[:, repeat_category_index, :, session_chunk_ids, user_chunk_ids]
+            elif name.endswith('_constant'):
+                # (R, *) --> (R, total_computation, *)
+                return sample[:, 0, 0].view(R, 1, -1).expand(-1, total_computation, -1)
             else:
-                # log_prob outputs (num_seeds, num_{items, users}), sum to (num_seeds).
-                total += coef.log_prior(
-                    sample=sample_dict[coef_name], H_sample=None, x_obs=None).sum(dim=-1)
+                raise ValueError
+
+        # for coef_name, coef in self.coef_dict.items():
+        for coef_name, bayesian_coeffs in self.coef_dict.items():
+            for ii, bayesian_coeffs_inner in enumerate(bayesian_coeffs):
+                for jj, coef in enumerate(bayesian_coeffs_inner):
+                    if self.obs2prior_dict[coef_name]:
+                        if coef_name.endswith('_item'):
+                            x_obs = batch.item_obs
+                        elif coef_name.endswith('_user'):
+                            x_obs = batch.user_obs
+                        else:
+                            raise ValueError(
+                                f'No observable found to support obs2prior for {coef_name}.')
+
+                        total += coef.log_prior(sample=sample_dict[coef_name][:, :, :, ii, jj],
+                                                H_sample=sample_dict[coef_name + '.H'][:, :, :, ii, jj],
+                                                x_obs=x_obs).sum(dim=-1)
+                    else:
+                        # log_prob outputs (num_seeds, num_{items, users}), sum to (num_seeds).
+                        total += coef.log_prior(
+                            sample=sample_dict[coef_name][:, :, :, ii, jj], H_sample=None, x_obs=None).sum(dim=-1)
 
         for module in self.additional_modules:
             raise NotImplementedError()
@@ -1388,6 +1452,8 @@ def elbo(self, batch: ChoiceDataset, num_seeds: int = 1) -> torch.Tensor:
         # 4. optionally add log likelihood under variational distributions q(latent).
         # ==============================================================================================================
         if self.trace_log_q:
+            #TODO(akanodia): do not allow at this time
+            raise NotImplementedError()
             assert not self.deterministic_variational, "deterministic_variational is not compatible with trace_log_q."
             elbo -= self.log_variational(sample_dict).mean(dim=0)
 

From edc2044693b8e4fc905516fae937ac0695135586 Mon Sep 17 00:00:00 2001
From: Ayush Kanodia <kanodiaayush@gmail.com>
Date: Wed, 2 Nov 2022 02:59:04 -0700
Subject: [PATCH 03/19] clean up reshape code, make it correct for num_seeds

---
 bemb/model/bemb_chunked.py | 68 ++++++++++++++++++++++++++------------
 1 file changed, 46 insertions(+), 22 deletions(-)

diff --git a/bemb/model/bemb_chunked.py b/bemb/model/bemb_chunked.py
index be40e65..8c6e6b9 100644
--- a/bemb/model/bemb_chunked.py
+++ b/bemb/model/bemb_chunked.py
@@ -1082,30 +1082,53 @@ def log_likelihood_item_index(self, batch: ChoiceDataset, return_logit: bool, sa
         def reshape_coef_sample(sample, name):
             # reshape the monte carlo sample of coefficients to (R, P, I, *).
             if name.endswith('_user'):
-                # (R, U, *) --> (R, total_computation, *)
-                temp = sample[:, user_index, :, :, :]
-                stemp = session_chunk_ids.reshape(1, -1, 1, 1, 1)
-                stemp = stemp.repeat(1, 1, temp.shape[2], temp.shape[3], 1)
-                ctemp = category_chunk_ids.reshape(1, -1, 1, 1)
-                ctemp = ctemp.repeat(1, 1, temp.shape[2], 1)
-                gathered1 = torch.gather(temp, 4, stemp).squeeze(4)
-                gathered2 = torch.gather(gathered1, 3, ctemp).squeeze(3)
-                return gathered2
-                # return sample[:, user_index, :, category_chunk_ids, session_chunk_ids]
+                # (R, total_computation, dim, chunk_size_1, chunk_size_2)
+                all_chunks_sample = sample[:, user_index, :, :, :]
+                # (total_computation) --> (1, total_computation, 1, 1, 1)
+                second_chunk_index = session_chunk_ids.reshape(1, -1, 1, 1, 1)
+                # (1, total_computation, 1, 1, 1) --> (R, total_computation, dim, chunk_size_1, 1)
+                second_chunk_index = second_chunk_index.repeat(R, 1, all_chunks_sample.shape[2], all_chunks_sample.shape[3], 1)
+                # (total_computation) --> (1, total_computation, 1, 1)
+                first_chunk_index = category_chunk_ids.reshape(1, -1, 1, 1)
+                # (1, total_computation, 1, 1) --> (R, total_computation, dim, 1)
+                first_chunk_index = first_chunk_index.repeat(R, 1, all_chunks_sample.shape[2], 1)
+                # select the first chunk.
+                second_chunk_selected = torch.gather(all_chunks_sample, -1, second_chunk_index).squeeze(-1)
+                # select the second chunk.
+                first_chunk_selected = torch.gather(second_chunk_selected, -1, first_chunk_index).squeeze(-1)
+                return first_chunk_selected
             elif name.endswith('_item'):
-                # (R, I, *) --> (R, total_computation, *)
-                temp = sample[:, relevant_item_index, :, :, :]
-                utemp = user_chunk_ids.reshape(1, -1, 1, 1, 1)
-                utemp = utemp.repeat(1, 1, temp.shape[2], temp.shape[3], 1)
-                stemp = session_chunk_ids.reshape(1, -1, 1, 1)
-                stemp = stemp.repeat(1, 1, temp.shape[2], 1)
-                gathered1 = torch.gather(temp, 4, utemp).squeeze(4)
-                gathered2 = torch.gather(gathered1, 3, stemp).squeeze(3)
-                return gathered2
-                # return sample[:, relevant_item_index, :, session_chunk_ids, user_chunk_ids]
+                # (R, total_computation, dim, chunk_size_1, chunk_size_2)
+                all_chunks_sample = sample[:, relevant_item_index, :, :, :]
+                # (total_computation) --> (1, total_computation, 1, 1, 1)
+                second_chunk_index = user_chunk_ids.reshape(1, -1, 1, 1, 1)
+                # (1, total_computation, 1, 1, 1) --> (R, total_computation, dim, chunk_size_1, 1)
+                second_chunk_index = second_chunk_index.repeat(R, 1, all_chunks_sample.shape[2], all_chunks_sample.shape[3], 1)
+                # (total_computation) --> (1, total_computation, 1, 1)
+                first_chunk_index = session_chunk_ids.reshape(1, -1, 1, 1)
+                # (1, total_computation, 1, 1) --> (R, total_computation, dim, 1)
+                first_chunk_index = first_chunk_index.repeat(R, 1, all_chunks_sample.shape[2], 1)
+                # select the first chunk.
+                second_chunk_selected = torch.gather(all_chunks_sample, -1, second_chunk_index).squeeze(-1)
+                # select the second chunk.
+                first_chunk_selected = torch.gather(second_chunk_selected, -1, first_chunk_index).squeeze(-1)
+                return first_chunk_selected
             elif name.endswith('_category'):
-                # (R, NC, *) --> (R, total_computation, *)
-                return sample[:, repeat_category_index, :, session_chunk_ids, user_chunk_ids]
+                # (R, total_computation, dim, chunk_size_1, chunk_size_2)
+                all_chunks_sample = sample[:, repeat_category_index, :, :, :]
+                # (total_computation) --> (1, total_computation, 1, 1, 1)
+                second_chunk_index = user_chunk_ids.reshape(1, -1, 1, 1, 1)
+                # (1, total_computation, 1, 1, 1) --> (R, total_computation, dim, chunk_size_1, 1)
+                second_chunk_index = second_chunk_index.repeat(R, 1, all_chunks_sample.shape[2], all_chunks_sample.shape[3], 1)
+                # (total_computation) --> (1, total_computation, 1, 1)
+                first_chunk_index = session_chunk_ids.reshape(1, -1, 1, 1)
+                # (1, total_computation, 1, 1) --> (R, total_computation, dim, 1)
+                first_chunk_index = first_chunk_index.repeat(R, 1, all_chunks_sample.shape[2], 1)
+                # select the first chunk.
+                second_chunk_selected = torch.gather(all_chunks_sample, -1, second_chunk_index).squeeze(-1)
+                # select the second chunk.
+                first_chunk_selected = torch.gather(second_chunk_selected, -1, first_chunk_index).squeeze(-1)
+                return first_chunk_selected
             elif name.endswith('_constant'):
                 # (R, *) --> (R, total_computation, *)
                 return sample[:, 0, 0].view(R, 1, -1).expand(-1, total_computation, -1)
@@ -1173,6 +1196,7 @@ def reshape_observable(obs, name):
                 coef_name = term['coefficient'][0]
                 coef_sample = reshape_coef_sample(
                     sample_dict[coef_name], coef_name)
+                # breakpoint()
                 assert coef_sample.shape == (
                     R, total_computation, positive_integer)
 

From 8688765c03031dea6da5f9761d134f7ea6ea8dc5 Mon Sep 17 00:00:00 2001
From: Ayush Kanodia <kanodiaayush@gmail.com>
Date: Sun, 4 Dec 2022 23:56:13 -0800
Subject: [PATCH 04/19] before bemb ivs

---
 bemb/model/bemb_chunked.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/bemb/model/bemb_chunked.py b/bemb/model/bemb_chunked.py
index 8c6e6b9..e8c2cac 100644
--- a/bemb/model/bemb_chunked.py
+++ b/bemb/model/bemb_chunked.py
@@ -1,8 +1,11 @@
 """
-The core class of the Bayesian EMBedding (BEMB) model.
+A chunked version of BEMB.
 
-Author: Tianyu Du
-Update: Apr. 28, 2022
+We divide users, items(categories) and sessions into u, i and s chunks.
+Then for each user, there are i*s parameters, for each item there are u*s parameters and for each session there are u*i parameters.
+
+Author: Ayush Kanodia
+Update: Dec 04, 2022
 """
 import warnings
 from pprint import pprint
@@ -25,6 +28,7 @@
 # ======================================================================================================================
 
 from bemb.model.bemb import PositiveInteger, parse_utility
+
 positive_integer = PositiveInteger()
 
 # ======================================================================================================================
@@ -1372,6 +1376,8 @@ def reshape_coef_sample(sample, name):
                         # log_prob outputs (num_seeds, num_{items, users}), sum to (num_seeds).
                         total += coef.log_prior(
                             sample=sample_dict[coef_name][:, :, :, ii, jj], H_sample=None, x_obs=None).sum(dim=-1)
+                    # break
+                # break
 
         for module in self.additional_modules:
             raise NotImplementedError()
@@ -1435,7 +1441,10 @@ def elbo(self, batch: ChoiceDataset, num_seeds: int = 1) -> torch.Tensor:
         # ==============================================================================================================
         # 2. compute log p(latent) prior.
         # (num_seeds,) --mean--> scalar.
-        elbo = self.log_prior(batch, sample_dict).mean(dim=0)
+        # with torch.no_grad():
+        #     while True:
+        #         elbo = self.log_prior(batch, sample_dict).mean(dim=0)
+        elbo = torch.tensor(0.0, device=self.device)
         # ==============================================================================================================
 
         # ==============================================================================================================

From 04583f85e065df494961039aefa12729c39140cf Mon Sep 17 00:00:00 2001
From: Ayush Kanodia <kanodiaayush@gmail.com>
Date: Mon, 5 Dec 2022 03:37:08 -0800
Subject: [PATCH 05/19] ivs done; ll_all_items done with call to ll_item_index

---
 bemb/model/bemb_chunked.py | 158 ++++++++++++++++++++++++-------------
 1 file changed, 103 insertions(+), 55 deletions(-)

diff --git a/bemb/model/bemb_chunked.py b/bemb/model/bemb_chunked.py
index e8c2cac..f437fdd 100644
--- a/bemb/model/bemb_chunked.py
+++ b/bemb/model/bemb_chunked.py
@@ -397,19 +397,14 @@ def ivs(self, batch) -> torch.Tensor:
             Returns:
                 torch.Tensor: the combined utility and log probability.
             """
-        # TODO(akanodia): disallow this for now
-        raise NotImplementedError()
-
         # Use the means of variational distributions as the sole MC sample.
-        sample_dict = dict()
-        for coef_name, coef in self.coef_dict.items():
-            sample_dict[coef_name] = coef.variational_distribution.mean.unsqueeze(dim=0)  # (1, num_*, dim)
-
+        sample_dict = self.sample_coefficient_dictionary(1, deterministic=True)
         # there is 1 random seed in this case.
         # (num_seeds=1, len(batch), num_items)
         out = self.log_likelihood_all_items(batch, return_logit=True, sample_dict=sample_dict)
         out = out.squeeze(0)
         # import pdb; pdb.set_trace()
+        out = out.view(-1, self.num_items)
         ivs = scatter_logsumexp(out, self.item_to_category_tensor, dim=-1)
         return ivs # (len(batch), num_categories)
 
@@ -562,6 +557,8 @@ def forward(self, batch: ChoiceDataset,
         # get sample_dict ready.
         # ==============================================================================================================
         if deterministic:
+            sample_dict = self.sample_coefficient_dictionary(num_seeds, deterministic=True)
+            '''
             num_seeds = 1
             # Use the means of variational distributions as the sole deterministic MC sample.
             # NOTE: here we don't need to sample the obs2prior weight H since we only compute the log-likelihood.
@@ -580,6 +577,7 @@ def forward(self, batch: ChoiceDataset,
                         # inner_list.append(coef.variational_distribution.mean.unsqueeze(dim=0)) # (1, num_*, dim)
                     # outer_list.append(inner_list)
                 sample_dict[coef_name] = this_sample
+            '''
         else:
             if sample_dict is None:
                 # sample stochastic parameters.
@@ -630,7 +628,7 @@ def device(self) -> torch.device:
     # ==================================================================================================================
     # helper functions.
     # ==================================================================================================================
-    def sample_coefficient_dictionary(self, num_seeds: int) -> Dict[str, torch.Tensor]:
+    def sample_coefficient_dictionary(self, num_seeds: int, deterministic: bool=False) -> Dict[str, torch.Tensor]:
         """A helper function to sample parameters from coefficients.
 
         Args:
@@ -642,42 +640,62 @@ def sample_coefficient_dictionary(self, num_seeds: int) -> Dict[str, torch.Tenso
                 Each sample tensor has shape (num_seeds, num_classes, dim).
         """
         sample_dict = dict()
-        for coef_name, bayesian_coeffs in self.coef_dict.items():
-            # outer_list = []
-            num_classes = bayesian_coeffs[0][0].num_classes
-            dim = bayesian_coeffs[0][0].dim
-            this_sample = torch.FloatTensor(num_seeds, num_classes, dim, len(bayesian_coeffs), len(bayesian_coeffs[0])).to(self.device)
-            obs2prior = self.obs2prior_dict[coef_name]
-            if obs2prior:
-                num_obs = bayesian_coeffs[0][0].num_obs
-                this_sample_H = torch.FloatTensor(num_seeds, dim, num_obs, len(bayesian_coeffs), len(bayesian_coeffs[0])).to(self.device)
-                # outer_list_H = []
-            for ii, bayesian_coeffs_inner in enumerate(bayesian_coeffs):
-                # inner_list = []
-                # if obs2prior:
-                    # inner_list_H = []
-                for jj, coef in enumerate(bayesian_coeffs_inner):
-                    s = coef.rsample(num_seeds)
-                    if coef.obs2prior:
-                        # sample both obs2prior weight and realization of variable.
-                        assert isinstance(s, tuple) and len(s) == 2
-                        this_sample[:, :, :, ii, jj] = s[0]
-                        this_sample_H[:, :, :, ii, jj] = s[1]
-                        # inner_list.append(s[0])
-                        # inner_list_H.append(s[1])
-                    else:
-                        # only sample the realization of variable.
-                        assert torch.is_tensor(s)
-                        this_sample[:, :, :, ii, jj] = s
-                        # inner_list.append(s)
-                # outer_list.append(inner_list)
-                # if obs2prior:
-                        # outer_list_H.append(inner_list_H)
-            sample_dict[coef_name] = this_sample
-            # sample_dict[coef_name] = outer_list
-            if obs2prior:
-                sample_dict[coef_name + '.H'] = this_sample_H
-                # sample_dict[coef_name + '.H'] = outer_list_H
+        if deterministic:
+            num_seeds = 1
+            # Use the means of variational distributions as the sole deterministic MC sample.
+            # NOTE: here we don't need to sample the obs2prior weight H since we only compute the log-likelihood.
+            # TODO: is this correct?
+            sample_dict = dict()
+            for coef_name, bayesian_coeffs in self.coef_dict.items():
+                num_classes = bayesian_coeffs[0][0].num_classes
+                dim = bayesian_coeffs[0][0].dim
+                this_sample = torch.FloatTensor(num_seeds, num_classes, dim, len(bayesian_coeffs), len(bayesian_coeffs[0])).to(self.device)
+                # outer_list = []
+                for ii, bayesian_coeffs_inner in enumerate(bayesian_coeffs):
+                    # inner_list = []
+                    for jj, coef in enumerate(bayesian_coeffs_inner):
+                        this_sample[:, :, :, ii, jj] = coef.variational_distribution.mean.unsqueeze(dim=0) # (1, num_*, dim)
+                        # inner_list.append(coef.variational_distribution.mean.unsqueeze(dim=0)) # (1, num_*, dim)
+                        # inner_list.append(coef.variational_distribution.mean.unsqueeze(dim=0)) # (1, num_*, dim)
+                    # outer_list.append(inner_list)
+                sample_dict[coef_name] = this_sample
+        else:
+            for coef_name, bayesian_coeffs in self.coef_dict.items():
+                # outer_list = []
+                num_classes = bayesian_coeffs[0][0].num_classes
+                dim = bayesian_coeffs[0][0].dim
+                this_sample = torch.FloatTensor(num_seeds, num_classes, dim, len(bayesian_coeffs), len(bayesian_coeffs[0])).to(self.device)
+                obs2prior = self.obs2prior_dict[coef_name]
+                if obs2prior:
+                    num_obs = bayesian_coeffs[0][0].num_obs
+                    this_sample_H = torch.FloatTensor(num_seeds, dim, num_obs, len(bayesian_coeffs), len(bayesian_coeffs[0])).to(self.device)
+                    # outer_list_H = []
+                for ii, bayesian_coeffs_inner in enumerate(bayesian_coeffs):
+                    # inner_list = []
+                    # if obs2prior:
+                        # inner_list_H = []
+                    for jj, coef in enumerate(bayesian_coeffs_inner):
+                        s = coef.rsample(num_seeds)
+                        if coef.obs2prior:
+                            # sample both obs2prior weight and realization of variable.
+                            assert isinstance(s, tuple) and len(s) == 2
+                            this_sample[:, :, :, ii, jj] = s[0]
+                            this_sample_H[:, :, :, ii, jj] = s[1]
+                            # inner_list.append(s[0])
+                            # inner_list_H.append(s[1])
+                        else:
+                            # only sample the realization of variable.
+                            assert torch.is_tensor(s)
+                            this_sample[:, :, :, ii, jj] = s
+                            # inner_list.append(s)
+                    # outer_list.append(inner_list)
+                    # if obs2prior:
+                            # outer_list_H.append(inner_list_H)
+                sample_dict[coef_name] = this_sample
+                # sample_dict[coef_name] = outer_list
+                if obs2prior:
+                    sample_dict[coef_name + '.H'] = this_sample_H
+                    # sample_dict[coef_name + '.H'] = outer_list_H
 
         return sample_dict
 
@@ -793,9 +811,23 @@ def log_likelihood_all_items(self, batch: ChoiceDataset, return_logit: bool, sam
                 out[x, y, z] is the probability of choosing item z in session y conditioned on
                 latents to be the x-th Monte Carlo sample.
         """
-        # TODO: (akanodia) disallow this for now.
-        raise NotImplementedError()
-        num_seeds = next(iter(sample_dict.values())).shape[0]
+        batch.item_index = torch.arange(self.num_items, device=batch.device)
+        batch.item_index = batch.item_index.repeat(batch.user_index.shape[0])
+        batch.user_index = batch.user_index.repeat_interleave(self.num_items)
+        batch.session_index = batch.session_index.repeat_interleave(self.num_items)
+        # batch.user_obs = batch.user_obs.repeat_interleave(self.num_items)
+        # batch.item_obs = batch.user_obs.repeat_interleave(self.num_items)
+        # batch.user_obs = batch.user_obs.repeat_interleave(self.num_items)
+                                               # user_obs=user_obs,
+                                               # item_obs=item_obs,
+                                               # price_obs=price_obs,
+                                               # session_obs_d=session_obs_d,
+                                               # session_obs_s=session_obs_s,
+                                               # session_obs_w=session_obs_w,
+                                               # session_week_id=session_week_id,
+        return self.log_likelihood_item_index(batch, return_logit, sample_dict, all_items=True)
+        # num_seeds = next(iter(sample_dict.values())).shape[0]
+        num_seeds = list(sample_dict.values())[0].shape[0]
 
         # avoid repeated work when user purchased several items in the same session.
         user_session_index = torch.stack(
@@ -817,6 +849,10 @@ def log_likelihood_all_items(self, batch: ChoiceDataset, return_logit: bool, sam
         I = self.num_items
         NC = self.num_categories
 
+        user_chunk_ids = torch.repeat_interleave(self.user_chunk_ids[batch.user_index], repeats)
+        item_chunk_ids = torch.repeat_interleave(self.item_chunk_ids[batch.item_index], repeats)
+        session_chunk_ids = torch.repeat_interleave(self.session_chunk_ids[batch.session_index], repeats)
+        category_chunk_ids = torch.repeat_interleave(self.category_chunk_ids[cate_index], repeats)
         # ==============================================================================================================
         # Helper Functions for Reshaping.
         # ==============================================================================================================
@@ -885,6 +921,7 @@ def reshape_observable(obs, name):
                 obs = obs[session_index, :, :]  # (P, I, O)
                 return obs.view(1, P, I, O).expand(R, -1, -1, -1)
             elif name.startswith('taste_'):
+                raise NotImplementedError
                 assert obs.shape == (U, I, O)
                 obs = obs[user_index, :, :]  # (P, I, O)
                 return obs.view(1, P, I, O).expand(R, -1, -1, -1)
@@ -1012,7 +1049,7 @@ def reshape_observable(obs, name):
             assert log_p.shape == (num_seeds, len(batch), self.num_items)
             return log_p
 
-    def log_likelihood_item_index(self, batch: ChoiceDataset, return_logit: bool, sample_dict: Dict[str, torch.Tensor]) -> torch.Tensor:
+    def log_likelihood_item_index(self, batch: ChoiceDataset, return_logit: bool, sample_dict: Dict[str, torch.Tensor], all_items: bool=False) -> torch.Tensor:
         """
         NOTE for developers:
         This method is more efficient and only computes log-likelihood/logit(utility) for item in item_index[i] for each
@@ -1042,9 +1079,18 @@ def log_likelihood_item_index(self, batch: ChoiceDataset, return_logit: bool, sa
                 conditioned on latents to be the x-th Monte Carlo sample.
         """
         num_seeds = list(sample_dict.values())[0].shape[0]
+        # if all_items:
+        #     buser_index = batch.user_index.repeat_interleave(self.num_items)
+        #     bsession_index = batch.session_index.repeat_interleave(self.num_items)
+        #     bitem_index = torch.arange(self.num_items, device=batch.device)
+        #     bitem_index = bitem_index.repeat(batch.user_index.shape[0])
+        # else:
+        bitem_index = batch.item_index
+        buser_index = batch.user_index
+        bsession_index = batch.session_index
 
         # get category id of the item bought in each row of batch.
-        cate_index = self.item_to_category_tensor[batch.item_index]
+        cate_index = self.item_to_category_tensor[bitem_index]
 
         # get item ids of all items from the same category of each item bought.
         relevant_item_index = self.category_to_item_tensor[cate_index, :]
@@ -1058,12 +1104,14 @@ def log_likelihood_item_index(self, batch: ChoiceDataset, return_logit: bool, sa
         reverse_indices = torch.repeat_interleave(
             torch.arange(len(batch), device=self.device), repeats)
         # expand the user_index and session_index.
-        user_index = torch.repeat_interleave(batch.user_index, repeats)
+        # if all_items:
+        #     breakpoint()
+        user_index = torch.repeat_interleave(buser_index, repeats)
         repeat_category_index = torch.repeat_interleave(cate_index, repeats)
-        session_index = torch.repeat_interleave(batch.session_index, repeats)
+        session_index = torch.repeat_interleave(bsession_index, repeats)
         # duplicate the item focused to match.
         item_index_expanded = torch.repeat_interleave(
-            batch.item_index, repeats)
+            bitem_index, repeats)
 
         # short-hands for easier shape check.
         R = num_seeds
@@ -1074,9 +1122,9 @@ def log_likelihood_item_index(self, batch: ChoiceDataset, return_logit: bool, sa
         I = self.num_items
         NC = self.num_categories
 
-        user_chunk_ids = torch.repeat_interleave(self.user_chunk_ids[batch.user_index], repeats)
-        item_chunk_ids = torch.repeat_interleave(self.item_chunk_ids[batch.item_index], repeats)
-        session_chunk_ids = torch.repeat_interleave(self.session_chunk_ids[batch.session_index], repeats)
+        user_chunk_ids = torch.repeat_interleave(self.user_chunk_ids[buser_index], repeats)
+        item_chunk_ids = torch.repeat_interleave(self.item_chunk_ids[bitem_index], repeats)
+        session_chunk_ids = torch.repeat_interleave(self.session_chunk_ids[bsession_index], repeats)
         category_chunk_ids = torch.repeat_interleave(self.category_chunk_ids[cate_index], repeats)
 
         # ==========================================================================================

From c3676e65a8674a669ab1f87208363790105340e2 Mon Sep 17 00:00:00 2001
From: Ayush Kanodia <kanodiaayush@gmail.com>
Date: Mon, 5 Dec 2022 03:42:29 -0800
Subject: [PATCH 06/19] nit

---
 bemb/model/bemb_chunked.py | 255 +------------------------------------
 1 file changed, 7 insertions(+), 248 deletions(-)

diff --git a/bemb/model/bemb_chunked.py b/bemb/model/bemb_chunked.py
index f437fdd..68ccb55 100644
--- a/bemb/model/bemb_chunked.py
+++ b/bemb/model/bemb_chunked.py
@@ -815,239 +815,7 @@ def log_likelihood_all_items(self, batch: ChoiceDataset, return_logit: bool, sam
         batch.item_index = batch.item_index.repeat(batch.user_index.shape[0])
         batch.user_index = batch.user_index.repeat_interleave(self.num_items)
         batch.session_index = batch.session_index.repeat_interleave(self.num_items)
-        # batch.user_obs = batch.user_obs.repeat_interleave(self.num_items)
-        # batch.item_obs = batch.user_obs.repeat_interleave(self.num_items)
-        # batch.user_obs = batch.user_obs.repeat_interleave(self.num_items)
-                                               # user_obs=user_obs,
-                                               # item_obs=item_obs,
-                                               # price_obs=price_obs,
-                                               # session_obs_d=session_obs_d,
-                                               # session_obs_s=session_obs_s,
-                                               # session_obs_w=session_obs_w,
-                                               # session_week_id=session_week_id,
         return self.log_likelihood_item_index(batch, return_logit, sample_dict, all_items=True)
-        # num_seeds = next(iter(sample_dict.values())).shape[0]
-        num_seeds = list(sample_dict.values())[0].shape[0]
-
-        # avoid repeated work when user purchased several items in the same session.
-        user_session_index = torch.stack(
-            [batch.user_index, batch.session_index])
-        assert user_session_index.shape == (2, len(batch))
-        unique_user_sess, inverse_indices = torch.unique(
-            user_session_index, dim=1, return_inverse=True)
-
-        user_index = unique_user_sess[0, :]
-        session_index = unique_user_sess[1, :]
-        assert len(user_index) == len(session_index)
-
-        # short-hands for easier shape check.
-        R = num_seeds
-        # P = len(batch)  # num_purchases.
-        P = unique_user_sess.shape[1]
-        S = self.num_sessions
-        U = self.num_users
-        I = self.num_items
-        NC = self.num_categories
-
-        user_chunk_ids = torch.repeat_interleave(self.user_chunk_ids[batch.user_index], repeats)
-        item_chunk_ids = torch.repeat_interleave(self.item_chunk_ids[batch.item_index], repeats)
-        session_chunk_ids = torch.repeat_interleave(self.session_chunk_ids[batch.session_index], repeats)
-        category_chunk_ids = torch.repeat_interleave(self.category_chunk_ids[cate_index], repeats)
-        # ==============================================================================================================
-        # Helper Functions for Reshaping.
-        # ==============================================================================================================
-        def reshape_user_coef_sample(C):
-            # input shape (R, U, *)
-            C = C.view(R, U, 1, -1).expand(-1, -1, I, -1)  # (R, U, I, *)
-            C = C[:, user_index, :, :]
-            assert C.shape == (R, P, I, positive_integer)
-            return C
-
-        def reshape_item_coef_sample(C):
-            # input shape (R, I, *)
-            C = C.view(R, 1, I, -1).expand(-1, P, -1, -1)
-            assert C.shape == (R, P, I, positive_integer)
-            return C
-
-        def reshape_category_coef_sample(C):
-            # input shape (R, NC, *)
-            C = torch.repeat_interleave(C, self.category_to_size_tensor, dim=1)
-            # input shape (R, I, *)
-            C = C.view(R, 1, I, -1).expand(-1, P, -1, -1)
-            assert C.shape == (R, P, I, positive_integer)
-            return C
-
-        def reshape_constant_coef_sample(C):
-            # input shape (R, *)
-            C = C.view(R, 1, 1, -1).expand(-1, P, I, -1)
-            assert C.shape == (R, P, I, positive_integer)
-            return C
-
-        def reshape_coef_sample(sample, name):
-            # reshape the monte carlo sample of coefficients to (R, P, I, *).
-            if name.endswith('_user'):
-                # (R, U, *) --> (R, P, I, *)
-                return reshape_user_coef_sample(sample)
-            elif name.endswith('_item'):
-                # (R, I, *) --> (R, P, I, *)
-                return reshape_item_coef_sample(sample)
-            elif name.endswith('_category'):
-                # (R, NC, *) --> (R, P, NC, *)
-                return reshape_category_coef_sample(sample)
-            elif name.endswith('_constant'):
-                # (R, *) --> (R, P, I, *)
-                return reshape_constant_coef_sample(sample)
-            else:
-                raise ValueError
-
-        def reshape_observable(obs, name):
-            # reshape observable to (R, P, I, *) so that it can be multiplied with monte carlo
-            # samples of coefficients.
-            O = obs.shape[-1]  # number of observables.
-            assert O == positive_integer
-            if name.startswith('item_'):
-                assert obs.shape == (I, O)
-                obs = obs.view(1, 1, I, O).expand(R, P, -1, -1)
-            elif name.startswith('user_'):
-                assert obs.shape == (U, O)
-                obs = obs[user_index, :]  # (P, O)
-                obs = obs.view(1, P, 1, O).expand(R, -1, I, -1)
-            elif name.startswith('session_'):
-                assert obs.shape == (S, O)
-                obs = obs[session_index, :]  # (P, O)
-                return obs.view(1, P, 1, O).expand(R, -1, I, -1)
-            elif name.startswith('price_'):
-                assert obs.shape == (S, I, O)
-                obs = obs[session_index, :, :]  # (P, I, O)
-                return obs.view(1, P, I, O).expand(R, -1, -1, -1)
-            elif name.startswith('taste_'):
-                raise NotImplementedError
-                assert obs.shape == (U, I, O)
-                obs = obs[user_index, :, :]  # (P, I, O)
-                return obs.view(1, P, I, O).expand(R, -1, -1, -1)
-            else:
-                raise ValueError
-            assert obs.shape == (R, P, I, O)
-            return obs
-
-        # ==============================================================================================================
-        # Compute the Utility Term by Term.
-        # ==============================================================================================================
-        # P is the number of unique (user, session) pairs.
-        # (random_seeds, P, num_items).
-        utility = torch.zeros(R, P, I, device=self.device)
-
-        # loop over additive term to utility
-        for term in self.formula:
-            # Type I: single coefficient, e.g., lambda_item or lambda_user.
-            if len(term['coefficient']) == 1 and term['observable'] is None:
-                # E.g., lambda_item or lambda_user
-                coef_name = term['coefficient'][0]
-                coef_sample = reshape_coef_sample(
-                    sample_dict[coef_name], coef_name)
-                assert coef_sample.shape == (R, P, I, 1)
-                additive_term = coef_sample.view(R, P, I)
-
-            # Type II: factorized coefficient, e.g., <theta_user, lambda_item>.
-            elif len(term['coefficient']) == 2 and term['observable'] is None:
-                coef_name_0 = term['coefficient'][0]
-                coef_name_1 = term['coefficient'][1]
-
-                coef_sample_0 = reshape_coef_sample(
-                    sample_dict[coef_name_0], coef_name_0)
-                coef_sample_1 = reshape_coef_sample(
-                    sample_dict[coef_name_1], coef_name_1)
-
-                assert coef_sample_0.shape == coef_sample_1.shape == (
-                    R, P, I, positive_integer)
-
-                additive_term = (coef_sample_0 * coef_sample_1).sum(dim=-1)
-
-            # Type III: single coefficient multiplied by observable, e.g., theta_user * x_obs_item.
-            elif len(term['coefficient']) == 1 and term['observable'] is not None:
-                coef_name = term['coefficient'][0]
-                coef_sample = reshape_coef_sample(
-                    sample_dict[coef_name], coef_name)
-                assert coef_sample.shape == (R, P, I, positive_integer)
-
-                obs_name = term['observable']
-                obs = reshape_observable(getattr(batch, obs_name), obs_name)
-                assert obs.shape == (R, P, I, positive_integer)
-
-                additive_term = (coef_sample * obs).sum(dim=-1)
-
-            # Type IV: factorized coefficient multiplied by observable.
-            # e.g., gamma_user * beta_item * price_obs.
-            elif len(term['coefficient']) == 2 and term['observable'] is not None:
-                coef_name_0, coef_name_1 = term['coefficient'][0], term['coefficient'][1]
-
-                coef_sample_0 = reshape_coef_sample(
-                    sample_dict[coef_name_0], coef_name_0)
-                coef_sample_1 = reshape_coef_sample(
-                    sample_dict[coef_name_1], coef_name_1)
-                assert coef_sample_0.shape == coef_sample_1.shape == (
-                    R, P, I, positive_integer)
-                num_obs_times_latent_dim = coef_sample_0.shape[-1]
-
-                obs_name = term['observable']
-                obs = reshape_observable(getattr(batch, obs_name), obs_name)
-                assert obs.shape == (R, P, I, positive_integer)
-                num_obs = obs.shape[-1]  # number of observables.
-
-                assert (num_obs_times_latent_dim % num_obs) == 0
-                latent_dim = num_obs_times_latent_dim // num_obs
-
-                coef_sample_0 = coef_sample_0.view(
-                    R, P, I, num_obs, latent_dim)
-                coef_sample_1 = coef_sample_1.view(
-                    R, P, I, num_obs, latent_dim)
-                # compute the factorized coefficient with shape (R, P, I, O).
-                coef = (coef_sample_0 * coef_sample_1).sum(dim=-1)
-
-                additive_term = (coef * obs).sum(dim=-1)
-
-            else:
-                raise ValueError(f'Undefined term type: {term}')
-
-            assert additive_term.shape == (R, P, I)
-            utility += additive_term
-
-        # ==============================================================================================================
-        # Mask Out Unavailable Items in Each Session.
-        # ==============================================================================================================
-
-        if batch.item_availability is not None:
-            # expand to the Monte Carlo sample dimension.
-            # (S, I) -> (P, I) -> (1, P, I) -> (R, P, I)
-            A = batch.item_availability[session_index, :].unsqueeze(
-                dim=0).expand(R, -1, -1)
-            utility[~A] = - (torch.finfo(utility.dtype).max / 2)
-
-        utility = utility[:, inverse_indices, :]
-        assert utility.shape == (R, len(batch), I)
-
-        for module in self.additional_modules:
-            additive_term = module(batch)
-            assert additive_term.shape == (R, len(batch), 1)
-            utility += additive_term.expand(-1, -1, I)
-
-        if return_logit:
-            # output shape: (num_seeds, len(batch), num_items)
-            assert utility.shape == (num_seeds, len(batch), self.num_items)
-            return utility
-        else:
-            # compute log likelihood log p(choosing item i | user, item latents)
-            # compute log softmax separately within each category.
-            if self.pred_item:
-                # output shape: (num_seeds, len(batch), num_items)
-                log_p = scatter_log_softmax(utility, self.item_to_category_tensor, dim=-1)
-            else:
-                label_expanded = batch.label.to(torch.float32).view(1, len(batch), 1).expand(num_seeds, -1, self.num_items)
-                assert label_expanded.shape == (num_seeds, len(batch), self.num_items)
-                bce = nn.BCELoss(reduction='none')
-                log_p = - bce(torch.sigmoid(utility), label_expanded)
-            assert log_p.shape == (num_seeds, len(batch), self.num_items)
-            return log_p
 
     def log_likelihood_item_index(self, batch: ChoiceDataset, return_logit: bool, sample_dict: Dict[str, torch.Tensor], all_items: bool=False) -> torch.Tensor:
         """
@@ -1079,18 +847,9 @@ def log_likelihood_item_index(self, batch: ChoiceDataset, return_logit: bool, sa
                 conditioned on latents to be the x-th Monte Carlo sample.
         """
         num_seeds = list(sample_dict.values())[0].shape[0]
-        # if all_items:
-        #     buser_index = batch.user_index.repeat_interleave(self.num_items)
-        #     bsession_index = batch.session_index.repeat_interleave(self.num_items)
-        #     bitem_index = torch.arange(self.num_items, device=batch.device)
-        #     bitem_index = bitem_index.repeat(batch.user_index.shape[0])
-        # else:
-        bitem_index = batch.item_index
-        buser_index = batch.user_index
-        bsession_index = batch.session_index
 
         # get category id of the item bought in each row of batch.
-        cate_index = self.item_to_category_tensor[bitem_index]
+        cate_index = self.item_to_category_tensor[batch.item_index]
 
         # get item ids of all items from the same category of each item bought.
         relevant_item_index = self.category_to_item_tensor[cate_index, :]
@@ -1106,12 +865,12 @@ def log_likelihood_item_index(self, batch: ChoiceDataset, return_logit: bool, sa
         # expand the user_index and session_index.
         # if all_items:
         #     breakpoint()
-        user_index = torch.repeat_interleave(buser_index, repeats)
+        user_index = torch.repeat_interleave(batch.user_index, repeats)
         repeat_category_index = torch.repeat_interleave(cate_index, repeats)
-        session_index = torch.repeat_interleave(bsession_index, repeats)
+        session_index = torch.repeat_interleave(batch.session_index, repeats)
         # duplicate the item focused to match.
         item_index_expanded = torch.repeat_interleave(
-            bitem_index, repeats)
+            batch.item_index, repeats)
 
         # short-hands for easier shape check.
         R = num_seeds
@@ -1122,9 +881,9 @@ def log_likelihood_item_index(self, batch: ChoiceDataset, return_logit: bool, sa
         I = self.num_items
         NC = self.num_categories
 
-        user_chunk_ids = torch.repeat_interleave(self.user_chunk_ids[buser_index], repeats)
-        item_chunk_ids = torch.repeat_interleave(self.item_chunk_ids[bitem_index], repeats)
-        session_chunk_ids = torch.repeat_interleave(self.session_chunk_ids[bsession_index], repeats)
+        user_chunk_ids = torch.repeat_interleave(self.user_chunk_ids[batch.user_index], repeats)
+        item_chunk_ids = torch.repeat_interleave(self.item_chunk_ids[batch.item_index], repeats)
+        session_chunk_ids = torch.repeat_interleave(self.session_chunk_ids[batch.session_index], repeats)
         category_chunk_ids = torch.repeat_interleave(self.category_chunk_ids[cate_index], repeats)
 
         # ==========================================================================================

From a50a24598af54725b6435eebf5851e94743ea4d2 Mon Sep 17 00:00:00 2001
From: Ayush Kanodia <kanodiaayush@gmail.com>
Date: Fri, 28 Apr 2023 05:27:03 -0700
Subject: [PATCH 07/19] turn on log prior, minor bug

---
 bemb/model/bemb_chunked.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/bemb/model/bemb_chunked.py b/bemb/model/bemb_chunked.py
index 68ccb55..06b9396 100644
--- a/bemb/model/bemb_chunked.py
+++ b/bemb/model/bemb_chunked.py
@@ -294,7 +294,7 @@ def __init__(self,
                             self.prior_mean[coef_name] = self.prior_mean['default']
                         else:
                             # warnings.warn(f"You provided a dictionary of prior mean, but coefficient {coef_name} is not a key in it. Supply a value for 'default' in the prior_mean dictionary to use that as default value (e.g., prior_mean['default'] = 0.1); now using mean=0.0 since this is not supplied.")
-                            self.prior_variance[coef_name] = 0.0
+                            self.prior_mean[coef_name] = 0.0
 
                 mean = self.prior_mean[coef_name] if isinstance(
                     self.prior_mean, dict) else self.prior_mean
@@ -1251,7 +1251,8 @@ def elbo(self, batch: ChoiceDataset, num_seeds: int = 1) -> torch.Tensor:
         # with torch.no_grad():
         #     while True:
         #         elbo = self.log_prior(batch, sample_dict).mean(dim=0)
-        elbo = torch.tensor(0.0, device=self.device)
+        # elbo = torch.tensor(0.0, device=self.device)
+        elbo = self.log_prior(batch, sample_dict).mean(dim=0)
         # ==============================================================================================================
 
         # ==============================================================================================================

From 0431ad49ac41030a30ce648617b9544c6eb3cb9e Mon Sep 17 00:00:00 2001
From: Ayush Kanodia <kanodiaayush@gmail.com>
Date: Mon, 31 Jul 2023 21:53:09 -0700
Subject: [PATCH 08/19] debug

---
 bemb/model/bemb_chunked.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/bemb/model/bemb_chunked.py b/bemb/model/bemb_chunked.py
index 06b9396..c2f563c 100644
--- a/bemb/model/bemb_chunked.py
+++ b/bemb/model/bemb_chunked.py
@@ -499,7 +499,8 @@ def forward(self, batch: ChoiceDataset,
                 return_scope: str,
                 deterministic: bool = True,
                 sample_dict: Optional[Dict[str, torch.Tensor]] = None,
-                num_seeds: Optional[int] = None
+                num_seeds: Optional[int] = None,
+                debug=False,
                 ) -> torch.Tensor:
         """A combined method for inference with the model.
 
@@ -607,7 +608,7 @@ def forward(self, batch: ChoiceDataset,
         elif return_scope == 'item_index':
             # (num_seeds, len(batch))
             out = self.log_likelihood_item_index(
-                batch=batch, sample_dict=sample_dict, return_logit=return_logit)
+                batch=batch, sample_dict=sample_dict, return_logit=return_logit, debug=debug)
 
         if deterministic:
             # drop the first dimension, which has size of `num_seeds` (equals 1 in the deterministic case).
@@ -817,7 +818,7 @@ def log_likelihood_all_items(self, batch: ChoiceDataset, return_logit: bool, sam
         batch.session_index = batch.session_index.repeat_interleave(self.num_items)
         return self.log_likelihood_item_index(batch, return_logit, sample_dict, all_items=True)
 
-    def log_likelihood_item_index(self, batch: ChoiceDataset, return_logit: bool, sample_dict: Dict[str, torch.Tensor], all_items: bool=False) -> torch.Tensor:
+    def log_likelihood_item_index(self, batch: ChoiceDataset, return_logit: bool, sample_dict: Dict[str, torch.Tensor], all_items: bool=False, debug=False) -> torch.Tensor:
         """
         NOTE for developers:
         This method is more efficient and only computes log-likelihood/logit(utility) for item in item_index[i] for each
@@ -978,6 +979,8 @@ def reshape_observable(obs, name):
 
         # loop over additive term to utility
         for term in self.formula:
+            if debug:
+                breakpoint()
             # Type I: single coefficient, e.g., lambda_item or lambda_user.
             if len(term['coefficient']) == 1 and term['observable'] is None:
                 # E.g., lambda_item or lambda_user
@@ -1087,6 +1090,7 @@ def reshape_observable(obs, name):
             # compute log likelihood log p(choosing item i | user, item latents)
             # compute the log probability from logits/utilities.
             # output shape: (num_seeds, len(batch), num_items)
+            # breakpoint()
             log_p = scatter_log_softmax(utility, reverse_indices, dim=-1)
             # select the log-P of the item actually bought.
             log_p = log_p[:, item_index_expanded == relevant_item_index]
@@ -1234,6 +1238,8 @@ def elbo(self, batch: ChoiceDataset, num_seeds: int = 1) -> torch.Tensor:
         # 1. sample latent variables from their variational distributions.
         # ==============================================================================================================
         if self.deterministic_variational:
+            sample_dict = self.sample_coefficient_dictionary(num_seeds)
+            '''
             num_seeds = 1
             # Use the means of variational distributions as the sole deterministic MC sample.
             # NOTE: here we don't need to sample the obs2prior weight H since we only compute the log-likelihood.
@@ -1242,6 +1248,7 @@ def elbo(self, batch: ChoiceDataset, num_seeds: int = 1) -> torch.Tensor:
             for coef_name, coef in self.coef_dict.items():
                 sample_dict[coef_name] = coef.variational_distribution.mean.unsqueeze(
                     dim=0)  # (1, num_*, dim)
+            '''
         else:
             sample_dict = self.sample_coefficient_dictionary(num_seeds)
 

From 38abd25ec818fdefc8a2ae838f96cc1956cc0a9f Mon Sep 17 00:00:00 2001
From: Ayush Kanodia <kanodiaayush@gmail.com>
Date: Thu, 30 Nov 2023 11:08:59 -0800
Subject: [PATCH 09/19] add logit_components, price_coeff

---
 bemb/model/bemb_chunked.py | 50 ++++++++++++++++++++++++++++++++++----
 1 file changed, 45 insertions(+), 5 deletions(-)

diff --git a/bemb/model/bemb_chunked.py b/bemb/model/bemb_chunked.py
index c2f563c..64eb511 100644
--- a/bemb/model/bemb_chunked.py
+++ b/bemb/model/bemb_chunked.py
@@ -818,7 +818,7 @@ def log_likelihood_all_items(self, batch: ChoiceDataset, return_logit: bool, sam
         batch.session_index = batch.session_index.repeat_interleave(self.num_items)
         return self.log_likelihood_item_index(batch, return_logit, sample_dict, all_items=True)
 
-    def log_likelihood_item_index(self, batch: ChoiceDataset, return_logit: bool, sample_dict: Dict[str, torch.Tensor], all_items: bool=False, debug=False) -> torch.Tensor:
+    def log_likelihood_item_index(self, batch: ChoiceDataset, return_logit: bool, sample_dict: Dict[str, torch.Tensor], all_items: bool=False, debug=False, return_logit_components=False, return_price_coeff=False) -> torch.Tensor:
         """
         NOTE for developers:
         This method is more efficient and only computes log-likelihood/logit(utility) for item in item_index[i] for each
@@ -841,6 +841,10 @@ def log_likelihood_item_index(self, batch: ChoiceDataset, return_logit: bool, sa
                 The value of sample_dict should be tensors of shape (num_seeds, num_classes, dim)
                 where num_classes in {num_users, num_items, 1}
                 and dim in {latent_dim(K), num_item_obs, num_user_obs, 1}.
+            all_items: return for all items
+            debug: debug param, keeps evolving, see code
+            return_logit_components: return a tensor of size (len(batch), num_terms) where num_terms is the number of additive terms in self.formula. If this is set to True, return_logit must be set to True too
+
 
         Returns:
             torch.Tensor: a tensor of shape (num_seeds, len(batch)), where
@@ -976,9 +980,20 @@ def reshape_observable(obs, name):
         # Compute Components related to users and items only.
         # ==========================================================================================
         utility = torch.zeros(R, total_computation, device=self.device)
+        if return_logit_components:
+            assert R == 1, 'return_logit_components is not supported for R > 1'
+            assert return_logit, "return_logit_components requires return_logit"
+            utility_components = torch.zeros(len(self.formula), R, total_computation, device=self.device)
+
+        if return_price_coeff:
+            # 'price_obs' needs to be seen in self.formula exactly once in self.utility_formula
+            assert R == 1, 'return_price_coeff is not supported for R > 1'
+            assert self.utility_formula.count('price_obs') == 1, "price_obs needs to be seen in self.formula exactly once for return_price_coeff"
+            price_coeffs = torch.zeros(R, total_computation, device=self.device)
 
         # loop over additive term to utility
-        for term in self.formula:
+        for ii, term in enumerate(self.formula):
+            obs_coeff = None
             if debug:
                 breakpoint()
             # Type I: single coefficient, e.g., lambda_item or lambda_user.
@@ -1018,6 +1033,10 @@ def reshape_observable(obs, name):
                 obs = reshape_observable(getattr(batch, obs_name), obs_name)
                 assert obs.shape == (R, total_computation, positive_integer)
 
+                if return_price_coeff and term['observable'] is not None and term['observable'].startswith('price_'):
+                    obs_coeff = coef_sample.sum(dim=-1)
+                    price_coeffs = obs_coeff
+
                 additive_term = (coef_sample * obs).sum(dim=-1)
 
             # Type IV: factorized coefficient multiplied by observable.
@@ -1047,6 +1066,10 @@ def reshape_observable(obs, name):
                 # compute the factorized coefficient with shape (R, P, I, O).
                 coef = (coef_sample_0 * coef_sample_1).sum(dim=-1)
 
+                if return_price_coeff and term['observable'] is not None and term['observable'].startswith('price_'):
+                    obs_coeff = coef.sum(dim=-1)
+                    price_coeffs = obs_coeff
+
                 additive_term = (coef * obs).sum(dim=-1)
 
             else:
@@ -1054,6 +1077,8 @@ def reshape_observable(obs, name):
 
             assert additive_term.shape == (R, total_computation)
             utility += additive_term
+            if return_logit_components:
+                utility_components[ii] = additive_term
 
         # ==========================================================================================
         # Mask Out Unavailable Items in Each Session.
@@ -1064,9 +1089,14 @@ def reshape_observable(obs, name):
             A = batch.item_availability[session_index, relevant_item_index].unsqueeze(
                 dim=0).expand(R, -1)
             utility[~A] = - (torch.finfo(utility.dtype).max / 2)
+            if return_logit_components:
+                utility_components[:, ~A] = - (torch.finfo(utility.dtype).max / 2)
+            if return_price_coeff:
+                price_coeffs[~A] = 0
 
         for module in self.additional_modules:
             # current utility shape: (R, total_computation)
+            assert False, "additional modules not supported for bemb_chunked"
             additive_term = module(batch)
             assert additive_term.shape == (
                 R, len(batch)) or additive_term.shape == (R, len(batch), 1)
@@ -1082,9 +1112,19 @@ def reshape_observable(obs, name):
 
         if return_logit:
             # (num_seeds, len(batch))
-            u = utility[:, item_index_expanded == relevant_item_index]
-            assert u.shape == (R, len(batch))
-            return u
+            if return_logit_components:
+                u = utility_components[:, :, item_index_expanded == relevant_item_index]
+                assert u.shape == (len(self.formula), R, len(batch))
+                return u
+            else:
+                u = utility[:, item_index_expanded == relevant_item_index]
+                assert u.shape == (R, len(batch))
+                if return_price_coeff:
+                    price_coeffs = price_coeffs[ :, item_index_expanded == relevant_item_index].squeeze(dim=0)
+                    assert price_coeffs.shape[0] == len(batch)
+                    return u, price_coeffs
+                else:
+                    return u
 
         if self.pred_item:
             # compute log likelihood log p(choosing item i | user, item latents)

From 09d3e78570abb89258c7b5c33f6b10e212890b90 Mon Sep 17 00:00:00 2001
From: Ayush Kanodia <kanodiaayush@gmail.com>
Date: Fri, 1 Dec 2023 18:27:44 -0800
Subject: [PATCH 10/19] first cut gamma coefficient - working, results poor

---
 bemb/model/bayesian_coefficient.py | 116 ++++++++++++++++++++++++++---
 bemb/model/bemb.py                 |  30 +++++++-
 bemb/model/bemb_flex_lightning.py  |   8 ++
 tutorials/supermarket/configs.yaml |  26 ++++---
 tutorials/supermarket/main.py      |   7 ++
 5 files changed, 166 insertions(+), 21 deletions(-)

diff --git a/bemb/model/bayesian_coefficient.py b/bemb/model/bayesian_coefficient.py
index 790c4fe..7763764 100644
--- a/bemb/model/bayesian_coefficient.py
+++ b/bemb/model/bayesian_coefficient.py
@@ -9,6 +9,7 @@
 import torch
 import torch.nn as nn
 from torch.distributions.lowrank_multivariate_normal import LowRankMultivariateNormal
+from torch.distributions.gamma import Gamma
 
 
 class BayesianCoefficient(nn.Module):
@@ -21,7 +22,8 @@ def __init__(self,
                  num_obs: Optional[int] = None,
                  dim: int = 1,
                  prior_mean: float = 0.0,
-                 prior_variance: Union[float, torch.Tensor] = 1.0
+                 prior_variance: Union[float, torch.Tensor] = 1.0,
+                 distribution: str = 'gaussian'
                  ) -> None:
         """The Bayesian coefficient object represents a learnable tensor mu_i in R^k, where i is from a family (e.g., user, item)
             so there are num_classes * num_obs learnable weights in total.
@@ -63,12 +65,27 @@ def __init__(self,
                 If a tensor with shape (num_classes, dim) is supplied, supplying a (num_classes, dim) tensor is amount
                 to specifying a different prior variance for each entry in the coefficient.
                 Defaults to 1.0.
+            distribution (str, optional): the distribution of the coefficient. Currently we support 'gaussian' and 'gamma'.
+                Defaults to 'gaussian'.
         """
         super(BayesianCoefficient, self).__init__()
         # do we use this at all? TODO: drop self.variation.
         assert variation in ['item', 'user', 'constant', 'category']
 
         self.variation = variation
+
+        assert distribution in ['gaussian', 'gamma'], f'Unsupported distribution {distribution}'
+        if distribution == 'gamma':
+            assert not obs2prior, 'Gamma distribution is not supported for obs2prior at present.'
+            mean = 1
+            variance = 10
+            assert mean > 0, 'Gamma distribution requires mean > 0'
+            assert variance > 0, 'Gamma distribution requires variance > 0'
+            prior_mean = mean**2 / variance
+            prior_variance = mean / variance
+
+        self.distribution = distribution
+
         self.obs2prior = obs2prior
         if variation == 'constant' or variation == 'category':
             if obs2prior:
@@ -95,7 +112,8 @@ def __init__(self,
                                                dim=num_obs,
                                                prior_variance=1.0,
                                                H_zero_mask=self.H_zero_mask,
-                                               is_H=True)  # this is a distribution responsible for the obs2prior H term.
+                                               is_H=True,
+                                               distribution=self.distribution)  # this is a distribution responsible for the obs2prior H term.
 
         else:
             self.register_buffer(
@@ -117,6 +135,14 @@ def __init__(self,
         self.variational_mean_flexible = nn.Parameter(
             torch.randn(num_classes, dim), requires_grad=True)
 
+        # TOOD(kanodiaayush): initialize the gamma distribution variational mean in a more principled way.
+        '''
+        if self.distribution == 'gamma':
+            # take absolute value of the variational mean.
+            self.variational_mean_flexible.data = torch.abs(
+                self.variational_mean_flexible.data)
+        '''
+
         if self.is_H and self.H_zero_mask is not None:
             assert self.H_zero_mask.shape == self.variational_mean_flexible.shape, \
                 f"The H_zero_mask should have exactly the shape as the H variable, `H_zero_mask`.shape is {self.H_zero_mask.shape}, `H`.shape is {self.variational_mean_flexible.shape} "
@@ -163,6 +189,9 @@ def variational_mean(self) -> torch.Tensor:
         else:
             M = self.variational_mean_fixed + self.variational_mean_flexible
 
+        if self.distribution == 'gamma':
+            M = torch.pow(M, 2) + 0.000001
+
         if self.is_H and (self.H_zero_mask is not None):
             # a H-variable with zero-entry restriction.
             # multiply zeros to entries with H_zero_mask[i, j] = 1.
@@ -196,7 +225,11 @@ def log_prior(self,
         Returns:
             torch.Tensor: the log prior of the variable with shape (num_seeds, num_classes).
         """
-        # p(sample)
+        # DEBUG_MARKER
+        '''
+        print(sample)
+        print('log_prior')
+        '''
         num_seeds, num_classes, dim = sample.shape
         # shape (num_seeds, num_classes)
         if self.obs2prior:
@@ -211,9 +244,46 @@ def log_prior(self,
 
         else:
             mu = self.prior_zero_mean
-        out = LowRankMultivariateNormal(loc=mu,
-                                        cov_factor=self.prior_cov_factor,
-                                        cov_diag=self.prior_cov_diag).log_prob(sample)
+
+        if self.distribution == 'gaussian':
+            # DEBUG_MARKER
+            '''
+            print('sample.shape', sample.shape)
+            print('gaussian')
+            print("mu.shape, self.prior_cov_diag.shape")
+            print(mu.shape, self.prior_cov_diag.shape)
+            '''
+            out = LowRankMultivariateNormal(loc=mu,
+                                            cov_factor=self.prior_cov_factor,
+                                            cov_diag=self.prior_cov_diag).log_prob(sample)
+        elif self.distribution == 'gamma':
+            concentration = torch.pow(mu, 2)/self.prior_cov_diag
+            rate = mu/self.prior_cov_diag
+            # DEBUG_MARKER
+            '''
+            print('sample.shape', sample.shape)
+            print('gamma')
+            print("mu.shape, self.prior_cov_diag.shape")
+            print(mu.shape, self.prior_cov_diag.shape)
+            print("concentration.shape, rate.shape")
+            print(concentration.shape, rate.shape)
+            '''
+            out = Gamma(concentration=concentration,
+                        rate=rate).log_prob(sample)
+            # drop the last dim, take the first element over the last dim
+            out = out[:, :, 0]
+
+
+        # DEBUG_MARKER
+        '''
+        print("sample.shape")
+        print(sample.shape)
+        print("out.shape")
+        print(out.shape)
+        print("num_seeds, num_classes")
+        print(num_seeds, num_classes)
+        breakpoint()
+        '''
         assert out.shape == (num_seeds, num_classes)
         return out
 
@@ -250,6 +320,15 @@ def rsample(self, num_seeds: int = 1) -> Union[torch.Tensor, Tuple[torch.Tensor]
         """
         value_sample = self.variational_distribution.rsample(
             torch.Size([num_seeds]))
+        # DEBUG_MARKER
+        '''
+        print("rsample")
+        print(self.distribution)
+        print("value_sample.shape")
+        print(value_sample.shape)
+        breakpoint()
+        '''
+        # DEBUG_MARKER
         if self.obs2prior:
             # sample obs2prior H as well.
             H_sample = self.prior_H.rsample(num_seeds=num_seeds)
@@ -258,12 +337,29 @@ def rsample(self, num_seeds: int = 1) -> Union[torch.Tensor, Tuple[torch.Tensor]
             return value_sample
 
     @property
-    def variational_distribution(self) -> LowRankMultivariateNormal:
+    def variational_distribution(self) -> Union[LowRankMultivariateNormal, Gamma]:
         """Constructs the current variational distribution of the coefficient from current variational mean and covariance.
         """
-        return LowRankMultivariateNormal(loc=self.variational_mean,
-                                         cov_factor=self.variational_cov_factor,
-                                         cov_diag=torch.exp(self.variational_logstd))
+        if self.distribution == 'gaussian':
+            return LowRankMultivariateNormal(loc=self.variational_mean,
+                                             cov_factor=self.variational_cov_factor,
+                                             cov_diag=torch.exp(self.variational_logstd))
+        elif self.distribution == 'gamma':
+            # concentration is mean**2 / var (std**2)
+            concentration = torch.pow(self.variational_mean, 2)/torch.pow(torch.exp(self.variational_logstd), 2)
+            # rate is mean / var (std**2)
+            rate = self.variational_mean/torch.pow(torch.exp(self.variational_logstd), 2)
+            # DEBUG_MARKER
+            '''
+            print("self.variational_mean, self.variational_logstd")
+            print(self.variational_mean, self.variational_logstd)
+            print("concentration, rate")
+            print(concentration, rate)
+            '''
+            # DEBUG_MARKER
+            return Gamma(concentration=concentration, rate=rate)
+        else:
+            raise NotImplementedError("Unknown variational distribution type.")
 
     @property
     def device(self) -> torch.device:
diff --git a/bemb/model/bemb.py b/bemb/model/bemb.py
index be704db..bcf0cc4 100644
--- a/bemb/model/bemb.py
+++ b/bemb/model/bemb.py
@@ -113,6 +113,7 @@ def __init__(self,
                  num_items: int,
                  pred_item: bool,
                  num_classes: int = 2,
+                 coef_dist_dict: Dict[str, str] = {'default' : 'gaussian'},
                  H_zero_mask_dict: Optional[Dict[str, torch.BoolTensor]] = None,
                  prior_mean: Union[float, Dict[str, float]] = 0.0,
                  prior_variance: Union[float, Dict[str, float]] = 1.0,
@@ -140,6 +141,13 @@ def __init__(self,
                     lambda_item + theta_user * alpha_item + gamma_user * beta_item * price_obs
                 See the doc-string of parse_utility for an example.
 
+            coef_dist_dict (Dict[str, str]): a dictionary mapping coefficient name to coefficient distribution name.
+                The coefficient distribution name can be one of the following:
+                1. 'gaussian'
+                2. 'gamma' - obs2prior is not supported for gamma coefficients
+                If a coefficient does not appear in the dictionary, it will be assigned the distribution specified
+                by the 'default' key. By default, the default distribution is 'gaussian'.
+
             obs2prior_dict (Dict[str, bool]): a dictionary maps coefficient name (e.g., 'lambda_item')
                 to a boolean indicating if observable (e.g., item_obs) enters the prior of the coefficient.
 
@@ -233,6 +241,7 @@ def __init__(self,
         self.utility_formula = utility_formula
         self.obs2prior_dict = obs2prior_dict
         self.coef_dim_dict = coef_dim_dict
+        self.coef_dist_dict = coef_dist_dict
         if H_zero_mask_dict is not None:
             self.H_zero_mask_dict = H_zero_mask_dict
         else:
@@ -348,6 +357,13 @@ def __init__(self,
                             warnings.warn(f"You provided a dictionary of prior variance, but coefficient {coef_name} is not a key in it. Supply a value for 'default' in the prior_variance dictionary to use that as default value (e.g., prior_variance['default'] = 0.3); now using variance=1.0 since this is not supplied.")
                             self.prior_variance[coef_name] = 1.0
 
+                if coef_name not in self.coef_dist_dict.keys():
+                    if 'default' in self.coef_dist_dict.keys():
+                        self.coef_dist_dict[coef_name] = self.coef_dist_dict['default']
+                    else:
+                        warnings.warn(f"You provided a dictionary of coef_dist_dict, but coefficient {coef_name} is not a key in it. Supply a value for 'default' in the coef_dist_dict dictionary to use that as default value (e.g., coef_dist_dict['default'] = 'gaussian'); now using distribution='gaussian' since this is not supplied.")
+                        self.coef_dist_dict[coef_name] = 'gaussian'
+
                 s2 = self.prior_variance[coef_name] if isinstance(
                     self.prior_variance, dict) else self.prior_variance
 
@@ -367,7 +383,8 @@ def __init__(self,
                                                            prior_mean=mean,
                                                            prior_variance=s2,
                                                            H_zero_mask=H_zero_mask,
-                                                           is_H=False)
+                                                           is_H=False,
+                                                           distribution=self.coef_dist_dict[coef_name])
         self.coef_dict = nn.ModuleDict(coef_dict)
 
         # ==============================================================================================================
@@ -653,6 +670,9 @@ def sample_coefficient_dictionary(self, num_seeds: int, deterministic: bool = Fa
         """
         sample_dict = dict()
         for coef_name, coef in self.coef_dict.items():
+            '''
+            print(coef_name)
+            '''
             if deterministic:
                 sample_dict[coef_name] = coef.variational_distribution.mean.unsqueeze(dim=0)  # (1, num_*, dim)
                 if coef.obs2prior:
@@ -935,6 +955,8 @@ def reshape_observable(obs, name):
                 assert obs.shape == (R, P, I, positive_integer)
 
                 additive_term = (coef_sample * obs).sum(dim=-1)
+                if obs_name == 'price_obs':
+                    additive_term *= -1.0
 
             # Type IV: factorized coefficient multiplied by observable.
             # e.g., gamma_user * beta_item * price_obs.
@@ -965,6 +987,8 @@ def reshape_observable(obs, name):
                 coef = (coef_sample_0 * coef_sample_1).sum(dim=-1)
 
                 additive_term = (coef * obs).sum(dim=-1)
+                if obs_name == 'price_obs':
+                    additive_term *= -1.0
 
             else:
                 raise ValueError(f'Undefined term type: {term}')
@@ -1167,6 +1191,8 @@ def reshape_observable(obs, name):
                 assert obs.shape == (R, total_computation, positive_integer)
 
                 additive_term = (coef_sample * obs).sum(dim=-1)
+                if obs_name == 'price_obs':
+                    additive_term *= -1.0
 
             # Type IV: factorized coefficient multiplied by observable.
             # e.g., gamma_user * beta_item * price_obs.
@@ -1196,6 +1222,8 @@ def reshape_observable(obs, name):
                 coef = (coef_sample_0 * coef_sample_1).sum(dim=-1)
 
                 additive_term = (coef * obs).sum(dim=-1)
+                if obs_name == 'price_obs':
+                    additive_term *= -1.0
 
             else:
                 raise ValueError(f'Undefined term type: {term}')
diff --git a/bemb/model/bemb_flex_lightning.py b/bemb/model/bemb_flex_lightning.py
index 7873aa5..0c7dbee 100644
--- a/bemb/model/bemb_flex_lightning.py
+++ b/bemb/model/bemb_flex_lightning.py
@@ -79,6 +79,14 @@ def training_step(self, batch, batch_idx):
         loss = - elbo
         return loss
 
+    # DEBUG_MARKER
+    '''
+    def on_train_batch_end(self, outputs, batch, batch_idx, dataloader_idx):
+        print(f"Epoch {self.current_epoch} has ended")
+        breakpoint()
+    '''
+    # DEBUG_MARKER
+
     def _get_performance_dict(self, batch):
         if self.model.pred_item:
             log_p = self.model(batch, return_type='log_prob',
diff --git a/tutorials/supermarket/configs.yaml b/tutorials/supermarket/configs.yaml
index ce6b672..7ecc4f7 100644
--- a/tutorials/supermarket/configs.yaml
+++ b/tutorials/supermarket/configs.yaml
@@ -1,30 +1,36 @@
 device: cuda
 # data_dir: /home/tianyudu/Data/MoreSupermarket/tsv/
-data_dir: /home/tianyudu/Data/MoreSupermarket/20180101-20191231_13/tsv/
+# data_dir: /home/tianyudu/Data/MoreSupermarket/20180101-20191231_13/tsv/
+data_dir: /oak/stanford/groups/athey/MoreSupermarkets/csv/new_data/nf_runs/1904/20180101-20191231_44/tsv
 # utility: lambda_item
 # utility: lambda_item + theta_user * alpha_item
 # utility: lambda_item + theta_user * alpha_item + zeta_user * item_obs
-utility: lambda_item + theta_user * alpha_item + gamma_user * beta_item * price_obs
+# utility: lambda_item + theta_user * alpha_item + gamma_user * beta_item * price_obs
+utility: lambda_item + theta_user * alpha_item + gamma_user * price_obs
+# utility: alpha_item * gamma_user * price_obs
 out_dir: ./output/
 # model configuration.
+coef_dist_dict:
+  default: 'gaussian'
+  gamma_user: 'gamma'
 obs2prior_dict:
-  lambda_item: True
-  theta_user: True
-  alpha_item: True
+  lambda_item: False
+  theta_user: False
+  alpha_item: False
   zeta_user: True
   lota_item: True
-  gamma_user: True
+  gamma_user: False
   beta_item: True
 coef_dim_dict:
   lambda_item: 1
   theta_user: 10
   alpha_item: 10
-  gamma_user: 10
-  beta_item: 10
+  gamma_user: 1
+  beta_item: 1
 #### optimization.
 trace_log_q: False
 shuffle: False
 batch_size: 100000
-num_epochs: 3
+num_epochs: 100
 learning_rate: 0.03
-num_mc_seeds: 1
+num_mc_seeds: 2
diff --git a/tutorials/supermarket/main.py b/tutorials/supermarket/main.py
index 7bbb1e4..5cd5479 100644
--- a/tutorials/supermarket/main.py
+++ b/tutorials/supermarket/main.py
@@ -197,6 +197,7 @@ def load_tsv(file_name, data_dir):
     bemb = LitBEMBFlex(
         # trainings args.
         pred_item = configs.pred_item,
+        coef_dist_dict=configs.coef_dist_dict,
         learning_rate=configs.learning_rate,
         num_seeds=configs.num_mc_seeds,
         # model args, will be passed to BEMB constructor.
@@ -217,6 +218,12 @@ def load_tsv(file_name, data_dir):
     bemb = bemb.to(configs.device)
     bemb = run(bemb, dataset_list, batch_size=configs.batch_size, num_epochs=configs.num_epochs)
 
+    coeffs = bemb.model.coef_dict['gamma_user'].variational_mean.detach().cpu().numpy()
+    coeffs = coeffs**2
+    # give distribution statistics
+    print('Coefficients statistics:')
+    print(pd.DataFrame(coeffs).describe())
+
     # ==============================================================================================
     # inference example
     # ==============================================================================================

From f6e757020039f3c8762a0d7b5ee40ce4d1195e02 Mon Sep 17 00:00:00 2001
From: Ayush Kanodia <kanodiaayush@gmail.com>
Date: Sun, 3 Dec 2023 23:40:03 -0800
Subject: [PATCH 11/19] sum over the gamma distribution in the log_prior over
 last dim

---
 bemb/model/bayesian_coefficient.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bemb/model/bayesian_coefficient.py b/bemb/model/bayesian_coefficient.py
index 7763764..cb5cbf3 100644
--- a/bemb/model/bayesian_coefficient.py
+++ b/bemb/model/bayesian_coefficient.py
@@ -270,8 +270,8 @@ def log_prior(self,
             '''
             out = Gamma(concentration=concentration,
                         rate=rate).log_prob(sample)
-            # drop the last dim, take the first element over the last dim
-            out = out[:, :, 0]
+            # sum over the last dimension
+            out = torch.sum(out, dim=-1)
 
 
         # DEBUG_MARKER

From 51ba2f1bf9a531040f270c6fe9694108bb38f607 Mon Sep 17 00:00:00 2001
From: Ayush Kanodia <kanodiaayush@gmail.com>
Date: Mon, 4 Dec 2023 21:25:36 -0800
Subject: [PATCH 12/19] update main files

---
 bemb/utils/run_helper.py      |  8 ++--
 tutorials/supermarket/main.py | 79 +++++++++++++++++++++--------------
 2 files changed, 52 insertions(+), 35 deletions(-)

diff --git a/bemb/utils/run_helper.py b/bemb/utils/run_helper.py
index 410745d..0d5637a 100644
--- a/bemb/utils/run_helper.py
+++ b/bemb/utils/run_helper.py
@@ -17,7 +17,7 @@ def section_print(input_text):
     print('=' * 20, input_text, '=' * 20)
 
 
-def run(model: LitBEMBFlex, dataset_list: List[ChoiceDataset], batch_size: int=-1, num_epochs: int=10, num_workers: int=8, **kwargs) -> LitBEMBFlex:
+def run(model: LitBEMBFlex, dataset_list: List[ChoiceDataset], batch_size: int=-1, num_epochs: int=10, num_workers: int=8, run_test=True, **kwargs) -> LitBEMBFlex:
     """A standard pipeline of model training and evaluation.
 
     Args:
@@ -25,6 +25,7 @@ def run(model: LitBEMBFlex, dataset_list: List[ChoiceDataset], batch_size: int=-
         dataset_list (List[ChoiceDataset]): train_dataset, validation_test, and test_dataset in a list of length 3.
         batch_size (int, optional): batch_size for training and evaluation. Defaults to -1, which indicates full-batch training.
         num_epochs (int, optional): number of epochs for training. Defaults to 10.
+        run_test (bool, optional): whether to run evaluation on test set. Defaults to True.
         **kwargs: additional keyword argument for the pytorch-lightning Trainer.
 
     Returns:
@@ -57,6 +58,7 @@ def run(model: LitBEMBFlex, dataset_list: List[ChoiceDataset], batch_size: int=-
     trainer.fit(model, train_dataloaders=train, val_dataloaders=validation)
     print(f'time taken: {time.time() - start_time}')
 
-    section_print('test performance')
-    trainer.test(model, dataloaders=test)
+    if run_test:
+        section_print('test performance')
+        trainer.test(model, dataloaders=test)
     return model
diff --git a/tutorials/supermarket/main.py b/tutorials/supermarket/main.py
index 5cd5479..4982014 100644
--- a/tutorials/supermarket/main.py
+++ b/tutorials/supermarket/main.py
@@ -10,7 +10,8 @@
 from termcolor import cprint
 from example_customized_module import ExampleCustomizedModule
 from torch_choice.data import ChoiceDataset
-from bemb.model import LitBEMBFlex
+# from bemb.model import LitBEMBFlex
+from bemb.model.bemb_supermarket_lightning import LitBEMBFlex
 from bemb.utils.run_helper import run
 
 
@@ -70,28 +71,30 @@ def load_tsv(file_name, data_dir):
     # ==============================================================================================
     # user observables
     # ==============================================================================================
-    user_obs = pd.read_csv(os.path.join(configs.data_dir, 'obsUser.tsv'),
-                           sep='\t',
-                           index_col=0,
-                           header=None)
-    # TODO(Tianyu): there could be duplicate information for each user.
-    # do we need to catch it in some check process?
-    user_obs = user_obs.groupby(user_obs.index).first().sort_index()
-    user_obs = torch.Tensor(user_obs.values)
-    configs.num_user_obs = user_obs.shape[1]
-    configs.coef_dim_dict['obsuser_item'] = configs.num_user_obs
+    if configs.obs_user:
+        user_obs = pd.read_csv(os.path.join(configs.data_dir, 'obsUser.tsv'),
+                               sep='\t',
+                               index_col=0,
+                               header=None)
+        # TODO(Tianyu): there could be duplicate information for each user.
+        # do we need to catch it in some check process?
+        user_obs = user_obs.groupby(user_obs.index).first().sort_index()
+        user_obs = torch.Tensor(user_obs.values)
+        configs.num_user_obs = user_obs.shape[1]
+        configs.coef_dim_dict['obsuser_item'] = configs.num_user_obs
 
     # ==============================================================================================
     # item observables
     # ==============================================================================================
-    item_obs = pd.read_csv(os.path.join(configs.data_dir, 'obsItem.tsv'),
-                           sep='\t',
-                           index_col=0,
-                           header=None)
-    item_obs = item_obs.groupby(item_obs.index).first().sort_index()
-    item_obs = torch.Tensor(item_obs.values)
-    configs.num_item_obs = item_obs.shape[1]
-    configs.coef_dim_dict['obsitem_user'] = configs.num_item_obs
+    if configs.obs_item:
+        item_obs = pd.read_csv(os.path.join(configs.data_dir, 'obsItem.tsv'),
+                               sep='\t',
+                               index_col=0,
+                               header=None)
+        item_obs = item_obs.groupby(item_obs.index).first().sort_index()
+        item_obs = torch.Tensor(item_obs.values)
+        configs.num_item_obs = item_obs.shape[1]
+        configs.coef_dim_dict['obsitem_user'] = configs.num_item_obs
 
     # ==============================================================================================
     # item availability
@@ -152,14 +155,22 @@ def load_tsv(file_name, data_dir):
         # example day of week, random example.
         session_day_of_week = torch.LongTensor(np.random.randint(0, 7, configs.num_sessions))
 
-        choice_dataset = ChoiceDataset(item_index=label,
-                                       user_index=user_index,
-                                       session_index=session_index,
-                                       item_availability=item_availability,
-                                       user_obs=user_obs,
-                                       item_obs=item_obs,
-                                       price_obs=price_obs,
-                                       session_day_of_week=session_day_of_week)
+        choice_dataset_args = {
+            "item_index": label,
+            "user_index": user_index,
+            "session_index": session_index,
+            "item_availability": item_availability,
+            "price_obs": price_obs,
+            "session_day_of_week": session_day_of_week
+               }
+
+        if configs.obs_user:
+            choice_dataset_args["user_obs"] = user_obs
+
+        if configs.obs_item:
+            choice_dataset_args["item_obs"] = item_obs
+
+        choice_dataset = ChoiceDataset(**choice_dataset_args)
 
         dataset_list.append(choice_dataset)
 
@@ -209,20 +220,24 @@ def load_tsv(file_name, data_dir):
         coef_dim_dict=configs.coef_dim_dict,
         trace_log_q=configs.trace_log_q,
         category_to_item=category_to_item,
-        num_user_obs=configs.num_user_obs,
-        num_item_obs=configs.num_item_obs,
-        # num_price_obs=configs.num_price_obs,
+        num_user_obs=configs.num_user_obs if configs.obs_user else None,
+        num_item_obs=configs.num_item_obs if configs.obs_item else None,
+        prior_variance=configs.prior_variance,
+        num_price_obs=configs.num_price_obs,
+        preprocess=False,
         # additional_modules=[ExampleCustomizedModule()]
     )
 
     bemb = bemb.to(configs.device)
-    bemb = run(bemb, dataset_list, batch_size=configs.batch_size, num_epochs=configs.num_epochs)
+    bemb = run(bemb, dataset_list, batch_size=configs.batch_size, num_epochs=configs.num_epochs, run_test=False)
 
+    # '''
     coeffs = bemb.model.coef_dict['gamma_user'].variational_mean.detach().cpu().numpy()
-    coeffs = coeffs**2
+    # coeffs = coeffs**2
     # give distribution statistics
     print('Coefficients statistics:')
     print(pd.DataFrame(coeffs).describe())
+    # '''
 
     # ==============================================================================================
     # inference example

From 4b47bc2bb547a1d43958245d3805a657914abf5d Mon Sep 17 00:00:00 2001
From: Ayush Kanodia <kanodiaayush@gmail.com>
Date: Tue, 5 Dec 2023 17:38:41 -0800
Subject: [PATCH 13/19] gamma coeff implemented; unclean right now

---
 bemb/model/bayesian_coefficient.py | 87 +++++++++---------------------
 bemb/model/bemb.py                 | 33 ++++++++----
 tutorials/supermarket/main.py      | 11 ++--
 3 files changed, 54 insertions(+), 77 deletions(-)

diff --git a/bemb/model/bayesian_coefficient.py b/bemb/model/bayesian_coefficient.py
index cb5cbf3..5da279a 100644
--- a/bemb/model/bayesian_coefficient.py
+++ b/bemb/model/bayesian_coefficient.py
@@ -77,12 +77,14 @@ def __init__(self,
         assert distribution in ['gaussian', 'gamma'], f'Unsupported distribution {distribution}'
         if distribution == 'gamma':
             assert not obs2prior, 'Gamma distribution is not supported for obs2prior at present.'
-            mean = 1
-            variance = 10
+            mean = 1.0
+            variance = 10.0
             assert mean > 0, 'Gamma distribution requires mean > 0'
             assert variance > 0, 'Gamma distribution requires variance > 0'
-            prior_mean = mean**2 / variance
-            prior_variance = mean / variance
+            shape = mean ** 2 / variance
+            rate = mean / variance
+            prior_mean = shape
+            prior_variance = rate
 
         self.distribution = distribution
 
@@ -132,21 +134,21 @@ def __init__(self,
             num_classes, dim) * self.prior_variance)
 
         # create variational distribution.
-        self.variational_mean_flexible = nn.Parameter(
-            torch.randn(num_classes, dim), requires_grad=True)
-
+        if self.distribution == 'gaussian':
+            self.variational_mean_flexible = nn.Parameter(
+                torch.randn(num_classes, dim), requires_grad=True)
         # TOOD(kanodiaayush): initialize the gamma distribution variational mean in a more principled way.
-        '''
-        if self.distribution == 'gamma':
-            # take absolute value of the variational mean.
-            self.variational_mean_flexible.data = torch.abs(
-                self.variational_mean_flexible.data)
-        '''
+        elif self.distribution == 'gamma':
+            # initialize using uniform distribution between 0.5 and 1.5
+            # for a gamma distribution, we store the concentration as log(concentration) = variational_mean_flexible
+            self.variational_mean_flexible = nn.Parameter(
+                torch.rand(num_classes, dim) + 0.5, requires_grad=True)
 
         if self.is_H and self.H_zero_mask is not None:
             assert self.H_zero_mask.shape == self.variational_mean_flexible.shape, \
                 f"The H_zero_mask should have exactly the shape as the H variable, `H_zero_mask`.shape is {self.H_zero_mask.shape}, `H`.shape is {self.variational_mean_flexible.shape} "
 
+        # for gamma distribution, we store the rate as log(rate) = variational_logstd
         self.variational_logstd = nn.Parameter(
             torch.randn(num_classes, dim), requires_grad=True)
 
@@ -190,7 +192,8 @@ def variational_mean(self) -> torch.Tensor:
             M = self.variational_mean_fixed + self.variational_mean_flexible
 
         if self.distribution == 'gamma':
-            M = torch.pow(M, 2) + 0.000001
+            # M = torch.pow(M, 2) + 0.000001
+            M = M.exp() / self.variational_logstd.exp()
 
         if self.is_H and (self.H_zero_mask is not None):
             # a H-variable with zero-entry restriction.
@@ -246,44 +249,17 @@ def log_prior(self,
             mu = self.prior_zero_mean
 
         if self.distribution == 'gaussian':
-            # DEBUG_MARKER
-            '''
-            print('sample.shape', sample.shape)
-            print('gaussian')
-            print("mu.shape, self.prior_cov_diag.shape")
-            print(mu.shape, self.prior_cov_diag.shape)
-            '''
             out = LowRankMultivariateNormal(loc=mu,
                                             cov_factor=self.prior_cov_factor,
                                             cov_diag=self.prior_cov_diag).log_prob(sample)
         elif self.distribution == 'gamma':
-            concentration = torch.pow(mu, 2)/self.prior_cov_diag
-            rate = mu/self.prior_cov_diag
-            # DEBUG_MARKER
-            '''
-            print('sample.shape', sample.shape)
-            print('gamma')
-            print("mu.shape, self.prior_cov_diag.shape")
-            print(mu.shape, self.prior_cov_diag.shape)
-            print("concentration.shape, rate.shape")
-            print(concentration.shape, rate.shape)
-            '''
+            concentration = mu
+            rate = self.prior_variance
             out = Gamma(concentration=concentration,
                         rate=rate).log_prob(sample)
             # sum over the last dimension
             out = torch.sum(out, dim=-1)
 
-
-        # DEBUG_MARKER
-        '''
-        print("sample.shape")
-        print(sample.shape)
-        print("out.shape")
-        print(out.shape)
-        print("num_seeds, num_classes")
-        print(num_seeds, num_classes)
-        breakpoint()
-        '''
         assert out.shape == (num_seeds, num_classes)
         return out
 
@@ -321,14 +297,6 @@ def rsample(self, num_seeds: int = 1) -> Union[torch.Tensor, Tuple[torch.Tensor]
         value_sample = self.variational_distribution.rsample(
             torch.Size([num_seeds]))
         # DEBUG_MARKER
-        '''
-        print("rsample")
-        print(self.distribution)
-        print("value_sample.shape")
-        print(value_sample.shape)
-        breakpoint()
-        '''
-        # DEBUG_MARKER
         if self.obs2prior:
             # sample obs2prior H as well.
             H_sample = self.prior_H.rsample(num_seeds=num_seeds)
@@ -345,18 +313,11 @@ def variational_distribution(self) -> Union[LowRankMultivariateNormal, Gamma]:
                                              cov_factor=self.variational_cov_factor,
                                              cov_diag=torch.exp(self.variational_logstd))
         elif self.distribution == 'gamma':
-            # concentration is mean**2 / var (std**2)
-            concentration = torch.pow(self.variational_mean, 2)/torch.pow(torch.exp(self.variational_logstd), 2)
-            # rate is mean / var (std**2)
-            rate = self.variational_mean/torch.pow(torch.exp(self.variational_logstd), 2)
-            # DEBUG_MARKER
-            '''
-            print("self.variational_mean, self.variational_logstd")
-            print(self.variational_mean, self.variational_logstd)
-            print("concentration, rate")
-            print(concentration, rate)
-            '''
-            # DEBUG_MARKER
+            # for a gamma distribution, we store the concentration as log(concentration) = variational_mean_flexible
+            assert self.variational_mean_fixed == None, 'Gamma distribution does not support fixed mean'
+            concentration = self.variational_mean_flexible.exp()
+            # for gamma distribution, we store the rate as log(rate) = variational_logstd
+            rate = torch.exp(self.variational_logstd)
             return Gamma(concentration=concentration, rate=rate)
         else:
             raise NotImplementedError("Unknown variational distribution type.")
diff --git a/bemb/model/bemb.py b/bemb/model/bemb.py
index bcf0cc4..31dbdf9 100644
--- a/bemb/model/bemb.py
+++ b/bemb/model/bemb.py
@@ -39,21 +39,26 @@ def parse_utility(utility_string: str) -> List[Dict[str, Union[List[str], None]]
     A helper function parse utility string into a list of additive terms.
 
     Example:
-        utility_string = 'lambda_item + theta_user * alpha_item + gamma_user * beta_item * price_obs'
+        utility_string = 'lambda_item + theta_user * alpha_item - gamma_user * beta_item * price_obs'
         output = [
             {
                 'coefficient': ['lambda_item'],
-                'observable': None
+                'observable': None,
+                'sign': 1.0,
+
             },
             {
                 'coefficient': ['theta_user', 'alpha_item'],
                 'observable': None
+                'sign': 1.0,
             },
             {
                 'coefficient': ['gamma_user', 'beta_item'],
                 'observable': 'price_obs'
+                'sign': -1.0,
             }
             ]
+         Note that 'minus' is allowed in the utility string. If the first term is negative, the minus should be without a space.
     """
     # split additive terms
     coefficient_suffix = ('_item', '_user', '_constant', '_category')
@@ -76,10 +81,16 @@ def is_coefficient(name: str) -> bool:
     def is_observable(name: str) -> bool:
         return any(name.startswith(prefix) for prefix in observable_prefix)
 
+    utility_string = utility_string.replace(' - ', ' + -')
     additive_terms = utility_string.split(' + ')
     additive_decomposition = list()
     for term in additive_terms:
-        atom = {'coefficient': [], 'observable': None}
+        if term.startswith('-'):
+            sign = -1.0
+            term = term[1:]
+        else:
+            sign = 1.0
+        atom = {'coefficient': [], 'observable': None, 'sign': sign}
         # split multiplicative terms.
         for x in term.split(' * '):
             assert not (is_observable(x) and is_coefficient(x)), f"The element {x} is ambiguous, it follows naming convention of both an observable and a coefficient."
@@ -927,6 +938,7 @@ def reshape_observable(obs, name):
                     sample_dict[coef_name], coef_name)
                 assert coef_sample.shape == (R, P, I, 1)
                 additive_term = coef_sample.view(R, P, I)
+                additive_term *= term['sign']
 
             # Type II: factorized coefficient, e.g., <theta_user, lambda_item>.
             elif len(term['coefficient']) == 2 and term['observable'] is None:
@@ -942,6 +954,7 @@ def reshape_observable(obs, name):
                     R, P, I, positive_integer)
 
                 additive_term = (coef_sample_0 * coef_sample_1).sum(dim=-1)
+                additive_term *= term['sign']
 
             # Type III: single coefficient multiplied by observable, e.g., theta_user * x_obs_item.
             elif len(term['coefficient']) == 1 and term['observable'] is not None:
@@ -955,8 +968,7 @@ def reshape_observable(obs, name):
                 assert obs.shape == (R, P, I, positive_integer)
 
                 additive_term = (coef_sample * obs).sum(dim=-1)
-                if obs_name == 'price_obs':
-                    additive_term *= -1.0
+                additive_term *= term['sign']
 
             # Type IV: factorized coefficient multiplied by observable.
             # e.g., gamma_user * beta_item * price_obs.
@@ -987,8 +999,7 @@ def reshape_observable(obs, name):
                 coef = (coef_sample_0 * coef_sample_1).sum(dim=-1)
 
                 additive_term = (coef * obs).sum(dim=-1)
-                if obs_name == 'price_obs':
-                    additive_term *= -1.0
+                additive_term *= term['sign']
 
             else:
                 raise ValueError(f'Undefined term type: {term}')
@@ -1162,6 +1173,7 @@ def reshape_observable(obs, name):
                     sample_dict[coef_name], coef_name)
                 assert coef_sample.shape == (R, total_computation, 1)
                 additive_term = coef_sample.view(R, total_computation)
+                additive_term *= term['sign']
 
             # Type II: factorized coefficient, e.g., <theta_user, lambda_item>.
             elif len(term['coefficient']) == 2 and term['observable'] is None:
@@ -1177,6 +1189,7 @@ def reshape_observable(obs, name):
                     R, total_computation, positive_integer)
 
                 additive_term = (coef_sample_0 * coef_sample_1).sum(dim=-1)
+                additive_term *= term['sign']
 
             # Type III: single coefficient multiplied by observable, e.g., theta_user * x_obs_item.
             elif len(term['coefficient']) == 1 and term['observable'] is not None:
@@ -1191,8 +1204,7 @@ def reshape_observable(obs, name):
                 assert obs.shape == (R, total_computation, positive_integer)
 
                 additive_term = (coef_sample * obs).sum(dim=-1)
-                if obs_name == 'price_obs':
-                    additive_term *= -1.0
+                additive_term *= term['sign']
 
             # Type IV: factorized coefficient multiplied by observable.
             # e.g., gamma_user * beta_item * price_obs.
@@ -1222,8 +1234,7 @@ def reshape_observable(obs, name):
                 coef = (coef_sample_0 * coef_sample_1).sum(dim=-1)
 
                 additive_term = (coef * obs).sum(dim=-1)
-                if obs_name == 'price_obs':
-                    additive_term *= -1.0
+                additive_term *= term['sign']
 
             else:
                 raise ValueError(f'Undefined term type: {term}')
diff --git a/tutorials/supermarket/main.py b/tutorials/supermarket/main.py
index 4982014..37c3093 100644
--- a/tutorials/supermarket/main.py
+++ b/tutorials/supermarket/main.py
@@ -232,11 +232,16 @@ def load_tsv(file_name, data_dir):
     bemb = run(bemb, dataset_list, batch_size=configs.batch_size, num_epochs=configs.num_epochs, run_test=False)
 
     # '''
-    coeffs = bemb.model.coef_dict['gamma_user'].variational_mean.detach().cpu().numpy()
     # coeffs = coeffs**2
     # give distribution statistics
-    print('Coefficients statistics:')
-    print(pd.DataFrame(coeffs).describe())
+    if 'gamma_user' in configs.utility:
+        coeffs_gamma = bemb.model.coef_dict['gamma_user'].variational_mean.detach().cpu().numpy()
+        print('Coefficients statistics Gamma:')
+        print(pd.DataFrame(coeffs_gamma).describe())
+    if 'nfact_category' in configs.utility:
+        coeffs_nfact = bemb.model.coef_dict['nfact_category'].variational_mean.detach().cpu().numpy()
+        print('Coefficients statistics nfact_category:')
+        print(pd.DataFrame(coeffs_nfact).describe())
     # '''
 
     # ==============================================================================================

From 189b1dcab0940a81ac051997e81847d0b19057da Mon Sep 17 00:00:00 2001
From: Ayush Kanodia <kanodiaayush@gmail.com>
Date: Thu, 7 Dec 2023 22:45:15 -0800
Subject: [PATCH 14/19] Gamma with Bayesian obs2prior working

---
 bemb/model/bayesian_coefficient.py | 17 ++++++++++++-----
 bemb/model/bemb.py                 | 30 ++++++++++++++++++++----------
 tutorials/supermarket/main.py      | 12 +++++++++++-
 3 files changed, 43 insertions(+), 16 deletions(-)

diff --git a/bemb/model/bayesian_coefficient.py b/bemb/model/bayesian_coefficient.py
index 5da279a..464c95a 100644
--- a/bemb/model/bayesian_coefficient.py
+++ b/bemb/model/bayesian_coefficient.py
@@ -6,6 +6,7 @@
 """
 from typing import Optional, Tuple, Union
 
+import numpy as np
 import torch
 import torch.nn as nn
 from torch.distributions.lowrank_multivariate_normal import LowRankMultivariateNormal
@@ -76,15 +77,20 @@ def __init__(self,
 
         assert distribution in ['gaussian', 'gamma'], f'Unsupported distribution {distribution}'
         if distribution == 'gamma':
+            '''
             assert not obs2prior, 'Gamma distribution is not supported for obs2prior at present.'
             mean = 1.0
             variance = 10.0
             assert mean > 0, 'Gamma distribution requires mean > 0'
             assert variance > 0, 'Gamma distribution requires variance > 0'
-            shape = mean ** 2 / variance
-            rate = mean / variance
-            prior_mean = shape
+            # shape (concentration) is mean^2/variance, rate is variance/mean for Gamma distribution.
+            shape = prior_mean ** 2 / prior_variance
+            rate = prior_mean / prior_variance
+            prior_mean = np.log(shape)
             prior_variance = rate
+            '''
+            prior_mean = np.log(prior_mean)
+            prior_variance = prior_variance
 
         self.distribution = distribution
 
@@ -108,6 +114,7 @@ def __init__(self,
         if self.obs2prior:
             # the mean of prior distribution depends on observables.
             # initiate a Bayesian Coefficient with shape (dim, num_obs) standard Gaussian.
+            prior_H_dist = 'gaussian'
             self.prior_H = BayesianCoefficient(variation='constant',
                                                num_classes=dim,
                                                obs2prior=False,
@@ -115,7 +122,7 @@ def __init__(self,
                                                prior_variance=1.0,
                                                H_zero_mask=self.H_zero_mask,
                                                is_H=True,
-                                               distribution=self.distribution)  # this is a distribution responsible for the obs2prior H term.
+                                               distribution=prior_H_dist)  # this is a distribution responsible for the obs2prior H term.
 
         else:
             self.register_buffer(
@@ -253,7 +260,7 @@ def log_prior(self,
                                             cov_factor=self.prior_cov_factor,
                                             cov_diag=self.prior_cov_diag).log_prob(sample)
         elif self.distribution == 'gamma':
-            concentration = mu
+            concentration = torch.exp(mu)
             rate = self.prior_variance
             out = Gamma(concentration=concentration,
                         rate=rate).log_prob(sample)
diff --git a/bemb/model/bemb.py b/bemb/model/bemb.py
index 31dbdf9..8c2ba56 100644
--- a/bemb/model/bemb.py
+++ b/bemb/model/bemb.py
@@ -158,6 +158,7 @@ def __init__(self,
                 2. 'gamma' - obs2prior is not supported for gamma coefficients
                 If a coefficient does not appear in the dictionary, it will be assigned the distribution specified
                 by the 'default' key. By default, the default distribution is 'gaussian'.
+                For coefficients which have gamma distributions, prior mean and variance MUST be specified in the prior_mean and prior_variance arguments if obs2prior is False for this coefficient. If obs2prior is True, prior_variance is still required
 
             obs2prior_dict (Dict[str, bool]): a dictionary maps coefficient name (e.g., 'lambda_item')
                 to a boolean indicating if observable (e.g., item_obs) enters the prior of the coefficient.
@@ -203,6 +204,8 @@ def __init__(self,
                 If no `prior_mean['default']` is provided, the default prior mean will be 0.0 for those coefficients
                 not in the prior_mean.keys().
 
+                For coefficients with gamma distributions, prior_mean specifies the shape parameter of the gamma prior.
+
                 Defaults to 0.0.
 
             prior_variance (Union[float, Dict[str, float]], Dict[str, torch. Tensor]): the variance of prior distribution
@@ -222,6 +225,8 @@ def __init__(self,
                 If no `prior_variance['default']` is provided, the default prior variance will be 1.0 for those coefficients
                 not in the prior_variance.keys().
 
+                For coefficients with gamma distributions, prior_variance specifies the concentration parameter of the gamma prior.
+
                 Defaults to 1.0, which means all priors have identity matrix as the covariance matrix.
 
             num_users (int, optional): number of users, required only if coefficient or observable
@@ -345,6 +350,21 @@ def __init__(self,
         for additive_term in self.formula:
             for coef_name in additive_term['coefficient']:
                 variation = coef_name.split('_')[-1]
+
+                if coef_name not in self.coef_dist_dict.keys():
+                    if 'default' in self.coef_dist_dict.keys():
+                        self.coef_dist_dict[coef_name] = self.coef_dist_dict['default']
+                    else:
+                        warnings.warn(f"You provided a dictionary of coef_dist_dict, but coefficient {coef_name} is not a key in it. Supply a value for 'default' in the coef_dist_dict dictionary to use that as default value (e.g., coef_dist_dict['default'] = 'gaussian'); now using distribution='gaussian' since this is not supplied.")
+                        self.coef_dist_dict[coef_name] = 'gaussian'
+
+                elif self.coef_dist_dict[coef_name] == 'gamma':
+                    if not self.obs2prior_dict[coef_name]:
+                        assert isinstance(self.prior_mean, dict) and coef_name in self.prior_mean.keys(), \
+                            f"Prior mean for {coef_name} needs to be provided because it's posterior is estimated as a gamma distribution."
+                        assert isinstance(self.prior_variance, dict) and coef_name in self.prior_variance.keys(), \
+                            f"Prior variance for {coef_name} needs to be provided because it's posterior is estimated as a gamma distribution."
+
                 if isinstance(self.prior_mean, dict):
                     # the user didn't specify prior mean for this coefficient.
                     if coef_name not in self.prior_mean.keys():
@@ -368,13 +388,6 @@ def __init__(self,
                             warnings.warn(f"You provided a dictionary of prior variance, but coefficient {coef_name} is not a key in it. Supply a value for 'default' in the prior_variance dictionary to use that as default value (e.g., prior_variance['default'] = 0.3); now using variance=1.0 since this is not supplied.")
                             self.prior_variance[coef_name] = 1.0
 
-                if coef_name not in self.coef_dist_dict.keys():
-                    if 'default' in self.coef_dist_dict.keys():
-                        self.coef_dist_dict[coef_name] = self.coef_dist_dict['default']
-                    else:
-                        warnings.warn(f"You provided a dictionary of coef_dist_dict, but coefficient {coef_name} is not a key in it. Supply a value for 'default' in the coef_dist_dict dictionary to use that as default value (e.g., coef_dist_dict['default'] = 'gaussian'); now using distribution='gaussian' since this is not supplied.")
-                        self.coef_dist_dict[coef_name] = 'gaussian'
-
                 s2 = self.prior_variance[coef_name] if isinstance(
                     self.prior_variance, dict) else self.prior_variance
 
@@ -681,9 +694,6 @@ def sample_coefficient_dictionary(self, num_seeds: int, deterministic: bool = Fa
         """
         sample_dict = dict()
         for coef_name, coef in self.coef_dict.items():
-            '''
-            print(coef_name)
-            '''
             if deterministic:
                 sample_dict[coef_name] = coef.variational_distribution.mean.unsqueeze(dim=0)  # (1, num_*, dim)
                 if coef.obs2prior:
diff --git a/tutorials/supermarket/main.py b/tutorials/supermarket/main.py
index 37c3093..b7c9548 100644
--- a/tutorials/supermarket/main.py
+++ b/tutorials/supermarket/main.py
@@ -205,6 +205,15 @@ def load_tsv(file_name, data_dir):
     # ==============================================================================================
     # pytorch-lightning training
     # ==============================================================================================
+    # if prior_mean is in the configs namespace set it to the prior mean
+    if hasattr(configs, 'prior_mean'):
+        prior_mean = configs.prior_mean
+    else:
+        prior_mean = 0.0
+    if hasattr(configs, 'prior_variance'):
+        prior_variance = configs.prior_variance
+    else:
+        prior_variance = 1.0
     bemb = LitBEMBFlex(
         # trainings args.
         pred_item = configs.pred_item,
@@ -222,7 +231,8 @@ def load_tsv(file_name, data_dir):
         category_to_item=category_to_item,
         num_user_obs=configs.num_user_obs if configs.obs_user else None,
         num_item_obs=configs.num_item_obs if configs.obs_item else None,
-        prior_variance=configs.prior_variance,
+        prior_mean = prior_mean,
+        prior_variance= prior_variance,
         num_price_obs=configs.num_price_obs,
         preprocess=False,
         # additional_modules=[ExampleCustomizedModule()]

From f1e98d546b649b99d175cbbc0d7b636205d17d23 Mon Sep 17 00:00:00 2001
From: Ayush Kanodia <kanodiaayush@gmail.com>
Date: Wed, 13 Dec 2023 16:29:19 -0800
Subject: [PATCH 15/19] gamma and lognormal

---
 bemb/model/bayesian_coefficient.py       |  31 ++--
 bemb/model/bemb.py                       |  16 +-
 bemb/model/bemb_chunked.py               |  52 +++++-
 bemb/model/bemb_supermarket_lightning.py | 195 +++++++++++++++++++++++
 bemb/utils/run_helper.py                 |   4 +-
 tutorials/supermarket/main.py            |  67 +++++---
 6 files changed, 325 insertions(+), 40 deletions(-)
 create mode 100644 bemb/model/bemb_supermarket_lightning.py

diff --git a/bemb/model/bayesian_coefficient.py b/bemb/model/bayesian_coefficient.py
index 464c95a..98ab638 100644
--- a/bemb/model/bayesian_coefficient.py
+++ b/bemb/model/bayesian_coefficient.py
@@ -66,7 +66,7 @@ def __init__(self,
                 If a tensor with shape (num_classes, dim) is supplied, supplying a (num_classes, dim) tensor is amount
                 to specifying a different prior variance for each entry in the coefficient.
                 Defaults to 1.0.
-            distribution (str, optional): the distribution of the coefficient. Currently we support 'gaussian' and 'gamma'.
+            distribution (str, optional): the distribution of the coefficient. Currently we support 'gaussian', 'gamma' and 'lognormal'.
                 Defaults to 'gaussian'.
         """
         super(BayesianCoefficient, self).__init__()
@@ -75,7 +75,7 @@ def __init__(self,
 
         self.variation = variation
 
-        assert distribution in ['gaussian', 'gamma'], f'Unsupported distribution {distribution}'
+        assert distribution in ['gaussian', 'gamma', 'lognormal'], f'Unsupported distribution {distribution}'
         if distribution == 'gamma':
             '''
             assert not obs2prior, 'Gamma distribution is not supported for obs2prior at present.'
@@ -84,13 +84,15 @@ def __init__(self,
             assert mean > 0, 'Gamma distribution requires mean > 0'
             assert variance > 0, 'Gamma distribution requires variance > 0'
             # shape (concentration) is mean^2/variance, rate is variance/mean for Gamma distribution.
+            '''
             shape = prior_mean ** 2 / prior_variance
             rate = prior_mean / prior_variance
+            # prior_mean stores ln(shape) for gamma
             prior_mean = np.log(shape)
+            # prior_variance stores rate for gamma
             prior_variance = rate
-            '''
-            prior_mean = np.log(prior_mean)
-            prior_variance = prior_variance
+            # prior_mean = np.log(prior_mean)
+            # prior_variance = prior_variance
 
         self.distribution = distribution
 
@@ -141,13 +143,13 @@ def __init__(self,
             num_classes, dim) * self.prior_variance)
 
         # create variational distribution.
-        if self.distribution == 'gaussian':
+        if self.distribution == 'gaussian' or self.distribution == 'lognormal':
             self.variational_mean_flexible = nn.Parameter(
                 torch.randn(num_classes, dim), requires_grad=True)
         # TOOD(kanodiaayush): initialize the gamma distribution variational mean in a more principled way.
         elif self.distribution == 'gamma':
             # initialize using uniform distribution between 0.5 and 1.5
-            # for a gamma distribution, we store the concentration as log(concentration) = variational_mean_flexible
+            # for a gamma distribution, we store the concentration (shape) as log(concentration) = variational_mean_flexible
             self.variational_mean_flexible = nn.Parameter(
                 torch.rand(num_classes, dim) + 0.5, requires_grad=True)
 
@@ -200,7 +202,7 @@ def variational_mean(self) -> torch.Tensor:
 
         if self.distribution == 'gamma':
             # M = torch.pow(M, 2) + 0.000001
-            M = M.exp() / self.variational_logstd.exp()
+            M = (torch.minimum((M.exp() + 0.000001), torch.tensor(1e3))) / self.variational_logstd.exp()
 
         if self.is_H and (self.H_zero_mask is not None):
             # a H-variable with zero-entry restriction.
@@ -255,13 +257,19 @@ def log_prior(self,
         else:
             mu = self.prior_zero_mean
 
-        if self.distribution == 'gaussian':
+        if self.distribution == 'gaussian' or self.distribution == 'lognormal':
             out = LowRankMultivariateNormal(loc=mu,
                                             cov_factor=self.prior_cov_factor,
                                             cov_diag=self.prior_cov_diag).log_prob(sample)
         elif self.distribution == 'gamma':
             concentration = torch.exp(mu)
             rate = self.prior_variance
+            '''
+            print(concentration)
+            print('concentration')
+            print(rate)
+            print('rate')
+            '''
             out = Gamma(concentration=concentration,
                         rate=rate).log_prob(sample)
             # sum over the last dimension
@@ -315,14 +323,15 @@ def rsample(self, num_seeds: int = 1) -> Union[torch.Tensor, Tuple[torch.Tensor]
     def variational_distribution(self) -> Union[LowRankMultivariateNormal, Gamma]:
         """Constructs the current variational distribution of the coefficient from current variational mean and covariance.
         """
-        if self.distribution == 'gaussian':
+        if self.distribution == 'gaussian' or self.distribution == 'lognormal':
             return LowRankMultivariateNormal(loc=self.variational_mean,
                                              cov_factor=self.variational_cov_factor,
                                              cov_diag=torch.exp(self.variational_logstd))
         elif self.distribution == 'gamma':
             # for a gamma distribution, we store the concentration as log(concentration) = variational_mean_flexible
             assert self.variational_mean_fixed == None, 'Gamma distribution does not support fixed mean'
-            concentration = self.variational_mean_flexible.exp()
+            concentration = self.variational_mean_flexible.exp() + 0.000001
+            concentration = torch.minimum(concentration, torch.tensor(1e3))
             # for gamma distribution, we store the rate as log(rate) = variational_logstd
             rate = torch.exp(self.variational_logstd)
             return Gamma(concentration=concentration, rate=rate)
diff --git a/bemb/model/bemb.py b/bemb/model/bemb.py
index fd7fbe3..8c30232 100644
--- a/bemb/model/bemb.py
+++ b/bemb/model/bemb.py
@@ -399,6 +399,7 @@ def __init__(self,
                 if (not self.obs2prior_dict[coef_name]) and (H_zero_mask is not None):
                     raise ValueError(f'You specified H_zero_mask for {coef_name}, but obs2prior is False for this coefficient.')
 
+                print(coef_name)
                 coef_dict[coef_name] = BayesianCoefficient(variation=variation,
                                                            num_classes=variation_to_num_classes[variation],
                                                            obs2prior=self.obs2prior_dict[coef_name],
@@ -695,7 +696,10 @@ def sample_coefficient_dictionary(self, num_seeds: int, deterministic: bool = Fa
         sample_dict = dict()
         for coef_name, coef in self.coef_dict.items():
             if deterministic:
-                sample_dict[coef_name] = coef.variational_distribution.mean.unsqueeze(dim=0)  # (1, num_*, dim)
+                s = coef.variational_distribution.mean.unsqueeze(dim=0)  # (1, num_*, dim)
+                if coef.distribution == 'lognormal':
+                    s = torch.exp(s)
+                sample_dict[coef_name] = s
                 if coef.obs2prior:
                     sample_dict[coef_name + '.H'] = coef.prior_H.variational_distribution.mean.unsqueeze(dim=0)  # (1, num_*, dim)
             else:
@@ -703,11 +707,17 @@ def sample_coefficient_dictionary(self, num_seeds: int, deterministic: bool = Fa
                 if coef.obs2prior:
                     # sample both obs2prior weight and realization of variable.
                     assert isinstance(s, tuple) and len(s) == 2
-                    sample_dict[coef_name] = s[0]
+                    if coef.distribution == 'lognormal':
+                        ss = torch.exp(s[0])
+                    else:
+                        ss = s[0]
+                    sample_dict[coef_name] = ss
                     sample_dict[coef_name + '.H'] = s[1]
                 else:
                     # only sample the realization of variable.
                     assert torch.is_tensor(s)
+                    if coef.distribution == 'lognormal':
+                        s = torch.exp(s)
                     sample_dict[coef_name] = s
         return sample_dict
 
@@ -1005,6 +1015,8 @@ def reshape_observable(obs, name):
                     R, P, I, num_obs, latent_dim)
                 coef_sample_1 = coef_sample_1.view(
                     R, P, I, num_obs, latent_dim)
+                # coef_sample_0 = torch.exp(coef_sample_0)
+                # coef_sample_1 = torch.exp(coef_sample_1)
                 # compute the factorized coefficient with shape (R, P, I, O).
                 coef = (coef_sample_0 * coef_sample_1).sum(dim=-1)
 
diff --git a/bemb/model/bemb_chunked.py b/bemb/model/bemb_chunked.py
index 64eb511..fa57bd2 100644
--- a/bemb/model/bemb_chunked.py
+++ b/bemb/model/bemb_chunked.py
@@ -47,6 +47,7 @@ def __init__(self,
                  num_items: int,
                  pred_item: bool,
                  num_classes: int = 2,
+                 coef_dist_dict: Dict[str, str] = {'default' : 'gaussian'},
                  H_zero_mask_dict: Optional[Dict[str, torch.BoolTensor]] = None,
                  prior_mean: Union[float, Dict[str, float]] = 0.0,
                  prior_variance: Union[float, Dict[str, float]] = 1.0,
@@ -75,6 +76,14 @@ def __init__(self,
                     lambda_item + theta_user * alpha_item + gamma_user * beta_item * price_obs
                 See the doc-string of parse_utility for an example.
 
+            coef_dist_dict (Dict[str, str]): a dictionary mapping coefficient name to coefficient distribution name.
+                The coefficient distribution name can be one of the following:
+                1. 'gaussian'
+                2. 'gamma' - obs2prior is not supported for gamma coefficients
+                If a coefficient does not appear in the dictionary, it will be assigned the distribution specified
+                by the 'default' key. By default, the default distribution is 'gaussian'.
+                For coefficients which have gamma distributions, prior mean and variance MUST be specified in the prior_mean and prior_variance arguments if obs2prior is False for this coefficient. If obs2prior is True, prior_variance is still required
+
             obs2prior_dict (Dict[str, bool]): a dictionary maps coefficient name (e.g., 'lambda_item')
                 to a boolean indicating if observable (e.g., item_obs) enters the prior of the coefficient.
 
@@ -119,6 +128,8 @@ def __init__(self,
                 If no `prior_mean['default']` is provided, the default prior mean will be 0.0 for those coefficients
                 not in the prior_mean.keys().
 
+                For coefficients with gamma distributions, prior_mean specifies the shape parameter of the gamma prior.
+
                 Defaults to 0.0.
 
             prior_variance (Union[float, Dict[str, float]], Dict[str, torch. Tensor]): the variance of prior distribution
@@ -138,6 +149,8 @@ def __init__(self,
                 If no `prior_variance['default']` is provided, the default prior variance will be 1.0 for those coefficients
                 not in the prior_variance.keys().
 
+                For coefficients with gamma distributions, prior_variance specifies the concentration parameter of the gamma prior.
+
                 Defaults to 1.0, which means all priors have identity matrix as the covariance matrix.
 
             num_users (int, optional): number of users, required only if coefficient or observable
@@ -172,6 +185,7 @@ def __init__(self,
         self.utility_formula = utility_formula
         self.obs2prior_dict = obs2prior_dict
         self.coef_dim_dict = coef_dim_dict
+        self.coef_dist_dict = coef_dist_dict
         if H_zero_mask_dict is not None:
             self.H_zero_mask_dict = H_zero_mask_dict
         else:
@@ -285,6 +299,23 @@ def __init__(self,
         for additive_term in self.formula:
             for coef_name in additive_term['coefficient']:
                 variation = coef_name.split('_')[-1]
+
+                if coef_name not in self.coef_dist_dict.keys():
+                    if 'default' in self.coef_dist_dict.keys():
+                        self.coef_dist_dict[coef_name] = self.coef_dist_dict['default']
+                    else:
+                        warnings.warn(f"You provided a dictionary of coef_dist_dict, but coefficient {coef_name} is not a key in it. Supply a value for 'default' in the coef_dist_dict dictionary to use that as default value (e.g., coef_dist_dict['default'] = 'gaussian'); now using distribution='gaussian' since this is not supplied.")
+                        self.coef_dist_dict[coef_name] = 'gaussian'
+
+                '''
+                elif self.coef_dist_dict[coef_name] == 'gamma':
+                    if not self.obs2prior_dict[coef_name]:
+                        assert isinstance(self.prior_mean, dict) and coef_name in self.prior_mean.keys(), \
+                            f"Prior mean for {coef_name} needs to be provided because it's posterior is estimated as a gamma distribution."
+                    assert isinstance(self.prior_variance, dict) and coef_name in self.prior_variance.keys(), \
+                        f"Prior variance for {coef_name} needs to be provided because it's posterior is estimated as a gamma distribution."
+                '''
+
                 if isinstance(self.prior_mean, dict):
                     # the user didn't specify prior mean for this coefficient.
                     if coef_name not in self.prior_mean.keys():
@@ -326,6 +357,8 @@ def __init__(self,
                 for ii in range(chunk_sizes[0]):
                     bayesian_coefs_inner = []
                     for jj in range(chunk_sizes[1]):
+                        if self.coef_dist_dict[coef_name] == 'gamma' and not self.obs2prior_dict[coef_name]:
+                            assert mean > 0, 'shape of gamma distribution specifieid as prior_mean needs to be > 0'
                         bayesian_coefs_inner.append(BayesianCoefficient(variation=variation,
                                                                         num_classes=variation_to_num_classes[variation],
                                                                         obs2prior=self.obs2prior_dict[coef_name],
@@ -334,8 +367,8 @@ def __init__(self,
                                                                         prior_mean=mean,
                                                                         prior_variance=s2,
                                                                         H_zero_mask=H_zero_mask,
-                                                                        is_H=False
-                                                                        )
+                                                                        is_H=False,
+                                                                        distribution=self.coef_dist_dict[coef_name]),
                                                     )
                     bayesian_coefs_inner = nn.ModuleList(bayesian_coefs_inner)
                     bayesian_coefs.append(bayesian_coefs_inner)
@@ -655,7 +688,10 @@ def sample_coefficient_dictionary(self, num_seeds: int, deterministic: bool=Fals
                 for ii, bayesian_coeffs_inner in enumerate(bayesian_coeffs):
                     # inner_list = []
                     for jj, coef in enumerate(bayesian_coeffs_inner):
-                        this_sample[:, :, :, ii, jj] = coef.variational_distribution.mean.unsqueeze(dim=0) # (1, num_*, dim)
+                        s = coef.variational_distribution.mean.unsqueeze(dim=0) # (1, num_*, dim)
+                        if coef.distribution == 'lognormal':
+                            s = s.exp()
+                        this_sample[:, :, :, ii, jj] = s
                         # inner_list.append(coef.variational_distribution.mean.unsqueeze(dim=0)) # (1, num_*, dim)
                         # inner_list.append(coef.variational_distribution.mean.unsqueeze(dim=0)) # (1, num_*, dim)
                     # outer_list.append(inner_list)
@@ -680,6 +716,10 @@ def sample_coefficient_dictionary(self, num_seeds: int, deterministic: bool=Fals
                         if coef.obs2prior:
                             # sample both obs2prior weight and realization of variable.
                             assert isinstance(s, tuple) and len(s) == 2
+                            if coef.distribution == 'lognormal':
+                                ss = torch.exp(s[0])
+                            else:
+                                ss = s[0]
                             this_sample[:, :, :, ii, jj] = s[0]
                             this_sample_H[:, :, :, ii, jj] = s[1]
                             # inner_list.append(s[0])
@@ -687,6 +727,8 @@ def sample_coefficient_dictionary(self, num_seeds: int, deterministic: bool=Fals
                         else:
                             # only sample the realization of variable.
                             assert torch.is_tensor(s)
+                            if coef.distribution == 'lognormal':
+                                s = torch.exp(s)
                             this_sample[:, :, :, ii, jj] = s
                             # inner_list.append(s)
                     # outer_list.append(inner_list)
@@ -1004,6 +1046,7 @@ def reshape_observable(obs, name):
                     sample_dict[coef_name], coef_name)
                 assert coef_sample.shape == (R, total_computation, 1)
                 additive_term = coef_sample.view(R, total_computation)
+                additive_term *= term['sign']
 
             # Type II: factorized coefficient, e.g., <theta_user, lambda_item>.
             elif len(term['coefficient']) == 2 and term['observable'] is None:
@@ -1019,6 +1062,7 @@ def reshape_observable(obs, name):
                     R, total_computation, positive_integer)
 
                 additive_term = (coef_sample_0 * coef_sample_1).sum(dim=-1)
+                additive_term *= term['sign']
 
             # Type III: single coefficient multiplied by observable, e.g., theta_user * x_obs_item.
             elif len(term['coefficient']) == 1 and term['observable'] is not None:
@@ -1038,6 +1082,7 @@ def reshape_observable(obs, name):
                     price_coeffs = obs_coeff
 
                 additive_term = (coef_sample * obs).sum(dim=-1)
+                additive_term *= term['sign']
 
             # Type IV: factorized coefficient multiplied by observable.
             # e.g., gamma_user * beta_item * price_obs.
@@ -1071,6 +1116,7 @@ def reshape_observable(obs, name):
                     price_coeffs = obs_coeff
 
                 additive_term = (coef * obs).sum(dim=-1)
+                additive_term *= term['sign']
 
             else:
                 raise ValueError(f'Undefined term type: {term}')
diff --git a/bemb/model/bemb_supermarket_lightning.py b/bemb/model/bemb_supermarket_lightning.py
new file mode 100644
index 0000000..a7b40cb
--- /dev/null
+++ b/bemb/model/bemb_supermarket_lightning.py
@@ -0,0 +1,195 @@
+"""
+BEMB Flex model adopted to PyTorch Lightning.
+"""
+import os
+
+import pandas as pd
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.utils.data import DataLoader, random_split
+from torchvision.datasets import MNIST
+from torchvision import transforms
+import pytorch_lightning as pl
+# from pytorch_lightning.metrics.functional import accuracy
+from torchmetrics.functional import accuracy
+from bemb.model import BEMBFlex
+from bemb.model.bemb import parse_utility
+from torch_choice.data.utils import create_data_loader
+
+
+class WeekTrendPreprocessor(nn.Module):
+    def __init__(self, num_weeks: int, latent_dim: int):
+        super().__init__()
+        self.emb = nn.Embedding(num_weeks, latent_dim)
+
+    def forward(self, batch):
+        # batch.session_obs_w expected, (num_session, 1)
+        # convert to batch.session_delta, (num_session, num_latent)
+        # session_delta will be considered as a session-specific observable by BEMB.
+        batch.session_week = self.emb(batch.session_week_id.long())
+        return batch
+
+class LitBEMBFlex(pl.LightningModule):
+
+    def __init__(self, configs=None, user_encoder=None, item_encoder=None, session_encoder=None, category_encoder=None, obs_dict=None, print_info=None, train_data=None, validation_data=None, batch_size: int = -1, num_workers: int = 8, learning_rate: float = 0.3, num_seeds: int=1, num_weeks=0, num_week_trend_latents=0, test_data=None, preprocess=True, lr_decay_type='multi_step_lr', lr_milestones=[], lr_decay=1.0, check_val_every_n_epoch=5, **kwargs):
+        # use kwargs to pass parameter to BEMB Torch.
+        super().__init__()
+        # import pdb; pdb.pdb.set_trace()
+        self.save_hyperparameters(ignore=['train_data', 'validation_data', 'test_data'])
+        self.train_data = train_data
+        self.validation_data = validation_data
+        self.test_data = test_data
+        bemb_params = {key : self.hparams[key] for key in kwargs.keys()}
+        self.model = BEMBFlex(**bemb_params)
+        self.configs = self.hparams.configs
+        # self.configs = configs
+        self.num_seeds = self.hparams.num_seeds
+        self.batch_size = self.hparams.batch_size
+        self.num_workers = self.hparams.num_workers
+        self.learning_rate = self.hparams.learning_rate
+        self.preprocess =  self.hparams.preprocess
+        self.lr_decay_type = self.hparams.lr_decay_type
+        self.lr_milestones = self.hparams.lr_milestones
+        self.lr_decay = self.hparams.lr_decay
+        self.check_val_every_n_epoch = self.hparams.check_val_every_n_epoch
+        if self.preprocess:
+            self.batch_preprocess = WeekTrendPreprocessor(num_weeks=self.hparams.num_weeks, latent_dim=self.hparams.num_week_trend_latents)
+
+    def __str__(self) -> str:
+        return str(self.model)
+
+    def training_step(self, batch, batch_idx):
+        if self.preprocess:
+            batch = self.batch_preprocess(batch)
+        elbo = self.model.elbo(batch, num_seeds=self.num_seeds)
+        self.log('train_elbo', elbo)
+        loss = - elbo
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        if self.preprocess:
+            batch = self.batch_preprocess(batch)
+        LL = self.model.forward(batch, return_type='log_prob', return_scope='item_index').mean()
+        # self.log('val_log_likelihood', LL, prog_bar=True)
+        return {'LL':LL}
+
+    # def training_epoch_end(self, outputs):
+        # sch = self.lr_schedulers()
+        # print("Learning Rate %f" % sch.get_lr)
+
+    def validation_epoch_end(self, outputs):
+        sch = self.lr_schedulers()
+        # print("Learning Rate %r" % sch.get_lr())
+        avg_LL = torch.stack([x["LL"] for x in outputs]).mean()
+        # self.log("lr", sch.get_last_lr()[0], prog_bar=True)
+        self.log("lr", self.hparams.learning_rate, prog_bar=True)
+        self.log("val_log_likelihood", avg_LL, prog_bar=True)
+
+    # If the selected scheduler is a ReduceLROnPlateau scheduler.
+        if isinstance(sch, torch.optim.lr_scheduler.ReduceLROnPlateau):
+            sch.step(self.trainer.callback_metrics["val_log_likelihood"])
+        # avg_acc = torch.stack([x["val_accuracy"] for x in outputs]).mean()
+        # self.log("ptl/val_loss", avg_loss)
+        # self.log("ptl/val_accuracy", avg_acc)
+
+    # def test_step(self, batch, batch_idx):
+    #     LL = self.model.forward(batch, return_logit=False, all_items=False).mean()
+    #     self.log('test_log_likelihood', LL)
+
+    #     pred = self.model(batch)
+    #     performance = self.model.get_within_category_accuracy(pred, batch.label)
+    #     for key, val in performance.items():
+            # self.log('test_' + key, val, prog_bar=True)
+
+    def configure_optimizers(self):
+        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
+        # return optimizer
+        # scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[400], gamma=0.99)
+        if self.lr_decay_type == 'reduce_on_plateau':
+            scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', patience=2, factor=self.lr_decay)
+            lr_scheduler = {
+                'scheduler': scheduler,
+                'name': 'lr_log',
+                'monitor' : 'val_log_likelihood',
+                'frequency' : self.check_val_every_n_epoch,
+            }
+        elif self.lr_decay_type == 'multi_step_lr':
+            scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=self.lr_milestones, gamma=self.lr_decay)
+            lr_scheduler = scheduler
+        else:
+            raise ValueError
+
+        # scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=1.00, total_steps=200)# steps_per_epoch=len(data_loader), epochs=10)
+        # lr_scheduler = scheduler
+        return [optimizer], [lr_scheduler]
+        # return optimizer
+
+    def train_dataloader(self):
+        train_dataloader = create_data_loader(self.train_data,
+                                              batch_size=self.batch_size,
+                                              shuffle=True,
+                                              num_workers=self.num_workers)
+        return train_dataloader
+
+    def val_dataloader(self):
+        validation_dataloader = create_data_loader(self.validation_data,
+                                                   # batch_size=-1,
+                                                   batch_size=self.batch_size,
+                                                   shuffle=True,
+                                                   num_workers=self.num_workers)
+        return validation_dataloader
+
+    def test_dataloader(self):
+        if self.test_data is None:
+            return None
+        test_dataloader = create_data_loader(self.test_data,
+                                            # batch_size=-1,
+                                            batch_size=self.batch_size,
+                                            shuffle=False,
+                                            num_workers=self.num_workers)
+        return test_dataloader
+
+
+    def write_bemb_cpp_format(self):
+        model = self.model
+        configs = self.hparams.configs
+        print_info = self.hparams.print_info
+        users = self.hparams.user_encoder.classes_
+        items = self.hparams.item_encoder.classes_
+        categories = self.hparams.category_encoder.classes_
+        sessions = self.hparams.session_encoder.classes_
+        user_df = pd.DataFrame(users, columns=['user_id'])
+        item_df = pd.DataFrame(items, columns=['item_id'])
+        category_df = pd.DataFrame(categories, columns=['category_id'])
+        formula = parse_utility(model.utility_formula)
+        variations = {'user', 'item', 'category', 'constant', 'session'}
+        variations_dfs = {
+            'user' : user_df,
+            'item' : item_df,
+            'category' : category_df,
+        }
+        variations_ids = {
+            'user' : 'user_id',
+            'item' : 'item_id',
+            'category' : 'category_id',
+        }
+        for additive_term in formula:
+            for coef_name in additive_term['coefficient']:
+                variation = coef_name.split('_')[-1]
+                params_mean = model.coef_dict[coef_name].variational_mean
+                params_std = torch.exp(model.coef_dict[coef_name].variational_logstd)
+                for params, moment in zip((params_mean, params_std), ('mean', 'std')):
+                    params_df = pd.DataFrame(params.detach().cpu().numpy())
+                    if variation != 'constant':
+                        params_df = pd.concat((variations_dfs[variation], params_df), axis=1)
+                    params_df.to_csv('%s/param_%s_%s.tsv' % (configs.out_dir, coef_name, moment), sep='\t')
+
+        if 'lambda_item' in model.coef_dict:
+            print_info['store_dummies'].to_csv('%s/store_dummies.tsv' % (configs.out_dir), sep='\t')
+        if 'delta_item' in model.coef_dict:
+            print_info['store_weekday_dummies'].to_csv('%s/store_weekday_dummies.tsv' % (configs.out_dir), sep='\t')
+        if 'mu_item' in model.coef_dict:
+            week_trend_latents = self.batch_preprocess.emb.weight
+            week_trend_latents_df = pd.DataFrame(week_trend_latents.detach().cpu().numpy())
+            week_trend_latents_df.to_csv('%s/param_weektrends_mean.tsv' % (configs.out_dir), sep='\t')
diff --git a/bemb/utils/run_helper.py b/bemb/utils/run_helper.py
index 0d5637a..33fd544 100644
--- a/bemb/utils/run_helper.py
+++ b/bemb/utils/run_helper.py
@@ -6,6 +6,7 @@
 import time
 
 import pytorch_lightning as pl
+from pytorch_lightning.callbacks.early_stopping import EarlyStopping
 from torch_choice.data.utils import create_data_loader
 from typing import List
 from torch_choice.data import ChoiceDataset
@@ -17,7 +18,7 @@ def section_print(input_text):
     print('=' * 20, input_text, '=' * 20)
 
 
-def run(model: LitBEMBFlex, dataset_list: List[ChoiceDataset], batch_size: int=-1, num_epochs: int=10, num_workers: int=8, run_test=True, **kwargs) -> LitBEMBFlex:
+def run(model: LitBEMBFlex, dataset_list: List[ChoiceDataset], batch_size: int=-1, num_epochs: int=10, num_workers: int=8, run_test=True, patience=100, **kwargs) -> LitBEMBFlex:
     """A standard pipeline of model training and evaluation.
 
     Args:
@@ -53,6 +54,7 @@ def run(model: LitBEMBFlex, dataset_list: List[ChoiceDataset], batch_size: int=-
                          max_epochs=num_epochs,
                          check_val_every_n_epoch=1,
                          log_every_n_steps=1,
+                         callbacks=[EarlyStopping(monitor='val_log_likelihood', patience=patience, mode='max')],
                          **kwargs)
     start_time = time.time()
     trainer.fit(model, train_dataloaders=train, val_dataloaders=validation)
diff --git a/tutorials/supermarket/main.py b/tutorials/supermarket/main.py
index b7c9548..59d10c1 100644
--- a/tutorials/supermarket/main.py
+++ b/tutorials/supermarket/main.py
@@ -101,26 +101,6 @@ def load_tsv(file_name, data_dir):
     # ==============================================================================================
     # parse item availability.
     # Try and catch? Optionally specify full availability?
-    a_tsv = pd.read_csv(os.path.join(configs.data_dir, 'availabilityList.tsv'),
-                        sep='\t',
-                        index_col=None,
-                        header=None,
-                        names=['session_id', 'item_id'])
-
-    # availability ties session as well.
-    session_encoder = LabelEncoder().fit(a_tsv['session_id'].values)
-    configs.num_sessions = len(session_encoder.classes_)
-    assert is_sorted(session_encoder.classes_)
-    # this loop could be slow, depends on # sessions.
-    item_availability = torch.zeros(configs.num_sessions, configs.num_items).bool()
-
-    a_tsv['item_id'] = item_encoder.transform(a_tsv['item_id'].values)
-    a_tsv['session_id'] = session_encoder.transform(a_tsv['session_id'].values)
-
-    for session_id, df_group in a_tsv.groupby('session_id'):
-        # get IDs of items available at this date.
-        a_item_ids = df_group['item_id'].unique()  # this unique is not necessary if the dataset is well-prepared.
-        item_availability[session_id, a_item_ids] = True
 
     # ==============================================================================================
     # price observables
@@ -128,6 +108,10 @@ def load_tsv(file_name, data_dir):
     df_price = pd.read_csv(os.path.join(configs.data_dir, 'item_sess_price.tsv'),
                            sep='\t',
                            names=['item_id', 'session_id', 'price'])
+    # availability ties session as well.
+    session_encoder = LabelEncoder().fit(df_price['session_id'].values)
+    configs.num_sessions = len(session_encoder.classes_)
+    assert is_sorted(session_encoder.classes_)
 
     # only keep prices of relevant items.
     mask = df_price['item_id'].isin(item_encoder.classes_)
@@ -141,6 +125,26 @@ def load_tsv(file_name, data_dir):
     price_obs = torch.Tensor(df_price.values).view(configs.num_sessions, configs.num_items, 1)
     configs.num_price_obs = 1
 
+    if not(hasattr(configs, 'complete_availability') and configs.complete_availability):
+        a_tsv = pd.read_csv(os.path.join(configs.data_dir, 'availabilityList.tsv'),
+                            sep='\t',
+                            index_col=None,
+                            header=None,
+                            names=['session_id', 'item_id'])
+
+        item_availability = torch.zeros(configs.num_sessions, configs.num_items).bool()
+
+        a_tsv['item_id'] = item_encoder.transform(a_tsv['item_id'].values)
+        a_tsv['session_id'] = session_encoder.transform(a_tsv['session_id'].values)
+
+    # this loop could be slow, depends on # sessions.
+        for session_id, df_group in a_tsv.groupby('session_id'):
+            # get IDs of items available at this date.
+            a_item_ids = df_group['item_id'].unique()  # this unique is not necessary if the dataset is well-prepared.
+            item_availability[session_id, a_item_ids] = True
+    else:
+        item_availability = torch.ones(configs.num_sessions, configs.num_items).bool()
+
     # ==============================================================================================
     # create datasets
     # ==============================================================================================
@@ -198,8 +202,8 @@ def load_tsv(file_name, data_dir):
     item_groups['category_id'] = category_encoder.transform(
         item_groups['category_id'].values)
 
-    print('Category sizes:')
-    print(item_groups.groupby('category_id').size().describe())
+    # print('Category sizes:')
+    # print(item_groups.groupby('category_id').size().describe())
     item_groups = item_groups.groupby('category_id')['item_id'].apply(list)
     category_to_item = dict(zip(item_groups.index, item_groups.values))
     # ==============================================================================================
@@ -238,20 +242,37 @@ def load_tsv(file_name, data_dir):
         # additional_modules=[ExampleCustomizedModule()]
     )
 
+    if hasattr(configs, 'patience'):
+        patience = configs.patience
+    else:
+        patience = 100
     bemb = bemb.to(configs.device)
-    bemb = run(bemb, dataset_list, batch_size=configs.batch_size, num_epochs=configs.num_epochs, run_test=False)
+    bemb = run(bemb, dataset_list, batch_size=configs.batch_size, num_epochs=configs.num_epochs, run_test=False, patience=patience)
 
     # '''
     # coeffs = coeffs**2
     # give distribution statistics
+    out_dir = './output'
+    if not os.path.exists(out_dir):
+        os.makedirs(out_dir)
     if 'gamma_user' in configs.utility:
         coeffs_gamma = bemb.model.coef_dict['gamma_user'].variational_mean.detach().cpu().numpy()
+        if configs.coef_dist_dict['gamma_user'] == 'lognormal':
+            coeffs_gamma = np.exp(coeffs_gamma)
         print('Coefficients statistics Gamma:')
         print(pd.DataFrame(coeffs_gamma).describe())
+        # write to file
+        print("writing")
+        np.savetxt(f'{out_dir}/gammas_{sys.argv[1]}.txt', coeffs_gamma, delimiter=',')
     if 'nfact_category' in configs.utility:
         coeffs_nfact = bemb.model.coef_dict['nfact_category'].variational_mean.detach().cpu().numpy()
+        if configs.coef_dist_dict['nfact_category'] == 'lognormal':
+            coeffs_nfact = np.exp(coeffs_nfact)
         print('Coefficients statistics nfact_category:')
         print(pd.DataFrame(coeffs_nfact).describe())
+        # write to file
+        print("writing")
+        np.savetxt(f'{out_dir}/nfacts_{sys.argv[1]}.txt', coeffs_nfact, delimiter=',')
     # '''
 
     # ==============================================================================================

From 671548a7e903d5f6114b54116198b77cb40fd47c Mon Sep 17 00:00:00 2001
From: Ayush Kanodia <kanodiaayush@gmail.com>
Date: Wed, 13 Dec 2023 18:25:41 -0800
Subject: [PATCH 16/19] lognormal in bayesian coeff (not working)

run python main.py configs3_1.yaml in tutorials/supermarket to test

removed lognormal from bemb.py, will remove from bemb_chunked.py too
---
 bemb/model/bayesian_coefficient.py | 41 +++++++++++++++++++++++++-----
 bemb/model/bemb.py                 | 12 +++++----
 bemb/model/bemb_chunked.py         | 11 ++++----
 tutorials/supermarket/main.py      |  8 +++---
 4 files changed, 51 insertions(+), 21 deletions(-)

diff --git a/bemb/model/bayesian_coefficient.py b/bemb/model/bayesian_coefficient.py
index 98ab638..a12f490 100644
--- a/bemb/model/bayesian_coefficient.py
+++ b/bemb/model/bayesian_coefficient.py
@@ -11,6 +11,7 @@
 import torch.nn as nn
 from torch.distributions.lowrank_multivariate_normal import LowRankMultivariateNormal
 from torch.distributions.gamma import Gamma
+from torch.distributions.log_normal import LogNormal
 
 
 class BayesianCoefficient(nn.Module):
@@ -195,15 +196,21 @@ def variational_mean(self) -> torch.Tensor:
         Returns:
             torch.Tensor: the current mean of the variational distribution with shape (num_classes, dim).
         """
-        if self.variational_mean_fixed is None:
-            M = self.variational_mean_flexible
-        else:
-            M = self.variational_mean_fixed + self.variational_mean_flexible
+        assert self.variational_mean_fixed is None, "not supported"
+
+        # if self.variational_mean_fixed is None:
+        # else:
+        #     M = self.variational_mean_fixed + self.variational_mean_flexible
+
+        M = self.variational_mean_flexible
 
         if self.distribution == 'gamma':
             # M = torch.pow(M, 2) + 0.000001
             M = (torch.minimum((M.exp() + 0.000001), torch.tensor(1e3))) / self.variational_logstd.exp()
 
+        elif self.distribution == 'lognormal':
+            M = (torch.minimum((M.exp() + 0.000001), torch.tensor(1e3)))
+
         if self.is_H and (self.H_zero_mask is not None):
             # a H-variable with zero-entry restriction.
             # multiply zeros to entries with H_zero_mask[i, j] = 1.
@@ -257,10 +264,14 @@ def log_prior(self,
         else:
             mu = self.prior_zero_mean
 
-        if self.distribution == 'gaussian' or self.distribution == 'lognormal':
+        if self.distribution == 'gaussian':# or self.distribution == 'lognormal':
             out = LowRankMultivariateNormal(loc=mu,
                                             cov_factor=self.prior_cov_factor,
                                             cov_diag=self.prior_cov_diag).log_prob(sample)
+        elif self.distribution == 'lognormal':
+            # out = LogNormal(loc=mu, scale=np.sqrt(self.prior_variance)).log_prob(sample)
+            # out = torch.sum(out, dim=-1)
+            out = torch.zeros((num_seeds, num_classes), device=sample.device)
         elif self.distribution == 'gamma':
             concentration = torch.exp(mu)
             rate = self.prior_variance
@@ -275,6 +286,7 @@ def log_prior(self,
             # sum over the last dimension
             out = torch.sum(out, dim=-1)
 
+        out = torch.zeros((num_seeds, num_classes), device=sample.device)
         assert out.shape == (num_seeds, num_classes)
         return out
 
@@ -311,6 +323,9 @@ def rsample(self, num_seeds: int = 1) -> Union[torch.Tensor, Tuple[torch.Tensor]
         """
         value_sample = self.variational_distribution.rsample(
             torch.Size([num_seeds]))
+        if self.distribution == 'lognormal':
+            print(torch.min(value_sample))
+            print(torch.max(value_sample))
         # DEBUG_MARKER
         if self.obs2prior:
             # sample obs2prior H as well.
@@ -323,10 +338,22 @@ def rsample(self, num_seeds: int = 1) -> Union[torch.Tensor, Tuple[torch.Tensor]
     def variational_distribution(self) -> Union[LowRankMultivariateNormal, Gamma]:
         """Constructs the current variational distribution of the coefficient from current variational mean and covariance.
         """
-        if self.distribution == 'gaussian' or self.distribution == 'lognormal':
-            return LowRankMultivariateNormal(loc=self.variational_mean,
+        if self.distribution == 'gaussian':# or self.distribution == 'lognormal':
+            # print(torch.max(self.variational_mean), torch.min(self.variational_mean))
+            # print(torch.max(self.variational_logstd), torch.min(self.variational_logstd))
+            return LowRankMultivariateNormal(loc=self.variational_mean_flexible,
                                              cov_factor=self.variational_cov_factor,
                                              cov_diag=torch.exp(self.variational_logstd))
+        elif self.distribution == 'lognormal':
+            # print(self.variational_mean_flexible)
+            # print(self.variational_logstd)
+            print(torch.max(self.variational_logstd), torch.min(self.variational_logstd))
+            print(torch.max(self.variational_mean_flexible), torch.min(self.variational_mean_flexible))
+            print(self.variational_mean_flexible.shape, self.variational_logstd.shape)
+            # return LowRankMultivariateNormal(loc=self.variational_mean_flexible,
+                                             # cov_factor=self.variational_cov_factor,
+                                             # cov_diag=torch.exp(self.variational_logstd))
+            return LogNormal(loc=self.variational_mean_flexible, scale=torch.exp(self.variational_logstd))
         elif self.distribution == 'gamma':
             # for a gamma distribution, we store the concentration as log(concentration) = variational_mean_flexible
             assert self.variational_mean_fixed == None, 'Gamma distribution does not support fixed mean'
diff --git a/bemb/model/bemb.py b/bemb/model/bemb.py
index 8c30232..d875d32 100644
--- a/bemb/model/bemb.py
+++ b/bemb/model/bemb.py
@@ -697,17 +697,19 @@ def sample_coefficient_dictionary(self, num_seeds: int, deterministic: bool = Fa
         for coef_name, coef in self.coef_dict.items():
             if deterministic:
                 s = coef.variational_distribution.mean.unsqueeze(dim=0)  # (1, num_*, dim)
-                if coef.distribution == 'lognormal':
-                    s = torch.exp(s)
+                # if coef.distribution == 'lognormal':
+                #     s = torch.exp(s)
                 sample_dict[coef_name] = s
                 if coef.obs2prior:
                     sample_dict[coef_name + '.H'] = coef.prior_H.variational_distribution.mean.unsqueeze(dim=0)  # (1, num_*, dim)
             else:
+                print(coef_name)
                 s = coef.rsample(num_seeds)
                 if coef.obs2prior:
                     # sample both obs2prior weight and realization of variable.
                     assert isinstance(s, tuple) and len(s) == 2
-                    if coef.distribution == 'lognormal':
+                    # if coef.distribution == 'lognormal':
+                    if False:
                         ss = torch.exp(s[0])
                     else:
                         ss = s[0]
@@ -716,8 +718,8 @@ def sample_coefficient_dictionary(self, num_seeds: int, deterministic: bool = Fa
                 else:
                     # only sample the realization of variable.
                     assert torch.is_tensor(s)
-                    if coef.distribution == 'lognormal':
-                        s = torch.exp(s)
+                    # if coef.distribution == 'lognormal':
+                    #     s = torch.exp(s)
                     sample_dict[coef_name] = s
         return sample_dict
 
diff --git a/bemb/model/bemb_chunked.py b/bemb/model/bemb_chunked.py
index fa57bd2..5a8c5dc 100644
--- a/bemb/model/bemb_chunked.py
+++ b/bemb/model/bemb_chunked.py
@@ -689,8 +689,8 @@ def sample_coefficient_dictionary(self, num_seeds: int, deterministic: bool=Fals
                     # inner_list = []
                     for jj, coef in enumerate(bayesian_coeffs_inner):
                         s = coef.variational_distribution.mean.unsqueeze(dim=0) # (1, num_*, dim)
-                        if coef.distribution == 'lognormal':
-                            s = s.exp()
+                        # if coef.distribution == 'lognormal':
+                        #     s = s.exp()
                         this_sample[:, :, :, ii, jj] = s
                         # inner_list.append(coef.variational_distribution.mean.unsqueeze(dim=0)) # (1, num_*, dim)
                         # inner_list.append(coef.variational_distribution.mean.unsqueeze(dim=0)) # (1, num_*, dim)
@@ -716,7 +716,8 @@ def sample_coefficient_dictionary(self, num_seeds: int, deterministic: bool=Fals
                         if coef.obs2prior:
                             # sample both obs2prior weight and realization of variable.
                             assert isinstance(s, tuple) and len(s) == 2
-                            if coef.distribution == 'lognormal':
+                            # if coef.distribution == 'lognormal':
+                            if False:
                                 ss = torch.exp(s[0])
                             else:
                                 ss = s[0]
@@ -727,8 +728,8 @@ def sample_coefficient_dictionary(self, num_seeds: int, deterministic: bool=Fals
                         else:
                             # only sample the realization of variable.
                             assert torch.is_tensor(s)
-                            if coef.distribution == 'lognormal':
-                                s = torch.exp(s)
+                            # if coef.distribution == 'lognormal':
+                            #     s = torch.exp(s)
                             this_sample[:, :, :, ii, jj] = s
                             # inner_list.append(s)
                     # outer_list.append(inner_list)
diff --git a/tutorials/supermarket/main.py b/tutorials/supermarket/main.py
index 59d10c1..1fe1f85 100644
--- a/tutorials/supermarket/main.py
+++ b/tutorials/supermarket/main.py
@@ -257,8 +257,8 @@ def load_tsv(file_name, data_dir):
         os.makedirs(out_dir)
     if 'gamma_user' in configs.utility:
         coeffs_gamma = bemb.model.coef_dict['gamma_user'].variational_mean.detach().cpu().numpy()
-        if configs.coef_dist_dict['gamma_user'] == 'lognormal':
-            coeffs_gamma = np.exp(coeffs_gamma)
+        # if configs.coef_dist_dict['gamma_user'] == 'lognormal':
+        #     coeffs_gamma = np.exp(coeffs_gamma)
         print('Coefficients statistics Gamma:')
         print(pd.DataFrame(coeffs_gamma).describe())
         # write to file
@@ -266,8 +266,8 @@ def load_tsv(file_name, data_dir):
         np.savetxt(f'{out_dir}/gammas_{sys.argv[1]}.txt', coeffs_gamma, delimiter=',')
     if 'nfact_category' in configs.utility:
         coeffs_nfact = bemb.model.coef_dict['nfact_category'].variational_mean.detach().cpu().numpy()
-        if configs.coef_dist_dict['nfact_category'] == 'lognormal':
-            coeffs_nfact = np.exp(coeffs_nfact)
+        # if configs.coef_dist_dict['nfact_category'] == 'lognormal':
+        #     coeffs_nfact = np.exp(coeffs_nfact)
         print('Coefficients statistics nfact_category:')
         print(pd.DataFrame(coeffs_nfact).describe())
         # write to file

From 8236f92ac0704a4f0117810a98884fab551bd042 Mon Sep 17 00:00:00 2001
From: Ayush Kanodia <kanodiaayush@gmail.com>
Date: Wed, 13 Dec 2023 18:27:03 -0800
Subject: [PATCH 17/19] add yaml

---
 tutorials/supermarket/configs3_1.yaml | 69 +++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 tutorials/supermarket/configs3_1.yaml

diff --git a/tutorials/supermarket/configs3_1.yaml b/tutorials/supermarket/configs3_1.yaml
new file mode 100644
index 0000000..5a3971d
--- /dev/null
+++ b/tutorials/supermarket/configs3_1.yaml
@@ -0,0 +1,69 @@
+device: cuda
+# data_dir: /home/tianyudu/Data/MoreSupermarket/tsv/
+# data_dir: /home/tianyudu/Data/MoreSupermarket/20180101-20191231_13/tsv/
+# data_dir: /oak/stanford/groups/athey/MoreSupermarkets/csv/new_data/nf_runs/m_9999/20180101-20191231_42/tsv
+# data_dir: /oak/stanford/groups/athey/MoreSupermarkets/csv/new_data/nf_runs/m_9999/2
+data_dir: /oak/stanford/groups/athey/MoreSupermarkets/csv/new_data/nf_runs/m_1904-1887-1974-2012-1992/20180101-20191231_44/tsv/
+# data_dir: /oak/stanford/groups/athey/MoreSupermarkets/csv/new_data/nf_runs/m_1887/chunked_1
+# data_dir: /oak/stanford/groups/athey/MoreSupermarkets/csv/new_data/nf_runs/sims34/1
+# utility: lambda_item
+# utility: lambda_item + theta_user * alpha_item
+# utility: lambda_item + theta_user * alpha_item + zeta_user * item_obs
+# utility: lambda_item + theta_user * alpha_item + gamma_user * beta_item * price_obs
+# utility: lambda_item + theta_user * alpha_item + gamma_user * price_obs
+# utility: lambda_item + theta_user * alpha_item - nfact_category * gamma_user * price_obs
+utility: theta_user * alpha_item - nfact_category * gamma_user * price_obs
+# utility: -nfact_category * gamma_user * price_obs
+# utility: -nfact_category * gamma_user * price_obs
+# utility: -gamma_user * price_obs
+# utility: lambda_item
+out_dir: ./output/
+# model configuration.
+obs2prior_dict:
+  lambda_item: False
+  theta_user: False
+  alpha_item: False
+  zeta_user: True
+  lota_item: True
+  gamma_user: False
+  nfact_category: False
+  beta_item: True
+coef_dim_dict:
+  lambda_item: 1
+  theta_user: 50
+  alpha_item: 50
+  gamma_user: 1
+  nfact_category: 1
+  beta_item: 10
+coef_dist_dict:
+  default: 'gaussian'
+  # gamma_user: 'gamma'
+  # nfact_category: 'gamma'
+  gamma_user: 'gaussian'
+  nfact_category: 'lognormal'
+prior_mean:
+  default: 0.0
+  # mean is shape for gamma variable
+  # shape is mean^sq / var
+  # we want mean 1, var 100 for these two vars
+  # gamma_user: 0.01
+  gamma_user: 1.0
+  nfact_category: 1.0
+prior_variance:
+  default: 100000.0
+  # variance is rate for gamma variable
+  # shape is mean / var
+  gamma_user: 10.0
+  nfact_category: 10.0
+#### optimization.
+trace_log_q: False
+shuffle: False
+batch_size: 50000
+num_epochs: 200
+learning_rate: 0.03
+num_mc_seeds: 1
+num_price_obs: 1
+obs_user: True
+obs_item: False
+patience: 10
+complete_availability: True

From b07d90ae0264ed882b4657ae30fded35117226d3 Mon Sep 17 00:00:00 2001
From: Ayush Kanodia <kanodiaayush@gmail.com>
Date: Mon, 29 Jan 2024 18:29:54 -0800
Subject: [PATCH 18/19] gamma and lognormal coeffs

---
 bemb/model/bayesian_coefficient.py       | 116 +++++++++++++++++++----
 bemb/model/bemb.py                       |   7 +-
 bemb/model/bemb_chunked.py               |  11 ++-
 bemb/model/bemb_supermarket_lightning.py |   6 ++
 4 files changed, 121 insertions(+), 19 deletions(-)

diff --git a/bemb/model/bayesian_coefficient.py b/bemb/model/bayesian_coefficient.py
index a12f490..0de89cb 100644
--- a/bemb/model/bayesian_coefficient.py
+++ b/bemb/model/bayesian_coefficient.py
@@ -86,8 +86,12 @@ def __init__(self,
             assert variance > 0, 'Gamma distribution requires variance > 0'
             # shape (concentration) is mean^2/variance, rate is variance/mean for Gamma distribution.
             '''
+            self.mean_clamp = (np.log(0.1), np.log(100.0))
+            self.logstd_clamp = (np.log(0.1), np.log(10000.0))
             shape = prior_mean ** 2 / prior_variance
+            assert shape > np.exp(self.mean_clamp[0])**2 and shape < np.exp(self.mean_clamp[1])**2, f'Gamma shape {shape} is out of range, should be in ({np.exp(self.mean_clamp[0])**2}, {np.exp(self.mean_clamp[1])**2})'
             rate = prior_mean / prior_variance
+            assert rate > np.exp(self.logstd_clamp[0]) and rate < np.exp(self.logstd_clamp[1]), f'Gamma rate {rate} is out of range, should be in ({np.exp(self.logstd_clamp[0])}, {np.exp(self.logstd_clamp[1])})'
             # prior_mean stores ln(shape) for gamma
             prior_mean = np.log(shape)
             # prior_variance stores rate for gamma
@@ -95,6 +99,20 @@ def __init__(self,
             # prior_mean = np.log(prior_mean)
             # prior_variance = prior_variance
 
+        elif distribution == 'lognormal':
+            # mean is exp(mu + sigma^2/2), variance is (exp(sigma^2) - 1) * exp(2*mu + sigma^2)
+            # prior_mean in -2, exp(3)
+            self.mean_clamp = (-2.0, np.exp(1.5))
+            # sigma sq clamp exp(-20), exp(1.5)
+            # therefore sigma in (exp(-10), exp(0.75))
+            # therefore log sigma in (-10, 0.75)
+            self.logstd_clamp = (-10.0, 0.75)
+            assert prior_mean > self.mean_clamp[0] and prior_mean < self.mean_clamp[1], f'Lognormal distribution requires prior_mean in {self.mean_clamp}, given {prior_mean}'
+            assert np.sqrt(prior_variance) > np.exp(self.logstd_clamp[0]) and np.sqrt(prior_variance) < np.exp(self.logstd_clamp[1]), f'Lognormal distribution requires prior_variance in {self.logstd_clamp}, given {prior_variance}'
+            # assert prior_mean > np.exp(-100.0) and prior_mean < np.exp(10.0), f'Lognormal distribution requires shape in (exp(-100), exp(10)), given {prior_mean}'
+            # assert prior_variance > np.exp(-100.0) and prior_variance < np.exp(2.0), f'Lognormal distribution requires rate in (exp(-100), exp(2)), given {prior_variance}'
+
+
         self.distribution = distribution
 
         self.obs2prior = obs2prior
@@ -144,23 +162,49 @@ def __init__(self,
             num_classes, dim) * self.prior_variance)
 
         # create variational distribution.
-        if self.distribution == 'gaussian' or self.distribution == 'lognormal':
+        if self.distribution == 'gaussian':
+            if self.is_H:
+                self.variational_mean_flexible = nn.Parameter(
+                    torch.randn(num_classes, dim), requires_grad=True)
+                # multiply by 0.0001 to avoid numerical issues.
+                self.variational_mean_flexible.data *= 0.0001
+
+            else:
+                self.variational_mean_flexible = nn.Parameter(
+                    torch.randn(num_classes, dim), requires_grad=True)
+        elif self.distribution == 'lognormal':
             self.variational_mean_flexible = nn.Parameter(
                 torch.randn(num_classes, dim), requires_grad=True)
+            self.variational_mean_flexible.data = torch.clamp(
+                self.variational_mean_flexible.data, min=self.mean_clamp[0], max=self.mean_clamp[1])
         # TOOD(kanodiaayush): initialize the gamma distribution variational mean in a more principled way.
         elif self.distribution == 'gamma':
             # initialize using uniform distribution between 0.5 and 1.5
             # for a gamma distribution, we store the concentration (shape) as log(concentration) = variational_mean_flexible
             self.variational_mean_flexible = nn.Parameter(
                 torch.rand(num_classes, dim) + 0.5, requires_grad=True)
+            self.variational_mean_flexible.data = torch.clamp(
+                self.variational_mean_flexible.data, min=self.mean_clamp[0], max=self.mean_clamp[1])
 
         if self.is_H and self.H_zero_mask is not None:
             assert self.H_zero_mask.shape == self.variational_mean_flexible.shape, \
                 f"The H_zero_mask should have exactly the shape as the H variable, `H_zero_mask`.shape is {self.H_zero_mask.shape}, `H`.shape is {self.variational_mean_flexible.shape} "
 
         # for gamma distribution, we store the rate as log(rate) = variational_logstd
-        self.variational_logstd = nn.Parameter(
-            torch.randn(num_classes, dim), requires_grad=True)
+        if self.distribution == 'gaussian':
+            self.variational_logstd = nn.Parameter(
+                torch.randn(num_classes, dim), requires_grad=True)
+        elif self.distribution == 'lognormal':
+            # uniform -1 to 1
+            self.variational_logstd = nn.Parameter(
+                torch.rand(num_classes, dim) * 2 - 1, requires_grad=True)
+            self.variational_logstd.data = torch.clamp(
+                self.variational_logstd.data, min=self.logstd_clamp[0], max=self.logstd_clamp[1])
+        elif self.distribution == 'gamma':
+            self.variational_logstd = nn.Parameter(
+                torch.randn(num_classes, dim), requires_grad=True)
+            self.variational_logstd.data = torch.clamp(
+                self.variational_logstd.data, min=self.logstd_clamp[0], max=self.logstd_clamp[1])
 
         self.register_buffer('variational_cov_factor',
                              torch.zeros(num_classes, dim, 1))
@@ -269,10 +313,13 @@ def log_prior(self,
                                             cov_factor=self.prior_cov_factor,
                                             cov_diag=self.prior_cov_diag).log_prob(sample)
         elif self.distribution == 'lognormal':
-            # out = LogNormal(loc=mu, scale=np.sqrt(self.prior_variance)).log_prob(sample)
-            # out = torch.sum(out, dim=-1)
-            out = torch.zeros((num_seeds, num_classes), device=sample.device)
+            mu = torch.clamp(mu, min=-100.0, max=10.0)
+            out = LogNormal(loc=mu, scale=np.sqrt(self.prior_variance)).log_prob(sample)
+            out = torch.sum(out, dim=-1)
+            # out = torch.zeros((num_seeds, num_classes), device=sample.device)
+
         elif self.distribution == 'gamma':
+            mu = torch.clamp(mu, min=-100.0, max=4.0)
             concentration = torch.exp(mu)
             rate = self.prior_variance
             '''
@@ -286,7 +333,7 @@ def log_prior(self,
             # sum over the last dimension
             out = torch.sum(out, dim=-1)
 
-        out = torch.zeros((num_seeds, num_classes), device=sample.device)
+        # out = torch.zeros((num_seeds, num_classes), device=sample.device)
         assert out.shape == (num_seeds, num_classes)
         return out
 
@@ -323,9 +370,10 @@ def rsample(self, num_seeds: int = 1) -> Union[torch.Tensor, Tuple[torch.Tensor]
         """
         value_sample = self.variational_distribution.rsample(
             torch.Size([num_seeds]))
-        if self.distribution == 'lognormal':
-            print(torch.min(value_sample))
-            print(torch.max(value_sample))
+        # if self.distribution == 'lognormal':
+        #     print(torch.min(value_sample))
+        #     print(torch.max(value_sample))
+            # breakpoint()
         # DEBUG_MARKER
         if self.obs2prior:
             # sample obs2prior H as well.
@@ -347,20 +395,31 @@ def variational_distribution(self) -> Union[LowRankMultivariateNormal, Gamma]:
         elif self.distribution == 'lognormal':
             # print(self.variational_mean_flexible)
             # print(self.variational_logstd)
-            print(torch.max(self.variational_logstd), torch.min(self.variational_logstd))
-            print(torch.max(self.variational_mean_flexible), torch.min(self.variational_mean_flexible))
-            print(self.variational_mean_flexible.shape, self.variational_logstd.shape)
+            # print(torch.max(self.variational_logstd), torch.min(self.variational_logstd))
+            # print(torch.max(self.variational_mean_flexible), torch.min(self.variational_mean_flexible))
+            # print(self.variational_mean_flexible.shape, self.variational_logstd.shape)
             # return LowRankMultivariateNormal(loc=self.variational_mean_flexible,
                                              # cov_factor=self.variational_cov_factor,
                                              # cov_diag=torch.exp(self.variational_logstd))
-            return LogNormal(loc=self.variational_mean_flexible, scale=torch.exp(self.variational_logstd))
+            # variational_mean_flexible = torch.clamp(self.variational_mean_flexible, min=-10, max=10)
+            # variational_logstd = torch.clamp(self.variational_logstd, min=-4, max=3)
+            loc = self.variational_mean_flexible
+            scale = torch.exp(self.variational_logstd)
+            return LogNormal(loc=loc, scale=scale)
         elif self.distribution == 'gamma':
             # for a gamma distribution, we store the concentration as log(concentration) = variational_mean_flexible
-            assert self.variational_mean_fixed == None, 'Gamma distribution does not support fixed mean'
-            concentration = self.variational_mean_flexible.exp() + 0.000001
-            concentration = torch.minimum(concentration, torch.tensor(1e3))
+            # assert self.variational_mean_fixed == None, 'Gamma distribution does not support fixed mean'
+            concentration = torch.exp(self.variational_mean_flexible)
+            # assert that all concentration should be between exp -4 and exp 4
+            # assert torch.all(concentration > 0.1353 - 0.0001), 'concentration should be greater than exp -2'
+            # assert torch.all(concentration < 54.5981 + 0.0001), 'concentration should be less than exp 4'
+            # concentration = self.variational_mean_flexible.exp() + 0.000001
+            # concentration = torch.clamp(concentration, min=1e-2, max=1e2)
+            # concentration = torch.minimum(concentration, torch.tensor(1e3))
             # for gamma distribution, we store the rate as log(rate) = variational_logstd
             rate = torch.exp(self.variational_logstd)
+            # print(concentration, rate)
+            # rate = torch.clamp(rate, min=1e-2, max=1e2)
             return Gamma(concentration=concentration, rate=rate)
         else:
             raise NotImplementedError("Unknown variational distribution type.")
@@ -369,3 +428,26 @@ def variational_distribution(self) -> Union[LowRankMultivariateNormal, Gamma]:
     def device(self) -> torch.device:
         """Returns the device of tensors contained in this module."""
         return self.variational_mean.device
+
+    def clamp_params(self) -> None:
+        """Clamps the parameters of the variational distribution to be within a reasonable range.
+        """
+        if self.distribution == 'gaussian':
+            # do nothing
+            pass
+            # self.variational_mean_flexible.data = torch.clamp(
+            #     self.variational_mean_flexible.data, min=-10, max=10)
+            # self.variational_logstd.data = torch.clamp(
+            #     self.variational_logstd.data, min=-4, max=3)
+        elif self.distribution in ['lognormal', 'gamma']:
+            self.variational_mean_flexible.data = torch.clamp(
+                self.variational_mean_flexible.data, min=self.mean_clamp[0], max=self.mean_clamp[1])
+            self.variational_logstd.data = torch.clamp(
+                self.variational_logstd.data, min=self.mean_clamp[0], max=self.logstd_clamp[1])
+        # elif self.distribution == 'gamma':
+        #     self.variational_mean_flexible.data = torch.clamp(
+        #         self.variational_mean_flexible.data, min=-2.0, max=4)
+        #     self.variational_logstd.data = torch.clamp(
+                # self.variational_logstd.data, min=-100.0, max=2.0)
+        else:
+            raise NotImplementedError("Unknown variational distribution type.")
diff --git a/bemb/model/bemb.py b/bemb/model/bemb.py
index d875d32..f6df9d9 100644
--- a/bemb/model/bemb.py
+++ b/bemb/model/bemb.py
@@ -422,6 +422,10 @@ def __init__(self,
                 'Additional modules are temporarily disabled for further development.')
             self.additional_modules = nn.ModuleList(additional_modules)
 
+    def clamp_coefs(self):
+        for coef_name in self.coef_dict.keys():
+            self.coef_dict[coef_name].clamp_params()
+
     def __str__(self):
         return f'Bayesian EMBedding Model with U[user, item, session] = {self.raw_formula}\n' \
                + f'Total number of parameters: {self.num_params}.\n' \
@@ -697,13 +701,14 @@ def sample_coefficient_dictionary(self, num_seeds: int, deterministic: bool = Fa
         for coef_name, coef in self.coef_dict.items():
             if deterministic:
                 s = coef.variational_distribution.mean.unsqueeze(dim=0)  # (1, num_*, dim)
+                # print(torch.min(s), torch.max(s))
+                # breakpoint()
                 # if coef.distribution == 'lognormal':
                 #     s = torch.exp(s)
                 sample_dict[coef_name] = s
                 if coef.obs2prior:
                     sample_dict[coef_name + '.H'] = coef.prior_H.variational_distribution.mean.unsqueeze(dim=0)  # (1, num_*, dim)
             else:
-                print(coef_name)
                 s = coef.rsample(num_seeds)
                 if coef.obs2prior:
                     # sample both obs2prior weight and realization of variable.
diff --git a/bemb/model/bemb_chunked.py b/bemb/model/bemb_chunked.py
index 5a8c5dc..b1572c8 100644
--- a/bemb/model/bemb_chunked.py
+++ b/bemb/model/bemb_chunked.py
@@ -358,7 +358,7 @@ def __init__(self,
                     bayesian_coefs_inner = []
                     for jj in range(chunk_sizes[1]):
                         if self.coef_dist_dict[coef_name] == 'gamma' and not self.obs2prior_dict[coef_name]:
-                            assert mean > 0, 'shape of gamma distribution specifieid as prior_mean needs to be > 0'
+                            assert mean > 0, 'shape of gamma distribution specified as prior_mean needs to be > 0'
                         bayesian_coefs_inner.append(BayesianCoefficient(variation=variation,
                                                                         num_classes=variation_to_num_classes[variation],
                                                                         obs2prior=self.obs2prior_dict[coef_name],
@@ -386,6 +386,13 @@ def __init__(self,
                 'Additional modules are temporarily disabled for further development.')
             self.additional_modules = nn.ModuleList(additional_modules)
 
+
+    def clamp_coefs(self):
+        for coef_name in self.coef_dict.keys():
+            for ii in range(len(self.coef_dict[coef_name])):
+                for jj in range(len(self.coef_dict[coef_name][ii])):
+                    self.coef_dict[coef_name][ii][jj].clamp_params()
+
     def __str__(self):
         return f'Bayesian EMBedding Model with U[user, item, session] = {self.raw_formula}\n' \
                + f'Total number of parameters: {self.num_params}.\n' \
@@ -1081,6 +1088,7 @@ def reshape_observable(obs, name):
                 if return_price_coeff and term['observable'] is not None and term['observable'].startswith('price_'):
                     obs_coeff = coef_sample.sum(dim=-1)
                     price_coeffs = obs_coeff
+                    price_coeffs *= term['sign']
 
                 additive_term = (coef_sample * obs).sum(dim=-1)
                 additive_term *= term['sign']
@@ -1115,6 +1123,7 @@ def reshape_observable(obs, name):
                 if return_price_coeff and term['observable'] is not None and term['observable'].startswith('price_'):
                     obs_coeff = coef.sum(dim=-1)
                     price_coeffs = obs_coeff
+                    price_coeffs *= term['sign']
 
                 additive_term = (coef * obs).sum(dim=-1)
                 additive_term *= term['sign']
diff --git a/bemb/model/bemb_supermarket_lightning.py b/bemb/model/bemb_supermarket_lightning.py
index a7b40cb..a26dc4e 100644
--- a/bemb/model/bemb_supermarket_lightning.py
+++ b/bemb/model/bemb_supermarket_lightning.py
@@ -150,6 +150,12 @@ def test_dataloader(self):
                                             num_workers=self.num_workers)
         return test_dataloader
 
+    # def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_closure, ):
+    def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu, using_native_amp, using_lbfgs):
+        optimizer.step(closure=optimizer_closure)
+        with torch.no_grad():
+            self.model.clamp_coefs()
+
 
     def write_bemb_cpp_format(self):
         model = self.model

From 0e034dea91e91c1d2cfe7be1ddb864b4e8089cf1 Mon Sep 17 00:00:00 2001
From: Ayush Kanodia <kanodiaayush@gmail.com>
Date: Mon, 17 Jun 2024 13:23:19 -0700
Subject: [PATCH 19/19] Update prior handling with positive variationals

1. the hyperprior is gaussian
2. the prior is gaussian
3. the lower bound of the lognormal is lowered
4. handling is changed to make it clearer
---
 bemb/model/bayesian_coefficient.py    | 54 ++++++++++++++-------------
 tutorials/supermarket/configs3_1.yaml | 19 +++++-----
 2 files changed, 38 insertions(+), 35 deletions(-)

diff --git a/bemb/model/bayesian_coefficient.py b/bemb/model/bayesian_coefficient.py
index 0de89cb..0a6460f 100644
--- a/bemb/model/bayesian_coefficient.py
+++ b/bemb/model/bayesian_coefficient.py
@@ -89,26 +89,28 @@ def __init__(self,
             self.mean_clamp = (np.log(0.1), np.log(100.0))
             self.logstd_clamp = (np.log(0.1), np.log(10000.0))
             shape = prior_mean ** 2 / prior_variance
-            assert shape > np.exp(self.mean_clamp[0])**2 and shape < np.exp(self.mean_clamp[1])**2, f'Gamma shape {shape} is out of range, should be in ({np.exp(self.mean_clamp[0])**2}, {np.exp(self.mean_clamp[1])**2})'
             rate = prior_mean / prior_variance
-            assert rate > np.exp(self.logstd_clamp[0]) and rate < np.exp(self.logstd_clamp[1]), f'Gamma rate {rate} is out of range, should be in ({np.exp(self.logstd_clamp[0])}, {np.exp(self.logstd_clamp[1])})'
-            # prior_mean stores ln(shape) for gamma
-            prior_mean = np.log(shape)
-            # prior_variance stores rate for gamma
-            prior_variance = rate
-            # prior_mean = np.log(prior_mean)
-            # prior_variance = prior_variance
+            if not obs2prior:
+                assert shape > np.exp(self.mean_clamp[0])**2 and shape < np.exp(self.mean_clamp[1])**2, f'Gamma shape {shape} is out of range, should be in ({np.exp(self.mean_clamp[0])**2}, {np.exp(self.mean_clamp[1])**2})'
+                assert rate > np.exp(self.logstd_clamp[0]) and rate < np.exp(self.logstd_clamp[1]), f'Gamma rate {rate} is out of range, should be in ({np.exp(self.logstd_clamp[0])}, {np.exp(self.logstd_clamp[1])})'
+                # prior_mean stores ln(shape) for gamma
+                prior_mean = np.log(shape)
+                # prior_variance stores rate for gamma
+                prior_variance = rate
+                # prior_mean = np.log(prior_mean)
+                # prior_variance = prior_variance
 
         elif distribution == 'lognormal':
             # mean is exp(mu + sigma^2/2), variance is (exp(sigma^2) - 1) * exp(2*mu + sigma^2)
             # prior_mean in -2, exp(3)
-            self.mean_clamp = (-2.0, np.exp(1.5))
+            self.mean_clamp = (-10.0, np.exp(1.5))
             # sigma sq clamp exp(-20), exp(1.5)
             # therefore sigma in (exp(-10), exp(0.75))
             # therefore log sigma in (-10, 0.75)
             self.logstd_clamp = (-10.0, 0.75)
-            assert prior_mean > self.mean_clamp[0] and prior_mean < self.mean_clamp[1], f'Lognormal distribution requires prior_mean in {self.mean_clamp}, given {prior_mean}'
-            assert np.sqrt(prior_variance) > np.exp(self.logstd_clamp[0]) and np.sqrt(prior_variance) < np.exp(self.logstd_clamp[1]), f'Lognormal distribution requires prior_variance in {self.logstd_clamp}, given {prior_variance}'
+            if not obs2prior:
+                assert prior_mean > self.mean_clamp[0] and prior_mean < self.mean_clamp[1], f'Lognormal distribution requires prior_mean in {self.mean_clamp}, given {prior_mean}'
+                assert np.sqrt(prior_variance) > np.exp(self.logstd_clamp[0]) and np.sqrt(prior_variance) < np.exp(self.logstd_clamp[1]), f'Lognormal distribution requires prior_variance in {self.logstd_clamp}, given {prior_variance}'
             # assert prior_mean > np.exp(-100.0) and prior_mean < np.exp(10.0), f'Lognormal distribution requires shape in (exp(-100), exp(10)), given {prior_mean}'
             # assert prior_variance > np.exp(-100.0) and prior_variance < np.exp(2.0), f'Lognormal distribution requires rate in (exp(-100), exp(2)), given {prior_variance}'
 
@@ -254,6 +256,7 @@ def variational_mean(self) -> torch.Tensor:
 
         elif self.distribution == 'lognormal':
             M = (torch.minimum((M.exp() + 0.000001), torch.tensor(1e3)))
+            # M = torch.minimum(M + 0.000001, torch.tensor(1e3))
 
         if self.is_H and (self.H_zero_mask is not None):
             # a H-variable with zero-entry restriction.
@@ -314,24 +317,23 @@ def log_prior(self,
                                             cov_diag=self.prior_cov_diag).log_prob(sample)
         elif self.distribution == 'lognormal':
             mu = torch.clamp(mu, min=-100.0, max=10.0)
-            out = LogNormal(loc=mu, scale=np.sqrt(self.prior_variance)).log_prob(sample)
-            out = torch.sum(out, dim=-1)
+            out = LowRankMultivariateNormal(loc=mu,
+                                            cov_factor=self.prior_cov_factor,
+                                            cov_diag=self.prior_cov_diag).log_prob(sample)
+            # out = LogNormal(loc=mu, scale=np.sqrt(self.prior_variance)).log_prob(sample)
+            # out = torch.sum(out, dim=-1)
             # out = torch.zeros((num_seeds, num_classes), device=sample.device)
 
         elif self.distribution == 'gamma':
             mu = torch.clamp(mu, min=-100.0, max=4.0)
-            concentration = torch.exp(mu)
-            rate = self.prior_variance
-            '''
-            print(concentration)
-            print('concentration')
-            print(rate)
-            print('rate')
-            '''
-            out = Gamma(concentration=concentration,
-                        rate=rate).log_prob(sample)
-            # sum over the last dimension
-            out = torch.sum(out, dim=-1)
+            out = LowRankMultivariateNormal(loc=mu,
+                                            cov_factor=self.prior_cov_factor,
+                                            cov_diag=self.prior_cov_diag).log_prob(sample)
+            # concentration = torch.exp(mu)
+            # rate = self.prior_variance
+            # out = Gamma(concentration=concentration,
+            #             rate=rate).log_prob(sample)
+            # out = torch.sum(out, dim=-1)
 
         # out = torch.zeros((num_seeds, num_classes), device=sample.device)
         assert out.shape == (num_seeds, num_classes)
@@ -443,7 +445,7 @@ def clamp_params(self) -> None:
             self.variational_mean_flexible.data = torch.clamp(
                 self.variational_mean_flexible.data, min=self.mean_clamp[0], max=self.mean_clamp[1])
             self.variational_logstd.data = torch.clamp(
-                self.variational_logstd.data, min=self.mean_clamp[0], max=self.logstd_clamp[1])
+                self.variational_logstd.data, min=self.logstd_clamp[0], max=self.logstd_clamp[1])
         # elif self.distribution == 'gamma':
         #     self.variational_mean_flexible.data = torch.clamp(
         #         self.variational_mean_flexible.data, min=-2.0, max=4)
diff --git a/tutorials/supermarket/configs3_1.yaml b/tutorials/supermarket/configs3_1.yaml
index 5a3971d..c2bbb7e 100644
--- a/tutorials/supermarket/configs3_1.yaml
+++ b/tutorials/supermarket/configs3_1.yaml
@@ -3,19 +3,19 @@ device: cuda
 # data_dir: /home/tianyudu/Data/MoreSupermarket/20180101-20191231_13/tsv/
 # data_dir: /oak/stanford/groups/athey/MoreSupermarkets/csv/new_data/nf_runs/m_9999/20180101-20191231_42/tsv
 # data_dir: /oak/stanford/groups/athey/MoreSupermarkets/csv/new_data/nf_runs/m_9999/2
-data_dir: /oak/stanford/groups/athey/MoreSupermarkets/csv/new_data/nf_runs/m_1904-1887-1974-2012-1992/20180101-20191231_44/tsv/
+# data_dir: /oak/stanford/groups/athey/MoreSupermarkets/csv/new_data/nf_runs/m_1904-1887-1974-2012-1992/20180101-20191231_44/tsv/
 # data_dir: /oak/stanford/groups/athey/MoreSupermarkets/csv/new_data/nf_runs/m_1887/chunked_1
-# data_dir: /oak/stanford/groups/athey/MoreSupermarkets/csv/new_data/nf_runs/sims34/1
+data_dir: /oak/stanford/groups/athey/MoreSupermarkets/csv/new_data/nf_runs/sims34/1
 # utility: lambda_item
 # utility: lambda_item + theta_user * alpha_item
 # utility: lambda_item + theta_user * alpha_item + zeta_user * item_obs
 # utility: lambda_item + theta_user * alpha_item + gamma_user * beta_item * price_obs
 # utility: lambda_item + theta_user * alpha_item + gamma_user * price_obs
 # utility: lambda_item + theta_user * alpha_item - nfact_category * gamma_user * price_obs
-utility: theta_user * alpha_item - nfact_category * gamma_user * price_obs
+# utility: theta_user * alpha_item + nfact_category * gamma_user * price_obs
 # utility: -nfact_category * gamma_user * price_obs
 # utility: -nfact_category * gamma_user * price_obs
-# utility: -gamma_user * price_obs
+utility: -gamma_user * price_obs
 # utility: lambda_item
 out_dir: ./output/
 # model configuration.
@@ -39,8 +39,9 @@ coef_dist_dict:
   default: 'gaussian'
   # gamma_user: 'gamma'
   # nfact_category: 'gamma'
-  gamma_user: 'gaussian'
-  nfact_category: 'lognormal'
+  # gamma_user: 'gamma'
+  gamma_user: 'lognormal'
+  # nfact_category: 'gamma'
 prior_mean:
   default: 0.0
   # mean is shape for gamma variable
@@ -53,8 +54,8 @@ prior_variance:
   default: 100000.0
   # variance is rate for gamma variable
   # shape is mean / var
-  gamma_user: 10.0
-  nfact_category: 10.0
+  gamma_user: 2.0
+  nfact_category: 2.0
 #### optimization.
 trace_log_q: False
 shuffle: False
@@ -63,7 +64,7 @@ num_epochs: 200
 learning_rate: 0.03
 num_mc_seeds: 1
 num_price_obs: 1
-obs_user: True
+obs_user: False
 obs_item: False
 patience: 10
 complete_availability: True