diff --git a/.gitignore b/.gitignore
index 01d251d..e046c42 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,9 +2,13 @@ outputs/
 scripts/
 data/cache/
 data/raw/
+abcd/data/cache/
+abcd/data/raw/
+
 results/
 
 .DS_Store
 __pycache__/
 *.py[cod]
 *$py.class
+*.egg-info
\ No newline at end of file
diff --git a/README.md b/README.md
index 82811c3..1879e6e 100644
--- a/README.md
+++ b/README.md
@@ -11,9 +11,9 @@ Paper link: https://arxiv.org/abs/2104.00783
 
 Blog link: https://www.asapp.com/blog/action-based-conversations-dataset/
 
-![Agent Dashboard](/data/images/agent_dashboard.png)
+![Agent Dashboard](/images/agent_dashboard.png)
 
-![Customer Site](/data/images/customer_site.png)
+![Customer Site](/images/customer_site.png)
 
 ## Usage
 All code is run by executing the corresponding command within the shell script `run.sh`, which will kick off the data preparation and training within `main.py`.  To use, first unzip the file found in `data/abcd_v1.1.json.gz` using the `gunzip` command (or similar).  Then comment or uncomment the appropriate lines in the shell script to get desired behavior. Finally, enter `sh run.sh` into the command line to get started.  Use the `--help` option of argparse for flag details or read through the file located within `utils/arguments.py`.
@@ -49,7 +49,7 @@ Each scene dict contains details about the customer setup along with the underly
   - _Flow_ and _Subflow_: these represent the ground truth user intent.  They are used to generate the prompt, but are not shown directly the customer.  The job of the agent is to infer this (latent) intent and then match against the Agent Guidelines to resolve the customer issue.
 
 ### Guidelines
-The agent guidelines are offered in their original form within [Agent Guidelines for ABCD](https://docs.google.com/document/d/1_SZit-iUAzNCICJ6qahULoMhqVOJCspQF37QiEJzHLc). This has been transformed into a formatted document for parsing by a model within `data/guidelines.json`.  The intents with their button actions about found within `kb.json`.  Lastly, the breakdown of all flows, subflows, and actions are found within `ontology.json`.
+The agent guidelines are offered in their original form within [Agent Guidelines for ABCD](https://docs.google.com/document/d/1_SZit-iUAzNCICJ6qahULoMhqVOJCspQF37QiEJzHLc). This has been transformed into a formatted document for parsing by a model within `abcd/data/guidelines.json`.  The intents with their button actions about found within `kb.json`.  Lastly, the breakdown of all flows, subflows, and actions are found within `ontology.json`.
 
 ### Conversation
 Each conversation is made up of a list of turns.  Each turn is a dict with five parts:
diff --git a/components/__init__.py b/abcd/components/__init__.py
similarity index 100%
rename from components/__init__.py
rename to abcd/components/__init__.py
diff --git a/abcd/components/datasets.py b/abcd/components/datasets.py
new file mode 100644
index 0000000..f3e7e71
--- /dev/null
+++ b/abcd/components/datasets.py
@@ -0,0 +1,216 @@
+import numpy as np
+import random
+import torch
+from torch.utils.data import Dataset
+from abcd.utils.arguments import Config
+from typing import List, Optional, Dict, Any, Sequence, Union
+
+from typing import TypeVar
+from abc import ABC, abstractmethod
+from .feature_dataclasses import (
+    BaseFeature,
+    FeatureType,
+    ActionFeature,
+    CascadeFeature,
+    CompletionFeature,
+)
+from dataclasses import fields
+
+
+class _BaseFeature(object):
+    """A single set of features of data."""
+
+    def __init__(
+        self,
+        input_ids: List[int],
+        segment_ids: List[int],
+        input_mask: List[int],
+        label_id: Optional[int],
+        position_ids: Optional[List[int]] = None,
+    ):
+        self.input_id = input_ids
+        self.segment_id = segment_ids
+        self.mask_id = input_mask
+        self.label_id = label_id
+        self.position_id = position_ids
+
+
+class _ActionFeature(_BaseFeature):
+    """ A single set of features with precomputed context token ids"""
+
+    def __init__(
+        self,
+        input_ids: List[int],
+        segment_ids: List[int],
+        input_mask: List[int],
+        label_ids: Dict,
+        context: Dict,
+    ):
+        super().__init__(input_ids, segment_ids, input_mask, label_ids["value"])
+        # token_ids is a batch_size length list, where each item is 100 ids
+        self.context_token: List[List[int]] = context["token_ids"]
+        self.context_segment = context["segment_ids"]
+        self.context_mask = context["mask_ids"]
+        self.action_id = label_ids["action"]
+
+
+class _CompletionFeature(_BaseFeature):
+    """ A single set of completion features with precomputed context token ids"""
+
+    def __init__(
+        self,
+        input_ids: List[int],
+        segment_ids: List[int],
+        input_mask: List[int],
+        label_ids: Dict,
+        context: Dict,
+        candidates: Any,
+    ):
+        super().__init__(input_ids, segment_ids, input_mask, None)
+        self.candidates = candidates
+        self.context_token = context["token_ids"]
+        self.context_segment = context["segment_ids"]
+        self.context_mask = context["mask_ids"]
+
+        self.intent_id = label_ids["intent"]
+        self.nextstep_id = label_ids["nextstep"]
+        self.action_id = label_ids["action"]
+        self.value_id = label_ids["value"]
+        self.utt_id = label_ids["utterance"]
+
+
+CompletionFeatureType = TypeVar(
+    "CompletionFeatureType", bound=CompletionFeature, covariant=True
+)
+
+
+class _CascadeFeature(_CompletionFeature):
+    """ A single set of completion features with precomputed context token ids"""
+
+    def __init__(
+        self,
+        input_ids: List[int],
+        segment_ids: List[int],
+        input_mask: List[int],
+        label_ids: Dict,
+        context: Dict,
+        candidates: Any,
+    ):
+        super().__init__(
+            input_ids, segment_ids, input_mask, label_ids, context, candidates
+        )
+        self.convo_id = label_ids["convo"]
+        self.turn_count = label_ids["turn"]
+
+
+class BaseDataset(Dataset[FeatureType], ABC):
+    def __init__(self, args: Config, features: List[FeatureType]):
+        self.data: List[FeatureType] = features
+        self.model_type = args.model_type
+        self.num_examples = len(features)
+
+    def __len__(self) -> int:
+        return len(self.data)
+
+    def __getitem__(self, idx: int) -> FeatureType:
+        return self.data[idx]
+
+    @abstractmethod
+    def collate_func(self, features: Sequence[FeatureType]) -> FeatureType:
+        # def collate_func(self, args, split, raw_data):
+        raise NotImplementedError()
+
+
+class ActionDataset(BaseDataset[ActionFeature]):
+    def collate_func(self, features: Sequence[ActionFeature]) -> ActionFeature:
+        return ActionFeature.stack(features)
+
+        # input_ids = torch.tensor([f.input_id for f in features], dtype=torch.long)
+        # segment_ids = torch.tensor([f.segment_id for f in features], dtype=torch.long)
+        # mask_ids = torch.tensor([f.mask_id for f in features], dtype=torch.long)
+        # context_tokens = torch.tensor(
+        #     [f.context_tokens for f in features], dtype=torch.long
+        # )
+        # context_segments = torch.tensor(
+        #     [f.context_segments for f in features], dtype=torch.long
+        # )
+        # context_masks = torch.tensor(
+        #     [f.context_masks for f in features], dtype=torch.long
+        # )
+        # action_ids = torch.tensor([f.action_id for f in features], dtype=torch.long)
+        # value_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
+
+        # return ActionFeature(
+        #     input_id=input_ids,
+        #     segment_id=segment_ids,
+        #     input_mask=mask_ids,
+        #     context_token=context_tokens,
+        #     context_segment=context_segments,
+        #     context_mask=context_masks,
+        #     action_id=action_ids,
+        #     value_id=value_ids,
+        # )
+        # return (
+        #     input_ids,
+        #     segment_ids,
+        #     mask_ids,
+        #     context_tokens,
+        #     context_segments,
+        #     context_masks,
+        #     action_ids,
+        #     value_ids,
+        # )
+
+
+class CompletionDataset(BaseDataset[CompletionFeatureType]):
+    def collate_func(self, features: Sequence[CompletionFeatureType]) -> CompletionFeatureType:
+        return CompletionFeature.stack(features)
+
+        # input_ids = torch.tensor([f.input_id for f in features], dtype=torch.long)
+        # segment_ids = torch.tensor([f.segment_id for f in features], dtype=torch.long)
+        # mask_ids = torch.tensor([f.mask_id for f in features], dtype=torch.long)
+        # context_tokens = torch.tensor(
+        #     [f.context_token for f in features], dtype=torch.long
+        # )
+        # context_segments = torch.tensor(
+        #     [f.context_segment for f in features], dtype=torch.long
+        # )
+        # context_masks = torch.tensor(
+        #     [f.context_mask for f in features], dtype=torch.long
+        # )
+
+        # intent_ids = torch.tensor([f.intent_id for f in features], dtype=torch.long)
+        # nextstep_ids = torch.tensor([f.nextstep_id for f in features], dtype=torch.long)
+        # action_ids = torch.tensor([f.action_id for f in features], dtype=torch.long)
+        # value_ids = torch.tensor([f.value_id for f in features], dtype=torch.long)
+        # utterance_ids = torch.tensor([f.utt_id for f in features], dtype=torch.long)
+        # all_candidates = torch.tensor(
+        #     [f.candidates for f in features], dtype=torch.long
+        # )
+
+        # return (
+        #     input_ids,
+        #     segment_ids,
+        #     mask_ids,
+        #     context_tokens,
+        #     context_segments,
+        #     context_masks,
+        #     intent_ids,
+        #     nextstep_ids,
+        #     action_ids,
+        #     value_ids,
+        #     utterance_ids,
+        #     all_candidates,
+        # )
+
+
+class CascadeDataset(CompletionDataset[CascadeFeature]):
+    def collate_func(self, features: Sequence[CascadeFeature]) -> CascadeFeature:
+        return CascadeFeature.stack(features)
+
+        # collated_batch = super().collate_func(features)
+        # convo_ids = torch.tensor([f.convo_id for f in features], dtype=torch.long)
+        # turn_counts = torch.tensor([f.turn_count for f in features], dtype=torch.long)
+        # cascade_batch = (convo_ids, turn_counts)
+
+        # return collated_batch + cascade_batch
diff --git a/abcd/components/feature_dataclasses.py b/abcd/components/feature_dataclasses.py
new file mode 100644
index 0000000..1a867a5
--- /dev/null
+++ b/abcd/components/feature_dataclasses.py
@@ -0,0 +1,207 @@
+from dataclasses import dataclass, field, InitVar, astuple, asdict, fields
+from typing import (
+    Dict,
+    List,
+    Any,
+    Optional,
+    Union,
+    Tuple,
+    Iterator,
+    Iterable,
+    Sequence,
+    TypeVar,
+)
+from simple_parsing.helpers.serialization import JsonSerializable
+from torch import Tensor
+import torch
+import numpy as np
+
+
+@dataclass
+class BaseFeature(JsonSerializable):
+    input_ids: List[int]
+    segment_ids: List[int]
+    input_mask: List[int]
+
+    @property
+    def mask_id(self) -> List[int]:
+        return self.input_mask
+
+    def keys(self):
+        return tuple(k for k, _ in self.items())
+
+    def values(self):
+        return tuple(v for _, v in self.items())
+
+    def items(self) -> Iterable[Tuple[str, Any]]:
+        for field in fields(self):
+            yield field.name, getattr(self, field.name)
+
+    def _as_dict(self) -> Dict:
+        return asdict(self)
+
+    def _as_tuple(self) -> Tuple:
+        return astuple(self)
+
+    def __iter__(self) -> Iterator[Tuple[Tensor]]:
+        yield from self._as_tuple()
+
+    def to(self, device: Union[str, torch.device]):
+        return type(self)(**{key: value.to(device) for key, value in self.items()})
+
+    @classmethod
+    def stack(cls, features: Sequence["FeatureType"]) -> "FeatureType":
+        kwargs = {
+            field.name: torch.tensor(
+                [getattr(feat, field.name) for feat in features], dtype=torch.long
+            )
+            for field in fields(cls)
+        }
+        stacked = cls(**kwargs)
+        # assert False, [v.shape for v in stacked.values()]
+        return stacked
+
+
+FeatureType = TypeVar("FeatureType", bound=BaseFeature, covariant=True)
+
+
+@dataclass
+class ActionFeature(BaseFeature):
+
+    context: InitVar[Dict] = {}
+    context_tokens: List[int] = field(default_factory=list)  # = context["token_ids"]
+    context_segments: List[int] = field(
+        default_factory=list
+    )  # = context["segment_ids"]
+    context_masks: List[int] = field(default_factory=list)  # = context["mask_ids"]
+
+    label_ids: InitVar[Dict[str, int]] = {}
+    action_id: Optional[int] = field(default=None)  # = label_ids["action"]
+    label_id: Optional[int] = field(default=None)  # = label_ids["value"]
+
+    def __post_init__(self, context: Dict, label_ids: Dict[str, int]):
+        if context or label_ids:
+            if "token_ids" in label_ids:
+                # BUG: These get switched for some reason?
+                context, label_ids = label_ids, context
+            assert "token_ids" in context, (context, label_ids)
+            self.context_tokens = context["token_ids"]
+            self.context_segments = context["segment_ids"]
+            self.context_masks = context["mask_ids"]
+            self.action_id = label_ids["action"]
+            self.label_id = label_ids["value"]
+
+
+@dataclass
+class CompletionFeature(BaseFeature):
+    candidates: List
+
+    context: InitVar[Dict] = {}
+    label_ids: InitVar[Dict] = {}
+
+    context_token: List[List[int]] = field(default_factory=list)
+    context_segment: List[int] = field(default_factory=list)
+    context_mask: List[int] = field(default_factory=list)
+
+    intent_id: Union[int, List[int]] = field(default_factory=list)  # type: ignore
+    nextstep_id: Union[int, List[int]] = field(default_factory=list)  # type: ignore
+    action_id: Union[int, List[int]] = field(default_factory=list)  # type: ignore
+    value_id: Union[int, List[int]] = field(default_factory=list)  # type: ignore
+    utt_id: Union[int, List[int]] = field(default_factory=list)  # type: ignore
+
+    def __post_init__(self, context: Dict, label_ids: Dict):
+        if context or label_ids:
+            self.context_token = context["token_ids"]
+            self.context_segment = context["segment_ids"]
+            self.context_mask = context["mask_ids"]
+
+            self.intent_id = label_ids["intent"]
+            self.nextstep_id = label_ids["nextstep"]
+            self.action_id = label_ids["action"]
+            self.value_id = label_ids["value"]
+            self.utt_id = label_ids["utterance"]
+
+
+@dataclass
+class CascadeFeature(CompletionFeature):
+    """ A single set of completion features with precomputed context token ids"""
+    convo_id: Union[int, List[int]] = field(default_factory=list)  # type: ignore
+    turn_count: Union[int, List[int]] = field(default_factory=list)  # type: ignore
+
+    def __post_init__(self, context: Dict, label_ids: Dict):
+        super().__post_init__(context=context, label_ids=label_ids)
+        if context or label_ids:
+            self.convo_id = label_ids["convo"]
+            self.turn_count = label_ids["turn"]
+
+
+### Examples from `features.py`.
+
+
+@dataclass
+class InputExample(JsonSerializable):
+    """A single training/test example for simple sequence classification."""
+
+    # Unique id for the example.
+    guid: int
+    # list of strings. The untokenized text of the converation so far.
+    input_context: List[str]
+    # The label of the example. This should be specified for train and dev examples, but
+    # not for test examples.
+    target_label: Optional[str]
+    # list of candidates to choose from for utterance ranking
+    candidates: Optional[List[str]] = field(init=False, default=None)
+
+    @property
+    def context(self) -> List[str]:
+        return self.input_context
+
+    @property
+    def label(self) -> Optional[str]:
+        return self.target_label
+
+
+@dataclass
+class ActionExample(InputExample):
+    """A single training/test example for slot value filling. """
+
+    tokens: List[int]
+    action: int
+
+    @property
+    def context_tokens(self):
+        return self.tokens
+
+
+@dataclass
+class CompleteExample(InputExample):
+    """A single training/test example for task completion. """
+
+    tokens: List[int]
+
+    targets: InitVar[Tuple[Any, Any, Any, Any, Any]]
+
+    def __post_init__(self, targets: Tuple[Any, Any, Any, Any, Any]):
+        intent, nextstep, action, value_index, utt_index = targets
+        self.intent_label = intent
+        self.nextstep_label = nextstep
+        self.action_label = action
+        self.value_label = value_index
+        self.utt_label = utt_index
+
+    @property
+    def context_tokens(self):
+        return self.tokens
+
+
+class CascadingExample(CompleteExample):
+    """A single training/test example for task completion. """
+
+    convo_id: int = field(init=False)
+    turn_count: int = field(init=False)
+
+    def __post_init__(self, targets):
+        super().__post_init__(targets=targets)
+
+        self.convo_id = convo_id
+        self.turn_count = turn_count
diff --git a/abcd/components/features.py b/abcd/components/features.py
new file mode 100644
index 0000000..185bf32
--- /dev/null
+++ b/abcd/components/features.py
@@ -0,0 +1,176 @@
+from typing import List, Optional
+
+
+class InputExample(object):
+    """A single training/test example for simple sequence classification."""
+
+    def __init__(
+        self,
+        guid: int,
+        input_context: List[str],
+        target_label: Optional[str],
+        candidates: List[str] = None,
+    ):
+        """Constructs a InputExample.
+
+    Args:
+      guid: Unique id for the example.
+      context: list of strings. The untokenized text of the converation so far.
+      label: (Optional) string. The label of the example. This should be
+      specified for train and dev examples, but not for test examples.
+      candidates: list of candidates to choose from for utterance ranking
+    """
+        self.guid = guid
+        self.context = input_context
+        self.label = target_label
+        self.candidates = candidates
+
+
+class ActionExample(InputExample):
+    """A single training/test example for slot value filling. """
+
+    def __init__(self, guid, input_context, target_label, tokens, action):
+        super().__init__(guid, input_context, target_label)
+        self.context_tokens = tokens
+        self.action = action
+
+
+class CompleteExample(InputExample):
+    """A single training/test example for task completion. """
+
+    def __init__(self, guid, input_context, targets, tokens, candidates):
+        super().__init__(guid, input_context, None, candidates)
+        self.context_tokens = tokens
+
+        intent, nextstep, action, value_index, utt_index = targets
+        self.intent_label = intent
+        self.nextstep_label = nextstep
+        self.action_label = action
+        self.value_label = value_index
+        self.utt_label = utt_index
+
+
+class CascadingExample(InputExample):
+    """A single training/test example for task completion. """
+
+    def __init__(
+        self, guid, input_context, targets, tokens, candidates, convo_id, turn_count
+    ):
+        super().__init__(guid, input_context, None, candidates)
+        self.context_tokens = tokens
+
+        intent, nextstep, action, value_index, utt_index = targets
+        self.intent_label = intent
+        self.nextstep_label = nextstep
+        self.action_label = action
+        self.value_label = value_index
+        self.utt_label = utt_index
+
+        self.convo_id = convo_id
+        self.turn_count = turn_count
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self, input_ids, segment_ids, input_mask, label_id, position_ids=None):
+        self.input_id = input_ids
+        self.segment_id = segment_ids
+        self.mask_id = input_mask
+        self.label_id = label_id
+        self.position_id = position_ids
+
+
+class CandidateFeatures(InputFeatures):
+    """ A single set of features with precomputed candidates """
+
+    def __init__(
+        self,
+        input_ids,
+        segment_ids,
+        input_mask,
+        label_id,
+        candidates,
+        position_ids=None,
+    ):
+        super().__init__(input_ids, segment_ids, input_mask, label_id, position_ids)
+        # candidates is a (batch_size x num_candidates x hidden_dim) tensor
+        self.candidates = candidates
+
+
+class ActionFeatures(InputFeatures):
+    """ A single set of features with precomputed context token ids"""
+
+    def __init__(
+        self,
+        input_ids,
+        segment_ids,
+        input_mask,
+        label_id,
+        context,
+        action_id,
+        position_ids=None,
+    ):
+        super().__init__(input_ids, segment_ids, input_mask, label_id, position_ids)
+        # token_ids is a batch_size length list, where each item is 100 ids
+        self.context_token = context["token_ids"]
+        self.context_segment = context["segment_ids"]
+        self.context_mask = context["mask_ids"]
+        self.action_id = action_id
+
+
+class CompletionFeatures(InputFeatures):
+    """ A single set of completion features with precomputed context token ids"""
+
+    def __init__(
+        self, input_ids, segment_ids, input_mask, label_ids, candidates, context
+    ):
+        super().__init__(input_ids, segment_ids, input_mask, None)
+        self.candidates = candidates
+        self.context_token = context["token_ids"]
+        self.context_segment = context["segment_ids"]
+        self.context_mask = context["mask_ids"]
+
+        self.intent_id = label_ids["intent"]
+        self.nextstep_id = label_ids["nextstep"]
+        self.action_id = label_ids["action"]
+        self.value_id = label_ids["value"]
+        self.utt_id = label_ids["utterance"]
+
+        self.action_mask = int(label_ids["nextstep"] == 1)
+        self.value_mask = int(label_ids["value"] >= 0)
+        self.utt_mask = int(label_ids["nextstep"] == 0)
+
+
+class CascadingFeatures(InputFeatures):
+    """ A single set of completion features with precomputed context token ids"""
+
+    def __init__(
+        self,
+        input_ids,
+        segment_ids,
+        input_mask,
+        label_ids,
+        candidates,
+        context,
+        convo_id,
+        turn_count,
+    ):
+        super().__init__(input_ids, segment_ids, input_mask, None)
+        self.candidates = candidates
+        self.context_token = context["token_ids"]
+        self.context_segment = context["segment_ids"]
+        self.context_mask = context["mask_ids"]
+
+        self.intent_id = label_ids["intent"]
+        self.nextstep_id = label_ids["nextstep"]
+        self.action_id = label_ids["action"]
+        self.value_id = label_ids["value"]
+        self.utt_id = label_ids["utterance"]
+
+        self.action_mask = int(label_ids["nextstep"] == 1)
+        self.value_mask = int(label_ids["value"] >= 0)
+        self.utt_mask = int(label_ids["nextstep"] == 0)
+        self.convo_id = convo_id
+        self.turn_count = turn_count
+
diff --git a/abcd/components/models.py b/abcd/components/models.py
new file mode 100644
index 0000000..c49de7f
--- /dev/null
+++ b/abcd/components/models.py
@@ -0,0 +1,217 @@
+import os, sys, pdb
+import json
+import math
+import numpy as np
+import GPUtil
+
+import torch
+from torch import nn
+from torch import optim
+from torch.nn import functional as F
+
+from transformers import BertModel, RobertaModel, AlbertModel
+from transformers.file_utils import WEIGHTS_NAME
+
+from abcd.utils.arguments import Config
+from abcd.utils.help import ModelInputDict
+
+
+class CoreModel(nn.Module):
+    def __init__(self, args: Config, checkpoint_dir):
+        super().__init__()
+        if args.model_type == "bert":
+            self.encoder = BertModel.from_pretrained("bert-base-uncased")
+        elif args.model_type == "roberta":
+            self.encoder = RobertaModel.from_pretrained("roberta-base")
+        elif args.model_type == "albert":
+            self.encoder = AlbertModel.from_pretrained("albert-base-v2")
+
+        self.outputs = ["intent", "nextstep", "action", "value", "utt"]
+        self.checkpoint_dir = checkpoint_dir
+        self.use_intent = args.use_intent
+
+    def forward(self, full_history: ModelInputDict, context_tokens: ModelInputDict):
+        raise NotImplementedError
+
+    def save_pretrained(self, filepath=None):
+        if filepath is None:
+            filepath = os.path.join(self.checkpoint_dir, "pytorch_model.pt")
+        torch.save(self.state_dict(), filepath)
+        print(f"Model weights saved in {filepath}")
+
+    @classmethod
+    def from_pretrained(
+        cls, hidden_dim, ontology_size, base_model, device, filepath=None
+    ):
+        # Instantiate model.
+        model = cls(hidden_dim, ontology_size, base_model, device)
+        # Load weights and fill them inside the model
+        if filepath is None:
+            filepath = os.path.join(self.checkpoint_dir, "pytorch_model.pt")
+        model.load_state_dict(torch.load(filepath))
+        model.eval()
+        print(f"Model loaded from {filepath}")
+        return model
+
+
+class ActionStateTracking(CoreModel):
+    """ An AST model should output predictions for buttons, slots and values.  There are multiple ways
+  to accomplish this goal:
+    a. Predicts all 3 parts separately and join the results afterwards
+    b. Predict the 231 possible button-slots together and then just the values
+    c. First predict the 30 available buttons alone and then the slot-values together
+    d. First predict the 30 available buttons and then just the values, leaving the slots as implied
+  Option D is reasonable because each value only belongs to a certain slot, so selecting the correct
+  value implies that the slot has also been correctly selected.  This final option is implemented below.
+
+  To perform value-filling, the task is further decomposed into copying unique tokens from the context
+  for non-enumerable values (copy_score_ or selecting from the ontology for enumerable values (enum_score).
+  """
+
+    def __init__(self, args, mappers, checkpoint_dir):
+        super().__init__(args, checkpoint_dir)
+        self.outputs = ["action", "value"]
+        self.mappings = mappers
+
+        self.action_projection = nn.Linear(args.hidden_dim, len(mappers["action"]))
+        self.enum_projection = nn.Linear(args.hidden_dim, len(mappers["value"]))
+        self.copy_projection = nn.Linear(
+            args.hidden_dim, 100
+        )  # hardcode limit of 100 context tokens
+        self.gating_mechanism = nn.Linear(
+            args.hidden_dim + 100, 1
+        )  # shrink down to scalar
+
+        self.softmax = nn.Softmax(dim=1)
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, full_history: ModelInputDict, context_tokens: ModelInputDict):
+        # NOTE: (@lebrice):
+        # - Action depends ONLY on full history
+        # - value depends on BOTH the full_history and the context_tokens
+
+        history_outputs = self.encoder(**full_history)
+        pooled_history = history_outputs.pooler_output  # batch_size x 768
+        action_score = self.softmax(self.action_projection(pooled_history))
+        enum_prob = self.softmax(self.enum_projection(pooled_history))
+
+        context_outputs = self.encoder(**context_tokens)
+        pooled_context = context_outputs.pooler_output  # batch_size x hidden
+        copy_prob = self.softmax(
+            self.copy_projection(pooled_context)
+        )  # batch_size x 100
+        reverse_copy_proj = self.copy_projection.weight.t()  # hidden x 100
+        copy_context = torch.matmul(
+            pooled_context, reverse_copy_proj
+        )  # batch_size x 100
+        joined = torch.cat(
+            [pooled_context, copy_context], dim=1
+        )  # batch_size x 768+100
+        gate = self.sigmoid(self.gating_mechanism(joined))  # batch_size x 1
+
+        enum_score = gate * enum_prob  # batch_size x 126
+        copy_score = (1 - gate) * copy_prob  # batch_size x 100
+        value_score = torch.cat([enum_score, copy_score], dim=1)  # batch_size x 226
+
+        return action_score, value_score
+
+from typing import NamedTuple
+from torch import Tensor
+
+
+class CDSModelOutput(NamedTuple):
+    intent_scores: Tensor
+    nextstep_scores: Tensor
+    action_scores: Tensor
+    value_scores: Tensor
+    utt_scores: Tensor
+
+
+class CascadeDialogSuccess(CoreModel):
+    """ Unlike the BaseModel, will output 5 predictions, one for each component """
+
+    def __init__(self, args, mappers, checkpoint_dir):
+        super().__init__(args, checkpoint_dir)
+        self.outputs = ["intent", "nextstep", "action", "value", "utterance"]
+        self.mappings = mappers
+
+        self.intent_projection = nn.Linear(args.hidden_dim, len(mappers["intent"]))
+        self.nextstep_projection = nn.Linear(args.hidden_dim, len(mappers["nextstep"]))
+        self.action_projection = nn.Linear(args.hidden_dim, len(mappers["action"]))
+
+        self.candidate_linear = nn.Linear(args.hidden_dim, 128)
+        self.context_linear = nn.Linear(args.hidden_dim, 128)
+
+        self.enum_projection = nn.Linear(args.hidden_dim, len(mappers["value"]))
+        self.copy_projection = nn.Linear(
+            args.hidden_dim, 100
+        )  # hardcode limit of 100 context tokens
+        self.gating_mechanism = nn.Linear(
+            args.hidden_dim + 100, 1
+        )  # shrink down to scalar
+
+        self.softmax = nn.Softmax(dim=1)
+        self.sigmoid = nn.Sigmoid()
+
+    def add_candidate_data(self, utt_texts, utt_vectors):
+        self.utt_texts = utt_texts
+        self.utt_vectors = utt_vectors
+
+    def forward(self, full_history, context_tokens, tools) -> CDSModelOutput:
+        if self.use_intent:
+            all_candidates, device, _ = tools
+        else:
+            all_candidates, device = tools
+
+        history_outputs = self.encoder(**full_history)  # batch_size x 768
+        pooled_history = history_outputs.pooler_output
+        intent_score = self.softmax(self.intent_projection(pooled_history))
+        nextstep_score = self.softmax(self.nextstep_projection(pooled_history))
+        action_score = self.softmax(self.action_projection(pooled_history))
+        enum_prob = self.softmax(self.enum_projection(pooled_history))
+
+        encoded_history = pooled_history.unsqueeze(1)  # (batch_size, 1, hidden_dim)
+        projected_history = self.context_linear(encoded_history)  # (batch_size, 1, 128)
+
+        batch_cands = []
+        for row in all_candidates:  # each row includes 100 positions
+            vectors = [self.utt_vectors[position] for position in row]
+            batch_cands.append(torch.stack(vectors))
+
+        candidates = torch.stack(batch_cands).to(
+            device
+        )  # batch_size, num_candidates, hidden_dim
+        candidates = self.candidate_linear(
+            candidates
+        )  # (batch_size, num_candidates, 128)
+        candidates = candidates.transpose(1, 2)  # (batch_size, 128, num_candidates)
+
+        utt_score = torch.bmm(projected_history, candidates)
+        utt_score = utt_score.squeeze(1)  # (batch_size, num_candidates)
+        utt_score = self.softmax(utt_score)  # normalize into probabilities
+
+        context_outputs = self.encoder(**context_tokens)
+        pooled_context = context_outputs.pooler_output
+        copy_prob = self.softmax(
+            self.copy_projection(pooled_context)
+        )  # batch_size x 100
+        reverse_copy_proj = self.copy_projection.weight.t()
+        copy_context = torch.matmul(
+            pooled_context, reverse_copy_proj
+        )  # batch_size x hidden
+        joined = torch.cat(
+            [pooled_context, copy_context], dim=1
+        )  # batch_size x 768+100
+        gate = self.sigmoid(self.gating_mechanism(joined))  # batch_size x 1
+
+        enum_score = gate * enum_prob  # batch_size x 125
+        copy_score = (1 - gate) * copy_prob  # batch_size x 100
+        value_score = torch.cat([enum_score, copy_score], dim=1)  # batch_size x 225
+
+        return CDSModelOutput(
+            intent_scores=intent_score,
+            nextstep_scores=nextstep_score,
+            action_scores=action_score,
+            value_scores=value_score,
+            utt_scores=utt_score,
+        )
diff --git a/abcd/components/systems.py b/abcd/components/systems.py
new file mode 100644
index 0000000..4bf394e
--- /dev/null
+++ b/abcd/components/systems.py
@@ -0,0 +1,209 @@
+import os, sys, pdb
+import random
+import numpy as np
+import json
+import pandas as pd
+from abcd.utils.arguments import Config
+
+
+class Application(object):
+    def __init__(self, args: Config, model, processor):
+        self.task = args.task
+        self.utt_vectors = model.utt_vectors
+        self.utt_texts = model.utt_texts
+        self.device = model.device
+
+        tokenizer = processor.tokenizer
+        cls_token_segment_id = 0
+        sequence_a_segment_id = 0 if args.model_type in ["roberta", "large"] else 1
+        processor.special = {
+            "tokens": [tokenizer.cls_token, tokenizer.sep_token, tokenizer.pad_token],
+            "ids": [cls_token_segment_id, sequence_a_segment_id, 0],
+            "maximum": [97, 100],
+        }
+
+        self.processor = processor
+        self.tokenizer = tokenizer
+
+        self.intent_list = processor.intent_labels
+        self.action_list = processor.action_labels
+        self.value_list = processor.value_labels
+        self.enumerable_size = len(self.value_list)
+
+        self.scenario_df = pd.read_csv(f"data/scenarios_0525.csv")
+        ontology = json.load(open("data/ontology.json", "r"))
+        self.non_enumerable = ontology["values"]["non_enumerable"]
+        self.so_far = []  # hold a list of context utterances
+        self.action_taken = False
+
+        kb = json.load(open("data/kb.json", "r"))
+        action_mask_map, intent_mask_map = Application.prepare_masks(kb, ontology)
+        self.action_mask_map = action_mask_map
+        self.intent_mask_map = intent_mask_map
+
+    @staticmethod
+    def prepare_masks(kb, ont):
+        # record the range that needs to be masked out
+        val_group_to_range = {}
+        current_idx = 0
+        before_cc = True
+        num_enumerable_vals = 0
+        # print(val_group_to_range)
+        for val_group, values in ont["values"]["enumerable"].items():
+            start = current_idx
+
+            size = len(values)
+            if "credit card" in values:
+                if before_cc:
+                    before_cc = False
+                else:
+                    size -= 1
+
+            num_enumerable_vals += size
+            stop = start + size
+            val_group_to_range[val_group] = (start, stop)
+            current_idx = stop
+
+        # build out the action to values mapping
+        action_mask_map = {}
+        for category, acts in ont["actions"].items():
+            for action, values in acts.items():
+                mask = np.zeros(100 + num_enumerable_vals)
+                mask[num_enumerable_vals:] = 1.0
+
+                for val_group in values:
+                    if val_group in val_group_to_range:
+                        start, stop = val_group_to_range[val_group]
+                        mask[start:stop] = 1.0
+
+                action_mask_map[action] = mask
+
+        # recreate the exact breakdown order from the loader
+        options = []
+        for section, buttons in ont["actions"].items():
+            actions = buttons.keys()
+            options.extend(actions)
+        # double check that all actions in the kb are valid
+        match, error = 0, 0
+        for intent, actions in kb.items():
+            for action in actions:
+                if action in options:
+                    pass
+                else:
+                    print(action)
+                    pdb.set_trace()
+                # assert(action in options)
+        # make the reverse lookup for the id that needs to be masked out
+        action_to_idx = {action: index for index, action in enumerate(options)}
+        # create the actual intent to action mapping
+        intent_mask_map = {}
+        for flow, subflows in ont["intents"]["subflows"].items():
+            for intent in subflows:
+                mask = np.zeros(30)
+
+                valid_options = kb[intent]
+                for action in valid_options:
+                    mask[action_to_idx[action]] = 1.0
+
+                intent_mask_map[intent] = mask
+
+        return action_mask_map, intent_mask_map
+
+    def delexicalize_text(self, scene, conversation):
+        """ Given all the utterances within a converation and the scenario, delexicalize the 
+    non enumerable entities. Inputs:
+      - scene: a dict with detail, personal_info and order info
+      - conversation: a list of utterances tuples where each tuple is (speaker, text, action, pred)
+    Returns:
+      - delex: a list of utterances where the text has been delexicalized
+    """
+        non_enumerable = []
+
+        for slot in self.non_enumerable["personal"]:
+            if slot in scene:
+                non_enumerable.append((slot, scene[slot]))
+
+        for slot, value in scene.items():
+            string_val = str(value)
+            if string_val.startswith("$"):
+                non_enumerable.append(("amount", string_val[1:]))
+            if slot == "order_id":
+                non_enumerable.append((slot, scene[slot]))
+
+        address = scene["address"]
+        address_tokens = address.split()
+        address_halves = address.split(",")
+        non_enumerable.append(("street_address", address_halves[0]))
+        non_enumerable.append(("full_address", address[0]))
+        non_enumerable.append(("zip_code", address_tokens[0]))
+
+        delexed = []
+        for utt in conversation:
+            text = utt.replace("|", "and").replace("_", " ").lower()
+            # must be in this order to prevent clash
+            for slot, value in non_enumerable:
+                if str(value) in text:
+                    text = text.replace(str(value), f"<{slot}>")
+
+            delexed.append(text)
+        return delexed
+
+    def sample_scenario(self):
+        scenario = self.scenario_df.sample()
+        flow_detail = json.loads(scenario["Detail"].item())
+        scene = json.loads(
+            scenario["Personal"].item()
+        )  # default scene to the personal info
+
+        order = json.loads(scenario["Order"].item())
+        street_address = order["address"]
+        scene[
+            "address"
+        ] = f"{street_address} {order['city']}, {order['state']} {order['zip_code']}"
+
+        for key, value in order.items():
+            if key == "products":
+                for product in order["products"]:
+                    product_name = product["brand"] + " " + product["product_type"]
+                    scene[product_name] = "$" + str(product["amount"])
+            if key not in ["address", "city", "status", "zip_code", "products"]:
+                scene[key] = value
+        self.scene = scene
+
+        issue = flow_detail["issue"]
+        reason = flow_detail["reason"]
+        solution = flow_detail["solution"]
+        prefix = flow_detail.get("prefix", "Y")
+        suffix = flow_detail.get("suffix", "")
+        prompt = (
+            f"{prefix}ou {issue} because {reason}. Explain your problem to the agent, provide any information that is requested and attempt to {solution}. {suffix}"
+            ""
+        )
+
+        return scene, prompt
+
+    def take_action(self, intent_pred, action_pred, value_pred, context_tokens):
+        top_intent = np.argmax(intent_pred)
+        intent_name = self.intent_list[top_intent]
+
+        # each intent mask should be size of 30 long
+        intent_mask = self.intent_mask_map[intent_name]
+        # now, all non valid actions should go to zero
+        action_pred *= np.array(intent_mask)
+        top_action = np.argmax(action_pred)
+        action_name = self.action_list[top_action]
+
+        # each action mask should be size of 223 long
+        action_mask = self.action_mask_map[action_name]
+        # now, all non valid values should go to zero
+        value_pred *= np.array(action_mask)
+        top_value = np.argmax(value_pred)
+        if top_value < self.enumerable_size:  # part of enumerable
+            value_name = self.value_list[top_value]
+        else:  # copy from context
+            top_value -= self.enumerable_size
+            while top_value > len(context_tokens):
+                top_value -= len(context_tokens)
+            value_name = context_tokens[top_value]
+
+        return {"Intent": intent_name, "Action": action_name, "Value": value_name}
diff --git a/components/tools.py b/abcd/components/tools.py
similarity index 52%
rename from components/tools.py
rename to abcd/components/tools.py
index f771e43..e2fd5be 100644
--- a/components/tools.py
+++ b/abcd/components/tools.py
@@ -3,34 +3,38 @@
 import numpy as np
 
 from tensorboardX import SummaryWriter
-from torch.optim.optimizer import Optimizer, required
+from torch.optim.optimizer import Optimizer
 from transformers import AdamW, get_linear_schedule_with_warmup
 
+
 class ExperienceLogger(object):
-    
     def __init__(self, args, checkpoint_dir):
         self.args = args
         logging.basicConfig(level=logging.INFO)
         self.logger = logging.getLogger(__name__)
-        self.logger.addHandler(logging.FileHandler(args.output_dir + '/exp.log'))
+        self.logger.addHandler(logging.FileHandler(args.output_dir + "/exp.log"))
 
         self.epoch = 0
         self.global_step = 0
         self.eval_step = 0
         self.log_interval = args.log_interval
 
-        self.best_score = float('-inf')
+        self.best_score = float("-inf")
         self.task = args.task
         self.mtype = args.model_type
         self.verbose = args.verbose
 
         self.output_dir = args.output_dir
-        self.filepath = os.path.join(checkpoint_dir, 'pytorch_model.pt')
+        self.filepath = os.path.join(checkpoint_dir, "pytorch_model.pt")
 
     def start_train(self, num_examples, total_step):
         self.logger.info("***** Running training *****")
-        self.logger.info(f"  Train examples: {num_examples}, Batch size: {self.args.batch_size}")
-        self.logger.info(f"  Num epochs: {self.args.epochs}, Optimization steps: {total_step}")
+        self.logger.info(
+            f"  Train examples: {num_examples}, Batch size: {self.args.batch_size}"
+        )
+        self.logger.info(
+            f"  Num epochs: {self.args.epochs}, Optimization steps: {total_step}"
+        )
         self.logger.info(f"  Running experiment for {self.task} {self.args.filename}")
 
     def start_eval(self, num_examples, kind):
@@ -39,9 +43,9 @@ def start_eval(self, num_examples, kind):
         self.batch_steps = 0
 
         if self.verbose:
-          epoch_msg = f"epoch {self.epoch} evaluation"
-          self.logger.info(f"***** Running {epoch_msg} for {kind} {self.mtype} *****")
-          self.logger.info(f"  Num evaluation examples: {num_examples}")
+            epoch_msg = f"epoch {self.epoch} evaluation"
+            self.logger.info(f"***** Running {epoch_msg} for {kind} {self.mtype} *****")
+            self.logger.info(f"  Num evaluation examples: {num_examples}")
 
     def end_eval(self, result, kind):
         self.logger.info("***** Eval results for {} *****".format(kind))
@@ -50,13 +54,13 @@ def end_eval(self, result, kind):
 
     def log_train(self, step, loss, result, metric):
         if self.log_interval > 0 and self.global_step % self.log_interval == 0:
-            log_str = 'Step {:>6d} | Loss {:5.4f}'.format(step, loss)
-            self.add_scalar('train', 'loss', loss, self.global_step)
+            log_str = "Step {:>6d} | Loss {:5.4f}".format(step, loss)
+            self.add_scalar("train", "loss", loss, self.global_step)
 
             if self.verbose:
                 value = round(result[metric], 3)
                 log_str += f" | {metric} {value}"
-                self.add_scalar('train', metric.lower(), value, self.global_step)
+                self.add_scalar("train", metric.lower(), value, self.global_step)
             self.logger.info(log_str)
 
         self.global_step += 1
@@ -65,26 +69,34 @@ def log_dev(self, step, metric, value):
         self.eval_step += 1
 
         avg_eval_loss = round(self.eval_loss / self.batch_steps, 4)
-        log_str = 'Eval {:3d} | Loss {} | {} {}'.format(step, avg_eval_loss, metric, value)
+        log_str = "Eval {:3d} | Loss {} | {} {}".format(
+            step, avg_eval_loss, metric, value
+        )
         self.logger.info(log_str)
-        self.add_scalar('dev', 'loss', avg_eval_loss, self.global_step)
-        self.add_scalar('dev', metric.lower(), value, self.global_step)
-
+        self.add_scalar("dev", "loss", avg_eval_loss, self.global_step)
+        self.add_scalar("dev", metric.lower(), value, self.global_step)
 
     def init_tb_writers(self):
-        self.train_writer = SummaryWriter(log_dir=self.output_dir + '/train')
-        self.dev_writer = SummaryWriter(log_dir=self.output_dir + '/dev')
+        self.train_writer = SummaryWriter(log_dir=self.output_dir + "/train")
+        self.dev_writer = SummaryWriter(log_dir=self.output_dir + "/dev")
 
     def add_scalar(self, mode, name, value, step):
-        if mode == 'train':
+        if mode == "train":
             self.train_writer.add_scalar(name, value, step)
-        elif mode == 'dev':
+        elif mode == "dev":
             self.dev_writer.add_scalar(name, value, step)
 
 
 class RAdam(Optimizer):
-
-    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True):
+    def __init__(
+        self,
+        params,
+        lr=1e-3,
+        betas=(0.9, 0.999),
+        eps=1e-8,
+        weight_decay=0,
+        degenerated_to_sgd=True,
+    ):
         if not 0.0 <= lr:
             raise ValueError("Invalid learning rate: {}".format(lr))
         if not 0.0 <= eps:
@@ -93,13 +105,25 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0
             raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
         if not 0.0 <= betas[1] < 1.0:
             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
-        
+
         self.degenerated_to_sgd = degenerated_to_sgd
-        if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict):
+        if (
+            isinstance(params, (list, tuple))
+            and len(params) > 0
+            and isinstance(params[0], dict)
+        ):
             for param in params:
-                if 'betas' in param and (param['betas'][0] != betas[0] or param['betas'][1] != betas[1]):
-                    param['buffer'] = [[None, None, None] for _ in range(10)]
-        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, buffer=[[None, None, None] for _ in range(10)])
+                if "betas" in param and (
+                    param["betas"][0] != betas[0] or param["betas"][1] != betas[1]
+                ):
+                    param["buffer"] = [[None, None, None] for _ in range(10)]
+        defaults = dict(
+            lr=lr,
+            betas=betas,
+            eps=eps,
+            weight_decay=weight_decay,
+            buffer=[[None, None, None] for _ in range(10)],
+        )
         super(RAdam, self).__init__(params, defaults)
 
     def __setstate__(self, state):
@@ -113,62 +137,74 @@ def step(self, closure=None):
 
         for group in self.param_groups:
 
-            for p in group['params']:
+            for p in group["params"]:
                 if p.grad is None:
                     continue
                 grad = p.grad.data.float()
                 if grad.is_sparse:
-                    raise RuntimeError('RAdam does not support sparse gradients')
+                    raise RuntimeError("RAdam does not support sparse gradients")
 
                 p_data_fp32 = p.data.float()
 
                 state = self.state[p]
 
                 if len(state) == 0:
-                    state['step'] = 0
-                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
-                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
+                    state["step"] = 0
+                    state["exp_avg"] = torch.zeros_like(p_data_fp32)
+                    state["exp_avg_sq"] = torch.zeros_like(p_data_fp32)
                 else:
-                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
-                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
+                    state["exp_avg"] = state["exp_avg"].type_as(p_data_fp32)
+                    state["exp_avg_sq"] = state["exp_avg_sq"].type_as(p_data_fp32)
 
-                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
-                beta1, beta2 = group['betas']
+                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
+                beta1, beta2 = group["betas"]
 
                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
 
-                state['step'] += 1
-                buffered = group['buffer'][int(state['step'] % 10)]
-                if state['step'] == buffered[0]:
+                state["step"] += 1
+                buffered = group["buffer"][int(state["step"] % 10)]
+                if state["step"] == buffered[0]:
                     N_sma, step_size = buffered[1], buffered[2]
                 else:
-                    buffered[0] = state['step']
-                    beta2_t = beta2 ** state['step']
+                    buffered[0] = state["step"]
+                    beta2_t = beta2 ** state["step"]
                     N_sma_max = 2 / (1 - beta2) - 1
-                    N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
+                    N_sma = N_sma_max - 2 * state["step"] * beta2_t / (1 - beta2_t)
                     buffered[1] = N_sma
 
                     # more conservative since it's an approximated value
                     if N_sma >= 5:
-                        step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
+                        step_size = math.sqrt(
+                            (1 - beta2_t)
+                            * (N_sma - 4)
+                            / (N_sma_max - 4)
+                            * (N_sma - 2)
+                            / N_sma
+                            * N_sma_max
+                            / (N_sma_max - 2)
+                        ) / (1 - beta1 ** state["step"])
                     elif self.degenerated_to_sgd:
-                        step_size = 1.0 / (1 - beta1 ** state['step'])
+                        step_size = 1.0 / (1 - beta1 ** state["step"])
                     else:
                         step_size = -1
                     buffered[2] = step_size
 
                 # more conservative since it's an approximated value
                 if N_sma >= 5:
-                    if group['weight_decay'] != 0:
-                        p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
-                    denom = exp_avg_sq.sqrt().add_(group['eps'])
-                    p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom)
+                    if group["weight_decay"] != 0:
+                        p_data_fp32.add_(
+                            -group["weight_decay"] * group["lr"], p_data_fp32
+                        )
+                    denom = exp_avg_sq.sqrt().add_(group["eps"])
+                    p_data_fp32.addcdiv_(-step_size * group["lr"], exp_avg, denom)
                     p.data.copy_(p_data_fp32)
                 elif step_size > 0:
-                    if group['weight_decay'] != 0:
-                        p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
-                    p_data_fp32.add_(-step_size * group['lr'], exp_avg)
+                    if group["weight_decay"] != 0:
+                        p_data_fp32.add_(
+                            -group["weight_decay"] * group["lr"], p_data_fp32
+                        )
+                    p_data_fp32.add_(-step_size * group["lr"], exp_avg)
                     p.data.copy_(p_data_fp32)
 
         return loss
diff --git a/data/abcd_sample.json b/abcd/data/abcd_sample.json
similarity index 100%
rename from data/abcd_sample.json
rename to abcd/data/abcd_sample.json
diff --git a/data/abcd_v1.1.json.gz b/abcd/data/abcd_v1.1.json.gz
similarity index 100%
rename from data/abcd_v1.1.json.gz
rename to abcd/data/abcd_v1.1.json.gz
diff --git a/data/guidelines.json b/abcd/data/guidelines.json
similarity index 100%
rename from data/guidelines.json
rename to abcd/data/guidelines.json
diff --git a/data/kb.json b/abcd/data/kb.json
similarity index 100%
rename from data/kb.json
rename to abcd/data/kb.json
diff --git a/data/ontology.json b/abcd/data/ontology.json
similarity index 100%
rename from data/ontology.json
rename to abcd/data/ontology.json
diff --git a/data/utterances.json b/abcd/data/utterances.json
similarity index 100%
rename from data/utterances.json
rename to abcd/data/utterances.json
diff --git a/utils/__init__.py b/abcd/utils/__init__.py
similarity index 100%
rename from utils/__init__.py
rename to abcd/utils/__init__.py
diff --git a/abcd/utils/arguments.py b/abcd/utils/arguments.py
new file mode 100644
index 0000000..be07d52
--- /dev/null
+++ b/abcd/utils/arguments.py
@@ -0,0 +1,288 @@
+from simple_parsing import ArgumentParser
+from simple_parsing.helpers import choice, field
+from dataclasses import dataclass
+import torch
+from typing import Optional
+
+
+@dataclass
+class BaseConfig:
+    # Random seed
+    seed: int = 14
+    # Which type of encoder and tokenizer to use
+    model_type: str = choice("roberta", "bert", "dialogpt", "albert", default="bert")
+    # Choose which of the two major tasks to train the model
+    task: str = choice("ast", "cds", default="ast")
+    # Whether or not to go into debug mode, which is faster
+    debug: bool = False
+    # whether or not to have verbose prints
+    verbose: bool = False
+
+
+@dataclass
+class DirectoryAndSavingConfig:
+    """ ------ DIRECTORY AND SAVING -------- """
+    output_dir: str = "outputs/"
+    input_dir: str = "abcd/data"
+    # distinguish the trial run, often a MM/DD date
+    prefix: str = "0524"
+    # name of the model if saving, or loading from saved
+    filename: str = ""
+    # distinguish the saved data, often a version number
+    suffix: str = "v1"
+    # Filter for just errors during evaluation
+    filter: bool = False
+
+
+@dataclass
+class TrainingAndEvaluationConfig:
+    """ ------ TRAINING AND EVALUATION -------- """
+    # load the best saved model and run evaluation, qualify or quantify flags must be on
+    do_eval: bool = False
+    log_interval: int = 100
+    # examine the qualitative outputs of the model in natural language
+    qualify: bool = False
+    # examine the quantitative outputs of the model in reports
+    quantify: bool = False
+
+
+@dataclass
+class MajorModelOptions:
+    """ ------- MAJOR MODEL OPTIONS -------- """
+    # use cascading evaluation rather than turn level
+    cascade: bool = False
+    # use an oracle intent classification module
+    use_intent: bool = False
+    # take advantage of KB guidelines to limit action and value options
+    use_kb: bool = False
+
+
+@dataclass
+class DatasetCreation:
+    """ ------ DATASET CREATION -------- """
+    # which version of the dataset is being used
+    # v1.0 was used initially, but v1.1 is released as a significantly cleaner benchmark
+    version: float = 1.1
+    # whether to build new vocabulary of Glove vectors
+    build_vocab: bool = False
+    # Maximum number of tokens to truncate each utterance
+    max_seq_len: int = 512
+
+
+@dataclass
+class ParameterOptimizationConfig:
+    """ hyperparameters """
+    # use RAdam optimizer rather than default AdamW
+    radam: bool = False
+    # Learning rate alpha for weight updates
+    learning_rate: float = field(default=3e-5, alias="-lr")
+    # Number of hidden units, size of hidden dimension
+    hidden_dim: int = 768
+    # probability of dropping a node, opposite of keep prob
+    drop_prob: float = 0.2
+    # Number of steps for gradient accumulation
+    grad_accum_steps: int = 1
+    # weight_decay to regularize the weights
+    weight_decay: float = field(default=0.003, alias="-reg")
+    # batch size for training and evaluation
+    batch_size: int = 50
+    # Number of epochs or episodes to train
+    epochs: int = field(default=14, alias="-e")
+    # Number of GPUs to use.
+    n_gpu: int = torch.cuda.device_count() if torch.cuda.is_available() else 0
+
+
+@dataclass
+class Config(
+    BaseConfig,
+    DirectoryAndSavingConfig,
+    TrainingAndEvaluationConfig,
+    MajorModelOptions,
+    DatasetCreation,
+    ParameterOptimizationConfig,    
+):
+    """ Parameters for the main.py script. """
+    pass
+
+
+def solicit_params() -> Config:
+    parser = ArgumentParser(add_option_string_dash_variants=True)
+    parser.add_arguments(Config, "config")
+    args = parser.parse_args()
+    config: Config = args.config
+    return args.config
+    
+    
+    # parser.add_argument("--seed", help="Random seed", type=int, default=14)
+    # parser.add_argument(
+    #     "--model-type",
+    #     choices=["roberta", "bert", "dialogpt", "albert"],
+    #     help="Which type of encoder and tokenizer to use",
+    #     default="bert",
+    # )
+    # parser.add_argument(
+    #     "--task",
+    #     default="ast",
+    #     type=str,
+    #     choices=["ast", "cds"],
+    #     help="choose which of the two major tasks to train the model",
+    # )
+    # parser.add_argument(
+    #     "--debug",
+    #     default=False,
+    #     action="store_true",
+    #     help="whether or not to go into debug mode, which is faster",
+    # )
+    # parser.add_argument(
+    #     "-v",
+    #     "--verbose",
+    #     default=False,
+    #     action="store_true",
+    #     help="whether or not to have verbose prints",
+    # )
+
+    # # ------ DIRECTORY AND SAVING --------
+    # parser.add_argument("--output-dir", default="outputs/", type=str)
+    # parser.add_argument("--input-dir", default="data/", type=str)
+    # parser.add_argument(
+    #     "--prefix",
+    #     type=str,
+    #     default="0524",
+    #     help="distinguish the trial run, often a MM/DD date",
+    # )
+    # parser.add_argument(
+    #     "--filename",
+    #     type=str,
+    #     help="name of the model if saving, or loading from saved",
+    # )
+    # parser.add_argument(
+    #     "--suffix",
+    #     type=str,
+    #     default="v1",
+    #     help="distinguish the saved data, often a version number",
+    # )
+    # parser.add_argument(
+    #     "--filter",
+    #     default=False,
+    #     action="store_true",
+    #     help="Filter for just errors during evaluation",
+    # )
+
+    # # ------ TRAINING AND EVALUATION --------
+    # parser.add_argument(
+    #     "--do-eval",
+    #     default=False,
+    #     action="store_true",
+    #     help="load the best saved model and run evaluation, qualify or quantify flags must be on",
+    # )
+    # parser.add_argument("--log-interval", default=100, type=int)
+    # parser.add_argument(
+    #     "--qualify",
+    #     default=False,
+    #     action="store_true",
+    #     help="examine the qualitative outputs of the model in natural language",
+    # )
+    # parser.add_argument(
+    #     "--quantify",
+    #     default=False,
+    #     action="store_true",
+    #     help="examine the quantitative outputs of the model in reports",
+    # )
+
+    # # ------- MAJOR MODEL OPTIONS --------
+    # parser.add_argument(
+    #     "--cascade",
+    #     default=False,
+    #     action="store_true",
+    #     help="use cascading evaluation rather than turn level",
+    # )
+    # parser.add_argument(
+    #     "--use-intent",
+    #     default=False,
+    #     action="store_true",
+    #     help="use an oracle intent classification module",
+    # )
+    # parser.add_argument(
+    #     "--use-kb",
+    #     default=False,
+    #     action="store_true",
+    #     help="take advantage of KB guidelines to limit action and value options",
+    # )
+
+    # # ------ DATASET CREATION --------
+    # parser.add_argument(
+    #     "--version",
+    #     type=float,
+    #     default=1.1,
+    #     help="which version of the dataset is being used",
+    # )
+    # # v1.0 was used initially, but v1.1 is released as a significantly cleaner benchmark
+    # parser.add_argument(
+    #     "--build-vocab",
+    #     default=False,
+    #     action="store_true",
+    #     help="whether to build new vocabulary of Glove vectors",
+    # )
+    # parser.add_argument(
+    #     "--max-seq-len",
+    #     default=512,
+    #     type=int,
+    #     help="Maximum number of tokens to truncate each utterance",
+    # )
+
+    # # ------ PARAMETER OPTIMIZATION --------
+    # param_group = parser.add_argument_group(title="hyperparameters")
+    # parser.add_argument(
+    #     "--radam",
+    #     default=False,
+    #     action="store_true",
+    #     help="use RAdam optimizer rather than default AdamW",
+    # )
+    # param_group.add_argument(
+    #     "-lr",
+    #     "--learning-rate",
+    #     default=3e-5,
+    #     type=float,
+    #     help="Learning rate alpha for weight updates",
+    # )
+    # param_group.add_argument(
+    #     "--hidden-dim",
+    #     default=768,
+    #     type=int,
+    #     help="Number of hidden units, size of hidden dimension",
+    # )
+    # param_group.add_argument(
+    #     "--drop-prob",
+    #     default=0.2,
+    #     type=float,
+    #     help="probability of dropping a node, opposite of keep prob",
+    # )
+    # param_group.add_argument(
+    #     "--grad-accum-steps",
+    #     default=1,
+    #     type=int,
+    #     help="Number of steps for gradient accumulation",
+    # )
+    # param_group.add_argument(
+    #     "-reg",
+    #     "--weight-decay",
+    #     default=0.003,
+    #     type=float,
+    #     help="weight_decay to regularize the weights",
+    # )
+    # param_group.add_argument(
+    #     "--batch-size",
+    #     default=50,
+    #     type=int,
+    #     help="batch size for training and evaluation",
+    # )
+    # param_group.add_argument(
+    #     "-e",
+    #     "--epochs",
+    #     default=14,
+    #     type=int,
+    #     help="Number of epochs or episodes to train",
+    # )
+
+    args = parser.parse_args()
+    return args
diff --git a/abcd/utils/embed.py b/abcd/utils/embed.py
new file mode 100644
index 0000000..168dbbb
--- /dev/null
+++ b/abcd/utils/embed.py
@@ -0,0 +1,32 @@
+import json
+import random
+import torch
+from typing import List
+from torch import Tensor
+from tqdm import tqdm as progress_bar
+from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel
+
+def main():
+    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+    model = BertModel.from_pretrained("bert-base-uncased")
+
+    print("Loading data ...")
+    utt_texts = json.load(open(f"data/utterances.json", "r"))
+    num_cands = len(utt_texts)
+    utt_vectors: List[Tensor] = []
+
+    # NOTE: Unused:
+    # cand_embeds, cand_segments, cand_masks = [], [], []
+    for cand_text in progress_bar(utt_texts, total=num_cands):
+        cand_inputs = tokenizer(cand_text, return_tensors="pt")
+        with torch.no_grad():
+            cand_outputs = model(**cand_inputs)
+        utt_vectors.append(cand_outputs.pooler_output)
+
+    utt_vectors_tensor = torch.cat(utt_vectors)
+    print("utt_vectors: {}".format(utt_vectors_tensor.shape))
+    torch.save(utt_vectors, "data/utt_vectors.pt")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/abcd/utils/evaluate.py b/abcd/utils/evaluate.py
new file mode 100644
index 0000000..c40e2c8
--- /dev/null
+++ b/abcd/utils/evaluate.py
@@ -0,0 +1,414 @@
+import json
+import os
+import pdb
+import random
+import sys
+import time as tm
+from collections import Counter, OrderedDict, defaultdict
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+import pandas as pd
+import torch
+from sklearn.metrics import accuracy_score
+from torch import Tensor
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
+
+from abcd.components.systems import Application
+from abcd.utils.arguments import Config
+from abcd.utils.help import prepare_inputs
+from abcd.utils.load import load_guidelines
+
+
+def ast_report(predictions, labels):
+    action_preds, value_preds = predictions
+    action_labels, value_labels = labels
+
+    size = len(action_preds)
+    assert size == len(value_labels)
+
+    top_action_preds = np.argmax(action_preds, axis=1)
+    action_match = action_labels == top_action_preds  # array of booleans
+    action_acc = sum(action_match) / float(size)
+
+    top_value_preds = np.argmax(value_preds, axis=1)
+    value_match = value_labels == top_value_preds
+    value_acc = sum(value_match) / float(size)
+
+    joint_match = action_match & value_match
+    joint_acc = sum(joint_match) / float(size)
+
+    full_result = {
+        "Action_Accuracy": round(action_acc, 4),
+        "Value_Accuracy": round(value_acc, 4),
+        "Joint_Accuracy": round(joint_acc, 4),
+    }
+
+    return full_result, "Joint_Accuracy"
+
+
+def ranking_report(predictions, labels, use_match=False):
+    full_result = {}
+    utt_match = []
+
+    for rank in [1, 5, 10]:
+        level = -rank  # select the top 5 rather than bottom 5
+        num_correct, num_possible = 0, 0
+        # vectorized version possible, but a lot less readable
+        for pred, label in zip(predictions, labels):
+            top_k_indexes = np.argpartition(pred, kth=level)[level:]
+            if label in top_k_indexes:
+                num_correct += 1
+                if rank == 1:
+                    utt_match.append(True)
+            else:
+                if rank == 1:
+                    utt_match.append(False)
+
+            if label >= 0:  # -1 means the turn was take-action or end-of-convo
+                num_possible += 1
+
+        rank_name = f"Recall_at_{rank}"
+        full_result[rank_name] = num_correct / num_possible
+
+    if use_match:
+        return full_result, utt_match
+    else:
+        return full_result, "Recall_at_5"
+
+
+def cds_report(predictions, labels, ci_and_tc, kb_labels=None):
+    """ Calculated in the form of cascaded evaluation
+  where each agent example or utterance a scored example"""
+    intent_pred, nextstep_pred, action_pred, value_pred, utterance_rank = predictions
+    intent_label, nextstep_label, action_label, value_label, utterance_label = labels
+    convo_ids = ci_and_tc[0].detach().cpu().numpy()
+    turn_counts = ci_and_tc[1].detach().cpu().numpy()
+
+    if kb_labels is None:
+        use_kb = False
+    else:
+        use_kb = True
+        intent_list = kb_labels["intent"]
+        action_list = kb_labels["action"]
+        guidelines = load_guidelines()
+        action_mask_map, intent_mask_map = Application.prepare_masks(*guidelines)
+
+    num_turns = len(nextstep_pred)
+    assert num_turns == len(convo_ids)
+
+    top_intent_preds = np.argmax(intent_pred, axis=1)
+    intent_match = intent_label == top_intent_preds  # array of booleans
+    intent_acc = sum(intent_match) / float(num_turns)
+
+    top_nextstep_preds = np.argmax(nextstep_pred, axis=1)
+    nextstep_match = nextstep_label == top_nextstep_preds  # array of booleans
+    nextstep_acc = sum(nextstep_match) / float(num_turns)
+
+    if use_kb:
+        intent_masks = []
+        for top_intent in top_intent_preds:
+            intent_name = intent_list[top_intent]
+            # each intent mask should be size of 30 long
+            intent_mask = intent_mask_map[intent_name]
+            intent_masks.append(intent_mask)
+        # now, all non valid actions should go to zero
+        action_pred *= np.array(intent_masks)
+
+    top_action_preds = np.argmax(action_pred, axis=1)
+    action_match = action_label == top_action_preds  # array of booleans
+    num_turns_include_action = sum(action_label >= 0)
+    action_acc = sum(action_match) / float(num_turns_include_action)
+
+    if use_kb:
+        action_masks = []
+        for top_action in top_action_preds:
+            action_name = action_list[top_action]
+            # each action mask should be size of 223 long
+            action_mask = action_mask_map[action_name]
+            action_masks.append(action_mask)
+        # now, all non valid values should go to zero
+        value_pred *= np.array(action_masks)
+
+    top_value_preds = np.argmax(value_pred, axis=1)
+    value_match = value_label == top_value_preds
+    num_turns_include_value = sum(value_label >= 0)
+    value_acc = sum(value_match) / float(num_turns_include_value)
+
+    joint_match = action_match & value_match
+    joint_acc = sum(joint_match) / float(num_turns_include_action)
+
+    recall, utt_match = {}, []
+    for rank in [1, 5, 10]:
+        level = -rank  # select the top 5 rather than bottom 5
+        num_correct, num_possible = 0, 0
+        for pred, label in zip(utterance_rank, utterance_label):
+            top_k_indexes = np.argpartition(pred, kth=level)[level:]
+            if label in top_k_indexes:
+                num_correct += 1
+                if rank == 1:
+                    utt_match.append(True)
+            else:
+                if rank == 1:
+                    utt_match.append(False)
+
+            if label >= 0:
+                num_possible += 1
+        recall[str(rank)] = num_correct / num_possible
+
+    # group by convo_ids
+    unique_convo_ids = list(set(convo_ids))
+    conversations = {}
+    for uci in unique_convo_ids:
+        turns, correctness = [], []
+        row_id = 0
+        for convo_id, turn_count in zip(convo_ids, turn_counts):
+            if convo_id == uci:
+                turns.append(turn_count)
+
+                correct = False
+                intent_right = intent_match[row_id]
+                nextstep_right = nextstep_match[row_id]
+
+                if nextstep_label[row_id] == 0:
+                    if intent_right and nextstep_right and utt_match[row_id]:
+                        correct = True
+                elif nextstep_label[row_id] == 1:
+                    if intent_right and nextstep_right and joint_match[row_id]:
+                        correct = True
+                elif nextstep_label[row_id] == 2:
+                    if intent_right and nextstep_right:
+                        correct = True
+
+                correctness.append(correct)
+            row_id += 1
+
+        # sort by turn_counts
+        ordered = [
+            cor for _, cor in sorted(zip(turns, correctness), key=lambda tc: tc[0])
+        ]
+        conversations[uci] = ordered
+
+    # count how many correct
+    turn_score, turn_correct = 0, 0
+    for convo_id, convo_correctness in conversations.items():
+        convo_length = len(convo_correctness)
+        # we use turn_id rather than the true turn_count since turn counts will skip numbers
+        # when looping through the conversation due to skipping over customer utterances
+        for turn_id in range(convo_length):
+            num_remaining = convo_length - turn_id
+
+            num_correct = 0
+            # count up how many were predicted correctly
+            while turn_id < convo_length and convo_correctness[turn_id]:
+                num_correct += 1
+                turn_id += 1
+
+            if num_correct > 0:
+                turn_correct += 1
+            # normalize by the number of turns remaining
+            turn_score += num_correct / num_remaining
+
+    # normalize by total number of turns possible
+    turn_acc = turn_correct / float(num_turns)
+    final_score = turn_score / float(num_turns)
+
+    full_result = {
+        "Intent_Accuracy": round(intent_acc, 4),
+        "Nextstep_Accuracy": round(nextstep_acc, 4),
+        "Action_Accuracy": round(action_acc, 4),
+        "Value_Accuracy": round(value_acc, 4),
+        "Joint_Accuracy": round(joint_acc, 4),
+        "Recall_at_1": round(recall["1"], 4),
+        "Recall_at_5": round(recall["5"], 4),
+        "Recall_at_10": round(recall["10"], 4),
+        "Turn_Accuracy": round(turn_acc, 4),
+        "Cascading_Score": round(final_score, 4),
+    }
+
+    return full_result, "Cascading_Score"
+
+
+def task_completion_report(predictions, labels, kb_labels=None):
+    intent_pred, nextstep_pred, action_pred, value_pred, utterance_rank = predictions
+    intent_label, nextstep_label, action_label, value_label, utterance_label = labels
+    num_turns = len(nextstep_pred)
+
+    if kb_labels is None:
+        use_kb = False
+    else:
+        use_kb = True
+        intent_list = kb_labels["intent"]
+        action_list = kb_labels["action"]
+        guidelines = load_guidelines()
+        action_mask_map, intent_mask_map = Application.prepare_masks(*guidelines)
+
+    top_intent_preds = np.argmax(intent_pred, axis=1)
+    intent_match = intent_label == top_intent_preds  # array of booleans
+    intent_acc = sum(intent_match) / float(num_turns)
+
+    top_nextstep_preds = np.argmax(nextstep_pred, axis=1)
+    nextstep_match = nextstep_label == top_nextstep_preds  # array of booleans
+    nextstep_acc = sum(nextstep_match) / float(num_turns)
+
+    if use_kb:
+        intent_masks = []
+        for top_intent in top_intent_preds:
+            intent_name = intent_list[top_intent]
+            # each intent mask should be size of 30 long
+            intent_mask = intent_mask_map[intent_name]
+            intent_masks.append(intent_mask)
+        # now, all non valid actions should go to zero
+        action_pred *= np.array(intent_masks)
+
+    top_action_preds = np.argmax(action_pred, axis=1)
+    action_match = action_label == top_action_preds  # array of booleans
+    num_turns_include_action = sum(action_label >= 0)
+    action_acc = sum(action_match) / float(num_turns_include_action)
+
+    if use_kb:
+        action_masks = []
+        for top_action in top_action_preds:
+            action_name = action_list[top_action]
+            # each action mask should be size of 223 long
+            action_mask = action_mask_map[action_name]
+            action_masks.append(action_mask)
+        # now, all non valid values should go to zero
+        value_pred *= np.array(action_masks)
+
+    top_value_preds = np.argmax(value_pred, axis=1)
+    value_match = value_label == top_value_preds
+    num_turns_include_value = sum(value_label >= 0)
+    value_acc = sum(value_match) / float(num_turns_include_value)
+
+    joint_match = action_match & value_match
+    joint_acc = sum(joint_match) / float(num_turns_include_action)
+
+    recall, utt_match = ranking_report(utterance_rank, utterance_label, use_match=True)
+
+    assert num_turns == len(value_label)
+    assert len(intent_pred) == len(nextstep_label)
+    assert len(utt_match) == num_turns
+    assert len(action_match) == len(top_value_preds)
+
+    turn_correct = 0
+    for turn in range(num_turns):
+        if intent_match[turn] and nextstep_match[turn]:
+            pass
+        else:
+            continue
+
+        if nextstep_label[turn] == 0 and utt_match[turn]:
+            turn_correct += 1
+        elif nextstep_label[turn] == 1 and joint_match[turn]:
+            turn_correct += 1
+        elif nextstep_label[turn] == 2:  # end_conversation
+            turn_correct += 1
+    turn_acc = turn_correct / float(num_turns)
+
+    full_result = {
+        "Intent_Accuracy": round(intent_acc, 4),
+        "Nextstep_Accuracy": round(nextstep_acc, 4),
+        "Action_Accuracy": round(action_acc, 4),
+        "Value_Accuracy": round(value_acc, 4),
+        "Joint_Accuracy": round(joint_acc, 4),
+        "Recall_at_1": round(recall["Recall_at_1"], 4),
+        "Recall_at_5": round(recall["Recall_at_5"], 4),
+        "Recall_at_10": round(recall["Recall_at_10"], 4),
+        "Turn_Accuracy": round(turn_acc, 4),
+    }
+
+    return full_result, "Turn_Accuracy"
+
+
+def qualify(args, ids, tokenizer, target_maps, scores, targets):
+    history_ids, context_ids = ids
+    action_mapper, value_mapper = target_maps
+    num_values = len(value_mapper)
+    pad_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
+
+    action_score, value_score = scores
+    action_target, value_target = targets
+    top_action_ids = np.argmax(action_score.detach().cpu().numpy(), axis=1)
+    top_value_ids = np.argmax(value_score.detach().cpu().numpy(), axis=1)
+
+    for index, (history, context) in enumerate(zip(history_ids, context_ids)):
+        stripped_history = [x for x in history if x != pad_id]
+        history_tokens = tokenizer.convert_ids_to_tokens(stripped_history)
+        history_symbols = " ".join(history_tokens).replace(" ##", "")
+        history_text = (
+            history_symbols.replace("Ġ", "").replace("</s>", "//").replace(" âĢ Ļ", "'")
+        )
+        action_pred = action_mapper[top_action_ids[index]]
+        action_actual = action_mapper[action_target[index].cpu()]
+
+        if args.filter and (action_pred == action_actual):
+            print("--- Skipping since model is correct ---")
+            continue
+
+        context_tokens = tokenizer.convert_ids_to_tokens(context)
+        tvii = top_value_ids[index]
+        if tvii >= num_values:
+            tvii -= num_values
+            value_pred = context_tokens[tvii]
+        else:
+            value_pred = value_mapper[tvii]
+
+        vtic = value_target[index].cpu()
+        if vtic >= num_values:
+            vtic -= num_values
+            value_actual = context_tokens[vtic]
+        else:
+            value_actual = value_mapper[vtic]
+        print(index, history_text)
+        print(f"Predicted Action: {action_pred}, Actual: {action_actual}")
+        print(f"Predicted Value: {value_pred}, Actual: {value_actual}")
+
+    pdb.set_trace()
+
+
+def quantify(
+    args: Config,
+    predictions: List[Tensor],
+    labels: List[Tensor],
+    utils: Union[str, Dict] = None,
+):
+    assert len(predictions) == len(labels)
+
+    if utils == "train" and not args.verbose:
+        return predictions, labels
+
+    if args.task == "ast":
+        predictions = [pred.detach().cpu().numpy() for pred in predictions]
+        labels = [label.detach().cpu().numpy() for label in labels]
+        report, res_name = ast_report(predictions, labels)
+
+    elif args.task == "cds":
+        predictions = [pred.detach().cpu().numpy() for pred in predictions]
+        labels = [label.detach().cpu().numpy() for label in labels]
+        # kb_labels = utils["kb_labels"] if args.use_kb else None
+        kb_labels: Optional[dict] = None
+        if args.use_kb:
+            assert isinstance(utils, dict)
+            kb_labels = utils["kb_labels"]
+
+        if args.cascade:
+            assert isinstance(utils, dict)
+            ci_and_tc = utils["ci_and_tc"]
+            result = cds_report(predictions, labels, ci_and_tc, kb_labels)
+            report, res_name = result
+        else:
+            report, res_name = task_completion_report(predictions, labels, kb_labels)
+
+    return report, res_name
+
+
+if __name__ == "__main__":
+
+    class MyModel:
+        def __init__(self):
+            self.utt_vectors = []
+            self.utt_texts = []
+
+    args = {}
+    run_interaction(args, MyModel())
diff --git a/abcd/utils/help.py b/abcd/utils/help.py
new file mode 100644
index 0000000..63c3a10
--- /dev/null
+++ b/abcd/utils/help.py
@@ -0,0 +1,217 @@
+import math
+import os
+import pdb
+import random
+import sys
+from typing import Any, Dict, NamedTuple, Optional, Tuple, Union, overload
+
+import numpy as np
+import torch
+from abcd.components.feature_dataclasses import (
+    ActionFeature,
+    BaseFeature,
+    CascadeFeature,
+)
+from abcd.utils.arguments import Config
+from torch import Tensor
+
+try:
+    from typing import Literal, TypedDict
+except ImportError:
+    from typing_extensions import Literal, TypedDict  # type: ignore
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+def set_seed(args: Config):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+
+def setup_gpus(args: Config) -> Config:
+    n_gpu = 0  # set the default to 0
+    if torch.cuda.is_available():
+        n_gpu = torch.cuda.device_count()
+    # NOTE: Replacing this next line with the property on the Config dataclass:
+    # args.n_gpu = n_gpu
+    # Therefore we just check that the value would have been the same anyway.
+    assert args.n_gpu == n_gpu
+
+    if n_gpu > 0:  # this is not an 'else' statement and cannot be combined
+        torch.backends.cudnn.benchmark = False
+        torch.backends.cudnn.deterministic = True
+
+    if args.debug:
+        args.epochs = 3
+    return args
+
+
+def check_cache(args: Config, cache_dir: str) -> Tuple[str, bool]:
+    cache_filename = f"{args.model_type}_{args.task}"
+    if args.cascade:
+        cache_filename += "_cascade"
+    if args.use_intent:
+        cache_filename += "_intent"
+    cache_path = os.path.join(cache_dir, cache_filename)
+
+    if os.path.exists(cache_path):
+        print(f"Loading features from cached file {cache_path}")
+        return cache_path, True
+    else:
+        print(f"Loading raw data and preparing new features")
+        return cache_path, False
+
+
+def check_directories(args: Config):
+    cache_dir = os.path.join(args.input_dir, "cache")
+    checkpoint_folder = f"{args.prefix}_{args.filename}_{args.model_type}_{args.suffix}"
+    ckpt_dir = os.path.join(args.output_dir, args.task, checkpoint_folder)
+    directories = [args.input_dir, cache_dir, args.output_dir, ckpt_dir]
+
+    for directory in directories:
+        if os.path.exists(directory):
+            if directory == ckpt_dir:
+                print(f"Warning: {directory} exists and files may be overwritten")
+        else:
+            print(f"Creating {directory} directory ...")
+            os.makedirs(directory)
+
+    cache_results = check_cache(args, cache_dir)
+    return ckpt_dir, cache_results
+
+
+# Define some types for the return value of the `prepare_inputs` function below:
+
+class ModelInputDict(TypedDict):
+    """ TypedDict for the formatted input to the model. """
+    input_ids: Tensor
+    token_type_ids: Tensor
+    attention_mask: Tensor
+
+
+class ASTTargetsTuple(NamedTuple):
+    """ NamedTuple for the 'targets'/'labels' of a model in the AST task.
+
+    NOTE: AST: Action State Tracking : Classify the aciton and the value taken at a
+    given step.
+    """
+    action_id: Tensor
+    value_id: Tensor
+
+
+class CDSTargetsTuple(NamedTuple):
+    """ NamedTuple for the 'targets'/labels for a model in the CDS task.
+
+    NOTE: CDS: Cascading Dialogue Success: Given the context, predict the rest of the
+    conversation. (TODO: @lebrice not 100% sure about this)
+    """
+    #           intent   nextstep   action    value     utterance
+    # targets = [batch[6], batch[7], batch[8], batch[9], batch[10]]
+    intent_id: Tensor
+    nextstep_id: Tensor
+    action_id: Tensor
+    value_id: Tensor
+    utt_id: Tensor
+    convo_id: Optional[Union[int, Tensor]] = None
+    turn_count: Optional[Union[int, Tensor]] = None
+
+
+@overload
+def prepare_inputs(
+    args: Config, batch: CascadeFeature, speaker_turn: bool
+) -> Tuple[ModelInputDict, CDSTargetsTuple, ModelInputDict, Any]:
+    ...
+
+# (BUG: This is fine, but mypy doesn't like it, so putting a type: ignore for now).
+@overload
+def prepare_inputs(  # type: ignore 
+    args: Config, batch: ActionFeature, speaker_turn: bool
+) -> Tuple[ModelInputDict, ASTTargetsTuple, ModelInputDict, Any]:
+    ...
+
+def prepare_inputs(
+    args: Config,
+    batch: Union[ActionFeature, CascadeFeature],
+    speaker_turn: bool = False,
+) -> Tuple[ModelInputDict, Union[ASTTargetsTuple, CDSTargetsTuple], ModelInputDict, Any]:
+    """
+    Convert the `Feature` object into what the transformer models expect as an input for
+    the given task.
+    """
+    if args.task == "ast":
+        assert isinstance(batch, ActionFeature)
+        full_history: ModelInputDict = {
+            # "input_ids": batch[0],
+            # "token_type_ids": batch[1],
+            # "attention_mask": batch[2],
+            "input_ids": batch.input_ids,
+            "token_type_ids": batch.segment_ids,
+            "attention_mask": batch.input_mask,
+        }
+        context_tokens: ModelInputDict = {
+            # "input_ids": batch[3],
+            # "token_type_ids": batch[4],
+            # "attention_mask": batch[5],
+            "input_ids": batch.context_tokens,
+            "token_type_ids": batch.context_segments,
+            "attention_mask": batch.context_masks,
+        }
+        # targets = [batch[6], batch[7]]  # actions and values
+        ast_targets = ASTTargetsTuple(
+            action_id=batch.action_id, value_id=batch.label_id
+        )
+        tools: Any = device
+        return full_history, ast_targets, context_tokens, tools
+
+    assert isinstance(batch, CascadeFeature)
+    full_history = {
+        "input_ids": batch.input_ids,
+        "token_type_ids": batch.segment_ids,
+        "attention_mask": batch.input_mask,
+        # "input_ids": batch[0],
+        # "token_type_ids": batch[1],
+        # "attention_mask": batch[2],
+    }
+    context_tokens = {
+        "input_ids": batch.context_token,
+        "token_type_ids": batch.context_segment,
+        "attention_mask": batch.context_mask,
+        # "input_ids": batch[3],
+        # "token_type_ids": batch[4],
+        # "attention_mask": batch[5],
+    }
+
+    # candidates = batch[11]
+    candidates = batch.candidates
+
+    #           intent   nextstep   action    value     utterance
+    # targets = [batch[6], batch[7], batch[8], batch[9], batch[10]]
+    cds_targets = CDSTargetsTuple(
+        intent_id=batch.intent_id,
+        nextstep_id=batch.nextstep_id,
+        action_id=batch.action_id,
+        value_id=batch.value_id,
+        utt_id=batch.utt_id,
+        convo_id=batch.convo_id if args.cascade else None,
+        turn_count=batch.turn_count if args.cascade else None,
+    )
+
+    # NOTE: @lebrice: Will use a 'targets' tuple with None values when `args.cascade` is
+    # False, rather than use a tuple with more items when it is.
+    # Will need to check that there isn't a switch somewhere that is based on the length
+    # of the tuple though.
+    # if args.cascade:
+    #     # targets.append(batch[15])  # convo_ids
+    #     # targets.append(batch[16])  # turn_counts
+    #     targets.append(batch.convo_id)  # convo_ids
+    #     targets.append(batch.turn_count)  # turn_counts
+    if args.use_intent:
+        tools = candidates, device, batch.intent_id
+        # tools = candidates, device, batch[6]
+    else:
+        tools = candidates, device
+
+    return full_history, cds_targets, context_tokens, tools
diff --git a/abcd/utils/load.py b/abcd/utils/load.py
new file mode 100644
index 0000000..f868545
--- /dev/null
+++ b/abcd/utils/load.py
@@ -0,0 +1,103 @@
+import os, sys, pdb
+import csv
+import json
+import random
+import math
+import torch
+import numpy as np
+from torch import nn
+from torch.optim.optimizer import Optimizer
+from typing import Dict, Union, List, Tuple
+
+from transformers import BertTokenizer, RobertaTokenizer, AlbertTokenizer
+from abcd.components.tools import RAdam, AdamW, get_linear_schedule_with_warmup
+from abcd.utils.arguments import Config
+from pathlib import Path
+from abcd.utils.objects.data import Split, Conversation
+
+
+def load_data(args: Config, already_cached: bool) -> Dict[Split, List[Conversation]]:
+    if already_cached:
+        return {}  # no need to load raw_data since we already have a feature cache
+    else:
+        # doesnt exist!
+        data_path = Path(args.input_dir) / f"abcd_v{args.version}.json"
+        if not data_path.exists():
+            gzip_file_path = data_path.with_suffix(".json.gz")
+            assert gzip_file_path.exists(), gzip_file_path
+            import gzip
+            # TODO: Could probably call something like `guzip` programmatically from python if the
+            # json file doesnt exist!
+            f = gzip.open(gzip_file_path, mode="r")
+        else:
+            f = open(data_path, "r")
+        raw_data: Dict[Split, List[Conversation]] = json.load(f)
+        return raw_data
+
+
+def load_guidelines():
+    kb = json.load(open("data/kb.json", "r"))
+    ont = json.load(open("data/ontology.json", "r"))
+    return kb, ont
+
+
+def load_candidates(args: Config):
+    # The raw agent utterances that are used as candidates when performing utterance ranking
+    utt_texts = json.load(open(f"{args.input_dir}/utterances.json", "r"))
+    # Vectors already been embedded by BERT.  To embed in some other fashion, use the utt_texts instead
+    utt_vectors = torch.load(f"{args.input_dir}/utt_vectors.pt")
+    return utt_texts, utt_vectors
+
+
+def load_tokenizer(args: Config):
+    ontology = json.load(open(f"{args.input_dir}/ontology.json", "r"))
+    non_enumerable = ontology["values"]["non_enumerable"]
+    special = [
+        f"<{slot}>" for category, slots in non_enumerable.items() for slot in slots
+    ]
+
+    if args.model_type == "bert":
+        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+    elif args.model_type == "roberta":
+        tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
+    elif args.model_type == "albert":
+        tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
+
+    tokenizer.add_tokens(special)
+    return tokenizer, ontology
+
+
+def get_optimizer(args: Config, model: nn.Module, adam_epsilon=1e-8) -> Union[RAdam, AdamW]:
+    no_decay = ["bias", "LayerNorm.weight"]
+    grouped_parameters = [
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if not any(nd in n for nd in no_decay)
+            ],
+            "weight_decay": args.weight_decay,
+        },
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if any(nd in n for nd in no_decay)
+            ],
+            "weight_decay": 0.0,
+        },
+    ]
+    if args.radam:
+        optimizer = RAdam(grouped_parameters, lr=args.learning_rate, eps=adam_epsilon)
+    else:
+        optimizer = AdamW(grouped_parameters, lr=args.learning_rate, eps=adam_epsilon)
+    return optimizer
+
+
+def get_scheduler(args: Config, optimizer: Optimizer, training_steps: int, warmup_steps=0, warmup_ratio=0.06):
+    if warmup_steps == 0:  # use the warmup ratio instead
+        warmup_steps = math.ceil(training_steps * warmup_ratio)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=warmup_steps, num_training_steps=training_steps
+    )
+    return scheduler
diff --git a/abcd/utils/objects/__init__.py b/abcd/utils/objects/__init__.py
new file mode 100644
index 0000000..5d9b59e
--- /dev/null
+++ b/abcd/utils/objects/__init__.py
@@ -0,0 +1,2 @@
+from .data import Speaker, Turn, DialogueItem, Conversation, Scenario, RawData, Targets
+from .ontology import Ontology
diff --git a/abcd/utils/objects/data.py b/abcd/utils/objects/data.py
new file mode 100644
index 0000000..803df69
--- /dev/null
+++ b/abcd/utils/objects/data.py
@@ -0,0 +1,173 @@
+""" Simple set of typed dicts that match the data format of ABCD.
+
+This doesn't have any runtime impact, it's just to make it easier to understand what 
+the data contains in code.
+"""
+from typing import Dict, List, NamedTuple, Optional
+try:
+    from typing import TypedDict, Literal
+except ImportError:
+    from typing_extensions import TypedDict, Literal  # type: ignore
+
+
+Split = Literal["train", "dev", "test"]
+Speaker = Literal["customer", "agent", "action"]
+
+# NOTE: This is still a bit unclear, but the values for this second field appear to be
+# related to the next_steps in the ontology json.
+NextStep = Literal["retrieve_utterance", "take_action", "end_conversation"]
+
+
+class Targets(NamedTuple):
+    """ Tuple of five items representing the subtask labels
+
+    Examples:
+    
+    - When in an agent's turn:
+    ```json
+    "targets": [
+        "return_size",
+        "take_action",
+        "validate-purchase",
+        [
+            "cminh730",
+            "cminh730@email.com",
+            "3348917502"
+        ],
+        -1,
+    ]
+    ```
+
+    - When in a user's turn:
+    ```json
+    "targets": [
+        "return_size",
+        null,
+        null,
+        [],
+        -1
+    ]
+    ```
+    """
+    #  Intent Classification (text) - 55 subflow options
+    subflow_name: str
+    # Nextstep Selection (text) - take_action, retrieve_utterance or end_conversation; 3 options
+    next_step: Optional[NextStep]
+    # Action Prediction (text) - the button clicked by the agent; 30 options
+    action: Optional[str]
+    # The values that are passed to the action.
+    # Value Filling (list) - the slot value(s) associated with the action above; 125 options
+    values: List[str]
+    # Utterance Ranking (int) - target position within list of candidates; 100 options
+    some_integer: int
+
+
+class Turn(TypedDict):
+    # Either "agent", "customer" or "action"
+    speaker: Speaker
+    # The utterance of the agent/customer or the system generated response of the action
+    text: str
+    # Integer representing the turn number, starting from 1
+    turn_count: int
+    #  Tuple of five items representing the subtask labels
+    targets: Targets
+    # List of utterance ids representing the pool of 100 candidates to choose from when ranking.
+    # The surface form text can be found in `utterances.json` where the utt_id is the index.
+    # Only applicable when the current turn is a "retrieve_utterance" step.
+    candidates: List[int]
+
+
+class Personal(TypedDict):
+    customer_name: str
+    email: str
+    member_level: str
+    phone: str
+    username: str
+
+
+class ProductItem(TypedDict):
+    brand: str
+    product_type: str
+    amount: int
+    image_url: str
+
+
+class Order(TypedDict):
+    street_address: str
+    full_address: str
+    city: str
+    num_products: str
+    order_id: str
+    packaging: str
+    payment_method: str
+    products: List[ProductItem]
+    purchase_date: str
+    state: str
+    zip_code: str
+
+
+class ProductDict(TypedDict):
+    names: List[str]
+    amounts: List[int]
+
+
+class Scenario(TypedDict):
+    """ Typed Dict for the 'ccenario' field of a "conversation" in ABCD dataset. 
+    
+    Examples:
+    ```json
+    "scenario": {
+        "personal": {
+            "customer_name": "crystal minh",
+            "email": "cminh730@email.com",
+            "member_level": "bronze",
+            "phone": "(977) 625-2661",
+            "username": "cminh730"
+        },
+        "order": {
+            "street_address": "6821 1st ave",
+            "full_address": "6821 1st ave  san mateo, ny 75227",
+            "city": "san mateo",
+            "num_products": "1",
+            "order_id": "3348917502",
+            "packaging": "yes",
+            "payment_method": "credit card",
+            "products": "[{'brand': 'michael_kors', 'product_type': 'jeans', 'amount': 94, 'image_url': 'images/michael_kors-jeans.jpeg'}]",
+            "purchase_date": "2019-11-06",
+            "state": "ny",
+            "zip_code": "75227"
+        },
+        "product": {
+            "names": [
+                "michael_kors jeans"
+            ],
+            "amounts": [
+                94
+            ]
+        },
+        "flow": "product_defect",
+        "subflow": "return_size"
+    }
+    ```
+    """
+    personal: Personal
+    order: Order
+    product: ProductDict
+    flow: str
+    subflow: str
+
+
+class DialogueItem(NamedTuple):
+    speaker: Speaker
+    text: str
+
+
+class Conversation(TypedDict):
+    convo_id: int
+    scenario: Scenario
+    original: List[DialogueItem]
+    delexed: List[Turn]
+
+
+# Could just use Dict[str, List[Conversation]] to be more general as well.
+RawData = Dict[Split, List[Conversation]]
diff --git a/abcd/utils/objects/ontology.py b/abcd/utils/objects/ontology.py
new file mode 100644
index 0000000..043c7c4
--- /dev/null
+++ b/abcd/utils/objects/ontology.py
@@ -0,0 +1,46 @@
+""" TypedDict for the Ontology dict and its items. """
+from typing import List, Any, Union, Dict
+
+try:
+    from typing import TypedDict, Literal
+except ImportError:
+    from typing_extensions import TypedDict, Literal  # type: ignore
+
+
+class Intents(TypedDict):
+    flows: List[str]
+    subflows: Dict[str, List[str]]
+
+
+class Vocabulary(TypedDict):
+    tokens: List
+    # Special tokens: ["[CLS]", "[SEP]", "[UNK]", "[AGENT]", "[CUSTOMER]", "[ACTION]"]
+    special: List[str]
+
+
+# Action maps the action name to the list of values that need to be entered, e.g:
+# "validate-purchase": ["username", "email", "order_id"],
+Action = Dict[str, List[str]]
+
+# "Actions" dicts apparently map the 'section' to the 'buttons' in that section, e.g.:
+# "kb_query": {
+#         "verify-identity": ["customer_name", "account_id", "order_id", "zip_code"],
+#         ...
+# }
+Actions = Dict[str, Action]
+
+
+class Values(TypedDict):
+    enumerable: Dict[str, List[str]]
+    non_enumerable: Dict[str, List[str]]
+
+
+class Ontology(TypedDict):
+    """ TypedDict for the Ontology, which is in `data/ontology.json`
+    """
+
+    intents: Intents
+    vocabulary: Vocabulary
+    actions: Actions
+    values: Values
+    next_steps: List[str]  # ["retrieve_utterance", "take_action", "end_conversation"]
diff --git a/abcd/utils/process.py b/abcd/utils/process.py
new file mode 100644
index 0000000..98bb6d1
--- /dev/null
+++ b/abcd/utils/process.py
@@ -0,0 +1,544 @@
+import os, sys, pdb
+import csv
+import json
+import random
+import torch
+import numpy as np
+import pandas as pd
+import datetime
+from typing import Optional, List, Tuple, Generic, TypeVar, Any
+from abc import ABC, abstractmethod
+from abcd.components.feature_dataclasses import BaseFeature
+from abcd.utils.objects import Conversation, Scenario, Turn, RawData, Targets, Ontology
+from tqdm import tqdm as progress_bar
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
+from typing import Dict, Tuple, List, Union
+from abcd.components.feature_dataclasses import (
+    ActionFeature,
+    CompletionFeature,
+    CascadeFeature,
+)
+from abcd.components.datasets import (
+    # ActionFeature,
+    # CompletionFeature,
+    # CascadeFeature,
+    ActionDataset,
+    CascadeDataset,
+)
+from abcd.utils.arguments import Config
+
+
+def setup_dataloader(
+    datasets: Dict[str, Union[ActionDataset, CascadeDataset]],
+    batch_size: int,
+    split: str,
+) -> Tuple[DataLoader, int]:
+    dataset = datasets[split]
+    num_examples = len(dataset)
+    sampler = RandomSampler(dataset) if split == "train" else SequentialSampler(dataset)
+    collate = dataset.collate_func
+    dataloader = DataLoader(
+        dataset, sampler=sampler, batch_size=batch_size, collate_fn=collate
+    )
+    print(f"Loaded {split} data with {len(dataloader)} batches")
+    return dataloader, num_examples
+
+
+def notify_feature_sizes(args: Config, features: Dict):
+    if args.verbose:
+        for split, feats in features.items():
+            print(f"{split}: {len(feats)} features")
+
+
+def prepare_action_labels(ontology: Ontology):
+    action_list = []
+    for section, buttons in ontology["actions"].items():
+        actions = buttons.keys()
+        action_list.extend(actions)
+    return {action: idx for idx, action in enumerate(action_list)}
+
+
+def prepare_intent_labels(ontology: Ontology):
+    intent_list = []
+    for flow, subflows in ontology["intents"]["subflows"].items():
+        intent_list.extend(subflows)
+    return {intent: idx for idx, intent in enumerate(intent_list)}
+
+
+def prepare_nextstep_labels(ontology: Ontology):
+    nextstep_list = ontology["next_steps"]
+    return {nextstep: idx for idx, nextstep in enumerate(nextstep_list)}
+
+
+def prepare_value_labels(ontology: Ontology):
+    value_list = []
+    for category, values in ontology["values"]["enumerable"].items():
+        # value_list.extend(values)
+        for val in values:
+            if val not in value_list:  # remove exactly one instance of credit_card
+                value_list.append(val.lower())
+    return {slotval: idx for idx, slotval in enumerate(value_list)}
+
+
+FeatureType = TypeVar("FeatureType", bound=BaseFeature, covariant=True)
+
+
+class BaseProcessor(ABC, Generic[FeatureType]):
+    def __init__(self, args: Config, tokenizer, ontology):
+        self.task = args.task
+        self.model_type = args.model_type
+        self.use_intent = args.use_intent
+
+        self.tokenizer = tokenizer
+        self.ontology = ontology
+
+        self.prepare_labels(args)
+        self.prepare_special_tokens(args)
+
+    @abstractmethod
+    def build_features(
+        self, args: Config, raw_data: Dict[str, List[Conversation]]
+    ) -> Dict[str, List[FeatureType]]:
+        """Build a list of Feature objects from the list of conversations for each
+        split.
+        
+        NOTE: This 'Conversation' object is just a typed dict. The actual object is just
+        a dict.
+        """
+        raise NotImplementedError()
+
+    def prepare_labels(self, args: Config):
+        self.non_enumerable = self.ontology["values"]["non_enumerable"]
+        self.enumerable = {}
+        for category, values in self.ontology["values"]["enumerable"].items():
+            self.enumerable[category] = [val.lower() for val in values]
+
+        self.mappers = {
+            "value": prepare_value_labels(self.ontology),
+            "action": prepare_action_labels(self.ontology),
+            "intent": prepare_intent_labels(self.ontology),
+            "nextstep": prepare_nextstep_labels(self.ontology),
+        }  # utterance is ranking, so not needed
+        self.start_idx = len(self.mappers["value"])
+
+        # Break down the slot values by action
+        self.value_by_action = {}
+        for section, actions in self.ontology["actions"].items():
+            for action, targets in actions.items():
+                self.value_by_action[action] = targets
+
+    def prepare_special_tokens(self, args: Config):
+        special_tokens_count = 3 if args.model_type == "roberta" else 2
+        # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
+        effective_max = args.max_seq_len - special_tokens_count
+        cls_token_segment_id = 0
+        sequence_a_segment_id = 0 if args.model_type == "roberta" else 1
+        pad_token_segment_id = 0
+
+        self.special = {
+            "tokens": [
+                self.tokenizer.cls_token,
+                self.tokenizer.sep_token,
+                self.tokenizer.pad_token,
+            ],
+            "ids": [cls_token_segment_id, sequence_a_segment_id, pad_token_segment_id],
+            "maximum": [effective_max, args.max_seq_len],
+        }
+
+    def value_to_id(
+        self, context, action, value, potential_vals
+    ) -> Tuple[int, List[str]]:
+        # context is a list of utterances
+        target_id = -1
+        action_tokens = self.tokenizer.tokenize(action)
+        filtered = []
+        for utterance in context:
+            speaker, text = utterance.split("|")
+            context_tokens = self.tokenizer.tokenize(text)
+            for tok in context_tokens:
+                if tok in filtered:
+                    continue  # find uniques this way to preserve order
+                if len(tok) > 2:
+                    filtered.append(tok)  # remove punctuation and special tokens
+
+        effective_max = 100 - (
+            len(action_tokens) + 3
+        )  # three special tokens will be added
+        tokens = filtered[-effective_max:]  # [CLS] action [SEP] filtered [SEP]
+
+        for option in potential_vals:
+            if option in self.enumerable:  # just look it up
+                if value in self.enumerable[option]:
+                    target_id = self.mappers["value"][value]
+            else:
+                entity = f"<{option}>"  # calculate location in the context
+                if entity in tokens:
+                    target_id = self.start_idx + tokens.index(entity)
+
+            if target_id >= 0:
+                break  # we found our guy, so let's move on
+
+        return target_id, tokens
+
+    def embed_utterance(self, text):
+        cls_token, sep_token, pad_token = self.special["tokens"]
+        (
+            cls_token_segment_id,
+            sequence_a_segment_id,
+            pad_token_segment_id,
+        ) = self.special["ids"]
+        effective_max, max_seq_length = self.special["maximum"]
+
+        text = pad_token if text == "" else text
+        if self.model_type in ["roberta", "large"]:
+            tokens = self.tokenizer.tokenize(text, add_prefix_space=True)
+        else:
+            tokens = self.tokenizer.tokenize(text)
+        if len(tokens) > effective_max:
+            tokens = tokens[:effective_max]
+
+        tokens = tokens + [sep_token]
+        segment_ids = [cls_token_segment_id] + [sequence_a_segment_id] * len(tokens)
+        tokens = [cls_token] + tokens
+        # The convention in BERT is:
+        # (a) For sequence pairs:
+        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+        #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
+        # (b) For single sequences:
+        #  tokens:   [CLS] the dog is hairy . [SEP]
+        #  type_ids:   0   0   0   0  0     0   0
+        #
+        # Since we only ever have input text, the "type_ids" are instead used to indicate
+        # the speaker of 0 = customer, 1 = agent and 2 = action
+        # The embedding vectors for `type=0` and `type=1` were learned during pre-training and
+        # are added to the wordpiece embedding vector (and position vector). Hopefully
+        # the fine-tuning can overcome this difference in semantic meaning
+
+        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
+        input_mask = [1] * len(input_ids)
+
+        pad_token_id = self.tokenizer.convert_tokens_to_ids([pad_token])[0]
+        # Zero-pad up to the sequence length.
+        padding_length = max_seq_length - len(input_ids)
+        input_ids = input_ids + ([pad_token_id] * padding_length)
+        segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)
+        input_mask = input_mask + ([0] * padding_length)
+
+        assert len(input_ids) == max_seq_length
+        assert len(segment_ids) == max_seq_length
+        assert len(input_mask) == max_seq_length
+
+        return input_ids, segment_ids, input_mask
+
+    def convert_context_tokens(self, context_tokens):
+        # context_tokens is a list of pre-tokenized strings, with action name in the front
+        # and we want a list of embedded vectors
+        cls_token, sep_token, pad_token = self.special["tokens"]
+        (
+            cls_token_segment_id,
+            sequence_a_segment_id,
+            pad_token_segment_id,
+        ) = self.special["ids"]
+
+        tokens = context_tokens + [sep_token]
+        segment_ids = [cls_token_segment_id] + [sequence_a_segment_id] * len(tokens)
+        tokens = [cls_token] + tokens
+
+        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
+        input_mask = [1] * len(token_ids)
+
+        pad_token_id = self.tokenizer.convert_tokens_to_ids([pad_token])[0]
+        # Zero-pad up to the sequence length.
+        padding_length = 100 - len(token_ids)
+        token_ids = token_ids + ([pad_token_id] * padding_length)
+        segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)
+        mask_ids = input_mask + ([0] * padding_length)
+
+        return {
+            "token_ids": token_ids,
+            "segment_ids": segment_ids,
+            "mask_ids": mask_ids,
+        }
+
+    def action_to_id(self, action):
+        if self.task == "value":
+            return action
+        if " " in action:
+            action, input_position = action.split(" ")
+        return self.mappers["action"][action]
+
+    def convert_example(
+        self,
+        dialog_history: List[str],
+        target_ids: Dict[str, Any],
+        context_tokens: List,
+        intent: Optional[str] = None,
+        candidates: List = None,
+    ) -> Union[ActionFeature, CascadeFeature]:
+        sep_token = self.special["tokens"][1]
+
+        texts = [
+            utterance.split("|")[1] for utterance in dialog_history
+        ]  # drop the speaker
+        if self.use_intent:
+            texts = [f"{intent}|{text}" for text in texts]
+        embedded, segments, mask = self.embed_utterance(f" {sep_token} ".join(texts))
+
+        embedded_context = self.convert_context_tokens(context_tokens)
+        if self.task == "ast":
+            return ActionFeature(
+                input_ids=embedded,
+                segment_ids=segments,
+                input_mask=mask,
+                label_ids=target_ids,
+                context=embedded_context,
+            )
+        assert self.task == "cds"
+        return CascadeFeature(
+            input_ids=embedded,
+            segment_ids=segments,
+            input_mask=mask,
+            label_ids=target_ids,
+            context=embedded_context,
+            candidates=candidates,
+        )
+
+
+class ASTProcessor(BaseProcessor[ActionFeature]):
+    def collect_one_example(self, context, action: str, value, potential_vals):
+        # actions that don't require any values
+        if value == "not applicable":
+            target_ids = {"action": self.action_to_id(action), "value": -1}
+            feature = self.convert_example(context, target_ids, [])
+            self.split_feats.append(feature)
+
+        else:  # actions that require at least one value
+            value_id, context_tokens = self.value_to_id(
+                context, action, value, potential_vals
+            )
+            # context_tokens are used for copying from the context when selecting values
+            if value_id >= 0:
+                target_ids = {"action": self.action_to_id(action), "value": value_id}
+                feature = self.convert_example(context, target_ids, context_tokens)
+                self.split_feats.append(feature)
+
+    def collect_examples(self, context: List[str], action: str, values: List[str]):
+        potential_vals = self.value_by_action[action]
+        # just skip if action does not require value inputs
+        if len(potential_vals) > 0:
+            # these two actions require 3 value inputs, so we break it down
+            if action in ["verify-identity", "validate-purchase"]:
+                # a smarter model can be made that handles each position conditioned on other values
+                for position, value in zip(["a", "b", "c"], values):
+                    action_name = action + " " + position
+                    self.collect_one_example(
+                        context, action_name, value, potential_vals
+                    )
+            # other actions require a single value to be filled
+            else:
+                self.collect_one_example(context, action, values[0], potential_vals)
+        else:
+            self.collect_one_example(context, action, "not applicable", potential_vals)
+
+    def build_features(
+        self, args: Config, raw_data: Dict[str, List[Conversation]]
+    ) -> Dict[str, List[ActionFeature]]:
+        features = {}
+
+        for split, data in raw_data.items():
+            self.split_feats: List[BaseFeature] = []
+            print(f"Building features for {split}")
+
+            # TODO: (See below)
+            # flows = []
+            # subflows = []
+
+            convo: Conversation
+            for convo in progress_bar(data, total=len(data)):
+                so_far: List[str] = []
+                # BUG: TypedDict are buggy with mypy? these type hints shouldn't be
+                # necessary:
+                scenario: Scenario = convo["scenario"]
+                flow: str = scenario["flow"]
+                subflow: str = scenario["subflow"]
+                # print(f"Flow: {flow}")
+                # print(f"Subflow: {subflow}")
+
+                # TODO: Figure out if we need to save this 'flow/subflow' field
+                # somewhere inside the Features, and if so, what the best way of doing
+                # this would be.
+
+                # self._current_flow = flow
+                # self._current_subflow = subflow
+                # flows.append(flow)
+                # subflow.append(subflow)
+
+                turn: Turn
+                for i, turn in enumerate(convo["delexed"]):
+                    speaker, utt = turn["speaker"], turn["text"]
+                    assert speaker in ["agent", "customer", "action"]
+                    subflow_name = turn["targets"][0]
+                    # print(f"Subflow name in the turn: {subflow_name}, equal to subflow: {subflow_name == subflow}")
+
+                    if speaker in ["agent", "customer"]:
+                        # print(f"{speaker} says: '{utt}'")
+                        utt_str = f"{speaker}|{utt}"
+                        so_far.append(utt_str)
+                    else:
+                        assert speaker == "action"
+
+                        # create a training example during every action
+                        # _, _, action, values, _ = turn["targets"]
+                        # Use a NamedTuple for the targets:
+                        targets: Targets = Targets(*turn["targets"])
+                        action = targets.action
+                        values = targets.values
+
+                        # FIXME: @lebrice Debugging: This first item in "targets" is
+                        # similar to the "subflow" of the "scenario" dict.
+                        # subflow_name, something, action, values, some_int = targets
+
+                        # print(f"Action: {action}, values: {values}")
+                        # print(f"(targets: {targets}")
+
+                        context = so_far.copy()  # [::-1] to reverse
+                        self.collect_examples(context, targets.action, targets.values)
+                        action_str = f"action|{action}"
+                        so_far.append(action_str)
+
+            features[split] = self.split_feats
+        return features
+
+
+class CDSProcessor(BaseProcessor[CascadeFeature]):
+    def collect_one_example(self, dialog_history, targets: Targets, support_items):
+        intent, nextstep, action, _, utt_id = targets
+        candidates = [-1] * 100
+        context_tokens = []
+        action_id, value_id = -1, -1
+
+        if nextstep == "take_action":
+            value, potential_vals, convo_id, turn_id = support_items
+            action_id = self.action_to_id(action)
+            if value != "not applicable":
+                value_id, context_tokens = self.value_to_id(
+                    dialog_history, action, value, potential_vals
+                )
+
+        elif nextstep == "retrieve_utterance":
+            candidates, convo_id, turn_id = support_items
+
+        elif nextstep == "end_conversation":
+            convo_id, turn_id = support_items
+
+        target_ids = {
+            "intent": self.mappers["intent"][intent],
+            "nextstep": self.mappers["nextstep"][nextstep],
+            "action": action_id,
+            "value": value_id,
+            "utterance": utt_id,
+            "convo": convo_id,
+            "turn": turn_id,
+        }
+        feature = self.convert_example(
+            dialog_history, target_ids, context_tokens, intent, candidates
+        )
+        self.split_feats.append(feature)
+
+    def collect_examples(self, context, targets, convo_id, turn_id):
+        _, _, action, values, _ = targets
+        potential_vals = self.value_by_action[action]
+
+        if len(potential_vals) > 0:  # just skip if action does not require inputs
+            if action in ["verify-identity", "validate-purchase"]:  # 3 action inputs
+                for position, value in zip(["a", "b", "c"], values):
+                    action_name = action + " " + position
+                    self.collect_one_example(
+                        context, targets, (value, potential_vals, convo_id, turn_id)
+                    )
+            else:
+                self.collect_one_example(
+                    context, targets, (values[0], potential_vals, convo_id, turn_id)
+                )
+        else:
+            self.collect_one_example(
+                context, targets, ("not applicable", potential_vals, convo_id, turn_id)
+            )
+
+    def build_features(
+        self, args: Config, raw_data: RawData
+    ) -> Dict[str, CascadeFeature]:
+        features: Dict[str, List[BaseFeature]] = {}
+        for split, data in raw_data.items():
+            self.split_feats = []
+            print(f"Building features for {split}")
+
+            for convo in progress_bar(data, total=len(data)):
+                so_far = []
+
+                for turn in convo["delexed"]:
+                    speaker, text = turn["speaker"], turn["text"]
+                    utterance = f"{speaker}|{text}"
+
+                    if speaker == "agent":
+                        context = so_far.copy()
+                        support_items = (
+                            turn["candidates"],
+                            convo["convo_id"],
+                            turn["turn_count"],
+                        )
+                        self.collect_one_example(
+                            context, turn["targets"], support_items
+                        )
+                        so_far.append(utterance)
+                    elif speaker == "action":
+                        context = so_far.copy()
+                        self.collect_examples(
+                            context,
+                            turn["targets"],
+                            convo["convo_id"],
+                            turn["turn_count"],
+                        )
+                        so_far.append(utterance)
+                    else:
+                        so_far.append(utterance)
+
+                context = so_far.copy()  # the entire conversation
+                end_targets = turn["targets"].copy()
+                end_targets[1] = "end_conversation"
+                end_targets[4] = -1
+                support_items = convo["convo_id"], turn["turn_count"]
+                self.collect_one_example(context, end_targets, support_items)
+
+            features[split] = self.split_feats
+        return features
+
+
+def process_data(
+    args: Config, tokenizer, ontology, raw_data, cache_path, from_cache: bool
+) -> Tuple[Dict, Dict]:
+    """Takes in a pre-processed dataset and performs further operations:
+    
+    1) Extract the labels
+    2) Embed the inputs
+    3) Store both into features
+    4) Cache the results
+    """
+    processor: BaseProcessor
+    if args.task == "ast":
+        processor = ASTProcessor(args, tokenizer, ontology)
+    elif args.task == "cds":
+        processor = CDSProcessor(args, tokenizer, ontology)
+
+    if from_cache:
+        features = torch.load(cache_path)
+        print(f"Features loaded successfully.")
+    else:
+        features = processor.build_features(args, raw_data)
+        print(f"Saving features into cached file {cache_path}")
+        torch.save(features, cache_path)
+
+    notify_feature_sizes(args, features)
+    return features, processor.mappers
diff --git a/abcd/utils/sentence.py b/abcd/utils/sentence.py
new file mode 100644
index 0000000..f931ea6
--- /dev/null
+++ b/abcd/utils/sentence.py
@@ -0,0 +1,102 @@
+import gym
+from textworld.gym.spaces import Word
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+from english_words import english_words_set
+from transformers import T5Tokenizer
+from typing import Tuple, Any
+from transformers import GPT2TokenizerFast
+from transformers import BertPreTrainedModel
+from typing import Sequence, List, Union, Dict, Any, Optional, Set
+import numpy as np
+from torch import Tensor
+from transformers import Conversation
+import uuid
+import logging
+from transformers import pipeline
+
+class Sentence(gym.Space):
+    def __init__(
+        self, max_length: int, min_length: int = 1, vocabulary: Sequence[str] = None
+    ):
+        super().__init__()
+        self.min_length = min_length
+        self.max_length = max_length
+        # TODO: The length of the sentence should vary.
+        self.vocabulary: Set[str] = set(vocabulary or english_words_set)
+        self.vocab_np = np.array(sorted(self.vocabulary))
+        self.vocab_length = len(self.vocabulary)
+
+    def contains(self, sample: Union[str, Any]) -> bool:
+        if not isinstance(sample, str):
+            return False
+        if not (self.min_length <= len(sample) <= self.max_length):
+            return False
+        words_not_in_vocab = [
+            word for word in sample.split() if word not in self.vocabulary
+        ]
+        if words_not_in_vocab:
+            print(f"sample has words not in vocabulary: {words_not_in_vocab}")
+            return False
+        return True
+
+    def sample(self):
+        # TODO: Use a language model instead for sampling.
+        sentence_length = self.np_random.randint(self.min_length, self.max_length + 1)
+        ids = self.np_random.choice(
+            self.vocab_length, size=sentence_length, replace=True
+        )
+        words_np = self.vocab_np[ids]
+        sentence_str = " ".join(words_np)
+        return sentence_str
+
+
+class CustomConversation(Conversation):
+    """ Optional: Extend `Conversation` so the names for the 'user' and 'bot' can be changed. """
+
+    def __init__(
+        self,
+        text: str = None,
+        conversation_id: uuid.UUID = None,
+        past_user_inputs: List[str] = None,
+        generated_responses: List[str] = None,
+        user_name: str = "user",
+        bot_name: str = "bot",
+    ):
+        super().__init__(
+            text=text,
+            conversation_id=conversation_id,
+            past_user_inputs=past_user_inputs,
+            generated_responses=generated_responses,
+        )
+        self.user_name = user_name
+        self.bot_name = bot_name
+
+    def __repr__(self):
+        """
+        Generates a string representation of the conversation.
+
+        Return:
+            :obj:`str`:
+
+            Example: Conversation id: 7d15686b-dc94-49f2-9c4b-c9eac6a1f114 user >> Going to the movies tonight - any
+            suggestions? bot >> The Big Lebowski
+        """
+        lines = [f"Conversation id: {self.uuid}"]
+        for is_user, text in self.iter_texts():
+            name = self.user_name if is_user else self.bot_name
+            lines.append(f"{name}: {text}")
+        return "\n".join(lines)
+
+    @classmethod
+    def wrap(
+        cls, convo: Conversation, user_name: str = "Agent", bot_name: str = "User"
+    ) -> "CustomConversation":
+        return CustomConversation(
+            text=convo.new_user_input,
+            conversation_id=convo.uuid,
+            past_user_inputs=convo.past_user_inputs,
+            generated_responses=convo.generated_responses,
+            user_name=user_name,
+            bot_name=bot_name,
+        )
\ No newline at end of file
diff --git a/components/datasets.py b/components/datasets.py
deleted file mode 100644
index ad8089e..0000000
--- a/components/datasets.py
+++ /dev/null
@@ -1,107 +0,0 @@
-import numpy as np
-import random
-import torch
-from torch.utils.data import Dataset
-
-class BaseFeature(object):
-  """A single set of features of data."""
-  def __init__(self, input_ids, segment_ids, input_mask, label_id, position_ids=None):
-    self.input_id = input_ids
-    self.segment_id = segment_ids
-    self.mask_id = input_mask
-    self.label_id = label_id
-    self.position_id = position_ids
-
-class ActionFeature(BaseFeature):
-  """ A single set of features with precomputed context token ids"""
-  def __init__(self, input_ids, segment_ids, input_mask, label_ids, context):
-    super().__init__(input_ids, segment_ids, input_mask, label_ids['value'])
-    # token_ids is a batch_size length list, where each item is 100 ids
-    self.context_token = context['token_ids']
-    self.context_segment = context['segment_ids']
-    self.context_mask = context['mask_ids']
-    self.action_id = label_ids['action']
-
-class CompletionFeature(BaseFeature):
-  """ A single set of completion features with precomputed context token ids"""
-  def __init__(self, input_ids, segment_ids, input_mask, label_ids, context, candidates):
-    super().__init__(input_ids, segment_ids, input_mask, None)
-    self.candidates = candidates
-    self.context_token = context['token_ids']
-    self.context_segment = context['segment_ids']
-    self.context_mask = context['mask_ids']
-
-    self.intent_id = label_ids['intent']
-    self.nextstep_id = label_ids['nextstep']
-    self.action_id = label_ids['action']
-    self.value_id = label_ids['value']
-    self.utt_id = label_ids['utterance']
-
-class CascadeFeature(CompletionFeature):
-  """ A single set of completion features with precomputed context token ids"""
-  def __init__(self, input_ids, segment_ids, input_mask, label_ids, context, candidates):
-    super().__init__(input_ids, segment_ids, input_mask, label_ids, context, candidates)
-    self.convo_id = label_ids['convo']
-    self.turn_count = label_ids['turn']
-
-class BaseDataset(Dataset):
-
-  def __init__(self, args, features):
-    self.data = features
-    self.model_type = args.model_type
-    self.num_examples = len(features)
-
-  def __len__(self):
-    return len(self.data)
-
-  def __getitem__(self, idx):
-    return self.data[idx]
-
-  def collate_func(self, args, split, raw_data):
-    raise NotImplementedError()
-
-class ActionDataset(BaseDataset):
-
-  def collate_func(self, features):
-    input_ids = torch.tensor([f.input_id for f in features], dtype=torch.long)
-    segment_ids = torch.tensor([f.segment_id for f in features], dtype=torch.long)
-    mask_ids = torch.tensor([f.mask_id for f in features], dtype=torch.long)
-    context_tokens = torch.tensor([f.context_token for f in features], dtype=torch.long)
-    context_segments = torch.tensor([f.context_segment for f in features], dtype=torch.long)
-    context_masks = torch.tensor([f.context_mask for f in features], dtype=torch.long)
-    
-    action_ids = torch.tensor([f.action_id for f in features], dtype=torch.long)
-    value_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
-
-    return (input_ids, segment_ids, mask_ids, context_tokens, context_segments, context_masks,
-              action_ids, value_ids)
-
-class CompletionDataset(BaseDataset):
-
-  def collate_func(self, features):
-    input_ids = torch.tensor([f.input_id for f in features], dtype=torch.long)
-    segment_ids = torch.tensor([f.segment_id for f in features], dtype=torch.long)
-    mask_ids = torch.tensor([f.mask_id for f in features], dtype=torch.long)
-    context_tokens = torch.tensor([f.context_token for f in features], dtype=torch.long)
-    context_segments = torch.tensor([f.context_segment for f in features], dtype=torch.long)
-    context_masks = torch.tensor([f.context_mask for f in features], dtype=torch.long)
-    
-    intent_ids = torch.tensor([f.intent_id for f in features], dtype=torch.long)
-    nextstep_ids = torch.tensor([f.nextstep_id for f in features], dtype=torch.long)
-    action_ids = torch.tensor([f.action_id for f in features], dtype=torch.long)
-    value_ids = torch.tensor([f.value_id for f in features], dtype=torch.long)
-    utterance_ids = torch.tensor([f.utt_id for f in features], dtype=torch.long)
-    all_candidates = torch.tensor([f.candidates for f in features], dtype=torch.long)
-
-    return (input_ids, segment_ids, mask_ids, context_tokens, context_segments, context_masks,
-              intent_ids, nextstep_ids, action_ids, value_ids, utterance_ids, all_candidates)
-    
-class CascadeDataset(CompletionDataset):
-
-  def collate_func(self, features):
-    collated_batch = super().collate_func(features)
-    convo_ids = torch.tensor([f.convo_id for f in features], dtype=torch.long)
-    turn_counts = torch.tensor([f.turn_count for f in features], dtype=torch.long)
-    cascade_batch = (convo_ids, turn_counts)
-
-    return collated_batch + cascade_batch
diff --git a/components/features.py b/components/features.py
deleted file mode 100644
index 72da99a..0000000
--- a/components/features.py
+++ /dev/null
@@ -1,123 +0,0 @@
-
-class InputExample(object):
-  """A single training/test example for simple sequence classification."""
-
-  def __init__(self, guid, input_context, target_label, candidates=None):
-    """Constructs a InputExample.
-
-    Args:
-      guid: Unique id for the example.
-      context: list of strings. The untokenized text of the converation so far.
-      label: (Optional) string. The label of the example. This should be
-      specified for train and dev examples, but not for test examples.
-      candidates: list of candidates to choose from for utterance ranking
-    """
-    self.guid = guid
-    self.context = input_context
-    self.label = target_label
-    self.candidates = candidates
-
-class ActionExample(InputExample):
-  """A single training/test example for slot value filling. """
-  def __init__(self, guid, input_context, target_label, tokens, action):
-    super().__init__(guid, input_context, target_label)
-    self.context_tokens = tokens
-    self.action = action
-
-class CompleteExample(InputExample):
-  """A single training/test example for task completion. """
-  def __init__(self, guid, input_context, targets, tokens, candidates):
-    super().__init__(guid, input_context, None, candidates)
-    self.context_tokens = tokens
-
-    intent, nextstep, action, value_index, utt_index = targets
-    self.intent_label = intent
-    self.nextstep_label = nextstep
-    self.action_label = action
-    self.value_label = value_index
-    self.utt_label = utt_index
-
-class CascadingExample(InputExample):
-  """A single training/test example for task completion. """
-  def __init__(self, guid, input_context, targets, tokens, candidates, convo_id, turn_count):
-    super().__init__(guid, input_context, None, candidates)
-    self.context_tokens = tokens
-
-    intent, nextstep, action, value_index, utt_index = targets
-    self.intent_label = intent
-    self.nextstep_label = nextstep
-    self.action_label = action
-    self.value_label = value_index
-    self.utt_label = utt_index
-
-    self.convo_id = convo_id
-    self.turn_count = turn_count
-
-class InputFeatures(object):
-  """A single set of features of data."""
-  def __init__(self, input_ids, segment_ids, input_mask, label_id, position_ids=None):
-    self.input_id = input_ids
-    self.segment_id = segment_ids
-    self.mask_id = input_mask
-    self.label_id = label_id
-    self.position_id = position_ids
-
-class CandidateFeatures(InputFeatures):
-  """ A single set of features with precomputed candidates """
-  def __init__(self, input_ids, segment_ids, input_mask, label_id, candidates, position_ids=None):
-    super().__init__(input_ids, segment_ids, input_mask, label_id, position_ids)
-    # candidates is a (batch_size x num_candidates x hidden_dim) tensor
-    self.candidates = candidates
-
-class ActionFeatures(InputFeatures):
-  """ A single set of features with precomputed context token ids"""
-  def __init__(self, input_ids, segment_ids, input_mask, label_id, 
-        context, action_id, position_ids=None):
-    super().__init__(input_ids, segment_ids, input_mask, label_id, position_ids)
-    # token_ids is a batch_size length list, where each item is 100 ids
-    self.context_token = context['token_ids']
-    self.context_segment = context['segment_ids']
-    self.context_mask = context['mask_ids']
-    self.action_id = action_id
-
-class CompletionFeatures(InputFeatures):
-  """ A single set of completion features with precomputed context token ids"""
-  def __init__(self, input_ids, segment_ids, input_mask, label_ids, candidates, context):
-    super().__init__(input_ids, segment_ids, input_mask, None)
-    self.candidates = candidates
-    self.context_token = context['token_ids']
-    self.context_segment = context['segment_ids']
-    self.context_mask = context['mask_ids']
-
-    self.intent_id = label_ids['intent']
-    self.nextstep_id = label_ids['nextstep']
-    self.action_id = label_ids['action']
-    self.value_id = label_ids['value']
-    self.utt_id = label_ids['utterance']
-
-    self.action_mask = int(label_ids['nextstep'] == 1)
-    self.value_mask = int(label_ids['value'] >= 0) 
-    self.utt_mask = int(label_ids['nextstep'] == 0)
-
-class CascadingFeatures(InputFeatures):
-  """ A single set of completion features with precomputed context token ids"""
-  def __init__(self, input_ids, segment_ids, input_mask, label_ids, 
-          candidates, context, convo_id, turn_count):
-    super().__init__(input_ids, segment_ids, input_mask, None)
-    self.candidates = candidates
-    self.context_token = context['token_ids']
-    self.context_segment = context['segment_ids']
-    self.context_mask = context['mask_ids']
-
-    self.intent_id = label_ids['intent']
-    self.nextstep_id = label_ids['nextstep']
-    self.action_id = label_ids['action']
-    self.value_id = label_ids['value']
-    self.utt_id = label_ids['utterance']
-
-    self.action_mask = int(label_ids['nextstep'] == 1)   
-    self.value_mask = int(label_ids['value'] >= 0) 
-    self.utt_mask = int(label_ids['nextstep'] == 0)
-
-    self.convo_id = convo_id
-    self.turn_count = turn_count
diff --git a/components/models.py b/components/models.py
deleted file mode 100644
index 3ab7b18..0000000
--- a/components/models.py
+++ /dev/null
@@ -1,163 +0,0 @@
-import os, sys, pdb
-import json
-import math
-import numpy as np
-import GPUtil
-
-import torch
-from torch import nn
-from torch import optim
-from torch.nn import functional as F
-
-from transformers import BertModel, RobertaModel, AlbertModel
-from transformers.file_utils import WEIGHTS_NAME
-
-class CoreModel(nn.Module):
-  def __init__(self, args, checkpoint_dir):
-    super().__init__()
-    if args.model_type == 'bert':
-      self.encoder = BertModel.from_pretrained('bert-base-uncased')
-    elif args.model_type == 'roberta':
-      self.encoder = RobertaModel.from_pretrained('roberta-base')
-    elif args.model_type == 'albert':
-      self.encoder = AlbertModel.from_pretrained('albert-base-v2')
-
-    self.outputs = ['intent', 'nextstep', 'action', 'value', 'utt']
-    self.checkpoint_dir = checkpoint_dir
-    self.use_intent = args.use_intent
-
-  def forward(self):
-    raise NotImplementedError
-
-  def save_pretrained(self, filepath=None):
-    if filepath is None:
-      filepath = os.path.join(self.checkpoint_dir, 'pytorch_model.pt')
-    torch.save(self.state_dict(), filepath)
-    print(f"Model weights saved in {filepath}")
-
-  @classmethod
-  def from_pretrained(cls, hidden_dim, ontology_size, base_model, device, filepath=None):
-    # Instantiate model.
-    model = cls(hidden_dim, ontology_size, base_model, device)
-    # Load weights and fill them inside the model
-    if filepath is None:
-      filepath = os.path.join(self.checkpoint_dir, 'pytorch_model.pt')
-    model.load_state_dict(torch.load(filepath))
-    model.eval()
-    print(f"Model loaded from {filepath}")
-    return model
-
-class ActionStateTracking(CoreModel):
-  """ An AST model should output predictions for buttons, slots and values.  There are multiple ways
-  to accomplish this goal:
-    a. Predicts all 3 parts separately and join the results afterwards
-    b. Predict the 231 possible button-slots together and then just the values
-    c. First predict the 30 available buttons alone and then the slot-values together
-    d. First predict the 30 available buttons and then just the values, leaving the slots as implied
-  Option D is reasonable because each value only belongs to a certain slot, so selecting the correct
-  value implies that the slot has also been correctly selected.  This final option is implemented below.
-
-  To perform value-filling, the task is further decomposed into copying unique tokens from the context
-  for non-enumerable values (copy_score_ or selecting from the ontology for enumerable values (enum_score).
-  """
-
-  def __init__(self, args, mappers, checkpoint_dir):
-    super().__init__(args, checkpoint_dir)
-    self.outputs = ['action', 'value']
-    self.mappings = mappers
-
-    self.action_projection = nn.Linear(args.hidden_dim, len(mappers['action']))
-    self.enum_projection = nn.Linear(args.hidden_dim, len(mappers['value']))
-    self.copy_projection = nn.Linear(args.hidden_dim, 100)  # hardcode limit of 100 context tokens
-    self.gating_mechanism = nn.Linear(args.hidden_dim + 100, 1)   # shrink down to scalar
-
-    self.softmax = nn.Softmax(dim=1)
-    self.sigmoid = nn.Sigmoid()
-
-  def forward(self, full_history, context_tokens):
-    history_outputs = self.encoder(**full_history)                
-    pooled_history = history_outputs.pooler_output                  # batch_size x 768
-    action_score = self.softmax(self.action_projection(pooled_history))
-    enum_prob = self.softmax(self.enum_projection(pooled_history))
-
-    context_outputs = self.encoder(**context_tokens)               
-    pooled_context = context_outputs.pooler_output                  # batch_size x hidden
-    copy_prob = self.softmax(self.copy_projection(pooled_context))  # batch_size x 100
-    reverse_copy_proj = self.copy_projection.weight.t()             # hidden x 100
-    copy_context = torch.matmul(pooled_context, reverse_copy_proj)  # batch_size x 100
-    joined = torch.cat([pooled_context, copy_context], dim=1)       # batch_size x 768+100
-    gate = self.sigmoid(self.gating_mechanism(joined))              # batch_size x 1
-
-    enum_score = gate * enum_prob                                   # batch_size x 126
-    copy_score = (1-gate) * copy_prob                               # batch_size x 100
-    value_score = torch.cat([enum_score, copy_score], dim=1)        # batch_size x 226
-
-    return action_score, value_score
-
-class CascadeDialogSuccess(CoreModel):
-  """ Unlike the BaseModel, will output 5 predictions, one for each component """
-  def __init__(self, args, mappers, checkpoint_dir):
-    super().__init__(args, checkpoint_dir)
-    self.outputs = ['intent', 'nextstep', 'action', 'value', 'utterance']
-    self.mappings = mappers
-
-    self.intent_projection = nn.Linear(args.hidden_dim, len(mappers['intent']))
-    self.nextstep_projection = nn.Linear(args.hidden_dim, len(mappers['nextstep']))
-    self.action_projection = nn.Linear(args.hidden_dim, len(mappers['action']))
-
-    self.candidate_linear = nn.Linear(args.hidden_dim, 128)
-    self.context_linear = nn.Linear(args.hidden_dim, 128)
-
-    self.enum_projection = nn.Linear(args.hidden_dim, len(mappers['value']))
-    self.copy_projection = nn.Linear(args.hidden_dim, 100)  # hardcode limit of 100 context tokens
-    self.gating_mechanism = nn.Linear(args.hidden_dim + 100, 1)   # shrink down to scalar
-
-    self.softmax = nn.Softmax(dim=1)
-    self.sigmoid = nn.Sigmoid()
-
-  def add_candidate_data(self, utt_texts, utt_vectors):
-    self.utt_texts = utt_texts
-    self.utt_vectors = utt_vectors
-
-  def forward(self, full_history, context_tokens, tools):
-    if self.use_intent:
-      all_candidates, device, _ = tools
-    else:
-      all_candidates, device = tools
-
-    history_outputs = self.encoder(**full_history)                # batch_size x 768
-    pooled_history = history_outputs.pooler_output
-    intent_score = self.softmax(self.intent_projection(pooled_history))
-    nextstep_score = self.softmax(self.nextstep_projection(pooled_history))
-    action_score = self.softmax(self.action_projection(pooled_history))
-    enum_prob = self.softmax(self.enum_projection(pooled_history))
-
-    encoded_history = pooled_history.unsqueeze(1)               # (batch_size, 1, hidden_dim)
-    projected_history = self.context_linear(encoded_history)    # (batch_size, 1, 128)
-
-    batch_cands = []
-    for row in all_candidates:                            # each row includes 100 positions
-      vectors = [self.utt_vectors[position] for position in row]
-      batch_cands.append(torch.stack(vectors))             
-
-    candidates = torch.stack(batch_cands).to(device)      # batch_size, num_candidates, hidden_dim
-    candidates = self.candidate_linear(candidates)        # (batch_size, num_candidates, 128)
-    candidates = candidates.transpose(1,2)                # (batch_size, 128, num_candidates)
-
-    utt_score = torch.bmm(projected_history, candidates)
-    utt_score = utt_score.squeeze(1)                      # (batch_size, num_candidates)
-    utt_score = self.softmax(utt_score)                   # normalize into probabilities
-
-    context_outputs = self.encoder(**context_tokens)               
-    pooled_context = context_outputs.pooler_output
-    copy_prob = self.softmax(self.copy_projection(pooled_context))  # batch_size x 100
-    reverse_copy_proj = self.copy_projection.weight.t()
-    copy_context = torch.matmul(pooled_context, reverse_copy_proj)  # batch_size x hidden
-    joined = torch.cat([pooled_context, copy_context], dim=1)       # batch_size x 768+100
-    gate = self.sigmoid(self.gating_mechanism(joined))              # batch_size x 1
-
-    enum_score = gate * enum_prob                                   # batch_size x 125
-    copy_score = (1-gate) * copy_prob                               # batch_size x 100
-    value_score = torch.cat([enum_score, copy_score], dim=1)        # batch_size x 225
-
-    return intent_score, nextstep_score, action_score, value_score, utt_score
diff --git a/components/systems.py b/components/systems.py
deleted file mode 100644
index 6f54f7d..0000000
--- a/components/systems.py
+++ /dev/null
@@ -1,197 +0,0 @@
-import os, sys, pdb
-import random
-import numpy as np
-import json
-import pandas as pd
-
-class Application(object):
-  def __init__(self, args, model, processor):
-    self.task = args.task
-    self.utt_vectors = model.utt_vectors
-    self.utt_texts = model.utt_texts
-    self.device = model.device
-
-    tokenizer = processor.tokenizer
-    cls_token_segment_id = 0
-    sequence_a_segment_id = 0 if args.model_type in ['roberta', 'large'] else 1
-    processor.special = { 'tokens': [tokenizer.cls_token, tokenizer.sep_token, tokenizer.pad_token],
-      'ids': [cls_token_segment_id, sequence_a_segment_id, 0], 'maximum': [97, 100] }
-
-    self.processor = processor
-    self.tokenizer = tokenizer
-
-    self.intent_list = processor.intent_labels
-    self.action_list = processor.action_labels
-    self.value_list = processor.value_labels
-    self.enumerable_size = len(self.value_list)
-
-    self.scenario_df = pd.read_csv(f'data/scenarios_0525.csv')
-    ontology = json.load( open("data/ontology.json", "r") )
-    self.non_enumerable = ontology["values"]["non_enumerable"]
-    self.so_far = []  # hold a list of context utterances
-    self.action_taken = False
-
-    kb = json.load( open("data/kb.json", "r"))
-    action_mask_map, intent_mask_map = Application.prepare_masks(kb, ontology)
-    self.action_mask_map = action_mask_map
-    self.intent_mask_map = intent_mask_map
-
-  @staticmethod
-  def prepare_masks(kb, ont):
-    # record the range that needs to be masked out
-    val_group_to_range = {}
-    current_idx = 0
-    before_cc = True
-    num_enumerable_vals = 0
-    # print(val_group_to_range)
-    for val_group, values in ont["values"]["enumerable"].items():
-        start = current_idx
-
-        size = len(values)
-        if 'credit card' in values:
-            if before_cc:
-                before_cc = False
-            else:
-                size -= 1
-
-        num_enumerable_vals += size
-        stop = start + size
-        val_group_to_range[val_group] = (start, stop)
-        current_idx = stop
-
-    # build out the action to values mapping
-    action_mask_map = {}
-    for category, acts in ont['actions'].items():
-        for action, values in acts.items():
-            mask = np.zeros(100 + num_enumerable_vals)
-            mask[num_enumerable_vals:] = 1.0
-
-            for val_group in values:
-                if val_group in val_group_to_range:
-                    start, stop = val_group_to_range[val_group]
-                    mask[start:stop] = 1.0
-
-            action_mask_map[action] = mask
-
-    # recreate the exact breakdown order from the loader
-    options = []
-    for section, buttons in ont["actions"].items():
-      actions = buttons.keys()
-      options.extend(actions)
-    # double check that all actions in the kb are valid
-    match, error = 0, 0
-    for intent, actions in kb.items():
-        for action in actions:
-            if action in options:
-                pass
-            else:
-                print(action)
-                pdb.set_trace()
-            # assert(action in options)
-    # make the reverse lookup for the id that needs to be masked out
-    action_to_idx = {action: index for index, action in enumerate(options)}
-    # create the actual intent to action mapping
-    intent_mask_map = {}
-    for flow, subflows in ont['intents']['subflows'].items():
-        for intent in subflows:
-            mask = np.zeros(30)
-
-            valid_options = kb[intent]
-            for action in valid_options:
-                mask[action_to_idx[action]] = 1.0
-
-            intent_mask_map[intent] = mask
-
-    return action_mask_map, intent_mask_map
-
-  def delexicalize_text(self, scene, conversation):
-    """ Given all the utterances within a converation and the scenario, delexicalize the 
-    non enumerable entities. Inputs:
-      - scene: a dict with detail, personal_info and order info
-      - conversation: a list of utterances tuples where each tuple is (speaker, text, action, pred)
-    Returns:
-      - delex: a list of utterances where the text has been delexicalized
-    """
-    non_enumerable = []
-
-    for slot in self.non_enumerable['personal']:
-      if slot in scene:
-        non_enumerable.append((slot, scene[slot]))
-
-    for slot, value in scene.items():
-      string_val = str(value)
-      if string_val.startswith('$'):
-        non_enumerable.append(('amount', string_val[1:]))
-      if slot == 'order_id':
-        non_enumerable.append((slot, scene[slot]))
-
-    address = scene['address']
-    address_tokens = address.split()
-    address_halves = address.split(',')
-    non_enumerable.append(('street_address', address_halves[0]))
-    non_enumerable.append(('full_address', address[0]))
-    non_enumerable.append(('zip_code', address_tokens[0]))
-
-    delexed = []
-    for utt in conversation:
-      text = utt.replace('|', 'and').replace('_', ' ').lower()
-      # must be in this order to prevent clash
-      for slot, value in non_enumerable:
-        if str(value) in text:
-          text = text.replace(str(value), f'<{slot}>')
-
-      delexed.append(text)
-    return delexed
-
-  def sample_scenario(self):
-    scenario = self.scenario_df.sample()
-    flow_detail = json.loads(scenario['Detail'].item())
-    scene = json.loads(scenario['Personal'].item())  # default scene to the personal info
-
-    order = json.loads(scenario['Order'].item())
-    street_address = order['address']
-    scene['address'] = f"{street_address} {order['city']}, {order['state']} {order['zip_code']}"
-
-    for key, value in order.items():
-      if key == 'products':
-        for product in order['products']:
-          product_name = product['brand'] + ' ' + product['product_type']
-          scene[product_name] = '$' + str(product['amount'])
-      if key not in ['address', 'city', 'status', 'zip_code', 'products']:
-        scene[key] = value
-    self.scene = scene
-
-    issue = flow_detail['issue']
-    reason = flow_detail['reason']
-    solution = flow_detail['solution']
-    prefix = flow_detail.get('prefix', 'Y')
-    suffix = flow_detail.get('suffix', '')
-    prompt = f"{prefix}ou {issue} because {reason}. Explain your problem to the agent, provide any information that is requested and attempt to {solution}. {suffix}"""
-
-    return scene, prompt
-
-  def take_action(self, intent_pred, action_pred, value_pred, context_tokens):
-    top_intent = np.argmax(intent_pred)
-    intent_name = self.intent_list[top_intent]
-
-    # each intent mask should be size of 30 long
-    intent_mask = self.intent_mask_map[intent_name]
-    # now, all non valid actions should go to zero
-    action_pred *= np.array(intent_mask)
-    top_action = np.argmax(action_pred)
-    action_name = self.action_list[top_action]
-    
-    # each action mask should be size of 223 long
-    action_mask = self.action_mask_map[action_name]
-    # now, all non valid values should go to zero
-    value_pred *= np.array(action_mask)
-    top_value = np.argmax(value_pred)
-    if top_value < self.enumerable_size:  # part of enumerable
-      value_name = self.value_list[top_value]
-    else:                                 # copy from context
-      top_value -= self.enumerable_size
-      while top_value > len(context_tokens):
-          top_value -= len(context_tokens)
-      value_name = context_tokens[top_value]
-
-    return {'Intent': intent_name, 'Action': action_name, 'Value': value_name}
diff --git a/data/images/action_example.png b/images/action_example.png
similarity index 100%
rename from data/images/action_example.png
rename to images/action_example.png
diff --git a/data/images/agent_dashboard.png b/images/agent_dashboard.png
similarity index 100%
rename from data/images/agent_dashboard.png
rename to images/agent_dashboard.png
diff --git a/data/images/customer_site.png b/images/customer_site.png
similarity index 100%
rename from data/images/customer_site.png
rename to images/customer_site.png
diff --git a/data/images/faq_screenshot.png b/images/faq_screenshot.png
similarity index 100%
rename from data/images/faq_screenshot.png
rename to images/faq_screenshot.png
diff --git a/main.py b/main.py
index 21fb9a3..3d099bd 100644
--- a/main.py
+++ b/main.py
@@ -2,182 +2,280 @@
 import random
 import numpy as np
 import torch
+from torch import nn
+from torch import Tensor
 from tqdm import tqdm as progress_bar
+from typing import Dict, Union, Tuple
+
+from abcd.utils.arguments import solicit_params, Config
+from abcd.utils.help import ModelInputDict, ASTTargetsTuple, CDSTargetsTuple
+from abcd.utils.help import set_seed, setup_gpus, check_directories, prepare_inputs, device
+from abcd.utils.load import (
+    load_data,
+    load_tokenizer,
+    load_candidates,
+    get_optimizer,
+    get_scheduler,
+)
+from abcd.utils.process import process_data, setup_dataloader
+from abcd.utils.evaluate import quantify, qualify
+
+from abcd.components.datasets import ActionDataset, CascadeDataset, ActionFeature, CascadeFeature
+from abcd.components.tools import ExperienceLogger
+from abcd.components.models import ActionStateTracking, CDSModelOutput, CascadeDialogSuccess
+
+
+def run_main(
+    args: Config,
+    datasets: Dict[str, Union[ActionDataset, CascadeDataset]],
+    model: Union[ActionStateTracking, CascadeDialogSuccess],
+    exp_logger: ExperienceLogger,
+):
+    if args.task == "cds":
+        utt_data = load_candidates(args)
+        model.add_candidate_data(*utt_data)
+    kb_labels = {}
+    if args.use_kb:
+        kb_labels["intent"] = list(model.mappings["intent"].keys())
+        kb_labels["action"] = list(model.mappings["action"].keys())
+
+    exp_logger.init_tb_writers()
+    run_train(args, datasets, model, exp_logger, kb_labels)
+
+    if args.do_eval:
+        result = run_eval(args, datasets, model, exp_logger, kb_labels, split="test")
+        results = dict((k + f"_{args.filename}", v) for k, v in result.items())
+        print("Test Results -", results)
+
+
+def ast_loss(
+    scores: Tuple[Tensor, Tensor],
+    targets: Tuple[Tensor, Tensor],
+    loss_func: nn.Module
+):
+    action_score, value_score = scores
+    action_target, value_target = targets
+
+    action_loss = loss_func(action_score, action_target)
+    value_loss = loss_func(value_score, value_target)
+
+    total_loss = action_loss + value_loss
+    return total_loss
+
+
+def cds_loss(scores: CDSModelOutput, targets, loss_func):
+    intent_scores, nextstep_scores, action_scores, value_scores, utt_scores = scores
+    intent_target, nextstep_target, action_target, value_target, utt_target = targets
+
+    utterance_mask = nextstep_target == 0  # 0 is the index of 'retrieve_utterance'
+    batch_size, num_candidates = utt_scores.shape
+    utt_scores = utt_scores * utterance_mask.unsqueeze(1).repeat(1, num_candidates)
+    utterance_target = utt_target * utterance_mask
+
+    intent_loss = loss_func(intent_scores, intent_target)
+    nextstep_loss = loss_func(nextstep_scores, nextstep_target)
+    action_loss = loss_func(action_scores, action_target)
+    value_loss = loss_func(value_scores, value_target)
+
+    utt_target_ids = utterance_target.unsqueeze(1)  # batch_size, 1
+    chosen = torch.gather(utt_scores, dim=1, index=utt_target_ids)
+    correct = chosen.sum()  # scalar
+
+    shift = torch.max(utt_scores)  # perform log sum exp of the incorrect scores
+    res = torch.exp(utt_scores - shift)  # batch_size, num_candidates
+    res = torch.log(torch.sum(res, dim=1))  # batch_size
+    incorrect = torch.sum(
+        shift + res
+    )  # add the shift back in to complete the log-sum-exp overflow trick
+    utt_loss = incorrect - correct
+
+    total_loss = intent_loss + nextstep_loss + action_loss + value_loss + utt_loss
+    return total_loss
+
+from typing import overload
+
+@overload
+def run_train(
+    args: Config,
+    datasets: Dict[str, ActionDataset],
+    model: ActionStateTracking,
+    exp_logger: ExperienceLogger,
+    kb_labels: Dict
+    ):
+    ...
+
+
+@overload
+def run_train(
+    args: Config,
+    datasets: Dict[str, CascadeDataset],
+    model: CascadeDialogSuccess,
+    exp_logger: ExperienceLogger,
+    kb_labels: Dict
+    ):
+    ...
+
+
+def run_train(
+    args: Config,
+    datasets: Dict[str, Union[ActionDataset, CascadeDataset]],
+    model: Union[ActionStateTracking, CascadeDialogSuccess],
+    exp_logger: ExperienceLogger,
+    kb_labels: Dict):
+    dataloader, num_examples = setup_dataloader(
+        datasets, args.batch_size, split="train"
+    )
+    t_total = len(dataloader) // args.grad_accum_steps * args.epochs
+    exp_logger.start_train(num_examples, total_step=t_total)
+    optimizer = get_optimizer(args, model)
+    scheduler = get_scheduler(args, optimizer, t_total)
+    loss_func = torch.nn.CrossEntropyLoss(ignore_index=-1)
+    model.zero_grad()
+
+    for epoch in range(args.epochs):
+        model.train()
+
+        batch: Union[ActionFeature, CascadeFeature]
+        for step, batch in enumerate(dataloader):
+            # batch = tuple(t.to(device) for t in batch)
+            batch = batch.to(device)
+
+            if args.task == "ast":
+                # NOTE: This is only here to help narrow down the type hints.
+                assert isinstance(batch, ActionFeature)
+                assert isinstance(model, ActionStateTracking)
+                full_history, targets, context_tokens, _ = prepare_inputs(args, batch)
+                scores = model(full_history, context_tokens)
+                loss = ast_loss(scores, targets, loss_func)
+            
+            elif args.task == "cds":
+                assert isinstance(batch, CascadeFeature)
+                assert isinstance(model, CascadeDialogSuccess)
+
+                full_history, targets, context_tokens, tools = prepare_inputs(
+                    args, batch
+                )
+                scores = model(full_history, context_tokens, tools)
+                loss = cds_loss(scores, targets, loss_func)
+
+            if args.grad_accum_steps > 1:
+                loss = loss / args.grad_accum_steps
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+
+            if (step + 1) % args.grad_accum_steps == 0:
+                optimizer.step()
+                scheduler.step()
+                model.zero_grad()
+                result, metric = quantify(args, scores, targets, "train")
+                exp_logger.log_train(step, loss.item(), result, metric)
+
+            if args.debug and step > 3 * args.log_interval:
+                break
+
+        result, res_name = run_eval(
+            args, datasets, model, exp_logger, kb_labels, split="dev"
+        )
+        dev_score = result[res_name]
+        if dev_score > exp_logger.best_score:
+            model.save_pretrained(exp_logger.filepath)
+            exp_logger.best_score = dev_score
+        exp_logger.log_dev(step + 1, res_name, dev_score)
+
+
+def run_eval(args: Config,
+             datasets: Dict[str, Union[ActionDataset, CascadeDataset]],         
+             model: Union[ActionStateTracking, CascadeDialogSuccess],
+             exp_logger: ExperienceLogger,
+             kb_labels: Dict,
+             split="dev"):
+    dataloader, num_examples = setup_dataloader(datasets, args.batch_size, split)
+    exp_logger.start_eval(num_examples, kind=args.filename)
+    loss_func = torch.nn.CrossEntropyLoss(ignore_index=-1)
+    num_outputs = len(model.outputs)
+    model.eval()
+
+    preds, labels, convo_ids, turn_counts = [], [], [], []
+    for batch in progress_bar(
+        dataloader, total=len(dataloader), desc=f"Epoch {exp_logger.epoch}"
+    ):
+        batch = tuple(t.to(device) for t in batch)
+        full_history, batch_targets, context_tokens, tools = prepare_inputs(args, batch)
+
+        with torch.no_grad():
+            if args.task == "ast":
+                batch_scores = model(full_history, context_tokens)
+                batch_loss = ast_loss(batch_scores, batch_targets, loss_func)
+            elif args.task == "cds":
+                batch_scores = model(full_history, context_tokens, tools)
+                batch_loss = cds_loss(batch_scores, batch_targets, loss_func)
+
+        if args.cascade:
+            batch_turn_count = batch_targets.pop()
+            batch_convo_id = batch_targets.pop()
+
+        if args.quantify or split == "dev":
+            exp_logger.eval_loss += batch_loss.mean().item()
+            exp_logger.batch_steps += 1
+
+        preds.append(batch_scores)
+        labels.append(batch_targets)
+        convo_ids.append(batch_convo_id if args.cascade else 0)
+        turn_counts.append(batch_turn_count if args.cascade else 0)
+
+        if args.debug:
+            if len(turn_counts) > 10:
+                break
+
+    grouped_preds = [
+        torch.cat([pred[i] for pred in preds], dim=0) for i in range(num_outputs)
+    ]
+    grouped_labels = [
+        torch.cat([label[i] for label in labels], dim=0) for i in range(num_outputs)
+    ]
+    ci_and_tc = (
+        (torch.cat(convo_ids, dim=0), torch.cat(turn_counts, dim=0))
+        if args.cascade
+        else (0, 0)
+    )
+
+    utils = {"kb_labels": kb_labels, "ci_and_tc": ci_and_tc}
+    metrics, res_name = quantify(args, grouped_preds, grouped_labels, utils)
+    exp_logger.end_eval(metrics, kind=args.filename)
+    return (metrics, res_name) if split == "dev" else metrics
 
-from utils.arguments import solicit_params
-from utils.help import set_seed, setup_gpus, check_directories, prepare_inputs, device
-from utils.load import load_data, load_tokenizer, load_candidates, get_optimizer, get_scheduler
-from utils.process import process_data, setup_dataloader
-from utils.evaluate import quantify, qualify
-
-from components.datasets import ActionDataset, CascadeDataset
-from components.tools import ExperienceLogger
-from components.models import ActionStateTracking, CascadeDialogSuccess
-
-def run_main(args, datasets, model, exp_logger):
-  if args.task == 'cds':
-    utt_data = load_candidates(args)
-    model.add_candidate_data(*utt_data)
-  kb_labels = {}
-  if args.use_kb:
-    kb_labels['intent'] = list(model.mappings['intent'].keys())
-    kb_labels['action'] = list(model.mappings['action'].keys())
-
-  exp_logger.init_tb_writers()
-  run_train(args, datasets, model, exp_logger, kb_labels)
-
-  if args.do_eval:
-    result = run_eval(args, datasets, model, exp_logger, kb_labels, split='test')
-    results = dict((k + f'_{args.filename}', v) for k, v in result.items())
-    print('Test Results -', results)
-
-def ast_loss(scores, targets, loss_func):
-  action_score, value_score = scores
-  action_target, value_target = targets
-
-  action_loss = loss_func(action_score, action_target)
-  value_loss = loss_func(value_score, value_target)
-
-  total_loss = action_loss + value_loss
-  return total_loss
-
-def cds_loss(scores, targets, loss_func):
-  intent_scores, nextstep_scores, action_scores, value_scores, utt_scores = scores
-  intent_target, nextstep_target, action_target, value_target, utt_target = targets
-  
-  utterance_mask = nextstep_target == 0  # 0 is the index of 'retrieve_utterance'
-  batch_size, num_candidates = utt_scores.shape
-  utt_scores = utt_scores * utterance_mask.unsqueeze(1).repeat(1, num_candidates)
-  utterance_target = utt_target * utterance_mask
-
-  intent_loss   = loss_func(intent_scores, intent_target)
-  nextstep_loss = loss_func(nextstep_scores, nextstep_target)
-  action_loss   = loss_func(action_scores, action_target)
-  value_loss    = loss_func(value_scores, value_target)
-  
-  utt_target_ids = utterance_target.unsqueeze(1)   # batch_size, 1
-  chosen = torch.gather(utt_scores, dim=1, index=utt_target_ids)
-  correct = chosen.sum()                   # scalar
-
-  shift = torch.max(utt_scores)             # perform log sum exp of the incorrect scores
-  res = torch.exp(utt_scores - shift)       # batch_size, num_candidates
-  res = torch.log(torch.sum(res, dim=1))   # batch_size
-  incorrect = torch.sum(shift + res)       # add the shift back in to complete the log-sum-exp overflow trick
-  utt_loss = incorrect - correct
-
-  total_loss = intent_loss + nextstep_loss + action_loss + value_loss + utt_loss
-  return total_loss
-
-def run_train(args, datasets, model, exp_logger, kb_labels):
-  dataloader, num_examples = setup_dataloader(datasets, args.batch_size, split='train')
-  t_total = len(dataloader) // args.grad_accum_steps * args.epochs    
-  exp_logger.start_train(num_examples, total_step=t_total)
-  optimizer = get_optimizer(args, model)
-  scheduler = get_scheduler(args, optimizer, t_total)
-  loss_func = torch.nn.CrossEntropyLoss(ignore_index=-1)
-  model.zero_grad()
-
-  for epoch in range(args.epochs):
-    model.train()
-
-    for step, batch in enumerate(dataloader):
-      batch = tuple(t.to(device) for t in batch)
-
-      if args.task == 'ast':
-        full_history, targets, context_tokens, _ = prepare_inputs(args, batch)
-        scores = model(full_history, context_tokens)
-        loss = ast_loss(scores, targets, loss_func)
-      elif args.task == 'cds':
-        full_history, targets, context_tokens, tools = prepare_inputs(args, batch)
-        scores = model(full_history, context_tokens, tools)
-        loss = cds_loss(scores, targets, loss_func)
-
-      if args.grad_accum_steps > 1:
-        loss = loss / args.grad_accum_steps
-      loss.backward()
-      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
-
-      if (step+1) % args.grad_accum_steps == 0:
-        optimizer.step()
-        scheduler.step()
-        model.zero_grad()
-        result, metric = quantify(args, scores, targets, "train")
-        exp_logger.log_train(step, loss.item(), result, metric)
-
-      if args.debug and step > 3*args.log_interval:
-        break
-
-    result, res_name = run_eval(args, datasets, model, exp_logger, kb_labels, split='dev')
-    dev_score = result[res_name]
-    if dev_score > exp_logger.best_score:
-        model.save_pretrained(exp_logger.filepath) 
-        exp_logger.best_score = dev_score
-    exp_logger.log_dev(step+1, res_name, dev_score)
-
-def run_eval(args, datasets, model, exp_logger, kb_labels, split='dev'):
-  dataloader, num_examples = setup_dataloader(datasets, args.batch_size, split)  
-  exp_logger.start_eval(num_examples, kind=args.filename)
-  loss_func = torch.nn.CrossEntropyLoss(ignore_index=-1)
-  num_outputs = len(model.outputs)
-  model.eval()
-
-  preds, labels, convo_ids, turn_counts = [], [], [], []
-  for batch in progress_bar(dataloader, total=len(dataloader), desc=f"Epoch {exp_logger.epoch}"):
-    batch = tuple(t.to(device) for t in batch)
-    full_history, batch_targets, context_tokens, tools = prepare_inputs(args, batch)
-
-    with torch.no_grad():
-      if args.task == 'ast':
-        batch_scores = model(full_history, context_tokens)
-        batch_loss = ast_loss(batch_scores, batch_targets, loss_func)
-      elif args.task == 'cds':
-        batch_scores = model(full_history, context_tokens, tools)
-        batch_loss = cds_loss(batch_scores, batch_targets, loss_func)
-
-    if args.cascade:
-      batch_turn_count = batch_targets.pop()
-      batch_convo_id = batch_targets.pop()
-
-    if args.quantify or split=='dev':
-      exp_logger.eval_loss += batch_loss.mean().item()
-      exp_logger.batch_steps += 1
-    
-    preds.append(batch_scores)
-    labels.append(batch_targets)
-    convo_ids.append(batch_convo_id if args.cascade else 0)
-    turn_counts.append(batch_turn_count if args.cascade else 0)
-
-    if args.debug:
-      if len(turn_counts) > 10:
-        break
     
-  grouped_preds = [torch.cat([pred[i] for pred in preds], dim=0) for i in range(num_outputs)]
-  grouped_labels = [torch.cat([label[i] for label in labels], dim=0) for i in range(num_outputs)]
-  ci_and_tc = (torch.cat(convo_ids, dim=0), torch.cat(turn_counts, dim=0)) if args.cascade else (0, 0)
+def main(args: Config):
+    args = setup_gpus(args)
+    set_seed(args)
+
+    ckpt_dir, cache_results = check_directories(args)
+    raw_data = load_data(args, cache_results[1])
+    tokenizer, ontology = load_tokenizer(args)
+    features, mappings = process_data(
+        args, tokenizer, ontology, raw_data, *cache_results
+    )
+    exp_logger = ExperienceLogger(args, ckpt_dir)
+
+    datasets: Dict[str, Union[ActionDataset, CascadeDataset]]
+    model: Union[ActionStateTracking, CascadeDialogSuccess]
+
+    if args.task == "ast":
+        datasets = {
+            split: ActionDataset(args, feats) for split, feats in features.items()
+        }
+        model = ActionStateTracking(args, mappings, ckpt_dir)
+    elif args.task == "cds":
+        datasets = {
+            split: CascadeDataset(args, feats) for split, feats in features.items()
+        }
+        model = CascadeDialogSuccess(args, mappings, ckpt_dir)
+
+    model = model.to(device)
+    model.encoder.resize_token_embeddings(len(tokenizer))
+    run_main(args, datasets, model, exp_logger)
 
-  utils = { 'kb_labels': kb_labels, 'ci_and_tc': ci_and_tc }
-  metrics, res_name = quantify(args, grouped_preds, grouped_labels, utils)
-  exp_logger.end_eval(metrics, kind=args.filename)
-  return (metrics, res_name) if split == 'dev' else metrics
 
 if __name__ == "__main__":
-  args = solicit_params()
-  args = setup_gpus(args)
-  set_seed(args)
-
-  ckpt_dir, cache_results = check_directories(args)
-  raw_data = load_data(args, cache_results[1])
-  tokenizer, ontology = load_tokenizer(args)
-  features, mappings = process_data(args, tokenizer, ontology, raw_data, *cache_results)
-  exp_logger = ExperienceLogger(args, ckpt_dir)
-
-  if args.task == 'ast':
-    datasets = {split: ActionDataset(args, feats) for split, feats in features.items()}
-    model = ActionStateTracking(args, mappings, ckpt_dir)
-  elif args.task == 'cds':
-    datasets = {split: CascadeDataset(args, feats) for split, feats in features.items()}
-    model = CascadeDialogSuccess(args, mappings, ckpt_dir)
-
-  model = model.to(device)
-  model.encoder.resize_token_embeddings(len(tokenizer))
-  run_main(args, datasets, model, exp_logger)
+    args: Config = solicit_params()
+    main(args)
\ No newline at end of file
diff --git a/run.sh b/run.sh
old mode 100644
new mode 100755
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..a90edc4
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,31 @@
+import sys
+import setuptools
+
+with open("README.md", "r") as fh:
+    long_description = fh.read()
+packages = setuptools.find_namespace_packages(include=["abcd*"])
+print("PACKAGES FOUND:", packages)
+print(sys.version_info)
+
+setuptools.setup(
+    name="abcd",
+    version="0.0.1",
+    author="Derek Chen",
+    author_email="dchen@asapp.com",
+    description="pip package for the ABCD repo.",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="https://github.com/asappresearch/abcd",
+    packages=packages,
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+    ],
+    include_package_data = True,
+    package_data = {
+        "data": ["*.json"],
+    },
+    python_requires=">=3.7",
+    install_requires=["torch", "transformers", "python-dateutil", "simple_parsing"],
+)
diff --git a/utils/arguments.py b/utils/arguments.py
deleted file mode 100644
index dc31ae3..0000000
--- a/utils/arguments.py
+++ /dev/null
@@ -1,74 +0,0 @@
-from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
-
-def solicit_params():
-  parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
-  parser.add_argument('--seed', help='Random seed', type=int, default=14)
-  parser.add_argument('--model-type', choices=['roberta', 'bert', 'dialogpt', 'albert'],
-            help='Which type of encoder and tokenizer to use', default='bert')
-  parser.add_argument('--task', default='ast', type=str, choices=['ast', 'cds'],
-            help='choose which of the two major tasks to train the model', )
-  parser.add_argument('--debug', default=False, action='store_true',
-            help='whether or not to go into debug mode, which is faster')
-  parser.add_argument('-v', '--verbose', default=False, action='store_true',
-            help='whether or not to have verbose prints')
-
-  # ------ DIRECTORY AND SAVING --------
-  parser.add_argument('--output-dir', default='outputs/', type=str)
-  parser.add_argument('--input-dir', default='data/', type=str)
-  parser.add_argument('--prefix', type=str, default='0524',
-            help='distinguish the trial run, often a MM/DD date')
-  parser.add_argument('--filename', type=str, 
-            help='name of the model if saving, or loading from saved')
-  parser.add_argument('--suffix', type=str, default='v1',
-            help='distinguish the saved data, often a version number')
-  parser.add_argument('--filter', default=False, action='store_true',
-            help='Filter for just errors during evaluation')
-
-  # ------ TRAINING AND EVALUATION --------
-  parser.add_argument('--do-eval', default=False, action='store_true',
-            help='load the best saved model and run evaluation, qualify or quantify flags must be on')
-  parser.add_argument('--log-interval', default=100, type=int)
-  parser.add_argument('--qualify', default=False, action='store_true',
-            help='examine the qualitative outputs of the model in natural language')
-  parser.add_argument('--quantify', default=False, action='store_true',
-            help='examine the quantitative outputs of the model in reports')
-
-  # ------- MAJOR MODEL OPTIONS --------
-  parser.add_argument('--cascade', default=False, action='store_true',
-            help='use cascading evaluation rather than turn level')
-  parser.add_argument('--use-intent', default=False, action='store_true',
-            help='use an oracle intent classification module')
-  parser.add_argument('--use-kb', default=False, action='store_true',
-            help='take advantage of KB guidelines to limit action and value options')
-
-  # ------ DATASET CREATION --------
-  parser.add_argument('--version', type=float, default=1.1,
-            help="which version of the dataset is being used")
-            # v1.0 was used initially, but v1.1 is released as a significantly cleaner benchmark
-  parser.add_argument('--build-vocab', default=False, action='store_true',
-            help='whether to build new vocabulary of Glove vectors')
-  parser.add_argument('--max-seq-len', default=512, type=int,
-            help='Maximum number of tokens to truncate each utterance')
-
-  # ------ PARAMETER OPTIMIZATION --------
-  param_group = parser.add_argument_group(title='hyperparameters')
-  parser.add_argument('--radam', default=False, action='store_true',
-            help='use RAdam optimizer rather than default AdamW')
-  param_group.add_argument('-lr', '--learning-rate', default=3e-5, type=float,
-            help='Learning rate alpha for weight updates')
-  param_group.add_argument('--hidden-dim', default=768, type=int,
-            help='Number of hidden units, size of hidden dimension')
-  param_group.add_argument('--drop-prob', default=0.2, type=float,
-            help='probability of dropping a node, opposite of keep prob')
-  param_group.add_argument('--grad-accum-steps', default=1, type=int,
-            help='Number of steps for gradient accumulation')
-  param_group.add_argument('-reg', '--weight-decay', default=0.003, type=float,
-            help='weight_decay to regularize the weights')
-  param_group.add_argument('--batch-size', default=50, type=int,
-            help='batch size for training and evaluation')
-  param_group.add_argument('-e', '--epochs', default=14, type=int,
-            help='Number of epochs or episodes to train')
-
-  args = parser.parse_args()
-  return args
-
diff --git a/utils/embed.py b/utils/embed.py
deleted file mode 100644
index da0f20d..0000000
--- a/utils/embed.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import json
-import random
-import torch
-
-from tqdm import tqdm as progress_bar
-from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel
-
-tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-model = BertModel.from_pretrained('bert-base-uncased')
-
-print("Loading data ...")
-utt_texts = json.load(open(f'data/utterances.json', 'r'))
-num_cands = len(utt_texts)
-utt_vectors = []
-
-cand_embeds, cand_segments, cand_masks = [], [], []
-for cand_text in progress_bar(utt_texts, total=num_cands):
-  cand_inputs = tokenizer(cand_text, return_tensors="pt")
-  with torch.no_grad():
-	  cand_outputs = model(**cand_inputs)
-  utt_vectors.append(cand_outputs.pooler_output)
-
-utt_vectors = torch.cat(utt_vectors)
-print("utt_vectors: {}".format(utt_vectors.shape))
-torch.save(utt_vectors, 'data/utt_vectors.pt')
-
diff --git a/utils/evaluate.py b/utils/evaluate.py
deleted file mode 100644
index 42688fb..0000000
--- a/utils/evaluate.py
+++ /dev/null
@@ -1,383 +0,0 @@
-import os, sys, pdb
-import random
-import json
-import torch
-import numpy as np
-import time as tm
-import pandas as pd
-
-from torch.utils.data import RandomSampler, SequentialSampler
-from torch.utils.data import DataLoader
-from collections import defaultdict, OrderedDict, Counter
-from sklearn.metrics import accuracy_score
-
-from components.systems import Application
-from utils.help import prepare_inputs
-from utils.load import load_guidelines
-
-def ast_report(predictions, labels):
-  action_preds, value_preds = predictions
-  action_labels, value_labels = labels
-
-  size = len(action_preds)
-  assert(size == len(value_labels))
-
-  top_action_preds = np.argmax(action_preds, axis=1)
-  action_match = action_labels == top_action_preds   # array of booleans
-  action_acc = sum(action_match) / float(size) 
-
-  top_value_preds = np.argmax(value_preds, axis=1)
-  value_match = value_labels == top_value_preds
-  value_acc = sum(value_match) / float(size) 
-
-  joint_match = action_match & value_match
-  joint_acc = sum(joint_match) / float(size) 
-
-  full_result = {'Action_Accuracy': round(action_acc, 4),
-          'Value_Accuracy': round(value_acc, 4),
-          'Joint_Accuracy': round(joint_acc, 4),}
-
-  return full_result, 'Joint_Accuracy'
-
-def ranking_report(predictions, labels, use_match=False):
-  full_result = {}
-  utt_match = []
-
-  for rank in [1,5,10]:
-    level = -rank   # select the top 5 rather than bottom 5
-    num_correct, num_possible = 0, 0
-    # vectorized version possible, but a lot less readable
-    for pred, label in zip(predictions, labels):
-      top_k_indexes = np.argpartition(pred, kth=level)[level:]
-      if label in top_k_indexes:
-        num_correct += 1
-        if rank == 1:
-          utt_match.append(True)
-      else:
-        if rank == 1:
-          utt_match.append(False)
-
-      if label >= 0:    # -1 means the turn was take-action or end-of-convo
-        num_possible += 1
-
-    rank_name = f'Recall_at_{rank}'
-    full_result[rank_name] = num_correct / num_possible
-
-  if use_match:
-    return full_result, utt_match
-  else:
-    return full_result, 'Recall_at_5'
-
-def cds_report(predictions, labels, ci_and_tc, kb_labels=None):
-  """ Calculated in the form of cascaded evaluation
-  where each agent example or utterance a scored example"""
-  intent_pred, nextstep_pred, action_pred, value_pred, utterance_rank = predictions
-  intent_label, nextstep_label, action_label, value_label, utterance_label = labels
-  convo_ids = ci_and_tc[0].detach().cpu().numpy()
-  turn_counts = ci_and_tc[1].detach().cpu().numpy()
-
-  if kb_labels is None:
-    use_kb = False
-  else:
-    use_kb = True
-    intent_list = kb_labels['intent']
-    action_list = kb_labels['action']
-    guidelines = load_guidelines()
-    action_mask_map, intent_mask_map = Application.prepare_masks(*guidelines)
-
-  num_turns = len(nextstep_pred)
-  assert(num_turns == len(convo_ids))
-
-  top_intent_preds = np.argmax(intent_pred, axis=1)
-  intent_match = intent_label == top_intent_preds   # array of booleans
-  intent_acc = sum(intent_match) / float(num_turns) 
-
-  top_nextstep_preds = np.argmax(nextstep_pred, axis=1)
-  nextstep_match = nextstep_label == top_nextstep_preds   # array of booleans
-  nextstep_acc = sum(nextstep_match) / float(num_turns) 
-
-  if use_kb:
-    intent_masks = []
-    for top_intent in top_intent_preds:
-      intent_name = intent_list[top_intent]
-      # each intent mask should be size of 30 long
-      intent_mask = intent_mask_map[intent_name]
-      intent_masks.append(intent_mask)
-    # now, all non valid actions should go to zero
-    action_pred *= np.array(intent_masks)
-
-  top_action_preds = np.argmax(action_pred, axis=1)
-  action_match = action_label == top_action_preds   # array of booleans
-  num_turns_include_action = sum(action_label >= 0)
-  action_acc = sum(action_match) / float(num_turns_include_action) 
-
-  if use_kb:
-    action_masks = []
-    for top_action in top_action_preds:
-      action_name = action_list[top_action]
-      # each action mask should be size of 223 long
-      action_mask = action_mask_map[action_name]
-      action_masks.append(action_mask)
-    # now, all non valid values should go to zero
-    value_pred *= np.array(action_masks)
-
-  top_value_preds = np.argmax(value_pred, axis=1)
-  value_match = value_label == top_value_preds
-  num_turns_include_value = sum(value_label >= 0)
-  value_acc = sum(value_match) / float(num_turns_include_value) 
-
-  joint_match = action_match & value_match
-  joint_acc = sum(joint_match) / float(num_turns_include_action) 
-
-  recall, utt_match = {}, []
-  for rank in [1,5,10]:
-    level = -rank   # select the top 5 rather than bottom 5
-    num_correct, num_possible = 0, 0
-    for pred, label in zip(utterance_rank, utterance_label):
-      top_k_indexes = np.argpartition(pred, kth=level)[level:]
-      if label in top_k_indexes:
-        num_correct += 1
-        if rank == 1:
-          utt_match.append(True)
-      else:
-        if rank == 1:
-          utt_match.append(False)
-
-      if label >= 0:
-        num_possible += 1
-    recall[str(rank)] = num_correct / num_possible 
-
-  # group by convo_ids
-  unique_convo_ids = list(set(convo_ids))
-  conversations = {}
-  for uci in unique_convo_ids:
-    turns, correctness = [], []
-    row_id = 0
-    for convo_id, turn_count in zip(convo_ids, turn_counts):
-      if convo_id == uci:
-        turns.append(turn_count)
-
-        correct = False
-        intent_right = intent_match[row_id]
-        nextstep_right = nextstep_match[row_id]
-
-        if nextstep_label[row_id] == 0:
-          if intent_right and nextstep_right and utt_match[row_id]:
-            correct = True
-        elif nextstep_label[row_id] == 1:
-          if intent_right and nextstep_right and joint_match[row_id]:
-            correct = True
-        elif nextstep_label[row_id] == 2:
-          if intent_right and nextstep_right:
-            correct = True
-
-        correctness.append(correct)
-      row_id += 1
-
-    # sort by turn_counts
-    ordered = [cor for _, cor in sorted( zip(turns,correctness), key=lambda tc: tc[0] )]
-    conversations[uci] = ordered
-
-  # count how many correct
-  turn_score, turn_correct = 0, 0
-  for convo_id, convo_correctness in conversations.items():
-    convo_length = len(convo_correctness)
-    # we use turn_id rather than the true turn_count since turn counts will skip numbers
-    # when looping through the conversation due to skipping over customer utterances
-    for turn_id in range(convo_length):
-      num_remaining = convo_length - turn_id
-      
-      num_correct = 0
-      # count up how many were predicted correctly
-      while turn_id < convo_length and convo_correctness[turn_id]:
-        num_correct += 1
-        turn_id += 1
-
-      if num_correct > 0:
-        turn_correct += 1
-      # normalize by the number of turns remaining
-      turn_score += num_correct / num_remaining
-
-  # normalize by total number of turns possible
-  turn_acc = turn_correct / float(num_turns)
-  final_score = turn_score / float(num_turns)
-
-  full_result = {'Intent_Accuracy': round(intent_acc, 4),
-         'Nextstep_Accuracy': round(nextstep_acc, 4),
-           'Action_Accuracy': round(action_acc, 4),
-          'Value_Accuracy': round(value_acc, 4),
-          'Joint_Accuracy': round(joint_acc, 4),
-             'Recall_at_1': round(recall['1'], 4),
-             'Recall_at_5': round(recall['5'], 4),
-            'Recall_at_10': round(recall['10'], 4),
-           'Turn_Accuracy': round(turn_acc, 4),
-           'Cascading_Score': round(final_score, 4) }
-
-  return full_result, 'Cascading_Score'
-
-def task_completion_report(predictions, labels, kb_labels=None):
-  intent_pred, nextstep_pred, action_pred, value_pred, utterance_rank = predictions
-  intent_label, nextstep_label, action_label, value_label, utterance_label = labels
-  num_turns = len(nextstep_pred)
-
-  if kb_labels is None:
-    use_kb = False
-  else:
-    use_kb = True
-    intent_list = kb_labels['intent']
-    action_list = kb_labels['action']
-    guidelines = load_guidelines()
-    action_mask_map, intent_mask_map = Application.prepare_masks(*guidelines)
-
-  top_intent_preds = np.argmax(intent_pred, axis=1)
-  intent_match = intent_label == top_intent_preds   # array of booleans
-  intent_acc = sum(intent_match) / float(num_turns) 
-
-  top_nextstep_preds = np.argmax(nextstep_pred, axis=1)
-  nextstep_match = nextstep_label == top_nextstep_preds   # array of booleans
-  nextstep_acc = sum(nextstep_match) / float(num_turns) 
-
-  if use_kb:
-    intent_masks = []
-    for top_intent in top_intent_preds:
-      intent_name = intent_list[top_intent]
-      # each intent mask should be size of 30 long
-      intent_mask = intent_mask_map[intent_name]
-      intent_masks.append(intent_mask)
-    # now, all non valid actions should go to zero
-    action_pred *= np.array(intent_masks)
-
-  top_action_preds = np.argmax(action_pred, axis=1)
-  action_match = action_label == top_action_preds   # array of booleans
-  num_turns_include_action = sum(action_label >= 0) 
-  action_acc = sum(action_match) / float(num_turns_include_action) 
-
-  if use_kb:
-    action_masks = []
-    for top_action in top_action_preds:
-      action_name = action_list[top_action]
-      # each action mask should be size of 223 long
-      action_mask = action_mask_map[action_name]
-      action_masks.append(action_mask)
-    # now, all non valid values should go to zero
-    value_pred *= np.array(action_masks)
-
-  top_value_preds = np.argmax(value_pred, axis=1)
-  value_match = value_label == top_value_preds
-  num_turns_include_value = sum(value_label >= 0)
-  value_acc = sum(value_match) / float(num_turns_include_value) 
-
-  joint_match = action_match & value_match
-  joint_acc = sum(joint_match) / float(num_turns_include_action) 
-
-  recall, utt_match = ranking_report(utterance_rank, utterance_label, use_match=True)
-
-  assert(num_turns == len(value_label))
-  assert(len(intent_pred) == len(nextstep_label))
-  assert(len(utt_match) == num_turns)    
-  assert(len(action_match) == len(top_value_preds))  
-
-  turn_correct = 0
-  for turn in range(num_turns):
-    if intent_match[turn] and nextstep_match[turn]:
-      pass
-    else:
-      continue
-
-    if nextstep_label[turn] == 0 and utt_match[turn]:
-      turn_correct += 1
-    elif nextstep_label[turn] == 1 and joint_match[turn]:
-      turn_correct += 1
-    elif nextstep_label[turn] == 2:      # end_conversation
-      turn_correct += 1
-  turn_acc = turn_correct / float(num_turns)
-
-  full_result = {'Intent_Accuracy': round(intent_acc, 4),
-         'Nextstep_Accuracy': round(nextstep_acc, 4),
-           'Action_Accuracy': round(action_acc, 4),
-          'Value_Accuracy': round(value_acc, 4),
-          'Joint_Accuracy': round(joint_acc, 4),
-             'Recall_at_1': round(recall['Recall_at_1'], 4),
-             'Recall_at_5': round(recall['Recall_at_5'], 4),
-            'Recall_at_10': round(recall['Recall_at_10'], 4),
-           'Turn_Accuracy': round(turn_acc, 4) }
-
-  return full_result, 'Turn_Accuracy' 
-
-def qualify(args, ids, tokenizer, target_maps, scores, targets):
-  history_ids, context_ids = ids
-  action_mapper, value_mapper = target_maps
-  num_values = len(value_mapper)
-  pad_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
-
-  action_score, value_score = scores
-  action_target, value_target = targets
-  top_action_ids = np.argmax(action_score.detach().cpu().numpy(), axis=1)
-  top_value_ids = np.argmax(value_score.detach().cpu().numpy(), axis=1)
-   
-  for index, (history, context) in enumerate(zip(history_ids, context_ids)):
-    stripped_history = [x for x in history if x != pad_id]
-    history_tokens = tokenizer.convert_ids_to_tokens(stripped_history)
-    history_symbols = ' '.join(history_tokens).replace(' ##', '')
-    history_text = history_symbols.replace('Ġ', '').replace('</s>', '//').replace(' âĢ Ļ', '\'')
-    action_pred = action_mapper[top_action_ids[index]]
-    action_actual = action_mapper[action_target[index].cpu()]
-    
-    if args.filter and (action_pred == action_actual):
-      print('--- Skipping since model is correct ---')
-      continue
-
-    context_tokens = tokenizer.convert_ids_to_tokens(context)
-    tvii = top_value_ids[index]
-    if tvii >= num_values:
-      tvii -= num_values
-      value_pred = context_tokens[tvii]
-    else:
-      value_pred = value_mapper[tvii]
-
-    vtic = value_target[index].cpu()  
-    if vtic >= num_values:
-      vtic -= num_values
-      value_actual = context_tokens[vtic]
-    else:
-      value_actual = value_mapper[vtic]
-    print(index, history_text)
-    print(f"Predicted Action: {action_pred}, Actual: {action_actual}")
-    print(f"Predicted Value: {value_pred}, Actual: {value_actual}")
-
-  pdb.set_trace()  
-
-def quantify(args, predictions, labels, utils=None):
-  assert len(predictions) == len(labels)
- 
-  if utils == "train" and not args.verbose:
-    return predictions, labels
-
-  if args.task == 'ast':
-    predictions = [pred.detach().cpu().numpy() for pred in predictions]
-    labels = [label.detach().cpu().numpy() for label in labels]
-    report, res_name = ast_report(predictions, labels)
-
-  elif args.task == 'cds':
-    predictions = [pred.detach().cpu().numpy() for pred in predictions]
-    labels = [label.detach().cpu().numpy() for label in labels]
-    kb_labels = utils['kb_labels'] if args.use_kb else None
-
-    if args.cascade:
-      ci_and_tc = utils['ci_and_tc']
-      result = cds_report(predictions, labels, ci_and_tc, kb_labels)
-      report, res_name = result
-    else:
-      report, res_name = task_completion_report(predictions, labels, kb_labels)
-
-  return report, res_name
-
-
-if __name__ == '__main__':
-  class MyModel():
-    def __init__(self):
-      self.utt_vectors = []
-      self.utt_texts = []
-
-  args = {}
-  run_interaction(args, MyModel())
diff --git a/utils/help.py b/utils/help.py
deleted file mode 100644
index 28995d8..0000000
--- a/utils/help.py
+++ /dev/null
@@ -1,83 +0,0 @@
-import os, sys, pdb
-import random
-import math
-import torch
-import numpy as np
-
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
-def set_seed(args):
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if args.n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-def setup_gpus(args):
-    n_gpu = 0  # set the default to 0
-    if torch.cuda.is_available():
-        n_gpu = torch.cuda.device_count()
-    args.n_gpu = n_gpu
-    if n_gpu > 0:   # this is not an 'else' statement and cannot be combined
-        torch.backends.cudnn.benchmark = False
-        torch.backends.cudnn.deterministic = True
-
-    if args.debug:
-        args.epochs = 3
-    return args
-
-def check_cache(args, cache_dir):
-  cache_filename = f"{args.model_type}_{args.task}"
-  if args.cascade:
-    cache_filename += "_cascade"
-  if args.use_intent:
-    cache_filename += "_intent"
-  cache_path = os.path.join(cache_dir, cache_filename)
-
-  if os.path.exists(cache_path):
-    print(f"Loading features from cached file {cache_path}")
-    return cache_path, True
-  else:
-    print(f"Loading raw data and preparing new features")
-    return cache_path, False
-
-def check_directories(args):
-    cache_dir = os.path.join(args.input_dir, 'cache')
-    checkpoint_folder = f'{args.prefix}_{args.filename}_{args.model_type}_{args.suffix}'
-    ckpt_dir = os.path.join(args.output_dir, args.task, checkpoint_folder)
-    directories = [args.input_dir, cache_dir, args.output_dir, ckpt_dir]
-
-    for directory in directories:
-        if os.path.exists(directory):
-            if directory == ckpt_dir:
-                print(f"Warning: {directory} exists and files may be overwritten")
-        else: 
-            print(f"Creating {directory} directory ...")
-            os.makedirs(directory)
-
-    cache_results = check_cache(args, cache_dir)
-    return ckpt_dir, cache_results
-
-def prepare_inputs(args, batch, speaker_turn=False):
-
-    if args.task == 'ast':
-        full_history = {'input_ids': batch[0], 'token_type_ids': batch[1], 'attention_mask': batch[2]}
-        context_tokens = {'input_ids': batch[3], 'token_type_ids': batch[4], 'attention_mask': batch[5]}
-        targets = [batch[6], batch[7]] # actions and values
-        tools = device
-    else:
-        full_history = {'input_ids': batch[0], 'token_type_ids': batch[1], 'attention_mask': batch[2]}
-        context_tokens = {'input_ids': batch[3], 'token_type_ids': batch[4], 'attention_mask': batch[5]}
-        #           intent   nextstep   action    value     utterance
-        targets = [batch[6], batch[7], batch[8], batch[9], batch[10]]
-        candidates = batch[11]
-
-        if args.cascade:
-            targets.append(batch[15])   # convo_ids
-            targets.append(batch[16])   # turn_counts
-        if args.use_intent:
-          tools = candidates, device, batch[6]
-        else:
-          tools = candidates, device
-    
-    return full_history, targets, context_tokens, tools
diff --git a/utils/load.py b/utils/load.py
deleted file mode 100644
index e50cf7e..0000000
--- a/utils/load.py
+++ /dev/null
@@ -1,63 +0,0 @@
-import os, sys, pdb
-import csv
-import json
-import random
-import math
-import torch
-import numpy as np
-
-from transformers import BertTokenizer, RobertaTokenizer, AlbertTokenizer
-from components.tools import RAdam, AdamW, get_linear_schedule_with_warmup
-    
-def load_data(args, already_cached):
-  if already_cached:
-    return []  # no need to load raw_data since we already have a feature cache
-  else:
-    data_path = os.path.join(args.input_dir, f"abcd_v{args.version}.json")
-    raw_data = json.load(open(data_path, 'r'))
-    return raw_data
-
-def load_guidelines():
-  kb = json.load(open('data/kb.json', 'r'))
-  ont = json.load(open('data/ontology.json', 'r'))
-  return kb, ont
-
-def load_candidates(args):
-  # The raw agent utterances that are used as candidates when performing utterance ranking
-  utt_texts = json.load(open(f'{args.input_dir}/utterances.json', 'r'))
-  # Vectors already been embedded by BERT.  To embed in some other fashion, use the utt_texts instead
-  utt_vectors = torch.load(f'{args.input_dir}/utt_vectors.pt')
-  return utt_texts, utt_vectors
-
-def load_tokenizer(args):
-  ontology = json.load(open(f'{args.input_dir}/ontology.json', 'r'))
-  non_enumerable = ontology['values']['non_enumerable']
-  special = [f'<{slot}>' for category, slots in non_enumerable.items() for slot in slots]
-
-  if args.model_type == 'bert':
-    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-  elif args.model_type == 'roberta':
-    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-  elif args.model_type == 'albert':
-    tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-
-  tokenizer.add_tokens(special)
-  return tokenizer, ontology
-
-def get_optimizer(args, model, adam_epsilon=1e-8):
-    no_decay = ['bias', 'LayerNorm.weight']
-    grouped_parameters = [
-        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
-        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-    ]
-    if args.radam:
-      optimizer = RAdam(grouped_parameters, lr=args.learning_rate, eps=adam_epsilon)
-    else:
-      optimizer = AdamW(grouped_parameters, lr=args.learning_rate, eps=adam_epsilon)
-    return optimizer
-
-def get_scheduler(args, optimizer, training_steps, warmup_steps=0, warmup_ratio=0.06):
-    if warmup_steps == 0:  # use the warmup ratio instead
-      warmup_steps = math.ceil(training_steps * warmup_ratio)
-    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=training_steps)
-    return scheduler
diff --git a/utils/process.py b/utils/process.py
deleted file mode 100644
index a28f296..0000000
--- a/utils/process.py
+++ /dev/null
@@ -1,388 +0,0 @@
-import os, sys, pdb
-import csv
-import json
-import random
-import torch
-import numpy as np
-import pandas as pd
-import datetime
-
-from tqdm import tqdm as progress_bar
-from components.datasets import ActionFeature, CompletionFeature, CascadeFeature
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
-
-def setup_dataloader(datasets, batch_size, split):
-  dataset = datasets[split]
-  num_examples = len(dataset)
-  sampler = RandomSampler(dataset) if split == 'train' else SequentialSampler(dataset)
-  collate = dataset.collate_func
-  dataloader = DataLoader(dataset, sampler=sampler, batch_size=batch_size, collate_fn=collate)
-  print(f"Loaded {split} data with {len(dataloader)} batches")
-  return dataloader, num_examples
-
-def notify_feature_sizes(args, features):
-  if args.verbose:
-    for split, feats in features.items():
-      print(f"{split}: {len(feats)} features")
-
-def prepare_action_labels(ontology):
-  action_list = []
-  for section, buttons in ontology["actions"].items():
-    actions = buttons.keys()
-    action_list.extend(actions)
-  return {action: idx for idx, action in enumerate(action_list)}
-
-def prepare_intent_labels(ontology):
-  intent_list = []
-  for flow, subflows in ontology["intents"]["subflows"].items():
-    intent_list.extend(subflows)
-  return {intent: idx for idx, intent in enumerate(intent_list)}
-
-def prepare_nextstep_labels(ontology):
-  nextstep_list = ontology['next_steps']
-  return {nextstep: idx for idx, nextstep in enumerate(nextstep_list)}
-
-def prepare_value_labels(ontology):
-  value_list = []
-  for category, values in ontology["values"]["enumerable"].items():
-    # value_list.extend(values)
-    for val in values:
-      if val not in value_list:   # remove exactly one instance of credit_card
-        value_list.append(val.lower())
-  return {slotval: idx for idx, slotval in enumerate(value_list)}
-
-class BaseProcessor(object):
-
-  def __init__(self, args, tokenizer, ontology):
-    self.task = args.task
-    self.model_type = args.model_type
-    self.use_intent = args.use_intent
-
-    self.tokenizer = tokenizer
-    self.ontology = ontology
-
-    self.prepare_labels(args)
-    self.prepare_special_tokens(args)
-
-  def prepare_labels(self, args):
-    self.non_enumerable = self.ontology["values"]["non_enumerable"]
-    self.enumerable = {}
-    for category, values in self.ontology["values"]["enumerable"].items():
-      self.enumerable[category] = [val.lower() for val in values]
-
-    self.mappers = {  
-      'value': prepare_value_labels(self.ontology),
-      'action': prepare_action_labels(self.ontology),
-      'intent': prepare_intent_labels(self.ontology),
-      'nextstep': prepare_nextstep_labels(self.ontology)
-    }  # utterance is ranking, so not needed
-    self.start_idx = len(self.mappers['value'])
-
-    # Break down the slot values by action
-    self.value_by_action = {}
-    for section, actions in self.ontology["actions"].items():
-      for action, targets in actions.items():
-        self.value_by_action[action] = targets
-
-  def prepare_special_tokens(self, args):
-    special_tokens_count = 3 if args.model_type == 'roberta' else 2
-    # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
-    effective_max = args.max_seq_len - special_tokens_count
-    cls_token_segment_id = 0
-    sequence_a_segment_id = 0 if args.model_type == 'roberta' else 1
-    pad_token_segment_id = 0
-
-    self.special = {
-      'tokens': [self.tokenizer.cls_token, self.tokenizer.sep_token, self.tokenizer.pad_token],
-      'ids': [cls_token_segment_id, sequence_a_segment_id, pad_token_segment_id],
-      'maximum': [effective_max, args.max_seq_len]
-    }
-
-  def value_to_id(self, context, action, value, potential_vals):
-    # context is a list of utterances
-    target_id = -1
-    action_tokens = self.tokenizer.tokenize(action)
-    filtered = []
-    for utterance in context:
-      speaker, text = utterance.split('|')
-      context_tokens = self.tokenizer.tokenize(text)
-      for tok in context_tokens:
-        if tok in filtered:  continue       # find uniques this way to preserve order
-        if len(tok) > 2:
-          filtered.append(tok)            # remove punctuation and special tokens
-
-    effective_max = 100 - (len(action_tokens) + 3)   # three special tokens will be added
-    tokens = filtered[-effective_max:]             # [CLS] action [SEP] filtered [SEP]
-
-    for option in potential_vals:
-      if option in self.enumerable:    # just look it up
-        if value in self.enumerable[option]:
-          target_id = self.mappers['value'][value]
-      else:
-        entity = f'<{option}>'       # calculate location in the context
-        if entity in tokens:
-          target_id = self.start_idx + tokens.index(entity)
-
-      if target_id >= 0: break     # we found our guy, so let's move on
-
-    return target_id, tokens
-
-  def build_features(self, args, raw_data):
-    print("Build features method missing")
-    raise NotImplementedError()
-
-  def embed_utterance(self, text):
-    cls_token, sep_token, pad_token = self.special['tokens']
-    cls_token_segment_id, sequence_a_segment_id, pad_token_segment_id = self.special['ids']
-    effective_max, max_seq_length = self.special['maximum']
-
-    text = pad_token if text == '' else text
-    if self.model_type in ['roberta', 'large']:
-      tokens = self.tokenizer.tokenize(text, add_prefix_space=True)
-    else:
-      tokens = self.tokenizer.tokenize(text)
-    if len(tokens) > effective_max:
-      tokens = tokens[:effective_max]
-
-    tokens = tokens + [sep_token]
-    segment_ids = [cls_token_segment_id] + [sequence_a_segment_id] * len(tokens)
-    tokens = [cls_token] + tokens
-    # The convention in BERT is:
-    # (a) For sequence pairs:
-    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-    #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
-    # (b) For single sequences:
-    #  tokens:   [CLS] the dog is hairy . [SEP]
-    #  type_ids:   0   0   0   0  0     0   0
-    #
-    # Since we only ever have input text, the "type_ids" are instead used to indicate
-    # the speaker of 0 = customer, 1 = agent and 2 = action 
-    # The embedding vectors for `type=0` and `type=1` were learned during pre-training and 
-    # are added to the wordpiece embedding vector (and position vector). Hopefully
-    # the fine-tuning can overcome this difference in semantic meaning
-
-    input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
-    # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
-    input_mask = [1] * len(input_ids)
-
-    pad_token_id = self.tokenizer.convert_tokens_to_ids([pad_token])[0]
-    # Zero-pad up to the sequence length.
-    padding_length = max_seq_length - len(input_ids)
-    input_ids = input_ids + ([pad_token_id] * padding_length)
-    segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)
-    input_mask = input_mask + ([0] * padding_length)
-
-    assert len(input_ids) == max_seq_length
-    assert len(segment_ids) == max_seq_length
-    assert len(input_mask) == max_seq_length
-
-    return input_ids, segment_ids, input_mask
-
-  def convert_context_tokens(self, context_tokens):
-    # context_tokens is a list of pre-tokenized strings, with action name in the front
-    # and we want a list of embedded vectors
-    cls_token, sep_token, pad_token = self.special['tokens']
-    cls_token_segment_id, sequence_a_segment_id, pad_token_segment_id = self.special['ids']
-
-    tokens = context_tokens + [sep_token]
-    segment_ids = [cls_token_segment_id] + [sequence_a_segment_id] * len(tokens)
-    tokens = [cls_token] + tokens
-
-    token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
-    # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
-    input_mask = [1] * len(token_ids)
-
-    pad_token_id = self.tokenizer.convert_tokens_to_ids([pad_token])[0]
-    # Zero-pad up to the sequence length.
-    padding_length = 100 - len(token_ids)
-    token_ids = token_ids + ([pad_token_id] * padding_length)
-    segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)
-    mask_ids = input_mask + ([0] * padding_length)
-
-    return {'token_ids': token_ids, 'segment_ids': segment_ids, 'mask_ids': mask_ids}
-
-  def action_to_id(self, action):
-    if self.task == 'value':
-      return action
-    if ' ' in action:
-      action, input_position = action.split(' ')
-    return self.mappers['action'][action]
-
-  def convert_example(self, dialog_history, target_ids, context_tokens, intent=None, candidates=None):
-    sep_token = self.special['tokens'][1]
-
-    texts = [utterance.split('|')[1] for utterance in dialog_history]  # drop the speaker
-    if self.use_intent:
-      texts = [f"{intent}|{text}" for text in texts]
-    embedded, segments, mask = self.embed_utterance(f' {sep_token} '.join(texts))
-
-    if self.task == 'ast':
-      embedded_context = self.convert_context_tokens(context_tokens)
-      feature = ActionFeature(input_ids=embedded, segment_ids=segments, input_mask=mask, 
-            label_ids=target_ids, context=embedded_context)
-    elif self.task == 'cds':
-      embedded_context = self.convert_context_tokens(context_tokens)
-      feature = CascadeFeature(input_ids=embedded, segment_ids=segments, input_mask=mask, 
-            label_ids=target_ids, context=embedded_context, candidates=candidates)
-
-    return feature
-    
-class ASTProcessor(BaseProcessor):
-
-  def collect_one_example(self, context, action, value, potential_vals):
-    # actions that don't require any values
-    if value == 'not applicable':
-      target_ids = { 'action': self.action_to_id(action), 'value': -1}
-      feature = self.convert_example(context, target_ids, [])
-      self.split_feats.append(feature)      
-
-    else: # actions that require at least one value
-      value_id, context_tokens = self.value_to_id(context, action, value, potential_vals)
-      # context_tokens are used for copying from the context when selecting values
-      if value_id >= 0:
-        target_ids = { 'action': self.action_to_id(action), 'value': value_id}
-        feature = self.convert_example(context, target_ids, context_tokens)
-        self.split_feats.append(feature)
-
-  def collect_examples(self, context, action, values):
-    potential_vals = self.value_by_action[action]
-    # just skip if action does not require value inputs
-    if len(potential_vals) > 0:
-      # these two actions require 3 value inputs, so we break it down
-      if action in ['verify-identity', 'validate-purchase']:  
-        # a smarter model can be made that handles each position conditioned on other values
-        for position, value in zip(['a', 'b', 'c'], values):
-          action_name = action + ' ' + position
-          self.collect_one_example(context, action_name, value, potential_vals)
-      # other actions require a single value to be filled
-      else:
-        self.collect_one_example(context, action, values[0], potential_vals)
-    else:
-      self.collect_one_example(context, action, 'not applicable', potential_vals)
-
-  def build_features(self, args, raw_data):
-    features = {}
-
-    for split, data in raw_data.items():
-      self.split_feats = []
-      print(f"Building features for {split}")
-
-      for convo in progress_bar(data, total=len(data)):
-        so_far = []
-
-        for turn in convo['delexed']:
-          speaker, utt = turn['speaker'], turn['text']
-          _, _, action, values, _ = turn['targets']
-
-          if speaker in ['agent', 'customer']:
-            utt_str = f'{speaker}|{utt}'
-            so_far.append(utt_str)
-          else:   # create a training example during every action
-            context = so_far.copy() # [::-1] to reverse
-            self.collect_examples(context, action, values)
-            action_str = f'action|{action}'
-            so_far.append(action_str)
-
-      features[split] = self.split_feats
-    return features
-
-class CDSProcessor(BaseProcessor):
-
-  def collect_one_example(self, dialog_history, targets, support_items):
-    intent, nextstep, action, _, utt_id = targets
-    candidates = [-1]*100
-    context_tokens = []
-    action_id, value_id = -1, -1
-
-    if nextstep == 'take_action':
-      value, potential_vals, convo_id, turn_id = support_items
-      action_id = self.action_to_id(action)
-      if value != 'not applicable':
-        value_id, context_tokens = self.value_to_id(dialog_history, action, value, potential_vals)
-
-    elif nextstep == 'retrieve_utterance':
-      candidates, convo_id, turn_id = support_items
-
-    elif nextstep == 'end_conversation':
-      convo_id, turn_id = support_items
-
-    target_ids = {
-      'intent': self.mappers['intent'][intent],
-      'nextstep': self.mappers['nextstep'][nextstep],
-      'action': action_id,
-      'value': value_id,
-      'utterance': utt_id,
-      'convo': convo_id,
-      'turn': turn_id,
-    }
-    feature = self.convert_example(dialog_history, target_ids, context_tokens, intent, candidates)  
-    self.split_feats.append(feature)
-
-  def collect_examples(self, context, targets, convo_id, turn_id):
-    _, _, action, values, _ = targets
-    potential_vals = self.value_by_action[action]
-
-    if len(potential_vals) > 0:           # just skip if action does not require inputs
-      if action in ['verify-identity', 'validate-purchase']:  # 3 action inputs
-        for position, value in zip(['a', 'b', 'c'], values):
-          action_name = action + ' ' + position
-          self.collect_one_example(context, targets, (value, potential_vals, convo_id, turn_id))  
-      else:
-        self.collect_one_example(context, targets, (values[0], potential_vals, convo_id, turn_id))  
-    else:
-      self.collect_one_example(context, targets, ("not applicable", potential_vals, convo_id, turn_id))
-
-  def build_features(self, args, raw_data):
-    features = {}
-
-    for split, data in raw_data.items():
-      self.split_feats = []
-      print(f"Building features for {split}")
-
-      for convo in progress_bar(data, total=len(data)):
-        so_far = []
-
-        for turn in convo['delexed']:
-          speaker, text = turn['speaker'], turn['text']
-          utterance = f"{speaker}|{text}"
-
-          if speaker == 'agent':
-            context = so_far.copy()
-            support_items = turn['candidates'], convo['convo_id'], turn['turn_count']
-            self.collect_one_example(context, turn['targets'], support_items)
-            so_far.append(utterance)
-          elif speaker == 'action':
-            context = so_far.copy()
-            self.collect_examples(context, turn['targets'], convo['convo_id'], turn['turn_count'])
-            so_far.append(utterance)
-          else:
-            so_far.append(utterance)
-
-        context = so_far.copy()  # the entire conversation
-        end_targets = turn['targets'].copy()
-        end_targets[1] = 'end_conversation'
-        end_targets[4] = -1
-        support_items = convo['convo_id'], turn['turn_count']
-        self.collect_one_example(context, end_targets, support_items)
-
-      features[split] = self.split_feats
-    return features
-
-def process_data(args, tokenizer, ontology, raw_data, cache_path, from_cache):
-  # Takes in a pre-processed dataset and performs further operations:
-  # 1) Extract the labels 2) Embed the inputs 3) Store both into features 4) Cache the results
-  if args.task == 'ast':
-    processor = ASTProcessor(args, tokenizer, ontology)
-  elif args.task == 'cds':
-    processor = CDSProcessor(args, tokenizer, ontology)
-
-  if from_cache:
-    features = torch.load(cache_path)
-    print(f"Features loaded successfully.")
-  else:
-    features = processor.build_features(args, raw_data)
-    print(f"Saving features into cached file {cache_path}")
-    torch.save(features, cache_path)
-  
-  notify_feature_sizes(args, features)
-  return features, processor.mappers