From 0d37321f7ab39f10ae7eaa2670965e258dd7b71e Mon Sep 17 00:00:00 2001
From: pasinger <pasinger@mit.edu>
Date: Thu, 17 Mar 2022 12:27:30 -0400
Subject: [PATCH 01/16] Update base.py

add class BEEPCycleFeatureMatrix to combine features which were generated at every diagnostic cycle
---
 beep/features/base.py | 204 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 203 insertions(+), 1 deletion(-)

diff --git a/beep/features/base.py b/beep/features/base.py
index b70a596f..f3d3fffd 100644
--- a/beep/features/base.py
+++ b/beep/features/base.py
@@ -72,7 +72,7 @@ def __init__(self, structured_datapath: Union[BEEPDatapath, None], hyperparamete
         else:
             self.hyperparameters = self.DEFAULT_HYPERPARAMETERS
 
-        if not (structured_datapath is None or structured_datapath.is_structured):
+        if structured_datapath is not None and not structured_datapath.is_structured:
             raise BEEPFeaturizationError("BEEPDatapath input is not structured!")
         self.datapath = structured_datapath
 
@@ -361,3 +361,205 @@ def to_json_file(self, filename):
         """
         d = self.as_dict()
         dumpfn(d, filename)
+        
+class BEEPCycleFeatureMatrix(MSONable):
+    """
+    Create an ((n battery cycler files) x (j cycles)) x (k features)  array composed of
+    m BEEPFeaturizer objects.
+
+    Args:
+        beepfeaturizers ([BEEPFeaturizer]): A list of BEEPFeaturizer objects
+
+    """
+
+    OP_DELIMITER = "::"
+
+    def __init__(self, beepfeaturizers: List[BEEPFeaturizer]):
+
+        if beepfeaturizers:
+            # initialize emtpy dict of file names
+            dfs_by_file = {os.path.basename(
+                bf.paths.get("structured", "no file found")
+             )[0:-19]: pd.DataFrame(columns=['filename','cycle_index','diag_pos']
+                                   ) for bf in beepfeaturizers}
+            # big_df_rows = {bf.__class__.__name__: [] for bf in beepfeaturizers}
+            unique_features = {}
+            for i, bf in enumerate(beepfeaturizers):
+                if bf.features is None:
+                    raise BEEPFeatureMatrixError(f"BEEPFeaturizer {bf} has not created features")
+                    
+#                 elif bf.features.shape[0] != 1:
+#                     raise BEEPFeatureMatrixError(f"BEEPFeaturizer {bf} features are not 1-dimensional.")
+                else:
+                    bfcn = bf.__class__.__name__
+
+#                     fname = bf.paths.get("structured", None)
+                    fname = os.path.basename(bf.paths['structured'])[0:-19]
+                    if not fname:
+                        raise BEEPFeatureMatrixError(
+                            "Cannot join features automatically as no linking can be done "
+                            "based on original structured filename."
+                        )
+
+                    # Check for any possible feature collisions using identical featurizers
+                    # on identical files
+
+                    # sort params for this featurizer obj by key
+                    params = sorted(list(bf.hyperparameters.items()), key=lambda x: x[0])
+
+                    # Prevent identical features from identical input files
+                    # create a unique operation string for the application of this featurizer
+                    # on a specific file, this op string will be the same as long as
+                    # the featurizer class name, hyperparameters, and class are the same
+
+                    param_str = "-".join([f"{k}:{v}" for k, v in params])
+                    param_hash = hashlib.sha256(param_str.encode("utf-8")).hexdigest()
+
+                    # Get an id for this featurizer operation (including hyperparameters)
+                    # regardless of the file it is applied on
+                    feature_op_id = f"{bfcn}{self.OP_DELIMITER}{param_hash}"
+
+                    # Get an id for this featurizer operation (including hyperparameters)
+                    # on THIS SPECIFIC file.
+                    file_feature_op_id = f"{fname}{self.OP_DELIMITER}{bfcn}{self.OP_DELIMITER}{param_hash}"
+
+                    # Get a unique id for every feature generated by a specific
+                    # featurizer on a specific file.
+                    this_file_feature_columns_ids = \
+                        [
+                            f"{file_feature_op_id}{self.OP_DELIMITER}{c}" for c in bf.features.columns
+                        ]
+
+                    # Check to make sure there are no duplicates of the exact same feature for
+                    # the exact same featurizer with the exact same hyperparameters on the exact
+                    # same file.
+                    collisions = {c: f for c, f in unique_features.items() if c in this_file_feature_columns_ids}
+                    if collisions:
+                        raise BEEPFeatureMatrixError(
+                            f"Multiple features generated with identical classes and identical hyperparameters"
+                            f" attempted to be joined into same dataset; \n"
+                            f"{bfcn} features collide with existing: \n{collisions}"
+                        )
+                    for c in this_file_feature_columns_ids:
+                        unique_features[c] = bfcn
+
+                    # Create consistent scheme for naming features regardless of file
+                    df = copy.deepcopy(bf.features)
+                    consistent_column_names = [f"{c}{self.OP_DELIMITER}{feature_op_id}" for c in df.columns]
+                    df.columns = consistent_column_names
+
+#                     df.index = [fname] * df.shape[0]
+#                     df.index.rename("filename", inplace=True)
+
+                    # create filename column to merge on
+                    df['filename'] = os.path.basename(bf.paths['structured'])[0:-19]
+                    
+#                     df = df.reset_index(drop=True)
+
+                    # remove hash from cycle_index and diag_pos column
+                    cycle_index_col = [col for col in df.columns if 'cycle_index' in col]
+                    df.rename(columns={cycle_index_col[0]:'cycle_index'},inplace=True)
+        
+                    # remove hash from diag_pos column
+                    diag_pos_col = [col for col in df.columns if 'diag_pos' in col]
+                    df.rename(columns={diag_pos_col[0]:'diag_pos'},inplace=True)
+                    
+                    # ensure cycle_index and diag_pos are integers
+                    df['cycle_index'] = df['cycle_index'].astype(int)
+                    df['diag_pos'] = df['diag_pos'].astype(int)
+                    
+                    # append each BEEPFeaturizer df to the corresponding cell dict entry
+#                     dfs_by_file[fname].append(df)
+                    dfs_by_file[fname] = dfs_by_file[fname].merge(
+                        df,how='outer',on=['filename','cycle_index','diag_pos']).sort_values('cycle_index').reset_index(drop=True)
+#                     dfs_by_file[fname] = pd.concat(
+#                         [dfs_by_file[fname],df],
+#                         axis=1,join='outer',ignore_index=True,
+#                         keys=['filename'])
+#                     self.dfs_by_file = dfs_by_file
+#                     self.df = df
+#             return None
+            
+            
+            rows = []
+            self.matrix = pd.DataFrame()
+            for filename, dfs in dfs_by_file.items():
+#                 row = pd.concat([row,dfs], axis=1)
+#                 row = row[sorted(row.columns)]
+#                 rows.append(row)
+                self.matrix = pd.concat([self.matrix,dfs], axis=0, ignore_index=True, join='outer') #, keys=['filename']
+
+        else:
+            self.matrix = None
+
+        self.featurizers = beepfeaturizers
+
+    def as_dict(self):
+        """Serialize a BEEPDatapath as a dictionary.
+
+        Must not be loaded from legacy.
+
+        Returns:
+            (dict): corresponding to dictionary for serialization.
+
+        """
+
+        return {
+            "@module": self.__class__.__module__,
+            "@class": self.__class__.__name__,
+
+            # Core parts of BEEPFeaturizer
+            "featurizers": [f.as_dict() for f in self.featurizers],
+            "matrix": self.matrix.to_dict("list"),
+        }
+
+    @classmethod
+    def from_dict(cls, d):
+        """Create a BEEPDatapath object from a dictionary.
+
+        Args:
+            d (dict): dictionary represenation.
+
+        Returns:
+            beep.structure.ProcessedCyclerRun: deserialized ProcessedCyclerRun.
+        """
+        # no need for original datapaths, as their ref paths should
+        # be in the subobjects
+        featurizers = [MontyDecoder().process_decoded(f) for f in d["featurizers"]]
+        return cls(featurizers)
+
+    @classmethod
+    def from_json_file(cls, filename):
+        """Load a structured run previously saved to file.
+
+        .json.gz files are supported.
+
+        Loads a BEEPFeatureMatrix from json.
+
+        Can be used in combination with files serialized with BEEPFeatures.to_json_file.
+
+        Args:
+            filename (str, Pathlike): a json file from a structured run, serialzed with to_json_file.
+
+        Returns:
+            None
+        """
+        return loadfn(filename)
+
+    def to_json_file(self, filename):
+        """Save a BEEPFeatureMatrix to disk as a json.
+
+        .json.gz files are supported.
+
+        Not named from_json to avoid conflict with MSONable.from_json(*)
+
+        Args:
+            filename (str, Pathlike): The filename to save the file to.
+            omit_raw (bool): If True, saves only structured (NOT RAW) data.
+                More efficient for saving/writing to disk.
+
+        Returns:
+            None
+        """
+        d = self.as_dict()
+        dumpfn(d, filename)
\ No newline at end of file

From 6bf62f9fc9f5c14f79163d26174051f7060a68c1 Mon Sep 17 00:00:00 2001
From: pasinger <pasinger@mit.edu>
Date: Thu, 17 Mar 2022 12:42:27 -0400
Subject: [PATCH 02/16] Update core.py

Add CycleFeatures classes and change existing class names to EarlyFeatures where appropriate.

HPPCResistanceVoltageCycleFeatures returns hppc resistance features at every diagnostic.

DiagnosticCycleFeatures returns some standard quantities, like discharge capacity of various RPTs across all diagnostic cycles.

CyclingProtocol returns cycling protocol information for the cell. This was integrated from the ChargingProtocol class written by Amalie Trewartha.
---
 beep/features/core.py | 267 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 258 insertions(+), 9 deletions(-)

diff --git a/beep/features/core.py b/beep/features/core.py
index 878b0987..7571ccd9 100644
--- a/beep/features/core.py
+++ b/beep/features/core.py
@@ -4,15 +4,17 @@
 from scipy.interpolate import interp1d
 
 from beep import PROTOCOL_PARAMETERS_DIR
-from beep.features import featurizer_helpers
-from beep.features.base import BEEPFeaturizer, BEEPFeaturizationError
+from beep.features import featurizer_helps
+from functools import reduce
+from beep.utils.parameters_lookup import get_protocol_parameters
 
+from beep.features.base import BEEPFeaturizer, BEEPFeaturizationError
 
-class HPPCResistanceVoltageFeatures(BEEPFeaturizer):
+class HPPCResistanceVoltageEarlyFeatures(BEEPFeaturizer):
     DEFAULT_HYPERPARAMETERS = {
         "test_time_filter_sec": 1000000,
         "cycle_index_filter": 6,
-        "diag_pos": 1,
+        "diag_pos":1,
         "soc_window": 8,
         "parameters_path": PROTOCOL_PARAMETERS_DIR
     }
@@ -50,8 +52,8 @@ def create_features(self):
         ).filter(lambda x: ~x["test_time"].isnull().all())
 
         # diffusion features
-        diffusion_features = featurizer_helpers.get_diffusion_features(
-            self.datapath, self.hyperparameters["diag_pos"]
+        diffusion_features = featurizer_helpers.get_diffusion_early_features(
+            self.datapath,
         )
 
         hppc_r = pd.DataFrame()
@@ -84,9 +86,81 @@ def create_features(self):
         # merge everything together as a final result dataframe
         self.features = pd.concat(
             [hppc_r, hppc_ocv, v_diff, diffusion_features], axis=1)
+        
+class HPPCResistanceVoltageCycleFeatures(BEEPFeaturizer):
+    DEFAULT_HYPERPARAMETERS = {
+        "test_time_filter_sec": 1000000,
+        "cycle_index_filter": 6,
+        "soc_window": 8,
+        "parameters_path": PROTOCOL_PARAMETERS_DIR
+    }
+    
+    def validate(self):
+        val, msg = featurizer_helpers.check_diagnostic_validation(self.datapath)
+        if val:
+            conditions = []
+            conditions.append(
+                any(
+                    [
+                        "hppc" in x
+                        for x in
+                        self.datapath.diagnostic_summary.cycle_type.unique()
+                    ]
+                )
+            )
+            if all(conditions):
+                return True, None
+            else:
+                return False, "HPPC conditions not met for this cycler run"
+        else:
+            return val, msg
+        
+    def create_features(self):
+        # Filter out low cycle numbers at the end of the test, corresponding to the "final" diagnostic
+        self.datapath.diagnostic_data = self.datapath.diagnostic_data[
+            ~((self.datapath.diagnostic_data.test_time > self.hyperparameters[
+                'test_time_filter_sec']) &
+              (self.datapath.diagnostic_data.cycle_index < self.hyperparameters[
+                  'cycle_index_filter']))
+        ]
+        self.datapath.diagnostic_data = self.datapath.diagnostic_data.groupby(
+            ["cycle_index", "step_index", "step_index_counter"]
+        ).filter(lambda x: ~x["test_time"].isnull().all())
+        
+        # Only hppc_resistance_features are able to be calculated without error.
+        # Xiao Cui should be pulled in to understand the issue with the others features.
+        
+        
+        # diffusion features
+#         diffusion_features = featurizer_helpers.get_diffusion_cycle_features(
+#             self.datapath,
+#         )
+        
+        # hppc resistance features
+        hppc_resistance_features = featurizer_helpers.get_hppc_resistance_cycle_features(
+            self.datapath,
+        )
+
+        # the variance of ocv features
+#         hppc_ocv_features = featurizer_helpers.get_hppc_ocv_cycle_features(
+#             self.datapath,
+#         )
 
+        # the v_diff features
+#         v_diff = featurizer_helpers.get_v_diff_cycle_features(
+#             self.datapath,
+#             self.hyperparameters["soc_window"],
+#             self.hyperparameters["parameters_path"]
+#         )
 
-class CycleSummaryStats(BEEPFeaturizer):
+        # merge everything together as a final result dataframe
+        self.features = pd.concat(
+            [hppc_resistance_features, 
+             #hppc_ocv_features,
+             #v_diff, #diffusion_features
+            ], axis=1)
+        
+class CycleSummaryStatsEarlyFeatures(BEEPFeaturizer):
     DEFAULT_HYPERPARAMETERS = {
         "cycle_comp_num": [10, 100],
         "statistics": ["var", "min", "mean", "skew", "kurtosis", "abs",
@@ -252,7 +326,7 @@ def get_summary_statistics(self, array):
         return np.asarray(stats)
 
 
-class DiagnosticSummaryStats(CycleSummaryStats):
+class DiagnosticSummaryStatsEarlyFeatures(CycleSummaryStats):
     """
     Object corresponding to summary statistics from a diagnostic cycle of
     specific type. Includes constructors to create the features, object names
@@ -472,7 +546,7 @@ def create_features(self):
         self.features = X
 
 
-class DeltaQFastCharge(BEEPFeaturizer):
+class DeltaQFastChargeEarlyFeatures(BEEPFeaturizer):
     """
     Object corresponding to feature object. Includes constructors
     to create the features, object names and metadata attributes in the
@@ -907,3 +981,178 @@ def get_threshold_targets(self, df):
                 x_to_threshold[indx]]
 
         return pd.DataFrame(threshold_dict)
+    
+class DiagnosticCycleFeatures(BEEPFeaturizer):
+    """
+    This class stores fractional levels of degradation in discharge capacity and discharge energy
+    relative to the first cycle at each diagnostic cycle, grouped by diagnostic cycle type.
+
+        name (str): predictor object name.
+        X (pandas.DataFrame): features in DataFrame format.
+        metadata (dict): information about the conditions, data
+            and code used to produce features
+
+    Hyperparameters:
+        parameters_dir (str): Full path to directory of parameters to analyse the
+            diagnostic cycles
+    """
+    DEFAULT_HYPERPARAMETERS = {
+        "parameters_dir": PROTOCOL_PARAMETERS_DIR,
+        "nominal_capacity": 4.84,
+        
+    }
+
+    def validate(self):
+        """
+        This function determines if the input data has the necessary attributes for
+        creation of this feature class. It should test for all of the possible reasons
+        that feature generation would fail for this particular input data.
+
+        Args:
+            processed_cycler_run (beep.structure.ProcessedCyclerRun): data from cycler run
+            params_dict (dict): dictionary of parameters governing how the ProcessedCyclerRun object
+            gets featurized. These could be filters for column or row operations
+        Returns:
+            bool: True/False indication of ability to proceed with feature generation
+        """
+        return featurizer_helpers.check_diagnostic_validation(self.datapath)
+
+    def create_features(self):
+        """
+        Generates diagnostic-property features from processed cycler run, including values for n*x method
+        Args:
+            self.datapath (beep.structure.ProcessedCyclerRun): data from cycler run
+            params_dict (dict): dictionary of parameters governing how the ProcessedCyclerRun object
+                gets featurized. These could be filters for column or row operations
+            parameters_path (str): Root directory storing project parameter files.
+
+        Returns:
+            pd.DataFrame: cycle_index, RPT discharge capacities and energies, aging cycle discharge capacity and energy,
+                            equivalent full cycles of aging cycle discharge, cumulative discharge throughput.
+            for each diagnostic cycle of the cell
+        """
+
+        parameters_path = self.hyperparameters["parameters_dir"]
+        
+        # RPT discharge capacities       
+        data_rpt_02C = self.datapath.diagnostic_data.loc[self.datapath.diagnostic_data.cycle_type == 'rpt_0.2C']
+        Q_rpt_02C = data_rpt_02C.groupby('cycle_index')[['discharge_capacity','discharge_energy']].max().reset_index(drop=False)
+        Q_rpt_02C.rename(columns={'discharge_capacity':'rpt_0.2C_discharge_capacity','discharge_energy':'rpt_0.2C_discharge_energy'},
+                         inplace=True)
+        Q_rpt_02C = Q_rpt_02C.reset_index(drop=False).rename(columns={'index':'diag_pos'})
+        
+        rpt_02C_cycles = data_rpt_02C.cycle_index.unique() # for referencing last regular cycle before diagnostic
+        
+        data_rpt_1C = self.datapath.diagnostic_data.loc[self.datapath.diagnostic_data.cycle_type == 'rpt_1C']
+        Q_rpt_1C = data_rpt_1C.groupby('cycle_index')[['discharge_capacity','discharge_energy']].max().reset_index(drop=False)
+        Q_rpt_1C.rename(columns={'discharge_capacity':'rpt_1C_discharge_capacity','discharge_energy':'rpt_1C_discharge_energy'},
+                         inplace=True)
+        Q_rpt_1C = Q_rpt_1C.reset_index(drop=False).rename(columns={'index':'diag_pos'})
+        
+        data_rpt_2C = self.datapath.diagnostic_data.loc[self.datapath.diagnostic_data.cycle_type == 'rpt_2C']
+        Q_rpt_2C = data_rpt_2C.groupby('cycle_index')[['discharge_capacity','discharge_energy']].max().reset_index(drop=False)
+        Q_rpt_2C.rename(columns={'discharge_capacity':'rpt_2C_discharge_capacity','discharge_energy':'rpt_2C_discharge_energy'},
+                         inplace=True)
+        Q_rpt_2C = Q_rpt_2C.reset_index(drop=False).rename(columns={'index':'diag_pos'})
+        
+        # cumuative discharge throughput
+        aging_df = self.datapath.structured_summary[['cycle_index','charge_throughput','energy_throughput','energy_efficiency','charge_duration','CV_time','CV_current','energy_efficiency']]
+        aging_df = aging_df.loc[aging_df.cycle_index.isin(rpt_02C_cycles - 3)]
+        
+        cumulative_discharge_throughput = aging_df[['cycle_index','charge_throughput']].rename(columns={'charge_throughput':'discharge_throughput'}).reset_index(drop=True)
+        cumulative_discharge_throughput = cumulative_discharge_throughput.reset_index(drop=False).rename(columns={'index':'diag_pos'})
+
+        cumulative_energy_throughput = aging_df[['cycle_index','energy_throughput']].reset_index(drop=True)
+        cumulative_energy_throughput = cumulative_energy_throughput.reset_index(
+            drop=False).rename(columns={'index':'diag_pos'})
+        
+        equivalent_full_cycles = cumulative_discharge_throughput.copy()
+        equivalent_full_cycles.rename(columns={'discharge_throughput':'equivalent_full_cycles'},inplace=True)
+        equivalent_full_cycles['equivalent_full_cycles'] = equivalent_full_cycles['equivalent_full_cycles']/self.hyperparameters['nominal_capacity']
+                
+        # Q_aging_pre_diag - discharge capacity of aging cycle before diagnostic
+        Q_aging_pre_diag = self.datapath.structured_data.groupby('cycle_index')[
+                    'discharge_capacity'].max().loc[rpt_02C_cycles[1:] - 3].reset_index(drop=False) # ignore first diagnostic, adjust cycle index to Q_aging_pre_diag
+        Q_aging_pre_diag.rename(columns={'discharge_capacity':'Q_aging_pre_diag'},inplace=True)
+        Q_aging_pre_diag = Q_aging_pre_diag.reset_index(
+            drop=False).rename(columns={'index':'diag_pos'})
+        Q_aging_pre_diag['diag_pos'] = Q_aging_pre_diag['diag_pos'] + 1 # since, first diag is ignored, add one to diag_pos
+        
+        # Q_aging_post_diag - discharge capacity of aging cycle after diagnostic
+        Q_aging_post_diag = self.datapath.structured_data.groupby('cycle_index')[
+                    'discharge_capacity'].max().loc[rpt_02C_cycles + 3].reset_index(drop=False) # does not ignore first diag since Q_aging exists after first diag
+        Q_aging_post_diag.rename(columns={'discharge_capacity':'Q_aging_post_diag'},inplace=True)
+        Q_aging_post_diag = Q_aging_post_diag.reset_index(
+            drop=False).rename(columns={'index':'diag_pos'})
+        
+        # Diagnostic time
+        diagnostic_time = data_rpt_02C.groupby('cycle_index')['test_time'].min().reset_index(drop=False).rename(columns={'test_time':'diagnostic_time'})
+        diagnostic_time = diagnostic_time.reset_index(
+            drop=False).rename(columns={'index':'diag_pos'})
+        
+        # Combine dataframes
+        df_list = [Q_rpt_02C,Q_rpt_1C,Q_rpt_2C,
+                    cumulative_discharge_throughput,
+                    cumulative_energy_throughput,
+                    equivalent_full_cycles,
+                    Q_aging_pre_diag,
+                    Q_aging_post_diag,
+                    diagnostic_time]
+
+        for df in df_list:
+            df['cycle_index'] = df['cycle_index'].copy().astype(int)
+            df['diag_pos'] = df['diag_pos'].copy().astype(int)
+            
+        cycle_features = reduce(lambda x,y: pd.merge(x,y,on=['cycle_index','diag_pos'],how='outer'), df_list)
+        self.features = cycle_features.sort_values('cycle_index').reset_index(drop=True)
+        
+class CyclingProtocol(BEEPFeaturizer):
+    """
+    This class stores information about the charging protocol used
+        name (str): predictor object name.
+        X (pandas.DataFrame): features in DataFrame format.
+        metadata (dict): information about the conditions, data
+            and code used to produce features
+    Hyperparameters:
+        parameters_dir (str): Full path to directory of charging protocol parameters
+        quantities ([str]): list of parameters to return
+    """
+    DEFAULT_HYPERPARAMETERS = {
+        "parameters_dir": PROTOCOL_PARAMETERS_DIR,
+        "quantities": ["charge_constant_current_1", "charge_constant_current_2",
+                       "charge_cutoff_voltage", "charge_constant_voltage_time",
+                       "discharge_constant_current", "discharge_cutoff_voltage"],
+    }
+
+    def validate(self):
+        """
+        This function determines if the input data has the necessary attributes for
+        creation of this feature class. It should test for all of the possible reasons
+        that feature generation would fail for this particular input data.
+        Args:
+            processed_cycler_run (beep.structure.ProcessedCyclerRun): data from cycler run
+            params_dict (dict): dictionary of parameters governing how the ProcessedCyclerRun object
+            gets featurized. These could be filters for column or row operations
+        Returns:
+            bool: True/False indication of ability to proceed with feature generation
+        """
+        if not ('raw' in self.datapath.paths.keys() or 'structured' in self.datapath.paths.keys()):
+            message = "datapath paths not set, unable to fetch charging protocol"
+            return False, message
+        else:
+            return featurizer_helpers.check_diagnostic_validation(self.datapath)
+
+    def create_features(self):
+        """
+        Fetches charging protocol features
+        """
+
+        parameters_path = self.hyperparameters["parameters_dir"]
+        file_path = self.datapath.paths['raw'] if 'raw' in self.datapath.paths.keys() else self.datapath.paths['structured']
+
+        parameters, _ = get_protocol_parameters(file_path, parameters_path)
+
+        parameters = parameters[self.hyperparameters["quantities"]]
+        parameters['cycle_index'] = int(0) # create a cycle index column for merging with other featurizers
+        parameters['diag_pos'] = int(0) # create a diag_pos column for merging with other featurizers
+        self.features = parameters

From 48f0c0faae1d0ba9e935ab146f27da5d205036d2 Mon Sep 17 00:00:00 2001
From: pasinger <pasinger@mit.edu>
Date: Thu, 17 Mar 2022 12:50:35 -0400
Subject: [PATCH 03/16] Update intracell_losses_v2.py

Update DEFAULT_HYPERPARAMETERS and dependent calls to match updates to intracell_analysis.py.

Modify some variable names for easier understanding and consistency across files (e.g. using "diag_pos" in for loop instead of "i").

Update dataframe construction to match updates in intracell_analysis.py.
---
 beep/features/intracell_losses_v2.py | 80 ++++++++++++++++++----------
 1 file changed, 52 insertions(+), 28 deletions(-)

diff --git a/beep/features/intracell_losses_v2.py b/beep/features/intracell_losses_v2.py
index ab0413d1..516a4cdc 100644
--- a/beep/features/intracell_losses_v2.py
+++ b/beep/features/intracell_losses_v2.py
@@ -1,5 +1,5 @@
 import os
-
+import numpy as np
 import pandas as pd
 
 from beep import PROTOCOL_PARAMETERS_DIR
@@ -7,7 +7,6 @@
 from beep.features.base import BEEPFeaturizer
 from beep.features.intracell_analysis_v2 import IntracellAnalysisV2
 
-
 DEFAULT_CELL_INFO_DIR = os.path.join(PROTOCOL_PARAMETERS_DIR, "intracell_info")
 
 
@@ -24,19 +23,24 @@ class IntracellCyclesV2(BEEPFeaturizer):
     """
 
     DEFAULT_HYPERPARAMETERS = {
-        "diagnostic_cycle_type": 'rpt_0.2C',
-        "step_type": 0,
-        # Paths for anode files should be absolute
-        # Defaults are for the specified names in the current dir
-        "anode_file": os.path.join(
-            DEFAULT_CELL_INFO_DIR,
-            'anode_test.csv'
-        ),
-        "cathode_file": os.path.join(
-            DEFAULT_CELL_INFO_DIR,
-            'cathode_test.csv'
-        ),
-    }
+        'pe_pristine_files_dict':{},
+        'pe_pristine_usecols':['Ecell/V','Capacity/mA.h',
+                               'SOC_aligned','c_rate',
+                               'BVV_c_rate','Voltage_aligned'],
+        'ne_1_pristine_files_dict':{},
+        'ne_1_pristine_usecols':['Ecell/V','Capacity/mA.h',
+                                 'SOC_aligned','c_rate',
+                                 'BVV_c_rate','Voltage_aligned'],
+        'Q_fc_nom':4.84,
+        'C_nom':-0.2,
+        'cycle_type':'rpt_0.2C',
+        'step_type': 1, 
+        'error_type':'dVdQ',
+        'error_weighting':'dQdV',
+        'dvdq_bound':None,
+        'ne_2pos_file':None,
+        'ne_2neg_file':None
+    } 
 
     def validate(self):
         """
@@ -58,7 +62,7 @@ def validate(self):
 
             # Ensure overlap of cycle indices above threshold and matching cycle type
             eol_cycle_index_list = self.datapath.diagnostic_summary[
-                (self.datapath.diagnostic_summary.cycle_type == self.hyperparameters["diagnostic_cycle_type"]) &
+                (self.datapath.diagnostic_summary.cycle_type == self.hyperparameters["cycle_type"]) &
                 (self.datapath.diagnostic_summary.discharge_capacity > IntracellAnalysisV2.THRESHOLD)
                 ].cycle_index.to_list()
             if not eol_cycle_index_list:
@@ -93,10 +97,19 @@ def create_features(self):
              (pd.DataFrame) containing the cell material parameters as a function of cycle index
         """
         ia = IntracellAnalysisV2(
-            self.hyperparameters["cathode_file"],
-            self.hyperparameters["anode_file"],
-            cycle_type=self.hyperparameters["diagnostic_cycle_type"],
-            step_type=self.hyperparameters["step_type"]
+            pe_pristine_files_dict=self.hyperparameters["pe_pristine_files_dict"],
+            pe_pristine_usecols=self.hyperparameters["pe_pristine_usecols"],
+            ne_1_pristine_files_dict=self.hyperparameters["ne_1_pristine_files_dict"],
+            ne_1_pristine_usecols=self.hyperparameters["ne_1_pristine_usecols"],
+            Q_fc_nom=self.hyperparameters["Q_fc_nom"],
+            C_nom=self.hyperparameters["C_nom"],
+            cycle_type=self.hyperparameters["cycle_type"],
+            step_type=self.hyperparameters["step_type"],
+            error_type=self.hyperparameters["error_type"],
+            error_weighting=self.hyperparameters["error_weighting"],
+            dvdq_bound=self.hyperparameters["dvdq_bound"],
+            ne_2pos_file=self.hyperparameters["ne_2pos_file"],
+            ne_2neg_file=self.hyperparameters["ne_2neg_file"]
         )
 
         # (cell_init_aligned, cell_init_profile, PE_matched, NE_matched) = ia.intracell_wrapper_init(
@@ -111,15 +124,17 @@ def create_features(self):
         # initialize dicts before for loop
         dataset_dict_of_cell_degradation_path = dict()
         real_cell_dict_of_profiles = dict()
-        for i, cycle_index in enumerate(eol_cycle_index_list):
+        for diag_pos, cycle_index in enumerate(eol_cycle_index_list):
             loss_dict, profiles_dict = ia.intracell_values_wrapper_ah(cycle_index,
                                                                       self.datapath
                                                                       )
+            loss_dict[cycle_index] = np.append(diag_pos,loss_dict[cycle_index])
             dataset_dict_of_cell_degradation_path.update(loss_dict)
-            real_cell_dict_of_profiles.update(profiles_dict)
+#             real_cell_dict_of_profiles.update(profiles_dict)
 
         degradation_df = pd.DataFrame(dataset_dict_of_cell_degradation_path,
-                                      index=['rmse_error', 'LLI_opt', 'Q_pe_opt', 'Q_ne_opt', 'x_NE_2',
+                                      index=['diag_pos','rmse_error', 'LLI_opt', 'Q_pe_opt', 'Q_ne_opt', 'x_NE_2',
+                                             'IR_coef_pe_opt','IR_coef_ne_opt',
                                              'pe_voltage_FC4p2V', 'pe_voltage_FC4p1V', 'pe_voltage_FC4p0V',
                                              'pe_voltage_FC3p9V', 'pe_voltage_FC3p8V', 'pe_voltage_FC3p7V',
                                              'pe_voltage_FC3p6V', 'pe_voltage_FC3p5V', 'pe_voltage_FC3p4V',
@@ -143,6 +158,8 @@ def create_features(self):
                                              'ne_soc_FC3p2V', 'ne_soc_FC3p1V',
                                              'ne_soc_FC3p0V', 'ne_soc_FC2p9V', 'ne_soc_FC2p8V', 'ne_soc_FC2p7V',
                                              'Q_fc', 'Q_pe', 'Q_ne', 'Q_li']).T
+        degradation_df = degradation_df.reset_index().rename(columns={'index':'cycle_index'})
+        degradation_df['diag_pos'] = degradation_df['diag_pos'].astype(int)
         self.features = degradation_df
 
 
@@ -173,10 +190,17 @@ def create_features(self):
         """
 
         ia = IntracellAnalysisV2(
-            self.hyperparameters["cathode_file"],
-            self.hyperparameters["anode_file"],
-            cycle_type=self.hyperparameters["diagnostic_cycle_type"],
-            step_type=self.hyperparameters["step_type"]
+            pe_pristine_dict=self.hyperparameters["pe_pristine_dict"],
+            ne_1_pristine_dict=self.hyperparameters["ne_1_pristine_dict"],
+            Q_fc_nom=self.hyperparameters["Q_fc_nom"],
+            C_nom=self.hyperparameters["C_nom"],
+            cycle_type=self.hyperparameters["cycle_type"],
+            step_type=self.hyperparameters["step_type"],
+            error_type=self.hyperparameters["error_type"],
+            error_weighting=self.hyperparameters["error_weighting"],
+            dvdq_bound=self.hyperparameters["dvdq_bound"],
+            ne_2pos_file=self.hyperparameters["ne_2pos_file"],
+            ne_2neg_file=self.hyperparameters["ne_2neg_file"]
         )
 
         # (cell_init_aligned, cell_init_profile, PE_matched, NE_matched) = ia.intracell_wrapper_init(
@@ -228,4 +252,4 @@ def create_features(self):
         diag_1_names = ["diag_1_" + name for name in degradation_df.columns]
         values = {0: degradation_df.iloc[0].tolist() + degradation_df.iloc[1].tolist()}
         features_df = pd.DataFrame(values, index=diag_0_names+diag_1_names).T
-        self.features = features_df
+        self.features = features_df
\ No newline at end of file

From e4c463c09bd73fafe4729f39dbee2dd9593f98d6 Mon Sep 17 00:00:00 2001
From: pasinger <pasinger@mit.edu>
Date: Thu, 17 Mar 2022 13:00:37 -0400
Subject: [PATCH 04/16] Update intracell_analysis_v2.py

Modify reference electrode data to be a dictionary of rates and rate file names, instead of file names for one rate; these changes are necessary for other updates which seek to correct for effective rate in the electrodes and utilize multiple rate data of the electrodes.

Incorporate changes for using alternative error metrics, such as dVdQ instead of the V-Q curve.
---
 beep/features/intracell_analysis_v2.py | 1474 +++++++++++++-----------
 1 file changed, 782 insertions(+), 692 deletions(-)

diff --git a/beep/features/intracell_analysis_v2.py b/beep/features/intracell_analysis_v2.py
index a88b44ee..bf42f9b7 100644
--- a/beep/features/intracell_analysis_v2.py
+++ b/beep/features/intracell_analysis_v2.py
@@ -1,5 +1,7 @@
 import numpy as np
 import pandas as pd
+import matplotlib.pyplot as plt
+from matplotlib import cm
 from scipy.interpolate import interp1d
 from scipy.spatial import distance
 from scipy.optimize import differential_evolution
@@ -16,11 +18,21 @@ class IntracellAnalysisV2:
     THRESHOLD = 4.84 * 0.0
 
     def __init__(self,
-                 pe_pristine_file,
-                 ne_pristine_file,
+                 pe_pristine_files_dict={},
+                 pe_pristine_usecols=['Ecell/V','Capacity/mA.h',
+                                      'SOC_aligned','c_rate',
+                                      'BVV_c_rate','Voltage_aligned'],
+                 ne_1_pristine_files_dict={},
+                 ne_1_pristine_usecols=['Ecell/V','Capacity/mA.h',
+                                      'SOC_aligned','c_rate',
+                                      'BVV_c_rate','Voltage_aligned'],
+                 Q_fc_nom=4.84,
+                 C_nom=-0.2,
                  cycle_type='rpt_0.2C',
                  step_type=0,
                  error_type='V-Q',
+                 error_weighting='uniform',
+                 dvdq_bound=None,
                  ne_2pos_file=None,
                  ne_2neg_file=None
                  ):
@@ -28,18 +40,35 @@ def __init__(self,
         Invokes the cell electrode analysis class. This is a class designed to fit the cell and electrode
         parameters in order to determine changes of electrodes within the full cell from only full cell cycling data.
         Args:
-            pe_pristine_file (str): file name for the half cell data of the pristine (uncycled) positive
-                electrode
-            ne_pristine_file (str): file name for the half cell data of the pristine (uncycled) negative
-                electrode
+            pe_pristine_dict (str): dictionary for the half cell rate data of the pristine (uncycled) positive
+                electrode. Keys are the rate and the entries are dataframes of Voltage vs. SOC.
+            ne_1_pristine_dict (str): dictionary for the half cell rate data of the pristine (uncycled) negative
+                electrode. Keys are the rate and the entris are dataframes of Voltage vs. SOC.
             cycle_type (str): type of diagnostic cycle for the fitting
             step_type (int): charge or discharge (0 for charge, 1 for discharge)
-            error_type (str): defines which error metric is to be used
+            error_type (str): defines which error metric is to be used 
             ne_2neg_file (str): file name of the data for the negative component of the anode
             ne_2pos_file (str): file name of the data for the positive component of the anode
         """
-        self.pe_pristine = pd.read_csv(pe_pristine_file, usecols=['SOC_aligned', 'Voltage_aligned'])
-        self.ne_1_pristine = pd.read_csv(ne_pristine_file, usecols=['SOC_aligned', 'Voltage_aligned'])
+        
+        self.pe_pristine_files_dict = pe_pristine_files_dict
+        self.pe_pristine_dict = {}
+        [self.pe_pristine_dict.update(
+            {key:pd.read_csv(pe_pristine_files_dict[key],
+                             usecols=pe_pristine_usecols)
+                }) for key in pe_pristine_files_dict]
+        
+        self.ne_1_pristine_files_dict = ne_1_pristine_files_dict
+        self.ne_1_pristine_dict = {}
+        [self.ne_1_pristine_dict.update(
+            {key:pd.read_csv(ne_1_pristine_files_dict[key],
+                             usecols=ne_1_pristine_usecols)
+                }) for key in ne_1_pristine_files_dict]
+        
+        
+        
+        self.Q_fc_nom = Q_fc_nom
+        self.C_nom = C_nom
 
         if ne_2neg_file and ne_2pos_file:
             self.ne_2_pristine_pos = pd.read_csv(ne_2pos_file)
@@ -47,7 +76,7 @@ def __init__(self,
         else:
             self.ne_2_pristine_pos = pd.DataFrame()
             self.ne_2_pristine_neg = pd.DataFrame()
-
+            
         if step_type == 0:
             self.capacity_col = 'charge_capacity'
         else:
@@ -56,63 +85,72 @@ def __init__(self,
         self.cycle_type = cycle_type
         self.step_type = step_type
         self.error_type = error_type
+        self.error_weighting = error_weighting
+        self.dvdq_bound = dvdq_bound
+
 
     def process_beep_cycle_data_for_candidate_halfcell_analysis_ah(self,
-                                                                   cell_struct,
-                                                                   cycle_index):
+                                                                cell_struct,
+                                                                cycle_index):
         """
         Ingests BEEP structured cycling data and cycle_index and returns
                 a Dataframe of evenly spaced capacity with corresponding voltage.
-
+        
         Inputs:
         cell_struct (MaccorDatapath): BEEP structured cycling data
         cycle_index (int): cycle number at which to evaluate
-
+        
         Outputs:
-        real_cell_candidate_charge_profile_aligned (Dataframe): columns Q_aligned (evenly spaced)
+        real_cell_candidate_profile_aligned (Dataframe): columns Q_aligned (evenly spaced)
                 and Voltage_aligned
         """
 
         # filter the data down to the diagnostic type of interest
         diag_type_cycles = cell_struct.diagnostic_data.loc[cell_struct.diagnostic_data['cycle_type'] == self.cycle_type]
-        real_cell_candidate_charge_profile = diag_type_cycles.loc[
+        real_cell_candidate_profile = diag_type_cycles.loc[
             (diag_type_cycles.cycle_index == cycle_index)
-            & (diag_type_cycles.step_type == 0)  # step_type = 0 is charge, 1 is discharge
+            & (diag_type_cycles.step_type == self.step_type)  # step_type = 0 is charge, 1 is discharge
             & (diag_type_cycles.voltage < self.FC_UPPER_VOLTAGE)
-            & (diag_type_cycles[self.capacity_col] > 0)][['voltage', 'charge_capacity']]
-
+            & (diag_type_cycles[self.capacity_col] > 0)][['voltage', self.capacity_col]]
+        
+        # modify discharge data to match the V-Q directionality of charge data (increasing capacity w/ increasing voltage)
+        if self.step_type == 1:
+            real_cell_candidate_profile[self.capacity_col] = np.nanmax(real_cell_candidate_profile[self.capacity_col]) - real_cell_candidate_profile[self.capacity_col].copy()
+#         real_cell_candidate_profile['voltage'] = np.flipud(real_cell_candidate_profile['voltage'].copy())
+        
+        
         # renaming capacity,voltage column
-        real_cell_candidate_charge_profile['Q'] = real_cell_candidate_charge_profile['charge_capacity']
-
-        real_cell_candidate_charge_profile['Voltage'] = real_cell_candidate_charge_profile['voltage']
-        real_cell_candidate_charge_profile.drop('voltage', axis=1, inplace=True)
-
+        real_cell_candidate_profile['Q'] =  real_cell_candidate_profile[self.capacity_col]
+                                                     
+        real_cell_candidate_profile['Voltage'] = real_cell_candidate_profile['voltage']
+        real_cell_candidate_profile.drop('voltage', axis=1, inplace=True)
+        
         # interpolate voltage along evenly spaced capacity axis
-        q_vec = np.linspace(0, np.max(real_cell_candidate_charge_profile['Q']), 1001)
+        Q_vec = np.linspace(0, np.max(real_cell_candidate_profile['Q']),1001)  
 
-        real_cell_candidate_charge_profile_aligned = pd.DataFrame()
-        real_cell_candidate_charge_profile_interper = interp1d(real_cell_candidate_charge_profile['Q'],
-                                                               real_cell_candidate_charge_profile['Voltage'],
+        real_cell_candidate_profile_aligned = pd.DataFrame()
+        real_cell_candidate_profile_interper = interp1d(real_cell_candidate_profile['Q'],
+                                                               real_cell_candidate_profile['Voltage'],
                                                                bounds_error=False,
-                                                               fill_value=(
-                                                               self.FC_LOWER_VOLTAGE, self.FC_UPPER_VOLTAGE))
-        real_cell_candidate_charge_profile_aligned['Voltage_aligned'] = real_cell_candidate_charge_profile_interper(
-            q_vec)
-
-        real_cell_candidate_charge_profile_aligned['Q_aligned'] = q_vec
+                                                               fill_value=(self.FC_LOWER_VOLTAGE, self.FC_UPPER_VOLTAGE))
+        real_cell_candidate_profile_aligned['Voltage_aligned'] = real_cell_candidate_profile_interper(
+            Q_vec)
 
-        return real_cell_candidate_charge_profile_aligned
+        real_cell_candidate_profile_aligned['Q_aligned'] = Q_vec
 
+        return real_cell_candidate_profile_aligned
+    
     def _impose_electrode_scale(self,
-                                pe_pristine=pd.DataFrame(),
-                                ne_1_pristine=pd.DataFrame(),
-                                ne_2_pristine_pos=pd.DataFrame(),
-                                ne_2_pristine_neg=pd.DataFrame(),
-                                lli=0.0, q_pe=0.0, q_ne=0.0, x_ne_2=0.0):
+                            pe_pristine=pd.DataFrame(),
+                            ne_1_pristine=pd.DataFrame(),
+                            ne_2_pristine_pos=pd.DataFrame(),
+                            ne_2_pristine_neg=pd.DataFrame(),
+                            lli=0.0, Q_pe=0.0, Q_ne=0.0, x_ne_2=0.0):
+        
         """
-        Scales the reference electrodes according to specified capacities and
+        Scales the reference electrodes according to specified capacities and 
         offsets their capacities according to lli. Blends negative electrode materials.
-
+        
         Inputs:
         pe_pristine (Dataframe): half cell data of the pristine (uncycled) positive
                 electrode
@@ -120,73 +158,76 @@ def _impose_electrode_scale(self,
                 electrode
         ne_2_pos (Dataframe): half cell data for the positive component of the anode
         ne_2_neg (Dataframe): half cell data for the negative component of the anode
-        lli (float): Loss of Lithium Inventory - capacity of the misalignment between
+        lli (float): Loss of Lithium Inventory - capacity of the misalignment between 
                 cathode and anode zero-capacity
-        q_pe (float): capacity of the positive electrode (cathode)
-        q_ne (float): capacity of the negative electrode (anode)
-        x_ne_2 (float): fraction of ne_2_pristine_pos or ne_2_pristine_neg
+        Q_pe (float): capacity of the positive electrode (cathode)
+        Q_ne (float): capacity of the negative electrode (anode)
+        x_ne_2 (float): fraction of ne_2_pristine_pos or ne_2_pristine_neg 
                 (positive or negative value, respectively) to ne_1_pristine
-
+        
         Outputs:
-        pe_degraded (Dataframe): positive electrode with imposed capacity
+        pe_degraded (Dataframe): positive electrode with imposed capacity 
                 scale to emulate degradation
-        ne_degraded (Dataframe): negative electrode with imposed capacity
+        ne_degraded (Dataframe): negative electrode with imposed capacity 
                 scale and capacity offset to emulate degradation
         """
+
         # Blend negative electrodes
         ne_pristine = blend_electrodes(ne_1_pristine, ne_2_pristine_pos, ne_2_pristine_neg, x_ne_2)
-
-        # rescaling pristine electrodes to q_pe and q_ne
-        pe_q_scaled = pe_pristine.copy()
-        pe_q_scaled['Q_aligned'] = (pe_q_scaled['SOC_aligned'] / 100) * q_pe
-        ne_q_scaled = ne_pristine.copy()
-        ne_q_scaled['Q_aligned'] = (ne_q_scaled['SOC_aligned'] / 100) * q_ne
-
+        
+        # rescaling pristine electrodes to Q_pe and Q_ne
+        pe_Q_scaled = pe_pristine.copy()
+        pe_Q_scaled['Q_aligned'] = (pe_Q_scaled['SOC_aligned']/np.nanmax(pe_Q_scaled['SOC_aligned']))*Q_pe
+        ne_Q_scaled = ne_pristine.copy()
+        ne_Q_scaled['Q_aligned'] = (ne_Q_scaled['SOC_aligned']/np.nanmax(ne_Q_scaled['SOC_aligned']))*Q_ne
+        
         # translate pristine ne electrode with lli
-        ne_q_scaled['Q_aligned'] = ne_q_scaled['Q_aligned'] + lli
-
+        ne_Q_scaled['Q_aligned'] = ne_Q_scaled['Q_aligned'] + lli
+        
         # Re-interpolate to align dataframes for differencing
-        lower_q = np.min((np.min(pe_q_scaled['Q_aligned']),
-                          np.min(ne_q_scaled['Q_aligned'])))
-        upper_q = np.max((np.max(pe_q_scaled['Q_aligned']),
-                          np.max(ne_q_scaled['Q_aligned'])))
-        q_vec = np.linspace(lower_q, upper_q, 1001)
-
+        lower_Q = np.min((np.min(pe_Q_scaled['Q_aligned']),
+                            np.min(ne_Q_scaled['Q_aligned'])))
+        upper_Q = np.max((np.max(pe_Q_scaled['Q_aligned']),
+                            np.max(ne_Q_scaled['Q_aligned'])))
+        Q_vec = np.linspace(lower_Q, upper_Q, 1001)
+        
         # Actually aligning the electrode Q's
-        pe_pristine_interper = interp1d(pe_q_scaled['Q_aligned'],
-                                        pe_q_scaled['Voltage_aligned'], bounds_error=False)
-        pe_degraded = pe_q_scaled.copy()
-        pe_degraded['Q_aligned'] = q_vec
-        pe_degraded['Voltage_aligned'] = pe_pristine_interper(q_vec)
-
-        ne_pristine_interper = interp1d(ne_q_scaled['Q_aligned'],
-                                        ne_q_scaled['Voltage_aligned'], bounds_error=False)
-        ne_degraded = ne_q_scaled.copy()
-        ne_degraded['Q_aligned'] = q_vec
-        ne_degraded['Voltage_aligned'] = ne_pristine_interper(q_vec)
+        pe_pristine_interper = interp1d(pe_Q_scaled['Q_aligned'],
+                                        pe_Q_scaled['Voltage_aligned'], bounds_error=False)
+        pe_degraded = pe_Q_scaled.copy()
+        pe_degraded['Q_aligned'] = Q_vec
+        pe_degraded['Voltage_aligned'] = pe_pristine_interper(Q_vec)
+        
+
+        ne_pristine_interper = interp1d(ne_Q_scaled['Q_aligned'],
+                                        ne_Q_scaled['Voltage_aligned'], bounds_error=False)
+        ne_degraded = ne_Q_scaled.copy()
+        ne_degraded['Q_aligned'] = Q_vec
+        ne_degraded['Voltage_aligned'] = ne_pristine_interper(Q_vec)
 
         # Returning pe and ne degraded on an Ah basis
         return pe_degraded, ne_degraded
-
+    
     def halfcell_degradation_matching_ah(self, x, *params):
         """
-        Calls underlying functions to impose degradation through electrode
+        Calls underlying functions to impose degradation through electrode 
         capacity scale and alignment through LLI. Modifies emulated full cell
         data to be within full cell voltage range and calibrates (zeros) capacity
         at the lowest permissible voltage. Interpolates real and emulated data onto
         a common capacity axis.
-
+        
         Inputs:
-        x (list): [LLI, q_pe, q_ne, x_ne_2]
-        *params:
+        x (list): [LLI, Q_pe, Q_ne, x_ne_2]
+        *params:       
                 pe_pristine (Dataframe): half cell data of the pristine (uncycled) positive
                         electrode
                 ne_pristine (Dataframe): half cell data of the pristine (uncycled) negative
                         electrode
                 ne_2_pos (Dataframe): half cell data for the positive component of the anode
                 ne_2_neg (Dataframe): half cell data for the negative component of the anode
-                real_cell_candidate_charge_profile_aligned (Dataframe): columns Q_aligned
+                real_cell_candidate_profile_aligned (Dataframe): columns Q_aligned 
                         (evenly spaced) and Voltage_aligned
+
         Outputs:
         pe_out_zeroed (Dataframe): cathode capacity and voltage columns scaled,
                 offset, and aligned along capacity
@@ -198,38 +239,60 @@ def halfcell_degradation_matching_ah(self, x, *params):
         emulated_full_cell_aligned (Dataframe): capacity and voltage interpolated evenly
                 across capacity for the emulated cell data
         """
-
+        
         lli = x[0]
-        q_pe = x[1]
-        q_ne = x[2]
-        x_ne_2 = x[3]
-
-        (pe_pristine,
-         ne_1_pristine,
-         ne_2_pristine_pos,
-         ne_2_pristine_neg,
-         real_cell_candidate_charge_profile_aligned) = params
+        Q_pe = x[1]
+        Q_ne = x[2]
+        IR_coef_pe = x[3]
+        IR_coef_ne = x[4]
+        x_ne_2 = x[5]
+        
+        # Q_fc_nom (scalar) : nominal capacity of pristine cell
+        # C_nom (scalar) : nominal C-rate of pristine cell
+        Q_fc_nom = self.Q_fc_nom 
+        C_nom = self.C_nom
+        
+        pe_pristine_dict, ne_1_pristine_dict, ne_2_pristine_pos, ne_2_pristine_neg, real_cell_candidate_profile_aligned= params
+        
+        if len(pe_pristine_dict) == 1:
+            pe_pristine = pe_pristine_dict[list(pe_pristine_dict.keys())[0]]
+        elif len(pe_pristine_dict) > 1:
+            pe_at_I_eff, R_at_SOC_pe = (
+                self._get_effective_electrode_rate_data(pe_pristine_dict, Q_pe, Q_fc_nom, C_nom, IR_coef_pe)
+            )
+            pe_pristine = pe_at_I_eff # the effective rate profile is now the reference profile at pristine state
+        else:
+            raise ValueError('No dictionary entries in pe_pristine_dict')
+            
+        if len(ne_1_pristine_dict) == 1:
+            ne_1_pristine = ne_1_pristine_dict[list(ne_1_pristine_dict.keys())[0]]
+        elif len(ne_1_pristine_dict) > 1:
+            
+            ne_at_I_eff, R_at_SOC_ne = (
+                self._get_effective_electrode_rate_data( ne_1_pristine_dict, Q_ne, Q_fc_nom, C_nom, IR_coef_ne)
+                                                        )
+            ne_1_pristine = ne_at_I_eff # the effective rate profile is now the reference profile at pristine state
+        else:
+            raise ValueError('No dictionary entries in ne_pristine_dict')
 
-        # output degraded ne and pe (on a AH basis, with electrode alignment
-        # (NaNs for voltage, when no capacity actually at the corresponding capacity index))
+        # output degraded ne and pe (on a AH basis, with electrode alignment (NaNs for voltage, when no capacity actually at the corresponding capacity index))
         pe_out, ne_out = self._impose_electrode_scale(pe_pristine, ne_1_pristine,
-                                                      ne_2_pristine_pos, ne_2_pristine_neg,
-                                                      lli, q_pe,
-                                                      q_ne, x_ne_2)
-
+                                                  ne_2_pristine_pos, ne_2_pristine_neg,
+                                                  lli, Q_pe,
+                                                  Q_ne, x_ne_2)
+        
         # PE - NE = full cell voltage
         emulated_full_cell_with_degradation = pd.DataFrame()
         emulated_full_cell_with_degradation['Q_aligned'] = pe_out['Q_aligned'].copy()
         emulated_full_cell_with_degradation['Voltage_aligned'] = pe_out['Voltage_aligned'] - ne_out['Voltage_aligned']
-
+        
         # Replace emulated full cell values outside of voltage range with NaN
         emulated_full_cell_with_degradation['Voltage_aligned'].loc[
             emulated_full_cell_with_degradation['Voltage_aligned'] < self.FC_LOWER_VOLTAGE] = np.nan
         emulated_full_cell_with_degradation['Voltage_aligned'].loc[
             emulated_full_cell_with_degradation['Voltage_aligned'] > self.FC_UPPER_VOLTAGE] = np.nan
-
-        # Center the emulated full cell and half cell curves onto the same Q at which the real (degraded)
-        # capacity measurement started (self.FC_LOWER_VOLTAGE)
+        
+        ## Center the emulated full cell and half cell curves onto the same Q at which the real (degraded) capacity measurement started (self.FC_LOWER_VOLTAGE)
         emulated_full_cell_with_degradation_zeroed = pd.DataFrame()
 
         emulated_full_cell_with_degradation_zeroed['Voltage_aligned'] = emulated_full_cell_with_degradation[
@@ -237,8 +300,8 @@ def halfcell_degradation_matching_ah(self, x, *params):
 
         zeroing_value = emulated_full_cell_with_degradation['Q_aligned'].loc[
             np.nanargmin(emulated_full_cell_with_degradation['Voltage_aligned'])
-        ]
-
+                                                                            ]
+    
         emulated_full_cell_with_degradation_zeroed['Q_aligned'] = \
             (emulated_full_cell_with_degradation['Q_aligned'].copy() - zeroing_value)
 
@@ -249,69 +312,72 @@ def halfcell_degradation_matching_ah(self, x, *params):
         ne_out_zeroed['Q_aligned'] = ne_out['Q_aligned'] - zeroing_value
 
         # Interpolate full cell profiles across same Q range
-        min_q = np.min(
-            real_cell_candidate_charge_profile_aligned['Q_aligned'].loc[
-                ~real_cell_candidate_charge_profile_aligned['Voltage_aligned'].isna()])
-        max_q = np.max(
-            real_cell_candidate_charge_profile_aligned['Q_aligned'].loc[
-                ~real_cell_candidate_charge_profile_aligned['Voltage_aligned'].isna()])
+        min_Q = np.min(
+            real_cell_candidate_profile_aligned['Q_aligned'].loc[
+                ~real_cell_candidate_profile_aligned['Voltage_aligned'].isna()])
+        max_Q = np.max(
+            (real_cell_candidate_profile_aligned['Q_aligned'].loc[
+                ~real_cell_candidate_profile_aligned['Voltage_aligned'].isna()].max(),
+            emulated_full_cell_with_degradation_zeroed['Q_aligned'].loc[
+                ~emulated_full_cell_with_degradation_zeroed['Voltage_aligned'].isna()].max())
+                        )
         emulated_interper = interp1d(emulated_full_cell_with_degradation_zeroed['Q_aligned'].loc[
                                          ~emulated_full_cell_with_degradation_zeroed['Voltage_aligned'].isna()],
                                      emulated_full_cell_with_degradation_zeroed['Voltage_aligned'].loc[
                                          ~emulated_full_cell_with_degradation_zeroed['Voltage_aligned'].isna()],
                                      bounds_error=False)
         real_interper = interp1d(
-            real_cell_candidate_charge_profile_aligned['Q_aligned'].loc[
-                ~real_cell_candidate_charge_profile_aligned['Voltage_aligned'].isna()],
-            real_cell_candidate_charge_profile_aligned['Voltage_aligned'].loc[
-                ~real_cell_candidate_charge_profile_aligned['Voltage_aligned'].isna()],
+            real_cell_candidate_profile_aligned['Q_aligned'].loc[
+                ~real_cell_candidate_profile_aligned['Voltage_aligned'].isna()],
+            real_cell_candidate_profile_aligned['Voltage_aligned'].loc[
+                ~real_cell_candidate_profile_aligned['Voltage_aligned'].isna()],
             bounds_error=False)
 
-        q_vec = np.linspace(min_q, max_q, 1001)
+        Q_vec = np.linspace(min_Q, max_Q, 1001)
 
         emulated_aligned = pd.DataFrame()
-        emulated_aligned['Q_aligned'] = q_vec
-        emulated_aligned['Voltage_aligned'] = emulated_interper(q_vec)
+        emulated_aligned['Q_aligned'] = Q_vec
+        emulated_aligned['Voltage_aligned'] = emulated_interper(Q_vec)
 
         real_aligned = pd.DataFrame()
-        real_aligned['Q_aligned'] = q_vec
-        real_aligned['Voltage_aligned'] = real_interper(q_vec)
+        real_aligned['Q_aligned'] = Q_vec
+        real_aligned['Voltage_aligned'] = real_interper(Q_vec)
 
         return pe_out_zeroed, ne_out_zeroed, real_aligned, emulated_aligned
 
-    def get_dqdv_over_v_from_degradation_matching_ah(self, x, *params):
+    def get_dQdV_over_V_from_degradation_matching_ah(self, x, *params):
         """
-        This function imposes degradation scaling ,then outputs the dqdv representation of the emulated cell data.
-
+        This function imposes degradation scaling ,then outputs the dQdV representation of the emulated cell data.
+        
         Inputs:
-        x (list): [LLI, q_pe, q_ne, x_ne_2]
-        *params:
+        x (list): [LLI, Q_pe, Q_ne, x_ne_2]
+        *params:       
                 pe_pristine (Dataframe): half cell data of the pristine (uncycled) positive
                         electrode
                 ne_pristine (Dataframe): half cell data of the pristine (uncycled) negative
                         electrode
                 ne_2_pos (Dataframe): half cell data for the positive component of the anode
                 ne_2_neg (Dataframe): half cell data for the negative component of the anode
-                real_cell_candidate_charge_profile_aligned (Dataframe): columns Q_aligned
+                real_cell_candidate_profile_aligned (Dataframe): columns Q_aligned 
                         (evenly spaced) and Voltage_aligned
-
+        
         Outputs:
         pe_out_zeroed (Dataframe): cathode capacity and voltage columns scaled,
                 offset, and aligned along capacity
         ne_out_zeroed (Dataframe): anode capacity and voltage columns scaled,
                 offset, and aligned along capacity
-        dq_dv_over_v_real (Dataframe): dqdv across voltage for the real cell data
-        dq_dv_over_v_emulated (Dataframe): dqdv across voltage for the emulated cell data
+        dq_dv_over_v_real (Dataframe): dQdV across voltage for the real cell data
+        dq_dv_over_v_emulated (Dataframe): dQdV across voltage for the emulated cell data
         df_real_interped (Dataframe): capacity and voltage interpolated evenly across
                 capacity for the real cell data
         emulated_full_cell_interped (Dataframe): capacity and voltage interpolated evenly
                 across capacity for the emulated cell data
         """
-
+        
         pe_out_zeroed, ne_out_zeroed, df_real_interped, emulated_full_cell_interped = \
             self.halfcell_degradation_matching_ah(x, *params)
 
-        # Calculate dqdv from full cell profiles
+        # Calculate dQdV from full cell profiles
         dq_dv_real = pd.DataFrame(np.gradient(df_real_interped['Q_aligned'], df_real_interped['Voltage_aligned']),
                                   columns=['dQdV']).ewm(0.1).mean()
         dq_dv_emulated = pd.DataFrame(
@@ -331,25 +397,25 @@ def get_dqdv_over_v_from_degradation_matching_ah(self, x, *params):
         v_dq_dv_interper_real = interp1d(dq_dv_real['Voltage_aligned'].loc[~dq_dv_real['Voltage_aligned'].isna()],
                                          dq_dv_real['dQdV'].loc[~dq_dv_real['Voltage_aligned'].isna()],
                                          bounds_error=False, fill_value=0)
-        v_q_interper_real = interp1d(dq_dv_real['Voltage_aligned'].loc[~dq_dv_real['Voltage_aligned'].isna()],
-                                     dq_dv_real['Q_aligned'].loc[~dq_dv_real['Voltage_aligned'].isna()],
-                                     bounds_error=False, fill_value=(0, np.max(df_real_interped['Q_aligned'])))
+        v_Q_interper_real = interp1d(dq_dv_real['Voltage_aligned'].loc[~dq_dv_real['Voltage_aligned'].isna()],
+                                       dq_dv_real['Q_aligned'].loc[~dq_dv_real['Voltage_aligned'].isna()],
+                                       bounds_error=False, fill_value=(0, np.max(df_real_interped['Q_aligned'])))
 
         v_dq_dv_interper_emulated = interp1d(dq_dv_emulated['Voltage_aligned'].loc[
                                                  ~dq_dv_emulated['Voltage_aligned'].isna()],
                                              dq_dv_emulated['dQdV'].loc[~dq_dv_emulated['Voltage_aligned'].isna()],
                                              bounds_error=False, fill_value=0)
-        v_q_interper_emulated = interp1d(dq_dv_emulated['Voltage_aligned'].loc[
-                                             ~dq_dv_emulated['Voltage_aligned'].isna()],
-                                         dq_dv_emulated['Q_aligned'].loc[~dq_dv_emulated['Voltage_aligned'].isna()],
-                                         bounds_error=False, fill_value=(0, np.max(df_real_interped['Q_aligned'])))
+        v_Q_interper_emulated = interp1d(dq_dv_emulated['Voltage_aligned'].loc[
+                                               ~dq_dv_emulated['Voltage_aligned'].isna()],
+                                        dq_dv_emulated['Q_aligned'].loc[~dq_dv_emulated['Voltage_aligned'].isna()],
+                                           bounds_error=False, fill_value=(0,np.max(df_real_interped['Q_aligned'])))
 
         dq_dv_over_v_real = pd.DataFrame(v_dq_dv_interper_real(voltage_vec), columns=['dQdV']).fillna(0)
-        dq_dv_over_v_real['Q_aligned'] = v_q_interper_real(voltage_vec)
+        dq_dv_over_v_real['Q_aligned'] = v_Q_interper_real(voltage_vec)
         dq_dv_over_v_real['Voltage_aligned'] = voltage_vec
 
         dq_dv_over_v_emulated = pd.DataFrame(v_dq_dv_interper_emulated(voltage_vec), columns=['dQdV']).fillna(0)
-        dq_dv_over_v_emulated['Q_aligned'] = v_q_interper_emulated(voltage_vec)
+        dq_dv_over_v_emulated['Q_aligned'] = v_Q_interper_emulated(voltage_vec)
         dq_dv_over_v_emulated['Voltage_aligned'] = voltage_vec
 
         return (pe_out_zeroed,
@@ -359,20 +425,20 @@ def get_dqdv_over_v_from_degradation_matching_ah(self, x, *params):
                 df_real_interped,
                 emulated_full_cell_interped)
 
-    def get_dvdq_over_q_from_degradation_matching_ah(self, x, *params):
+    def get_dVdQ_over_Q_from_degradation_matching_ah(self, x, *params):
         """
         This function imposes degradation scaling ,then outputs the dVdQ representation of the emulated cell data.
         Inputs:
-        x (list): [LLI, q_pe, q_ne, x_ne_2]
-        *params:
+        x (list): [LLI, Q_pe, Q_ne, x_ne_2]
+        *params:       
                 pe_pristine (Dataframe): half cell data of the pristine (uncycled) positive
                         electrode
                 ne_pristine (Dataframe): half cell data of the pristine (uncycled) negative
                         electrode
                 ne_2_pos (Dataframe): half cell data for the positive component of the anode
                 ne_2_neg (Dataframe): half cell data for the negative component of the anode
-                real_cell_candidate_charge_profile_aligned (Dataframe): columns Q_aligned
-                        (evenly spaced) and Voltage_aligned
+                real_cell_candidate_profile_aligned (Dataframe): columns Q_aligned 
+                        (evenly spaced) and Voltage_aligned        
         Outputs:
         pe_out_zeroed (Dataframe): cathode capacity and voltage columns scaled,
                 offset, and aligned along capacity
@@ -383,18 +449,23 @@ def get_dvdq_over_q_from_degradation_matching_ah(self, x, *params):
         df_real_interped (Dataframe): capacity and voltage interpolated evenly across
                 capacity for the real cell data
         emulated_full_cell_interped (Dataframe): capacity and voltage interpolated evenly
-                across capacity for the emulated cell data
+                across capacity for the emulated cell data        
         """
-
+        
         pe_out_zeroed, ne_out_zeroed, df_real_interped, emulated_full_cell_interped = \
             self.halfcell_degradation_matching_ah(x, *params)
 
-        # Calculate dQdV from full cell profiles
+        # Calculate dVdQ from full cell profiles
+#         dv_dq_real = pd.DataFrame(np.gradient(df_real_interped['Voltage_aligned'], df_real_interped['Q_aligned']),
+#                                   columns=['dVdQ']).ewm(alpha=0.3).mean()
+#         dv_dq_emulated = pd.DataFrame(
+#             np.gradient(emulated_full_cell_interped['Voltage_aligned'], emulated_full_cell_interped['Q_aligned']),
+#             columns=['dVdQ']).ewm(alpha=0.3).mean()
         dv_dq_real = pd.DataFrame(np.gradient(df_real_interped['Voltage_aligned'], df_real_interped['Q_aligned']),
-                                  columns=['dVdQ']).ewm(0.1).mean()
+                                  columns=['dVdQ'])
         dv_dq_emulated = pd.DataFrame(
             np.gradient(emulated_full_cell_interped['Voltage_aligned'], emulated_full_cell_interped['Q_aligned']),
-            columns=['dVdQ']).ewm(0.1).mean()
+            columns=['dVdQ'])
 
         # Include original data
         dv_dq_real['Q_aligned'] = df_real_interped['Q_aligned']
@@ -404,27 +475,27 @@ def get_dvdq_over_q_from_degradation_matching_ah(self, x, *params):
         dv_dq_emulated['Voltage_aligned'] = emulated_full_cell_interped['Voltage_aligned']
 
         # Q interpolation not needed, as interpolated over Q by default
-
+       
         return (pe_out_zeroed,
                 ne_out_zeroed,
                 dv_dq_real,
                 dv_dq_emulated,
                 df_real_interped,
                 emulated_full_cell_interped)
-
-    def get_v_over_q_from_degradation_matching_ah(self, x, *params):
+    
+    def get_V_over_Q_from_degradation_matching_ah(self, x, *params):
         """
         This function imposes degradation scaling ,then outputs the V-Q representation of the emulated cell data.
         Inputs:
-        x (list): [LLI, q_pe, q_ne, x_ne_2]
-        *params:
+        x (list): [LLI, Q_pe, Q_ne, x_ne_2]
+        *params:       
                 pe_pristine (Dataframe): half cell data of the pristine (uncycled) positive
                         electrode
                 ne_pristine (Dataframe): half cell data of the pristine (uncycled) negative
                         electrode
                 ne_2_pos (Dataframe): half cell data for the positive component of the anode
                 ne_2_neg (Dataframe): half cell data for the negative component of the anode
-                real_cell_candidate_charge_profile_aligned (Dataframe): columns Q_aligned
+                real_cell_candidate_profile_aligned (Dataframe): columns Q_aligned 
                         (evenly spaced) and Voltage_aligned
         Outputs:
         pe_out_zeroed (Dataframe): cathode capacity and voltage columns scaled,
@@ -440,13 +511,14 @@ def get_v_over_q_from_degradation_matching_ah(self, x, *params):
             self.halfcell_degradation_matching_ah(x, *params)
 
         min_soc_full_cell = np.min(real_aligned.loc[~real_aligned.Voltage_aligned.isna()].Q_aligned)
-        max_soc_full_cell = np.max(real_aligned.loc[~real_aligned.Voltage_aligned.isna()].Q_aligned)
+        max_soc_full_cell = np.max((real_aligned.loc[~real_aligned.Voltage_aligned.isna()].Q_aligned.max(),
+                                    emulated_aligned.loc[~emulated_aligned.Voltage_aligned.isna()].Q_aligned.max()))
 
         soc_vec_full_cell = np.linspace(min_soc_full_cell, max_soc_full_cell, 1001)
 
         emulated_full_cell_interper = interp1d(
-            emulated_aligned.Q_aligned.loc[~real_aligned.Voltage_aligned.isna()],
-            emulated_aligned.Voltage_aligned.loc[~real_aligned.Voltage_aligned.isna()],
+            emulated_aligned.Q_aligned.loc[~emulated_aligned.Voltage_aligned.isna()],
+            emulated_aligned.Voltage_aligned.loc[~emulated_aligned.Voltage_aligned.isna()],
             bounds_error=False)
         real_full_cell_interper = interp1d(real_aligned.Q_aligned.loc[~real_aligned.Voltage_aligned.isna()],
                                            real_aligned.Voltage_aligned.loc[~real_aligned.Voltage_aligned.isna()],
@@ -462,24 +534,24 @@ def get_v_over_q_from_degradation_matching_ah(self, x, *params):
         df_real_interped['Q_aligned'] = soc_vec_full_cell
         df_real_interped['Voltage_aligned'] = real_full_cell_interper(soc_vec_full_cell)
         return pe_out_zeroed, ne_out_zeroed, df_real_interped, emulated_full_cell_interped
-
-    def get_v_over_q_from_degradation_matching_ah_no_real(self, x, *params):
+    
+    
+    def get_V_over_Q_from_degradation_matching_ah_no_real(self, x, *params):
         """
-        This function imposes degradation scaling ,then outputs the V-Q representation of the
-        emulated cell data, in the absence of real cell data.
-
+        This function imposes degradation scaling ,then outputs the V-Q representation of the emulated cell data, in the absence of real cell data.
+        
         Inputs:
-        x (list): [LLI, q_pe, q_ne, x_ne_2]
-        *params:
+        x (list): [LLI, Q_pe, Q_ne, x_ne_2]
+        *params:       
                 pe_pristine (Dataframe): half cell data of the pristine (uncycled) positive
                         electrode
                 ne_pristine (Dataframe): half cell data of the pristine (uncycled) negative
                         electrode
                 ne_2_pos (Dataframe): half cell data for the positive component of the anode
                 ne_2_neg (Dataframe): half cell data for the negative component of the anode
-                real_cell_candidate_charge_profile_aligned (Dataframe): columns Q_aligned
+                real_cell_candidate_profile_aligned (Dataframe): columns Q_aligned 
                         (evenly spaced) and Voltage_aligned
-
+                        
         Outputs:
         pe_out_zeroed (Dataframe): cathode capacity and voltage columns scaled,
                 offset, and aligned along capacity
@@ -487,15 +559,15 @@ def get_v_over_q_from_degradation_matching_ah_no_real(self, x, *params):
                 offset, and aligned along capacity
         emulated_full_cell_interped (Dataframe): capacity and voltage interpolated evenly
                 across capacity for the emulated cell data
-
+        
         """
         (pe_out_zeroed, ne_out_zeroed, emulated_aligned) = \
             self.halfcell_degradation_matching_ah_no_real(x, *params)
 
-        min_q_full_cell = np.min(emulated_aligned.loc[~emulated_aligned.Voltage_aligned.isna()].Q_aligned)
-        max_q_full_cell = np.max(emulated_aligned.loc[~emulated_aligned.Voltage_aligned.isna()].Q_aligned)
+        min_Q_full_cell = np.min(emulated_aligned.loc[~emulated_aligned.Voltage_aligned.isna()].Q_aligned)
+        max_Q_full_cell = np.max(emulated_aligned.loc[~emulated_aligned.Voltage_aligned.isna()].Q_aligned)
 
-        q_vec_full_cell = np.linspace(min_q_full_cell, max_q_full_cell, 1001)
+        Q_vec_full_cell = np.linspace(min_Q_full_cell, max_Q_full_cell, 1001)
 
         emulated_full_cell_interper = interp1d(
             emulated_aligned.Q_aligned.loc[~emulated_aligned.Voltage_aligned.isna()],
@@ -504,30 +576,30 @@ def get_v_over_q_from_degradation_matching_ah_no_real(self, x, *params):
 
         # Interpolate the emulated full-cell profile
         emulated_full_cell_interped = pd.DataFrame()
-        emulated_full_cell_interped['Q_aligned'] = q_vec_full_cell
-        emulated_full_cell_interped['Voltage_aligned'] = emulated_full_cell_interper(q_vec_full_cell)
+        emulated_full_cell_interped['Q_aligned'] = Q_vec_full_cell
+        emulated_full_cell_interped['Voltage_aligned'] = emulated_full_cell_interper(Q_vec_full_cell)
 
         return pe_out_zeroed, ne_out_zeroed, emulated_full_cell_interped
-
+    
     def halfcell_degradation_matching_ah_no_real(self, x, *params):
         """
-        Calls underlying functions to impose degradation through electrode
+        Calls underlying functions to impose degradation through electrode 
         capacity scale and alignment through LLI. Modifies emulated full cell
         data to be within full cell voltage range and calibrates (zeros) capacity
         at the lowest permissible voltage.
-
+        
         Inputs:
-        x (list): [LLI, q_pe, q_ne, x_ne_2]
-        *params:
+        x (list): [LLI, Q_pe, Q_ne, x_ne_2]
+        *params:       
                 pe_pristine (Dataframe): half cell data of the pristine (uncycled) positive
                         electrode
                 ne_pristine (Dataframe): half cell data of the pristine (uncycled) negative
                         electrode
                 ne_2_pos (Dataframe): half cell data for the positive component of the anode
                 ne_2_neg (Dataframe): half cell data for the negative component of the anode
-                real_cell_candidate_charge_profile_aligned (Dataframe): columns Q_aligned
+                real_cell_candidate_profile_aligned (Dataframe): columns Q_aligned 
                         (evenly spaced) and Voltage_aligned
-
+                        
         Outputs:
         pe_out_zeroed (Dataframe): cathode capacity and voltage columns scaled,
                 offset, and aligned along capacity
@@ -536,40 +608,38 @@ def halfcell_degradation_matching_ah_no_real(self, x, *params):
         emulated_aligned (Dataframe): full cell data corresponding to the imposed degradation
         """
         lli = x[0]
-        q_pe = x[1]
-        q_ne = x[2]
+        Q_pe = x[1]
+        Q_ne = x[2]
         x_ne_2 = x[3]
 
         pe_pristine, ne_1_pristine, ne_2_pristine_pos, ne_2_pristine_neg = params
 
         pe_out, ne_out = self._impose_electrode_scale(pe_pristine, ne_1_pristine,
-                                                      ne_2_pristine_pos, ne_2_pristine_neg,
-                                                      lli, q_pe,
-                                                      q_ne,
-                                                      x_ne_2)
-        # outputs degraded ne and pe (on a AH basis, with electrode alignment (NaNs for voltage, when no overlap))
-
+                                                  ne_2_pristine_pos, ne_2_pristine_neg,
+                                                  lli, Q_pe,
+                                                  Q_ne, x_ne_2) #outputs degraded ne and pe (on a AH basis, with electrode alignment (NaNs for voltage, when no overlap))
+        
         emulated_full_cell_with_degradation = pd.DataFrame()
         emulated_full_cell_with_degradation['Q_aligned'] = pe_out['Q_aligned'].copy()
         emulated_full_cell_with_degradation['Voltage_aligned'] = pe_out['Voltage_aligned'] - ne_out['Voltage_aligned']
-
+        
         # Replace emulated full cell values outside of voltage range with NaN
         emulated_full_cell_with_degradation['Voltage_aligned'].loc[
             emulated_full_cell_with_degradation['Voltage_aligned'] < self.FC_LOWER_VOLTAGE] = np.nan
         emulated_full_cell_with_degradation['Voltage_aligned'].loc[
             emulated_full_cell_with_degradation['Voltage_aligned'] > self.FC_UPPER_VOLTAGE] = np.nan
 
-        # Center the emulated full cell and half cell curves onto the same Q at which the real (degraded)
-        # capacity measurement started (self.FC_LOWER_VOLTAGE)
+
+        ## Center the emulated full cell and half cell curves onto the same Q at which the real (degraded) capacity measurement started (self.FC_LOWER_VOLTAGE)
         emulated_full_cell_with_degradation_zeroed = pd.DataFrame()
 
         emulated_full_cell_with_degradation_zeroed['Voltage_aligned'] = emulated_full_cell_with_degradation[
             'Voltage_aligned']
-
+        
         zeroing_value = emulated_full_cell_with_degradation['Q_aligned'].loc[
             np.nanargmin(emulated_full_cell_with_degradation['Voltage_aligned'])
-        ]
-
+                                                                            ]
+        
         emulated_full_cell_with_degradation_zeroed['Q_aligned'] = \
             (emulated_full_cell_with_degradation['Q_aligned'] - zeroing_value)
 
@@ -580,336 +650,382 @@ def halfcell_degradation_matching_ah_no_real(self, x, *params):
         ne_out_zeroed['Q_aligned'] = ne_out['Q_aligned'] - zeroing_value
 
         # Interpolate full profiles across same Q range
-        min_q = np.min(
+        min_Q = np.min(
             emulated_full_cell_with_degradation_zeroed['Q_aligned'].loc[
                 ~emulated_full_cell_with_degradation_zeroed['Voltage_aligned'].isna()])
-        max_q = np.max(
+        max_Q = np.max(
             emulated_full_cell_with_degradation_zeroed['Q_aligned'].loc[
                 ~emulated_full_cell_with_degradation_zeroed['Voltage_aligned'].isna()])
-
+        
         emulated_interper = interp1d(emulated_full_cell_with_degradation_zeroed['Q_aligned'].loc[
                                          ~emulated_full_cell_with_degradation_zeroed['Voltage_aligned'].isna()],
                                      emulated_full_cell_with_degradation_zeroed['Voltage_aligned'].loc[
                                          ~emulated_full_cell_with_degradation_zeroed['Voltage_aligned'].isna()],
                                      bounds_error=False)
 
-        q_vec = np.linspace(min_q, max_q, 1001)
+        Q_vec = np.linspace(min_Q, max_Q, 1001)
 
         emulated_aligned = pd.DataFrame()
-        emulated_aligned['Q_aligned'] = q_vec
-        emulated_aligned['Voltage_aligned'] = emulated_interper(q_vec)
+        emulated_aligned['Q_aligned'] = Q_vec
+        emulated_aligned['Voltage_aligned'] = emulated_interper(Q_vec)
 
         return pe_out_zeroed, ne_out_zeroed, emulated_aligned
-
+    
     def _get_error_from_degradation_matching_ah(self, x, *params):
         """
         Wrapper function which selects the correct error sub routine and returns its error value.
-
+        
         Inputs:
-        x (list): [LLI, q_pe, q_ne, x_ne_2]
-        *params:
+        x (list): [LLI, Q_pe, Q_ne, x_ne_2]
+        *params:       
                 pe_pristine (Dataframe): half cell data of the pristine (uncycled) positive
                         electrode
                 ne_pristine (Dataframe): half cell data of the pristine (uncycled) negative
                         electrode
                 ne_2_pos (Dataframe): half cell data for the positive component of the anode
                 ne_2_neg (Dataframe): half cell data for the negative component of the anode
-                real_cell_candidate_charge_profile_aligned (Dataframe): columns Q_aligned
-                        (evenly spaced) and Voltage_aligned
-
+                real_cell_candidate_profile_aligned (Dataframe): columns Q_aligned 
+                        (evenly spaced) and Voltage_aligned        
+        
         Outputs:
             error value (float) - output of the specified error sub function
         """
         error_type = self.error_type
         if error_type == 'V-Q':
-            return self._get_error_from_degradation_matching_v_q(x, *params)[0]
+            return self._get_error_from_degradation_matching_V_Q(x,*params)[0]
         elif error_type == 'dVdQ':
-            return self._get_error_from_degradation_matching_dvdq(x, *params)[0]
+            return self._get_error_from_degradation_matching_dVdQ(x,*params)[0]
         elif error_type == 'dQdV':
-            return self._get_error_from_degradation_matching_dqdv(x, *params)[0]
+            return self._get_error_from_degradation_matching_dQdV(x,*params)[0]
         else:
-            return self._get_error_from_degradation_matching_v_q(x, *params)[0]
-
-    def _get_error_from_degradation_matching_v_q(self, x, *params):
+            return self._get_error_from_degradation_matching_V_Q(x,*params)[0]
+        
+    def _get_error_from_degradation_matching_V_Q(self, x, *params):
         """
         Error function returning the mean standardized Euclidean distance of each point of the real curve to
                 the closest value on the emulated curve in the V-Q representation.
-
+        
         Inputs:
-        x (list): [LLI, q_pe, q_ne, x_ne_2]
-        *params:
+        x (list): [LLI, Q_pe, Q_ne, x_ne_2]
+        *params:       
                 pe_pristine (Dataframe): half cell data of the pristine (uncycled) positive
                         electrode
                 ne_pristine (Dataframe): half cell data of the pristine (uncycled) negative
                         electrode
                 ne_2_pos (Dataframe): half cell data for the positive component of the anode
                 ne_2_neg (Dataframe): half cell data for the negative component of the anode
-                real_cell_candidate_charge_profile_aligned (Dataframe): columns Q_aligned
-                        (evenly spaced) and Voltage_aligned
-
+                real_cell_candidate_profile_aligned (Dataframe): columns Q_aligned 
+                        (evenly spaced) and Voltage_aligned        
+        
         Outputs:
             error (float): output of the specified error sub function
             error_vector (array): vector containingEuclidean distance of each point of the real curve to
                 the closest value on the emulated curve in the V-Q representation
-            xa (Dataframe): real full cell data used for error analysis
-            xb (Dataframe): emulated full cell  data used for error analysis
+            XA (Dataframe): real full cell data used for error analysis
+            XB (Dataframe): emulated full cell  data used for error analysis
         """
-
+        
         try:
             (pe_out_zeroed, ne_out_zeroed, real_aligned, emulated_aligned
-             ) = self.get_v_over_q_from_degradation_matching_ah(x, *params)
-
-            xa = real_aligned.dropna()
-            xb = emulated_aligned.dropna()
-            error_matrix = distance.cdist(xa, xb, 'seuclidean')
-            error_vector = error_matrix.min(axis=1)
+             ) = self.get_V_over_Q_from_degradation_matching_ah(x, *params)
+            
+            XA = real_aligned.dropna()
+            XB = emulated_aligned.dropna()
+            error_matrix = distance.cdist(XA, XB, 'seuclidean')
+            error_vector = error_matrix.min(axis = 1)
             error = error_vector.mean()
         except ValueError:
             error = 100
             return error, None, None, None
-        return error, error_vector, xa, xb
-
+        return error, error_vector, XA, XB
+    
         # Pairwise euclidean from premade dQdV
-
-    def _get_error_from_degradation_matching_dqdv(self, x, *params):
+    def _get_error_from_degradation_matching_dQdV(self, x, *params):
         """
         Error function returning the mean standardized Euclidean distance of each point of the real curve to
                 the closest value on the emulated curve in the dQdV representation.
-
+        
         Inputs:
-        x (list): [LLI, q_pe, q_ne, x_ne_2]
-        *params:
+        x (list): [LLI, Q_pe, Q_ne, x_ne_2]
+        *params:       
                 pe_pristine (Dataframe): half cell data of the pristine (uncycled) positive
                         electrode
                 ne_pristine (Dataframe): half cell data of the pristine (uncycled) negative
                         electrode
                 ne_2_pos (Dataframe): half cell data for the positive component of the anode
                 ne_2_neg (Dataframe): half cell data for the negative component of the anode
-                real_cell_candidate_charge_profile_aligned (Dataframe): columns Q_aligned
-                        (evenly spaced) and Voltage_aligned
-
+                real_cell_candidate_profile_aligned (Dataframe): columns Q_aligned 
+                        (evenly spaced) and Voltage_aligned        
+        
         Outputs:
             error (float): output of the specified error sub function
             error_vector (array): vector containing Euclidean distance of each point of the real curve to
                 the closest value on the emulated curve in the dQdV representation
-            xa (Dataframe): real full cell data used for error analysis
-            xb (Dataframe): emulated full cell  data used for error analysis
+            XA (Dataframe): real full cell data used for error analysis
+            XB (Dataframe): emulated full cell  data used for error analysis
         """
-
+        
         try:
             # Call dQdV generating function
-            (pe_out_zeroed,
-             ne_out_zeroed,
-             dqdv_over_v_real,
-             dqdv_over_v_emulated,
-             df_real_interped,
-             emulated_full_cell_interped) = self.get_dqdv_over_v_from_degradation_matching_ah(x, *params)
-
-            xa = dqdv_over_v_real[['Voltage_aligned', 'dQdV']].dropna()
-            xb = dqdv_over_v_emulated[['Voltage_aligned', 'dQdV']].dropna()
-            error_matrix = distance.cdist(xa, xb, 'seuclidean')
-            error_vector = error_matrix.min(axis=1)
+            (PE_out_zeroed,
+            NE_out_zeroed,
+            dQdV_over_v_real,
+            dQdV_over_v_emulated,
+            df_real_interped,
+            emulated_full_cell_interped) = self.get_dQdV_over_V_from_degradation_matching_ah(x, *params)
+
+            XA = dQdV_over_v_real[[ 'Voltage_aligned','dQdV']].dropna()
+            XB = dQdV_over_v_emulated[['Voltage_aligned', 'dQdV']].dropna()
+            error_matrix = distance.cdist(XA, XB, 'seuclidean')
+            error_vector = error_matrix.min(axis = 1)
             error = error_vector.mean()
-
+            
         except ValueError:
             error = 100
             return error, None, None, None
-        return error, error_vector, xa, xb
-
-    def _get_error_from_degradation_matching_dvdq(self, x, *params):
+        return error, error_vector, XA, XB
+    
+    def _get_error_from_degradation_matching_dVdQ(self, x, *params):
         """
         Error function returning the mean standardized Euclidean distance of each point of the real curve to
                 the closest value on the emulated curve in the dVdQ representation.
-
+        
         Inputs:
-        x (list): [LLI, q_pe, q_ne, x_ne_2]
-        *params:
+        x (list): [LLI, Q_pe, Q_ne, x_ne_2]
+        *params:       
                 pe_pristine (Dataframe): half cell data of the pristine (uncycled) positive
                         electrode
                 ne_pristine (Dataframe): half cell data of the pristine (uncycled) negative
                         electrode
                 ne_2_pos (Dataframe): half cell data for the positive component of the anode
                 ne_2_neg (Dataframe): half cell data for the negative component of the anode
-                real_cell_candidate_charge_profile_aligned (Dataframe): columns Q_aligned
-                        (evenly spaced) and Voltage_aligned
-
+                real_cell_candidate_profile_aligned (Dataframe): columns Q_aligned 
+                        (evenly spaced) and Voltage_aligned        
+        
         Outputs:
             error (float): output of the specified error sub function
             error_vector (array): vector containing Euclidean distance of each point of the real curve to
                 the closest value on the emulated curve in the dVdQ representation
-            xa (Dataframe): real full cell data used for error analysis
-            xb (Dataframe): emulated full cell  data used for error analysis
+            error_vector_weighted (array): error_vector multiplied by dQdV values as weighting
+            XA (Dataframe): real full cell data used for error analysis
+            XB (Dataframe): emulated full cell  data used for error analysis
         """
-
-        try:
-            (pe_out_zeroed,
-             ne_out_zeroed,
-             dvdq_over_q_real,
-             dvdq_over_q_emulated,
-             df_real_interped,
-             emulated_full_cell_interped) = self.get_dvdq_over_q_from_degradation_matching_ah(x, *params)
-
-            xa = dvdq_over_q_real[['Q_aligned', 'dVdQ']].dropna()
-            xb = dvdq_over_q_emulated[['Q_aligned', 'dVdQ']].dropna()
-
-            # down-select to values with capacity more than 0.5 Ahr to eliminate high-slope region of dVdQ
-            xa = xa.loc[(xa.Q_aligned > 0.5)]
-            xb = xb.loc[(xb.Q_aligned > 0.5)]
-
-            error_matrix = distance.cdist(xa, xb, 'seuclidean')
-            error_vector = error_matrix.min(axis=1)
-            error = error_vector.mean()
-
-        except ValueError:
-            error = 100
-            return error, None, None, None
-        return error, error_vector, xa, xb
-
+        
+#         try:
+        (PE_out_zeroed,
+                NE_out_zeroed,
+                dVdQ_over_Q_real,
+                dVdQ_over_Q_emulated,
+                df_real_interped,
+                emulated_full_cell_interped) = self.get_dVdQ_over_Q_from_degradation_matching_ah(x, *params)
+
+        XA = dVdQ_over_Q_real[[ 'Q_aligned','dVdQ']].dropna()
+        XB = dVdQ_over_Q_emulated[['Q_aligned', 'dVdQ']].dropna()
+            
+        if self.dvdq_bound is not None:
+            # down-select to values with dVdQ less than 0.8 V/Ahr to eliminate high-slope/high-valued region of dVdQ
+            XA = XA.loc[XA.dVdQ < self.dvdq_bound]
+            XB = XB.loc[XB.dVdQ < self.dvdq_bound]
+
+        error_matrix = distance.cdist(XA, XB, 'seuclidean')
+        error_vector_raw = error_matrix.min(axis = 1)
+
+        # apply weighting scheme
+        if self.error_weighting == 'dQdV':
+            dQdV_over_v_real = self.get_dQdV_over_V_from_degradation_matching_ah(x, *params)[2]
+            if self.dvdq_bound is not None:
+                error_vector_weighted = np.multiply(
+                    error_vector_raw,dQdV_over_v_real['dQdV'].loc[(XA.dVdQ < 0.8).index])
+            elif self.dvdq_bound is None:
+                error_vector_weighted = np.multiply(
+                    error_vector_raw,dQdV_over_v_real['dQdV'])                    
+        elif self.error_weighting == 'uniform':
+            error_vector_weighted = error_vector_raw
+        else:
+            raise ValueError('Unknown weighting scheme provided. Check the value of "error_weighting".')
+
+        error = error_vector_weighted.mean()
+            
+#         except ValueError:
+#             print('ValueError')
+#             error = 100
+#             return error, None, None, None, None
+        return error, error_vector_raw, error_vector_weighted, XA, XB
+    
     def _get_error_from_synthetic_fitting_ah(self, x, *params):
         """
         Wrapper function which selects the correct error sub routine and returns its error value.
         This function is specific to fitting synthetic data rather than real cycling data.
-
+        
         Inputs:
-        x (list): [LLI, q_pe, q_ne, x_ne_2]
-        *params:
+        x (list): [LLI, Q_pe, Q_ne, x_ne_2]
+        *params:       
                 pe_pristine (Dataframe): half cell data of the pristine (uncycled) positive
                         electrode
                 ne_pristine (Dataframe): half cell data of the pristine (uncycled) negative
                         electrode
                 ne_2_pos (Dataframe): half cell data for the positive component of the anode
                 ne_2_neg (Dataframe): half cell data for the negative component of the anode
-                real_cell_candidate_charge_profile_aligned (Dataframe): columns Q_aligned
-                        (evenly spaced) and Voltage_aligned
-
+                real_cell_candidate_profile_aligned (Dataframe): columns Q_aligned 
+                        (evenly spaced) and Voltage_aligned        
+        
         Outputs:
             error value (float) - output of the specified error sub function
         """
-
+        
         error_type = self.error_type
-
+        
         try:
             if error_type == 'V-Q':
-                return self._get_error_from_degradation_matching_v_q(x, *params)[0]
+                return self._get_error_from_degradation_matching_V_Q(x, *params)[0]
             elif error_type == 'dVdQ':
-                return self._get_error_from_degradation_matching_dvdq(x, *params)[0]
+                return self._get_error_from_degradation_matching_dVdQ(x, *params)[0]
             elif error_type == 'dQdV':
-                return self._get_error_from_degradation_matching_dvdq(x, *params)[0]
+                return self._get_error_from_degradation_matching_dQdV(x, *params)[0]
             else:
-                return self._get_error_from_degradation_matching_v_q(x, *params)[0]
-        except RuntimeError:
+                return self._get_error_from_degradation_matching_V_Q(x, *params)[0]
+        except:
             print("Can't return error")
             return 100
-
+        
     def intracell_values_wrapper_ah(self,
-                                    cycle_index,
-                                    cell_struct,
-                                    degradation_bounds=None
-                                    ):
+                                 cycle_index,
+                                 cell_struct,
+                                 degradation_bounds=None
+                                 ):
         """
         Wrapper function to solve capacity sizing and offset of reference electrodes to real full cell cycle data.
-
+        
         Inputs:
         cycle_index (int): the index of the cycle of interest of the structured real cycling data
         cell_struct (MaccorDatapath): BEEP structured cycling data
 
         Outputs:
         loss_dict (dict): dictionary with key of cycle index and entry of a list of
-                error, lli_opt, q_pe_opt, q_ne_opt, x_ne_2, Q_li
+                error, LLI_opt, Q_pe_opt, Q_ne_opt, x_NE_2, Q_li
         profiles_dict (dict): dictionary with key of cycle index and entry of a dictionary
                 containing various key/entry pairs of resulting from the fitting
         """
+
         if degradation_bounds is None:
-            degradation_bounds = ((0, 3),  # LLI
-                                  (2.5, 6.5),  # q_pe
-                                  (2.5, 6.5),  # q_ne
-                                  (1, 1),  # (-1,1) x_ne_2
+            degradation_bounds = ((0, 2.0),  # LLI
+                                  (2.5, 6.5),  # Q_pe (Q_pe must be > LLI)
+                                  (2.5, 6.5),  # Q_ne
+                                  (1,1), #IR coef pe
+                                  (1,1), #IR coef ne
+                                  (1, 1),  # (-1,1) x_NE_2
                                   )
 
-        real_cell_candidate_charge_profile_aligned = self.process_beep_cycle_data_for_candidate_halfcell_analysis_ah(
+        real_cell_candidate_profile_aligned = self.process_beep_cycle_data_for_candidate_halfcell_analysis_ah(
             cell_struct,
             cycle_index)
+        
+        
 
         degradation_optimization_result = differential_evolution(self._get_error_from_degradation_matching_ah,
                                                                  degradation_bounds,
-                                                                 args=(self.pe_pristine,
-                                                                       self.ne_1_pristine,
+                                                                 args=(self.pe_pristine_dict,
+                                                                       self.ne_1_pristine_dict,
                                                                        self.ne_2_pristine_pos,
                                                                        self.ne_2_pristine_neg,
-                                                                       real_cell_candidate_charge_profile_aligned
+                                                                       real_cell_candidate_profile_aligned,
                                                                        ),
-                                                                 strategy='best1bin', maxiter=100000,
-                                                                 popsize=15, tol=0.001, mutation=0.5,
+                                                                 strategy='best1bin', maxiter=1000,
+                                                                 popsize=15, tol=0.01, mutation=0.5,
                                                                  recombination=0.7,
-                                                                 seed=1,
+                                                                 seed=0,
                                                                  callback=None, disp=False, polish=True,
                                                                  init='latinhypercube',
                                                                  atol=0, updating='deferred', workers=-1,
                                                                  constraints=()
                                                                  )
-        #         print(degradation_optimization_result.x) #BVV
-
-        (pe_out_zeroed,
-         ne_out_zeroed,
-         dqdv_over_v_real,
-         dqdv_over_v_emulated,
+#         print(degradation_optimization_result.x) #BVV
+        (PE_out_zeroed,
+         NE_out_zeroed,
+         dVdQ_over_Q_real,
+         dVdQ_over_Q_emulated,
          df_real_interped,
-         emulated_full_cell_interped) = self.get_dqdv_over_v_from_degradation_matching_ah(
+         emulated_full_cell_interped) = self.get_dVdQ_over_Q_from_degradation_matching_ah(
             degradation_optimization_result.x,
-            self.pe_pristine,
-            self.ne_1_pristine,
+            self.pe_pristine_dict,
+            self.ne_1_pristine_dict,
             self.ne_2_pristine_pos,
             self.ne_2_pristine_neg,
-            real_cell_candidate_charge_profile_aligned)
+            real_cell_candidate_profile_aligned)
+    
+        (dQdV_over_v_real,
+         dQdV_over_v_emulated) = self.get_dQdV_over_V_from_degradation_matching_ah(
+                degradation_optimization_result.x,
+                self.pe_pristine_dict,
+                self.ne_1_pristine_dict,
+                self.ne_2_pristine_pos,
+                self.ne_2_pristine_neg,
+                real_cell_candidate_profile_aligned)[2:4]
         #
-        electrode_info_df = get_electrode_info_ah(pe_out_zeroed, ne_out_zeroed)
+        electrode_info_df = self.get_electrode_info_ah(PE_out_zeroed, NE_out_zeroed)
         #
         error = degradation_optimization_result.fun
-        lli_opt = degradation_optimization_result.x[0]
-        q_pe_opt = degradation_optimization_result.x[1]
-        q_ne_opt = degradation_optimization_result.x[2]
-        x_ne_2 = degradation_optimization_result.x[3]
-
-        loss_dict = {cycle_index: np.append([error, lli_opt, q_pe_opt, q_ne_opt,
-                                             x_ne_2],
-                                            electrode_info_df.iloc[-1].values)
+        LLI_opt = degradation_optimization_result.x[0]
+        Q_pe_opt = degradation_optimization_result.x[1]
+        Q_ne_opt = degradation_optimization_result.x[2]
+        IR_coef_pe_opt = degradation_optimization_result.x[3]
+        IR_coef_ne_opt = degradation_optimization_result.x[4]
+        x_NE_2 = degradation_optimization_result.x[5]
+        
+        loss_dict = {cycle_index: np.append([error, LLI_opt, Q_pe_opt, Q_ne_opt,IR_coef_pe_opt,IR_coef_ne_opt,
+                                             x_NE_2],
+                                             electrode_info_df.iloc[-1].values)
                      }
-
-        profiles_per_cycle_dict = {
-           'NE_zeroed': ne_out_zeroed,
-           'PE_zeroed': pe_out_zeroed,
-           'dQdV_over_v_real': dqdv_over_v_real,
-           'dQdV_over_v_emulated': dqdv_over_v_emulated,
-           'df_real_interped': df_real_interped,
-           'emulated_full_cell_interped': emulated_full_cell_interped,
-           'real_cell_candidate_charge_profile_aligned': real_cell_candidate_charge_profile_aligned
-                                   }
-
+        
+        
+        error, error_vector_raw, error_vector_weighted, XA, XB = self._get_error_from_degradation_matching_dVdQ(
+            [LLI_opt, Q_pe_opt, Q_ne_opt,IR_coef_pe_opt,IR_coef_ne_opt, x_NE_2],
+            self.pe_pristine_dict,
+            self.ne_1_pristine_dict,
+            self.ne_2_pristine_pos,
+            self.ne_2_pristine_neg,
+            real_cell_candidate_profile_aligned)
+        
+        profiles_per_cycle_dict = {'NE_zeroed' : NE_out_zeroed, 
+                                   'PE_zeroed' : PE_out_zeroed, 
+                                   'dVdQ_over_Q_real': dVdQ_over_Q_real,
+                                   'dVdQ_over_Q_emulated': dVdQ_over_Q_emulated,
+                                   'dQdV_over_v_real':dQdV_over_v_real,
+                                   'dQdV_over_v_emulated':dQdV_over_v_emulated,
+                                   'df_real_interped': df_real_interped,
+                                   'emulated_full_cell_interped': emulated_full_cell_interped ,
+                                   'real_cell_candidate_profile_aligned': real_cell_candidate_profile_aligned,
+                                   'error_vector_raw':error_vector_raw,
+                                   'error_vector_weighted':error_vector_weighted,
+                                   'XA':XA,
+                                   'XB':XB
+                                  }
+        
         profiles_dict = {cycle_index: profiles_per_cycle_dict}
 
         return loss_dict, profiles_dict
-
+    
     def solve_emulated_degradation(self,
-                                   forward_simulated_profile,
-                                   degradation_bounds=None
-                                   ):
-
+                                 forward_simulated_profile,
+                                 degradation_bounds=None
+                                 ):
+        
         """
-
-
+        
+        
         """
 
         if degradation_bounds is None:
-            degradation_bounds = ((0, 3),  # LLI
-                                  (2.5, 6.5),  # q_pe
-                                  (2.5, 6.5),  # q_ne
-                                  (1, 1),  # (-1,1) x_ne_2
+            degradation_bounds = ((0, 2.0),  # LLI
+                                  (2.5, 6.5),  # Q_pe
+                                  (2.5, 6.5),  # Q_ne
+                                  (1, 1),  # (-1,1) x_NE_2
                                   )
 
         degradation_optimization_result = differential_evolution(self._get_error_from_synthetic_fitting_ah,
                                                                  degradation_bounds,
-                                                                 args=(self.pe_pristine,
-                                                                       self.ne_1_pristine,
+                                                                 args=(self.pe_pristine_dict,
+                                                                       self.ne_1_pristine_dict,
                                                                        self.ne_2_pristine_pos,
                                                                        self.ne_2_pristine_neg,
                                                                        forward_simulated_profile,
@@ -925,19 +1041,319 @@ def solve_emulated_degradation(self,
                                                                  )
 
         return degradation_optimization_result
+    
+    def get_electrode_info_ah(self, pe_out_zeroed, ne_out_zeroed):
+        """
+        Calculates a variety of half-cell metrics at various positions in the full-cell profile.
 
+        Inputs:
+        pe_out_zeroed (Dataframe): cathode capacity and voltage columns scaled,
+                offset, and aligned along capacity
+        ne_out_zeroed (Dataframe): anode capacity and voltage columns scaled,
+                offset, and aligned along capacity
 
-# TODO revisit this function
-def blend_electrodes(electrode_1, electrode_2_pos, electrode_2_neg, x_2):
+        Outputs:
+        electrode_info_df (Dataframe): dataframe containing a variety of half-cell metrics 
+            at various positions in the emulated full-cell profile.
+
+            pe_voltage_FC4p2V: voltage of the positive electrode (catahode) corresponding
+                to the full cell at 4.2V
+            ...
+            pe_voltage_FC2p7V: voltage of the positive electrode (catahode) corresponding
+                to the full cell at 2.7V
+
+            pe_soc_FC4p2V: state of charge of the positive electrode corresponding
+                to the full cell at 4.2V
+            ...
+            pe_soc_FC2p7V: state of charge of the positive electrode corresponding
+                to the full cell at 2.7V
+
+            ne_voltage_FC4p2V: voltage of the negative electrode (anode) corresponding
+                to the full cell at 4.2V
+            ...
+            ne_voltage_FC2p7V: voltage of the negative electrode (anode) corresponding
+                to the full cell at 2.7V
+
+            ne_soc_FC4p2V: state of charge of the anode electrode corresponding
+                to the full cell at 4.2V
+            ...
+            ne_soc_FC2p7V: state of charge of the anode electrode corresponding
+                to the full cell at 2.7V
+
+            Q_fc: capacity of the full cecll within the full cell voltage limits
+            Q_pe: capacity of the cathode
+            Q_ne: capacity of the anode [Ahr]
+            Q_li
+        """
+        pe_minus_ne_zeroed = pd.DataFrame(pe_out_zeroed['Voltage_aligned'] - ne_out_zeroed['Voltage_aligned'],
+                                            columns=['Voltage_aligned'])
+        pe_minus_ne_zeroed['Q_aligned'] = pe_out_zeroed['Q_aligned']
+
+        electrode_info_df = pd.DataFrame(index=[0])
+
+        electrode_info_df['pe_voltage_FC4p2V'] = pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 4.2))].Voltage_aligned
+        electrode_info_df['pe_voltage_FC4p1V'] = pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 4.1))].Voltage_aligned
+        electrode_info_df['pe_voltage_FC4p0V'] = pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 4.0))].Voltage_aligned
+        electrode_info_df['pe_voltage_FC3p9V'] = pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.9))].Voltage_aligned
+        electrode_info_df['pe_voltage_FC3p8V'] = pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.8))].Voltage_aligned
+        electrode_info_df['pe_voltage_FC3p7V'] = pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.7))].Voltage_aligned
+        electrode_info_df['pe_voltage_FC3p6V'] = pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.6))].Voltage_aligned
+        electrode_info_df['pe_voltage_FC3p5V'] = pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.5))].Voltage_aligned
+        electrode_info_df['pe_voltage_FC3p4V'] = pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.4))].Voltage_aligned
+        electrode_info_df['pe_voltage_FC3p3V'] = pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.3))].Voltage_aligned
+        electrode_info_df['pe_voltage_FC3p2V'] = pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.2))].Voltage_aligned
+        electrode_info_df['pe_voltage_FC3p1V'] = pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.1))].Voltage_aligned
+        electrode_info_df['pe_voltage_FC3p0V'] = pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.0))].Voltage_aligned
+        electrode_info_df['pe_voltage_FC2p9V'] = pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 2.9))].Voltage_aligned
+        electrode_info_df['pe_voltage_FC2p8V'] = pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 2.8))].Voltage_aligned
+        electrode_info_df['pe_voltage_FC2p7V'] = pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 2.7))].Voltage_aligned
+
+        electrode_info_df['pe_soc_FC4p2V'] = ((pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 4.2))].Q_aligned - 
+                            np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()])) / (
+                                np.max(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]) -
+                                np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]))
+                        ) # 4.2V
+        electrode_info_df['pe_soc_FC4p1V'] = ((pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 4.1))].Q_aligned - 
+                            np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()])) / (
+                                np.max(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]) -
+                                np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]))
+                        ) # 4.1V
+        electrode_info_df['pe_soc_FC4p0V'] = ((pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 4.0))].Q_aligned - 
+                            np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()])) / (
+                                np.max(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]) -
+                                np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]))
+                        ) # 4.0V
+        electrode_info_df['pe_soc_FC3p9V'] = ((pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.9))].Q_aligned - 
+                            np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()])) / (
+                                np.max(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]) -
+                                np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]))
+                        ) # 3.9V
+        electrode_info_df['pe_soc_FC3p8V'] = ((pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.8))].Q_aligned - 
+                            np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()])) / (
+                                np.max(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]) -
+                                np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]))
+                        ) # 3.8V
+        electrode_info_df['pe_soc_FC3p7V'] = ((pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.7))].Q_aligned - 
+                            np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()])) / (
+                                np.max(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]) -
+                                np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]))
+                        ) # 3.7V
+        electrode_info_df['pe_soc_FC3p6V'] = ((pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.6))].Q_aligned - 
+                            np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()])) / (
+                                np.max(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]) -
+                                np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]))
+                        ) # 3.6V
+        electrode_info_df['pe_soc_FC3p5V'] = ((pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.5))].Q_aligned - 
+                            np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()])) / (
+                                np.max(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]) -
+                                np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]))
+                        ) # 3.5V    
+        electrode_info_df['pe_soc_FC3p4V'] = ((pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.4))].Q_aligned - 
+                            np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()])) / (
+                                np.max(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]) -
+                                np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]))
+                        ) # 3.4V
+        electrode_info_df['pe_soc_FC3p3V'] = ((pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.3))].Q_aligned - 
+                            np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()])) / (
+                                np.max(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]) -
+                                np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]))
+                        ) # 3.3V    
+        electrode_info_df['pe_soc_FC3p2V'] = ((pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.2))].Q_aligned - 
+                            np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()])) / (
+                                np.max(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]) -
+                                np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]))
+                        ) # 3.2V    
+        electrode_info_df['pe_soc_FC3p1V'] = ((pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.1))].Q_aligned - 
+                            np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()])) / (
+                                np.max(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]) -
+                                np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]))
+                        ) # 3.1V    
+        electrode_info_df['pe_soc_FC3p0V'] = ((pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.0))].Q_aligned - 
+                            np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()])) / (
+                                np.max(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]) -
+                                np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]))
+                        ) # 3.0V
+        electrode_info_df['pe_soc_FC2p9V'] = ((pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 2.9))].Q_aligned - 
+                            np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()])) / (
+                                np.max(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]) -
+                                np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]))
+                        ) # 2.9V    
+        electrode_info_df['pe_soc_FC2p8V'] = ((pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 2.8))].Q_aligned - 
+                            np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()])) / (
+                                np.max(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]) -
+                                np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]))
+                        ) # 2.8V        
+        electrode_info_df['pe_soc_FC2p7V'] = ((pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 2.7))].Q_aligned - 
+                            np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()])) / (
+                                np.max(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]) -
+                                np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]))
+                        ) # 2.7V
+
+        electrode_info_df['ne_voltage_FC4p2V'] = ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 4.2))].Voltage_aligned
+        electrode_info_df['ne_voltage_FC4p1V'] = ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 4.1))].Voltage_aligned
+        electrode_info_df['ne_voltage_FC4p0V'] = ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 4.0))].Voltage_aligned
+        electrode_info_df['ne_voltage_FC3p9V'] = ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.9))].Voltage_aligned
+        electrode_info_df['ne_voltage_FC3p8V'] = ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.8))].Voltage_aligned
+        electrode_info_df['ne_voltage_FC3p7V'] = ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.7))].Voltage_aligned
+        electrode_info_df['ne_voltage_FC3p6V'] = ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.6))].Voltage_aligned
+        electrode_info_df['ne_voltage_FC3p5V'] = ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.5))].Voltage_aligned
+        electrode_info_df['ne_voltage_FC3p4V'] = ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.4))].Voltage_aligned
+        electrode_info_df['ne_voltage_FC3p3V'] = ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.3))].Voltage_aligned
+        electrode_info_df['ne_voltage_FC3p2V'] = ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.2))].Voltage_aligned
+        electrode_info_df['ne_voltage_FC3p1V'] = ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.1))].Voltage_aligned
+        electrode_info_df['ne_voltage_FC3p0V'] = ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.0))].Voltage_aligned
+        electrode_info_df['ne_voltage_FC2p9V'] = ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 2.9))].Voltage_aligned
+        electrode_info_df['ne_voltage_FC2p8V'] = ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 2.8))].Voltage_aligned
+        electrode_info_df['ne_voltage_FC2p7V'] = ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 2.7))].Voltage_aligned
+
+        electrode_info_df['ne_soc_FC4p2V'] = ((ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 4.2))].Q_aligned - 
+                            np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()])) / (
+                                np.max(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]) -
+                                np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]))
+                        ) # 4.2V    
+        electrode_info_df['ne_soc_FC4p1V'] = ((ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 4.1))].Q_aligned - 
+                            np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()])) / (
+                                np.max(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]) -
+                                np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]))
+                        ) # 4.1V
+        electrode_info_df['ne_soc_FC4p0V'] = ((ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 4.0))].Q_aligned - 
+                            np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()])) / (
+                                np.max(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]) -
+                                np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]))
+                        ) # 4.0V
+        electrode_info_df['ne_soc_FC3p9V'] = ((ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.9))].Q_aligned - 
+                            np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()])) / (
+                                np.max(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]) -
+                                np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]))
+                        ) # 3.9V
+        electrode_info_df['ne_soc_FC3p8V'] = ((ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.8))].Q_aligned - 
+                            np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()])) / (
+                                np.max(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]) -
+                                np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]))
+                        ) # 3Q_aligned.8V
+        electrode_info_df['ne_soc_FC3p7V'] = ((ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.7))].Q_aligned - 
+                            np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()])) / (
+                                np.max(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]) -
+                                np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]))
+                        ) # 3.7V
+        electrode_info_df['ne_soc_FC3p6V'] = ((ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.6))].Q_aligned - 
+                            np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()])) / (
+                                np.max(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]) -
+                                np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]))
+                        ) # 3.6V
+        electrode_info_df['ne_soc_FC3p5V'] = ((ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.5))].Q_aligned - 
+                            np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()])) / (
+                                np.max(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]) -
+                                np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]))
+                        ) # 3.5V
+        electrode_info_df['ne_soc_FC3p4V'] = ((ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.4))].Q_aligned - 
+                            np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()])) / (
+                                np.max(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]) -
+                                np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]))
+                        ) # 3.4V
+        electrode_info_df['ne_soc_FC3p3V'] = ((ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.3))].Q_aligned - 
+                            np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()])) / (
+                                np.max(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]) -
+                                np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]))
+                        ) # 3.3V
+        electrode_info_df['ne_soc_FC3p2V'] = ((ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.2))].Q_aligned - 
+                            np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()])) / (
+                                np.max(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]) -
+                                np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]))
+                        ) # 3.2V
+        electrode_info_df['ne_soc_FC3p1V'] = ((ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.1))].Q_aligned - 
+                            np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()])) / (
+                                np.max(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]) -
+                                np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]))
+                        ) # 3.1V
+        electrode_info_df['ne_soc_FC3p0V'] = ((ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.0))].Q_aligned - 
+                            np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()])) / (
+                                np.max(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]) -
+                                np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]))
+                        ) # 3.0V
+        electrode_info_df['ne_soc_FC2p9V'] = ((ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 2.9))].Q_aligned - 
+                            np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()])) / (
+                                np.max(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]) -
+                                np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]))
+                        ) # 2.9V
+        electrode_info_df['ne_soc_FC2p8V'] = ((ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 2.8))].Q_aligned - 
+                            np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()])) / (
+                                np.max(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]) -
+                                np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]))
+                        ) # 2.8V
+        electrode_info_df['ne_soc_FC2p7V'] = ((ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 2.7))].Q_aligned - 
+                            np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()])) / (
+                                np.max(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]) -
+                                np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]))
+                        ) # 2.7V
+
+        electrode_info_df['Q_fc'] = pe_minus_ne_zeroed.loc[
+            np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 4.20))].Q_aligned
+
+        electrode_info_df['Q_pe'] = np.max(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]) - np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()])
+
+        electrode_info_df['Q_ne'] = np.max(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]) - np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()])
+
+        electrode_info_df['Q_li'] = np.max(pe_minus_ne_zeroed['Q_aligned'].loc[~pe_minus_ne_zeroed.Voltage_aligned.isna()]) - np.min(pe_minus_ne_zeroed['Q_aligned'].loc[~pe_minus_ne_zeroed.Voltage_aligned.isna()])
+        
+
+        return electrode_info_df
+    
+    def _get_effective_electrode_rate_data(self,electrode_rate_dict=dict(),
+                                          Q_electrode=4.84,Q_fc_nom=4.84,
+                                          C_nom=-0.2,IR_coef=1.0):
+        """
+        Inputs:
+        electrode_rate_dict (dict): dictonary of rate data for an electrode. Keys are float representing C rate; entries are dataframe of rate data with columns Voltage_aligned and SOC_aligned. Sign of cathode rate keys should be same as full cell (positive for charge, negative for discharge). Sign of anode rate keys should be opposite of the full cell.
+        Q_electrode (float): capacity of the electrode in Ahr
+        Q_fc_nom (float): nominal capacity of full cell which was used to define the nominal C rate
+        C_nom (float): nominal current rate in full cell (negative for discharge)
+        IR_coef (float): scaling coefficient of overpotential term, used in fitting
+        
+        Outputs:
+        electrode_at_I_eff (DataFrame): electrode data modified for effective rate, based on the supplied dictionary of rates and an overpotential scaling term.
+        
+        """
+        
+        I_eff = C_nom / (Q_electrode/Q_fc_nom)
+        C_nom_sign = np.sign(list(electrode_rate_dict.keys())[0])
+        
+        R_at_SOC = []
+        for SOC in electrode_rate_dict[list(electrode_rate_dict.keys())[0]]['SOC_aligned']:
+
+            V_at_SOC_for_rates = []
+            for rate in electrode_rate_dict.keys():
+                V_at_SOC_for_rates = np.append(V_at_SOC_for_rates,
+                                               electrode_rate_dict[rate]['Voltage_aligned'].loc[
+                                                   electrode_rate_dict[rate]['SOC_aligned'] == SOC
+                                               ])
+            R_at_SOC = np.append(R_at_SOC,
+                np.polynomial.polynomial.Polynomial.fit(
+                list(electrode_rate_dict.keys()),
+                V_at_SOC_for_rates,deg=2).deriv(m=1)(I_eff)
+                                    )
+        electrode_at_I_eff = pd.DataFrame()
+        electrode_at_I_eff['SOC_aligned'] = electrode_rate_dict[
+            list(electrode_rate_dict.keys())[0]]['SOC_aligned']
+        
+        closest_rate_key = list(electrode_rate_dict.keys())[
+            np.argmin(np.abs(
+                np.array(list(electrode_rate_dict.keys())) - I_eff))]
+        electrode_at_I_eff['Voltage_aligned'] = electrode_rate_dict[
+            closest_rate_key]['Voltage_aligned'] + IR_coef*(I_eff - (C_nom*C_nom_sign)*R_at_SOC)
+        
+        return electrode_at_I_eff, R_at_SOC
+
+def blend_electrodes(electrode_1, electrode_2_pos, electrode_2_neg, x_2): ## this function needs revisited
     """
     Blends two electrode materials from their SOC-V profiles to form a blended electrode.
-
+    
     Inputs:
     electrode_1: Primary material in electrode, typically Gr. DataFrame supplied with SOC evenly spaced and voltage.
     electrode_2: Secondary material in electrode, typically Si. DataFrame supplied with SOC evenly spaced and
         voltage as an additional column.
     x_2: Fraction of electrode_2 material's capacity (not mass). Supplied as scalar value.
-
+    
     Outputs:
     df_blended_soc_mod (Dataframe): blended electrode with SOC_aligned and Voltage_aligned columns
     """
@@ -992,329 +1408,3 @@ def blend_electrodes(electrode_1, electrode_2_pos, electrode_2_neg, x_2):
     df_blended_soc_mod['SOC_aligned'] = soc_vec / np.max(soc_vec) * 100
     return df_blended_soc_mod
 
-
-def get_electrode_info_ah(pe_out_zeroed, ne_out_zeroed):
-    """
-    Calculates a variety of half-cell metrics at various positions in the full-cell profile.
-
-    Inputs:
-    pe_out_zeroed (Dataframe): cathode capacity and voltage columns scaled,
-            offset, and aligned along capacity
-    ne_out_zeroed (Dataframe): anode capacity and voltage columns scaled,
-            offset, and aligned along capacity
-
-    Outputs:
-    electrode_info_df (Dataframe): dataframe containing a variety of half-cell metrics
-        at various positions in the emulated full-cell profile.
-
-        pe_voltage_FC4p2V: voltage of the positive electrode (catahode) corresponding
-            to the full cell at 4.2V
-        ...
-        pe_voltage_FC2p7V: voltage of the positive electrode (catahode) corresponding
-            to the full cell at 2.7V
-
-        pe_soc_FC4p2V: state of charge of the positive electrode corresponding
-            to the full cell at 4.2V
-        ...
-        pe_soc_FC2p7V: state of charge of the positive electrode corresponding
-            to the full cell at 2.7V
-
-        ne_voltage_FC4p2V: voltage of the negative electrode (anode) corresponding
-            to the full cell at 4.2V
-        ...
-        ne_voltage_FC2p7V: voltage of the negative electrode (anode) corresponding
-            to the full cell at 2.7V
-
-        ne_soc_FC4p2V: state of charge of the anode electrode corresponding
-            to the full cell at 4.2V
-        ...
-        ne_soc_FC2p7V: state of charge of the anode electrode corresponding
-            to the full cell at 2.7V
-
-        Q_fc: capacity of the full cecll within the full cell voltage limits
-        q_pe: capacity of the cathode
-        q_ne: capacity of the anode [Ahr]
-        Q_li
-    """
-    pe_minus_ne_zeroed = pd.DataFrame(pe_out_zeroed['Voltage_aligned'] - ne_out_zeroed['Voltage_aligned'],
-                                      columns=['Voltage_aligned'])
-    pe_minus_ne_zeroed['Q_aligned'] = pe_out_zeroed['Q_aligned']
-
-    electrode_info_df = pd.DataFrame(index=[0])
-
-    electrode_info_df['pe_voltage_FC4p2V'] = pe_out_zeroed.loc[
-        np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 4.2))].Voltage_aligned
-    electrode_info_df['pe_voltage_FC4p1V'] = pe_out_zeroed.loc[
-        np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 4.1))].Voltage_aligned
-    electrode_info_df['pe_voltage_FC4p0V'] = pe_out_zeroed.loc[
-        np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 4.0))].Voltage_aligned
-    electrode_info_df['pe_voltage_FC3p9V'] = pe_out_zeroed.loc[
-        np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.9))].Voltage_aligned
-    electrode_info_df['pe_voltage_FC3p8V'] = pe_out_zeroed.loc[
-        np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.8))].Voltage_aligned
-    electrode_info_df['pe_voltage_FC3p7V'] = pe_out_zeroed.loc[
-        np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.7))].Voltage_aligned
-    electrode_info_df['pe_voltage_FC3p6V'] = pe_out_zeroed.loc[
-        np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.6))].Voltage_aligned
-    electrode_info_df['pe_voltage_FC3p5V'] = pe_out_zeroed.loc[
-        np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.5))].Voltage_aligned
-    electrode_info_df['pe_voltage_FC3p4V'] = pe_out_zeroed.loc[
-        np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.4))].Voltage_aligned
-    electrode_info_df['pe_voltage_FC3p3V'] = pe_out_zeroed.loc[
-        np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.3))].Voltage_aligned
-    electrode_info_df['pe_voltage_FC3p2V'] = pe_out_zeroed.loc[
-        np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.2))].Voltage_aligned
-    electrode_info_df['pe_voltage_FC3p1V'] = pe_out_zeroed.loc[
-        np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.1))].Voltage_aligned
-    electrode_info_df['pe_voltage_FC3p0V'] = pe_out_zeroed.loc[
-        np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.0))].Voltage_aligned
-    electrode_info_df['pe_voltage_FC2p9V'] = pe_out_zeroed.loc[
-        np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 2.9))].Voltage_aligned
-    electrode_info_df['pe_voltage_FC2p8V'] = pe_out_zeroed.loc[
-        np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 2.8))].Voltage_aligned
-    electrode_info_df['pe_voltage_FC2p7V'] = pe_out_zeroed.loc[
-        np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 2.7))].Voltage_aligned
-
-    electrode_info_df['pe_soc_FC4p2V'] = (
-                (pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 4.2))].Q_aligned -
-                 np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()])) / (
-                        np.max(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]) -
-                        np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]))
-                )  # 4.2V
-    electrode_info_df['pe_soc_FC4p1V'] = (
-                (pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 4.1))].Q_aligned -
-                 np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()])) / (
-                        np.max(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]) -
-                        np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]))
-                )  # 4.1V
-    electrode_info_df['pe_soc_FC4p0V'] = (
-                (pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 4.0))].Q_aligned -
-                 np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()])) / (
-                        np.max(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]) -
-                        np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]))
-                )  # 4.0V
-    electrode_info_df['pe_soc_FC3p9V'] = (
-                (pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.9))].Q_aligned -
-                 np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()])) / (
-                        np.max(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]) -
-                        np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]))
-                )  # 3.9V
-    electrode_info_df['pe_soc_FC3p8V'] = (
-                (pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.8))].Q_aligned -
-                 np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()])) / (
-                        np.max(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]) -
-                        np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]))
-                )  # 3.8V
-    electrode_info_df['pe_soc_FC3p7V'] = (
-                (pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.7))].Q_aligned -
-                 np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()])) / (
-                        np.max(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]) -
-                        np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]))
-                )  # 3.7V
-    electrode_info_df['pe_soc_FC3p6V'] = (
-                (pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.6))].Q_aligned -
-                 np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()])) / (
-                        np.max(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]) -
-                        np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]))
-                )  # 3.6V
-    electrode_info_df['pe_soc_FC3p5V'] = (
-                (pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.5))].Q_aligned -
-                 np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()])) / (
-                        np.max(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]) -
-                        np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]))
-                )  # 3.5V
-    electrode_info_df['pe_soc_FC3p4V'] = (
-                (pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.4))].Q_aligned -
-                 np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()])) / (
-                        np.max(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]) -
-                        np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]))
-                )  # 3.4V
-    electrode_info_df['pe_soc_FC3p3V'] = (
-                (pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.3))].Q_aligned -
-                 np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()])) / (
-                        np.max(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]) -
-                        np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]))
-                )  # 3.3V
-    electrode_info_df['pe_soc_FC3p2V'] = (
-                (pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.2))].Q_aligned -
-                 np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()])) / (
-                        np.max(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]) -
-                        np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]))
-                )  # 3.2V
-    electrode_info_df['pe_soc_FC3p1V'] = (
-                (pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.1))].Q_aligned -
-                 np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()])) / (
-                        np.max(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]) -
-                        np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]))
-                )  # 3.1V
-    electrode_info_df['pe_soc_FC3p0V'] = (
-                (pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.0))].Q_aligned -
-                 np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()])) / (
-                        np.max(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]) -
-                        np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]))
-                )  # 3.0V
-    electrode_info_df['pe_soc_FC2p9V'] = (
-                (pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 2.9))].Q_aligned -
-                 np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()])) / (
-                        np.max(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]) -
-                        np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]))
-                )  # 2.9V
-    electrode_info_df['pe_soc_FC2p8V'] = (
-                (pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 2.8))].Q_aligned -
-                 np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()])) / (
-                        np.max(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]) -
-                        np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]))
-                )  # 2.8V
-    electrode_info_df['pe_soc_FC2p7V'] = (
-                (pe_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 2.7))].Q_aligned -
-                 np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()])) / (
-                        np.max(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]) -
-                        np.min(pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]))
-                )  # 2.7V
-
-    electrode_info_df['ne_voltage_FC4p2V'] = ne_out_zeroed.loc[
-        np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 4.2))].Voltage_aligned
-    electrode_info_df['ne_voltage_FC4p1V'] = ne_out_zeroed.loc[
-        np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 4.1))].Voltage_aligned
-    electrode_info_df['ne_voltage_FC4p0V'] = ne_out_zeroed.loc[
-        np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 4.0))].Voltage_aligned
-    electrode_info_df['ne_voltage_FC3p9V'] = ne_out_zeroed.loc[
-        np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.9))].Voltage_aligned
-    electrode_info_df['ne_voltage_FC3p8V'] = ne_out_zeroed.loc[
-        np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.8))].Voltage_aligned
-    electrode_info_df['ne_voltage_FC3p7V'] = ne_out_zeroed.loc[
-        np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.7))].Voltage_aligned
-    electrode_info_df['ne_voltage_FC3p6V'] = ne_out_zeroed.loc[
-        np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.6))].Voltage_aligned
-    electrode_info_df['ne_voltage_FC3p5V'] = ne_out_zeroed.loc[
-        np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.5))].Voltage_aligned
-    electrode_info_df['ne_voltage_FC3p4V'] = ne_out_zeroed.loc[
-        np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.4))].Voltage_aligned
-    electrode_info_df['ne_voltage_FC3p3V'] = ne_out_zeroed.loc[
-        np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.3))].Voltage_aligned
-    electrode_info_df['ne_voltage_FC3p2V'] = ne_out_zeroed.loc[
-        np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.2))].Voltage_aligned
-    electrode_info_df['ne_voltage_FC3p1V'] = ne_out_zeroed.loc[
-        np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.1))].Voltage_aligned
-    electrode_info_df['ne_voltage_FC3p0V'] = ne_out_zeroed.loc[
-        np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.0))].Voltage_aligned
-    electrode_info_df['ne_voltage_FC2p9V'] = ne_out_zeroed.loc[
-        np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 2.9))].Voltage_aligned
-    electrode_info_df['ne_voltage_FC2p8V'] = ne_out_zeroed.loc[
-        np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 2.8))].Voltage_aligned
-    electrode_info_df['ne_voltage_FC2p7V'] = ne_out_zeroed.loc[
-        np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 2.7))].Voltage_aligned
-
-    electrode_info_df['ne_soc_FC4p2V'] = (
-                (ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 4.2))].Q_aligned -
-                 np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()])) / (
-                        np.max(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]) -
-                        np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]))
-                )  # 4.2V
-    electrode_info_df['ne_soc_FC4p1V'] = (
-                (ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 4.1))].Q_aligned -
-                 np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()])) / (
-                        np.max(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]) -
-                        np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]))
-                )  # 4.1V
-    electrode_info_df['ne_soc_FC4p0V'] = (
-                (ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 4.0))].Q_aligned -
-                 np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()])) / (
-                        np.max(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]) -
-                        np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]))
-                )  # 4.0V
-    electrode_info_df['ne_soc_FC3p9V'] = (
-                (ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.9))].Q_aligned -
-                 np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()])) / (
-                        np.max(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]) -
-                        np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]))
-                )  # 3.9V
-    electrode_info_df['ne_soc_FC3p8V'] = (
-                (ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.8))].Q_aligned -
-                 np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()])) / (
-                        np.max(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]) -
-                        np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]))
-                )  # 3Q_aligned.8V
-    electrode_info_df['ne_soc_FC3p7V'] = (
-                (ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.7))].Q_aligned -
-                 np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()])) / (
-                        np.max(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]) -
-                        np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]))
-                )  # 3.7V
-    electrode_info_df['ne_soc_FC3p6V'] = (
-                (ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.6))].Q_aligned -
-                 np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()])) / (
-                        np.max(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]) -
-                        np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]))
-                )  # 3.6V
-    electrode_info_df['ne_soc_FC3p5V'] = (
-                (ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.5))].Q_aligned -
-                 np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()])) / (
-                        np.max(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]) -
-                        np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]))
-                )  # 3.5V
-    electrode_info_df['ne_soc_FC3p4V'] = (
-                (ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.4))].Q_aligned -
-                 np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()])) / (
-                        np.max(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]) -
-                        np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]))
-                )  # 3.4V
-    electrode_info_df['ne_soc_FC3p3V'] = (
-                (ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.3))].Q_aligned -
-                 np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()])) / (
-                        np.max(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]) -
-                        np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]))
-                )  # 3.3V
-    electrode_info_df['ne_soc_FC3p2V'] = (
-                (ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.2))].Q_aligned -
-                 np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()])) / (
-                        np.max(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]) -
-                        np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]))
-                )  # 3.2V
-    electrode_info_df['ne_soc_FC3p1V'] = (
-                (ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.1))].Q_aligned -
-                 np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()])) / (
-                        np.max(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]) -
-                        np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]))
-                )  # 3.1V
-    electrode_info_df['ne_soc_FC3p0V'] = (
-                (ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 3.0))].Q_aligned -
-                 np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()])) / (
-                        np.max(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]) -
-                        np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]))
-                )  # 3.0V
-    electrode_info_df['ne_soc_FC2p9V'] = (
-                (ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 2.9))].Q_aligned -
-                 np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()])) / (
-                        np.max(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]) -
-                        np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]))
-                )  # 2.9V
-    electrode_info_df['ne_soc_FC2p8V'] = (
-                (ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 2.8))].Q_aligned -
-                 np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()])) / (
-                        np.max(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]) -
-                        np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]))
-                )  # 2.8V
-    electrode_info_df['ne_soc_FC2p7V'] = (
-                (ne_out_zeroed.loc[np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 2.7))].Q_aligned -
-                 np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()])) / (
-                        np.max(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]) -
-                        np.min(ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]))
-                )  # 2.7V
-
-    electrode_info_df['Q_fc'] = pe_minus_ne_zeroed.loc[
-        np.argmin(np.abs(pe_minus_ne_zeroed.Voltage_aligned - 4.20))].Q_aligned
-
-    electrode_info_df['Q_pe'] = np.max(
-        pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()]) - np.min(
-        pe_out_zeroed['Q_aligned'].loc[~pe_out_zeroed['Voltage_aligned'].isna()])
-
-    electrode_info_df['Q_ne'] = np.max(
-        ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()]) - np.min(
-        ne_out_zeroed['Q_aligned'].loc[~ne_out_zeroed['Voltage_aligned'].isna()])
-
-    electrode_info_df['Q_li'] = np.max(
-        pe_minus_ne_zeroed['Q_aligned'].loc[~pe_minus_ne_zeroed.Voltage_aligned.isna()]) - np.min(
-        pe_minus_ne_zeroed['Q_aligned'].loc[~pe_minus_ne_zeroed.Voltage_aligned.isna()])
-
-    return electrode_info_df

From 7d4d2dcb60955573ca27667a8f6b1f15041b6006 Mon Sep 17 00:00:00 2001
From: ardunn <ardunn@lbl.gov>
Date: Fri, 18 Mar 2022 16:10:51 -0700
Subject: [PATCH 05/16] test commit for @pasinger

---
 beep/features/base.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/beep/features/base.py b/beep/features/base.py
index f3d3fffd..7e707259 100644
--- a/beep/features/base.py
+++ b/beep/features/base.py
@@ -361,7 +361,8 @@ def to_json_file(self, filename):
         """
         d = self.as_dict()
         dumpfn(d, filename)
-        
+
+
 class BEEPCycleFeatureMatrix(MSONable):
     """
     Create an ((n battery cycler files) x (j cycles)) x (k features)  array composed of

From e85dd0e9bf7b1ad2d0d4b3cda933773460e38d6d Mon Sep 17 00:00:00 2001
From: ardunn <ardunn@lbl.gov>
Date: Tue, 29 Mar 2022 16:51:51 -0700
Subject: [PATCH 06/16] basic formatting  etc.

---
 beep/features/base.py |   4 +-
 beep/features/core.py | 240 ++++++++++++++++++++++++++----------------
 2 files changed, 150 insertions(+), 94 deletions(-)

diff --git a/beep/features/base.py b/beep/features/base.py
index 7e707259..0bb965b8 100644
--- a/beep/features/base.py
+++ b/beep/features/base.py
@@ -293,7 +293,7 @@ def __init__(self, beepfeaturizers: List[BEEPFeaturizer]):
         self.featurizers = beepfeaturizers
 
     def as_dict(self):
-        """Serialize a BEEPDatapath as a dictionary.
+        """Serialize a BEEPFeatureMatrix as a dictionary.
 
         Must not be loaded from legacy.
 
@@ -313,7 +313,7 @@ def as_dict(self):
 
     @classmethod
     def from_dict(cls, d):
-        """Create a BEEPDatapath object from a dictionary.
+        """Create a BEEPFeatureMatrix object from a dictionary.
 
         Args:
             d (dict): dictionary represenation.
diff --git a/beep/features/core.py b/beep/features/core.py
index 7571ccd9..6887f303 100644
--- a/beep/features/core.py
+++ b/beep/features/core.py
@@ -4,17 +4,18 @@
 from scipy.interpolate import interp1d
 
 from beep import PROTOCOL_PARAMETERS_DIR
-from beep.features import featurizer_helps
+from beep.features import featurizer_helpers
 from functools import reduce
 from beep.utils.parameters_lookup import get_protocol_parameters
 
 from beep.features.base import BEEPFeaturizer, BEEPFeaturizationError
 
+
 class HPPCResistanceVoltageEarlyFeatures(BEEPFeaturizer):
     DEFAULT_HYPERPARAMETERS = {
         "test_time_filter_sec": 1000000,
         "cycle_index_filter": 6,
-        "diag_pos":1,
+        "diag_pos": 1,
         "soc_window": 8,
         "parameters_path": PROTOCOL_PARAMETERS_DIR
     }
@@ -86,7 +87,8 @@ def create_features(self):
         # merge everything together as a final result dataframe
         self.features = pd.concat(
             [hppc_r, hppc_ocv, v_diff, diffusion_features], axis=1)
-        
+
+
 class HPPCResistanceVoltageCycleFeatures(BEEPFeaturizer):
     DEFAULT_HYPERPARAMETERS = {
         "test_time_filter_sec": 1000000,
@@ -94,7 +96,7 @@ class HPPCResistanceVoltageCycleFeatures(BEEPFeaturizer):
         "soc_window": 8,
         "parameters_path": PROTOCOL_PARAMETERS_DIR
     }
-    
+
     def validate(self):
         val, msg = featurizer_helpers.check_diagnostic_validation(self.datapath)
         if val:
@@ -114,7 +116,7 @@ def validate(self):
                 return False, "HPPC conditions not met for this cycler run"
         else:
             return val, msg
-        
+
     def create_features(self):
         # Filter out low cycle numbers at the end of the test, corresponding to the "final" diagnostic
         self.datapath.diagnostic_data = self.datapath.diagnostic_data[
@@ -126,40 +128,40 @@ def create_features(self):
         self.datapath.diagnostic_data = self.datapath.diagnostic_data.groupby(
             ["cycle_index", "step_index", "step_index_counter"]
         ).filter(lambda x: ~x["test_time"].isnull().all())
-        
+
         # Only hppc_resistance_features are able to be calculated without error.
         # Xiao Cui should be pulled in to understand the issue with the others features.
-        
-        
+
         # diffusion features
-#         diffusion_features = featurizer_helpers.get_diffusion_cycle_features(
-#             self.datapath,
-#         )
-        
+        #         diffusion_features = featurizer_helpers.get_diffusion_cycle_features(
+        #             self.datapath,
+        #         )
+
         # hppc resistance features
         hppc_resistance_features = featurizer_helpers.get_hppc_resistance_cycle_features(
             self.datapath,
         )
 
         # the variance of ocv features
-#         hppc_ocv_features = featurizer_helpers.get_hppc_ocv_cycle_features(
-#             self.datapath,
-#         )
+        #         hppc_ocv_features = featurizer_helpers.get_hppc_ocv_cycle_features(
+        #             self.datapath,
+        #         )
 
         # the v_diff features
-#         v_diff = featurizer_helpers.get_v_diff_cycle_features(
-#             self.datapath,
-#             self.hyperparameters["soc_window"],
-#             self.hyperparameters["parameters_path"]
-#         )
+        #         v_diff = featurizer_helpers.get_v_diff_cycle_features(
+        #             self.datapath,
+        #             self.hyperparameters["soc_window"],
+        #             self.hyperparameters["parameters_path"]
+        #         )
 
         # merge everything together as a final result dataframe
         self.features = pd.concat(
-            [hppc_resistance_features, 
-             #hppc_ocv_features,
-             #v_diff, #diffusion_features
-            ], axis=1)
-        
+            [hppc_resistance_features,
+             # hppc_ocv_features,
+             # v_diff, #diffusion_features
+             ], axis=1)
+
+
 class CycleSummaryStatsEarlyFeatures(BEEPFeaturizer):
     DEFAULT_HYPERPARAMETERS = {
         "cycle_comp_num": [10, 100],
@@ -577,9 +579,11 @@ def validate(self):
             bool: True/False indication of ability to proceed with feature generation
         """
 
-        if not self.datapath.structured_summary.index.max() > self.hyperparameters["final_pred_cycle"]:
+        if not self.datapath.structured_summary.index.max() > \
+               self.hyperparameters["final_pred_cycle"]:
             return False, "Structured summary index max is less than final pred cycle"
-        elif not self.datapath.structured_summary.index.min() <= self.hyperparameters["init_pred_cycle"]:
+        elif not self.datapath.structured_summary.index.min() <= \
+                 self.hyperparameters["init_pred_cycle"]:
             return False, "Structured summary index min is more than initial pred cycle"
         elif "cycle_index" not in self.datapath.structured_summary.columns:
             return False, "Structured summary missing critical data: 'cycle_index'"
@@ -587,7 +591,8 @@ def validate(self):
             return False, "Structured data missing critical data: 'cycle_index'"
         elif not self.hyperparameters["mid_pred_cycle"] > 10:
             return False, "Middle pred. cycle less than threshold value of 10"
-        elif not self.hyperparameters["final_pred_cycle"] > self.hyperparameters["mid_pred_cycle"]:
+        elif not self.hyperparameters["final_pred_cycle"] > \
+                 self.hyperparameters["mid_pred_cycle"]:
             return False, "Final pred cycle less than middle pred cycle"
         else:
             return True, None
@@ -899,7 +904,8 @@ def get_threshold_targets(self, df):
         if filter_kinks:
             if np.any(df['fractional_metric'].diff().diff() < filter_kinks):
                 last_good_cycle = df[
-                    df['fractional_metric'].diff().diff() < filter_kinks]['cycle_index'].min()
+                    df['fractional_metric'].diff().diff() < filter_kinks][
+                    'cycle_index'].min()
                 df = df[df['cycle_index'] < last_good_cycle]
 
         x_axes = []
@@ -968,7 +974,8 @@ def get_threshold_targets(self, df):
                                                'initial_regular_throughput'].values[
                                                0]
             x_to_threshold.append(real_throughput_to_threshold)
-            interpolation_axes = interpolation_axes + ["real_regular_throughput"]
+            interpolation_axes = interpolation_axes + [
+                "real_regular_throughput"]
 
         threshold_dict = {
             'initial_regular_throughput':
@@ -981,7 +988,8 @@ def get_threshold_targets(self, df):
                 x_to_threshold[indx]]
 
         return pd.DataFrame(threshold_dict)
-    
+
+
 class DiagnosticCycleFeatures(BEEPFeaturizer):
     """
     This class stores fractional levels of degradation in discharge capacity and discharge energy
@@ -999,7 +1007,7 @@ class DiagnosticCycleFeatures(BEEPFeaturizer):
     DEFAULT_HYPERPARAMETERS = {
         "parameters_dir": PROTOCOL_PARAMETERS_DIR,
         "nominal_capacity": 4.84,
-        
+
     }
 
     def validate(self):
@@ -1033,79 +1041,121 @@ def create_features(self):
         """
 
         parameters_path = self.hyperparameters["parameters_dir"]
-        
+
         # RPT discharge capacities       
-        data_rpt_02C = self.datapath.diagnostic_data.loc[self.datapath.diagnostic_data.cycle_type == 'rpt_0.2C']
-        Q_rpt_02C = data_rpt_02C.groupby('cycle_index')[['discharge_capacity','discharge_energy']].max().reset_index(drop=False)
-        Q_rpt_02C.rename(columns={'discharge_capacity':'rpt_0.2C_discharge_capacity','discharge_energy':'rpt_0.2C_discharge_energy'},
-                         inplace=True)
-        Q_rpt_02C = Q_rpt_02C.reset_index(drop=False).rename(columns={'index':'diag_pos'})
-        
-        rpt_02C_cycles = data_rpt_02C.cycle_index.unique() # for referencing last regular cycle before diagnostic
-        
-        data_rpt_1C = self.datapath.diagnostic_data.loc[self.datapath.diagnostic_data.cycle_type == 'rpt_1C']
-        Q_rpt_1C = data_rpt_1C.groupby('cycle_index')[['discharge_capacity','discharge_energy']].max().reset_index(drop=False)
-        Q_rpt_1C.rename(columns={'discharge_capacity':'rpt_1C_discharge_capacity','discharge_energy':'rpt_1C_discharge_energy'},
-                         inplace=True)
-        Q_rpt_1C = Q_rpt_1C.reset_index(drop=False).rename(columns={'index':'diag_pos'})
-        
-        data_rpt_2C = self.datapath.diagnostic_data.loc[self.datapath.diagnostic_data.cycle_type == 'rpt_2C']
-        Q_rpt_2C = data_rpt_2C.groupby('cycle_index')[['discharge_capacity','discharge_energy']].max().reset_index(drop=False)
-        Q_rpt_2C.rename(columns={'discharge_capacity':'rpt_2C_discharge_capacity','discharge_energy':'rpt_2C_discharge_energy'},
-                         inplace=True)
-        Q_rpt_2C = Q_rpt_2C.reset_index(drop=False).rename(columns={'index':'diag_pos'})
-        
+        data_rpt_02C = self.datapath.diagnostic_data.loc[
+            self.datapath.diagnostic_data.cycle_type == 'rpt_0.2C']
+        Q_rpt_02C = data_rpt_02C.groupby('cycle_index')[
+            ['discharge_capacity', 'discharge_energy']].max().reset_index(
+            drop=False)
+        Q_rpt_02C.rename(
+            columns={'discharge_capacity': 'rpt_0.2C_discharge_capacity',
+                     'discharge_energy': 'rpt_0.2C_discharge_energy'},
+            inplace=True)
+        Q_rpt_02C = Q_rpt_02C.reset_index(drop=False).rename(
+            columns={'index': 'diag_pos'})
+
+        rpt_02C_cycles = data_rpt_02C.cycle_index.unique()  # for referencing last regular cycle before diagnostic
+
+        data_rpt_1C = self.datapath.diagnostic_data.loc[
+            self.datapath.diagnostic_data.cycle_type == 'rpt_1C']
+        Q_rpt_1C = data_rpt_1C.groupby('cycle_index')[
+            ['discharge_capacity', 'discharge_energy']].max().reset_index(
+            drop=False)
+        Q_rpt_1C.rename(
+            columns={'discharge_capacity': 'rpt_1C_discharge_capacity',
+                     'discharge_energy': 'rpt_1C_discharge_energy'},
+            inplace=True)
+        Q_rpt_1C = Q_rpt_1C.reset_index(drop=False).rename(
+            columns={'index': 'diag_pos'})
+
+        data_rpt_2C = self.datapath.diagnostic_data.loc[
+            self.datapath.diagnostic_data.cycle_type == 'rpt_2C']
+        Q_rpt_2C = data_rpt_2C.groupby('cycle_index')[
+            ['discharge_capacity', 'discharge_energy']].max().reset_index(
+            drop=False)
+        Q_rpt_2C.rename(
+            columns={'discharge_capacity': 'rpt_2C_discharge_capacity',
+                     'discharge_energy': 'rpt_2C_discharge_energy'},
+            inplace=True)
+        Q_rpt_2C = Q_rpt_2C.reset_index(drop=False).rename(
+            columns={'index': 'diag_pos'})
+
         # cumuative discharge throughput
-        aging_df = self.datapath.structured_summary[['cycle_index','charge_throughput','energy_throughput','energy_efficiency','charge_duration','CV_time','CV_current','energy_efficiency']]
+        aging_df = self.datapath.structured_summary[
+            ['cycle_index', 'charge_throughput', 'energy_throughput',
+             'energy_efficiency', 'charge_duration', 'CV_time', 'CV_current',
+             'energy_efficiency']]
         aging_df = aging_df.loc[aging_df.cycle_index.isin(rpt_02C_cycles - 3)]
-        
-        cumulative_discharge_throughput = aging_df[['cycle_index','charge_throughput']].rename(columns={'charge_throughput':'discharge_throughput'}).reset_index(drop=True)
-        cumulative_discharge_throughput = cumulative_discharge_throughput.reset_index(drop=False).rename(columns={'index':'diag_pos'})
 
-        cumulative_energy_throughput = aging_df[['cycle_index','energy_throughput']].reset_index(drop=True)
+        cumulative_discharge_throughput = aging_df[
+            ['cycle_index', 'charge_throughput']].rename(
+            columns={'charge_throughput': 'discharge_throughput'}).reset_index(
+            drop=True)
+        cumulative_discharge_throughput = cumulative_discharge_throughput.reset_index(
+            drop=False).rename(columns={'index': 'diag_pos'})
+
+        cumulative_energy_throughput = aging_df[
+            ['cycle_index', 'energy_throughput']].reset_index(drop=True)
         cumulative_energy_throughput = cumulative_energy_throughput.reset_index(
-            drop=False).rename(columns={'index':'diag_pos'})
-        
+            drop=False).rename(columns={'index': 'diag_pos'})
+
         equivalent_full_cycles = cumulative_discharge_throughput.copy()
-        equivalent_full_cycles.rename(columns={'discharge_throughput':'equivalent_full_cycles'},inplace=True)
-        equivalent_full_cycles['equivalent_full_cycles'] = equivalent_full_cycles['equivalent_full_cycles']/self.hyperparameters['nominal_capacity']
-                
+        equivalent_full_cycles.rename(
+            columns={'discharge_throughput': 'equivalent_full_cycles'},
+            inplace=True)
+        equivalent_full_cycles['equivalent_full_cycles'] = \
+        equivalent_full_cycles['equivalent_full_cycles'] / self.hyperparameters[
+            'nominal_capacity']
+
         # Q_aging_pre_diag - discharge capacity of aging cycle before diagnostic
         Q_aging_pre_diag = self.datapath.structured_data.groupby('cycle_index')[
-                    'discharge_capacity'].max().loc[rpt_02C_cycles[1:] - 3].reset_index(drop=False) # ignore first diagnostic, adjust cycle index to Q_aging_pre_diag
-        Q_aging_pre_diag.rename(columns={'discharge_capacity':'Q_aging_pre_diag'},inplace=True)
+            'discharge_capacity'].max().loc[rpt_02C_cycles[1:] - 3].reset_index(
+            drop=False)  # ignore first diagnostic, adjust cycle index to Q_aging_pre_diag
+        Q_aging_pre_diag.rename(
+            columns={'discharge_capacity': 'Q_aging_pre_diag'}, inplace=True)
         Q_aging_pre_diag = Q_aging_pre_diag.reset_index(
-            drop=False).rename(columns={'index':'diag_pos'})
-        Q_aging_pre_diag['diag_pos'] = Q_aging_pre_diag['diag_pos'] + 1 # since, first diag is ignored, add one to diag_pos
-        
+            drop=False).rename(columns={'index': 'diag_pos'})
+        Q_aging_pre_diag['diag_pos'] = Q_aging_pre_diag[
+                                           'diag_pos'] + 1  # since, first diag is ignored, add one to diag_pos
+
         # Q_aging_post_diag - discharge capacity of aging cycle after diagnostic
-        Q_aging_post_diag = self.datapath.structured_data.groupby('cycle_index')[
-                    'discharge_capacity'].max().loc[rpt_02C_cycles + 3].reset_index(drop=False) # does not ignore first diag since Q_aging exists after first diag
-        Q_aging_post_diag.rename(columns={'discharge_capacity':'Q_aging_post_diag'},inplace=True)
+        Q_aging_post_diag = \
+        self.datapath.structured_data.groupby('cycle_index')[
+            'discharge_capacity'].max().loc[rpt_02C_cycles + 3].reset_index(
+            drop=False)  # does not ignore first diag since Q_aging exists after first diag
+        Q_aging_post_diag.rename(
+            columns={'discharge_capacity': 'Q_aging_post_diag'}, inplace=True)
         Q_aging_post_diag = Q_aging_post_diag.reset_index(
-            drop=False).rename(columns={'index':'diag_pos'})
-        
+            drop=False).rename(columns={'index': 'diag_pos'})
+
         # Diagnostic time
-        diagnostic_time = data_rpt_02C.groupby('cycle_index')['test_time'].min().reset_index(drop=False).rename(columns={'test_time':'diagnostic_time'})
+        diagnostic_time = data_rpt_02C.groupby('cycle_index')[
+            'test_time'].min().reset_index(drop=False).rename(
+            columns={'test_time': 'diagnostic_time'})
         diagnostic_time = diagnostic_time.reset_index(
-            drop=False).rename(columns={'index':'diag_pos'})
-        
+            drop=False).rename(columns={'index': 'diag_pos'})
+
         # Combine dataframes
-        df_list = [Q_rpt_02C,Q_rpt_1C,Q_rpt_2C,
-                    cumulative_discharge_throughput,
-                    cumulative_energy_throughput,
-                    equivalent_full_cycles,
-                    Q_aging_pre_diag,
-                    Q_aging_post_diag,
-                    diagnostic_time]
+        df_list = [Q_rpt_02C, Q_rpt_1C, Q_rpt_2C,
+                   cumulative_discharge_throughput,
+                   cumulative_energy_throughput,
+                   equivalent_full_cycles,
+                   Q_aging_pre_diag,
+                   Q_aging_post_diag,
+                   diagnostic_time]
 
         for df in df_list:
             df['cycle_index'] = df['cycle_index'].copy().astype(int)
             df['diag_pos'] = df['diag_pos'].copy().astype(int)
-            
-        cycle_features = reduce(lambda x,y: pd.merge(x,y,on=['cycle_index','diag_pos'],how='outer'), df_list)
-        self.features = cycle_features.sort_values('cycle_index').reset_index(drop=True)
-        
+
+        cycle_features = reduce(
+            lambda x, y: pd.merge(x, y, on=['cycle_index', 'diag_pos'],
+                                  how='outer'), df_list)
+        self.features = cycle_features.sort_values('cycle_index').reset_index(
+            drop=True)
+
+
 class CyclingProtocol(BEEPFeaturizer):
     """
     This class stores information about the charging protocol used
@@ -1121,7 +1171,8 @@ class CyclingProtocol(BEEPFeaturizer):
         "parameters_dir": PROTOCOL_PARAMETERS_DIR,
         "quantities": ["charge_constant_current_1", "charge_constant_current_2",
                        "charge_cutoff_voltage", "charge_constant_voltage_time",
-                       "discharge_constant_current", "discharge_cutoff_voltage"],
+                       "discharge_constant_current",
+                       "discharge_cutoff_voltage"],
     }
 
     def validate(self):
@@ -1136,7 +1187,8 @@ def validate(self):
         Returns:
             bool: True/False indication of ability to proceed with feature generation
         """
-        if not ('raw' in self.datapath.paths.keys() or 'structured' in self.datapath.paths.keys()):
+        if not (
+                'raw' in self.datapath.paths.keys() or 'structured' in self.datapath.paths.keys()):
             message = "datapath paths not set, unable to fetch charging protocol"
             return False, message
         else:
@@ -1148,11 +1200,15 @@ def create_features(self):
         """
 
         parameters_path = self.hyperparameters["parameters_dir"]
-        file_path = self.datapath.paths['raw'] if 'raw' in self.datapath.paths.keys() else self.datapath.paths['structured']
+        file_path = self.datapath.paths[
+            'raw'] if 'raw' in self.datapath.paths.keys() else \
+        self.datapath.paths['structured']
 
         parameters, _ = get_protocol_parameters(file_path, parameters_path)
 
         parameters = parameters[self.hyperparameters["quantities"]]
-        parameters['cycle_index'] = int(0) # create a cycle index column for merging with other featurizers
-        parameters['diag_pos'] = int(0) # create a diag_pos column for merging with other featurizers
+        parameters['cycle_index'] = int(
+            0)  # create a cycle index column for merging with other featurizers
+        parameters['diag_pos'] = int(
+            0)  # create a diag_pos column for merging with other featurizers
         self.features = parameters

From 01294eba1c33d3371be1e4bed459e1bed6873771 Mon Sep 17 00:00:00 2001
From: ardunn <ardunn@lbl.gov>
Date: Tue, 29 Mar 2022 17:29:54 -0700
Subject: [PATCH 07/16] add base classes for AllCycles and PerCycle featurizer
 types

---
 beep/features/base.py | 210 ++++++++++++++++++++++++++++++------------
 1 file changed, 153 insertions(+), 57 deletions(-)

diff --git a/beep/features/base.py b/beep/features/base.py
index 0bb965b8..570d7c12 100644
--- a/beep/features/base.py
+++ b/beep/features/base.py
@@ -57,7 +57,8 @@ class BEEPFeaturizer(MSONable, abc.ABC):
 
     DEFAULT_HYPERPARAMETERS = {}
 
-    def __init__(self, structured_datapath: Union[BEEPDatapath, None], hyperparameters: Union[dict, None] = None):
+    def __init__(self, structured_datapath: Union[BEEPDatapath, None],
+                 hyperparameters: Union[dict, None] = None):
         # If all required hyperparameters are specified, use those
         # If some subset of required hyperparameters are specified, throw error
         # If no hyperparameters are specified, use defaults
@@ -73,7 +74,8 @@ def __init__(self, structured_datapath: Union[BEEPDatapath, None], hyperparamete
             self.hyperparameters = self.DEFAULT_HYPERPARAMETERS
 
         if structured_datapath is not None and not structured_datapath.is_structured:
-            raise BEEPFeaturizationError("BEEPDatapath input is not structured!")
+            raise BEEPFeaturizationError(
+                "BEEPDatapath input is not structured!")
         self.datapath = structured_datapath
 
         self.features = None
@@ -117,7 +119,8 @@ def as_dict(self):
         """
 
         if self.features is None:
-            raise BEEPFeaturizationError("Cannot serialize features which have not been generated.")
+            raise BEEPFeaturizationError(
+                "Cannot serialize features which have not been generated.")
 
         features = self.features.to_dict("list")
 
@@ -196,13 +199,66 @@ def to_json_file(self, filename):
         dumpfn(d, filename)
 
 
+class BEEPAllCyclesFeaturizer(BEEPFeaturizer):
+    """Base class for featurizers that return a constant number of features
+    for any number of cycles in a structured datapath.
+
+    These features are typically used for early prediction.
+
+    A BEEPAllCyclesFeaturizer always returns the same number of features
+    for files for datapaths with any number of samples. Thus,
+
+
+    [Datapath w/ 2 cycles]   ---> (vector of k features)
+
+    [Datapath w/ 100 cycles] ---> (vector of k features)
+    """
+    PER_CYCLE = False
+
+
+class BEEPPerCycleFeaturizer(BEEPFeaturizer):
+    """Base class for featurizers that return a vector of features for
+    EACH cycle in a structured datapath.
+
+    These features are generally used for analysis
+
+    A BEEPPerCycleFeaturizer always returns an (n x k) matrix of features
+    for datapaths with n cycles each producing k features. Thus,
+
+    [Datapath w/ 2 cycles]   ---> (2 x k feature matrix)
+
+    [Datapath w/ 100 cycles] ---> (100 x k feature matrix)
+
+    """
+    PER_CYCLE = True
+
+
 class BEEPFeatureMatrix(MSONable):
     """
-    Create an (n battery cycler files) x (k features) array composed of
-    m BEEPFeaturizer objects.
+    Create an array composed of BEEPFeaturizer objects.
+
+    The array may either be:
+
+    PER-CYCLER-RUN, using BEEPAllCyclesFeaturizer.
+    One feature vector per cycler file, resulting in an array w. dimenions:
+        (n battery cycler files) x (k features)
+
+    OR:
+
+    PER-CYCLE, using BEEPPerCycleFeaturizer.
+    One feature vector per cycle, resulting in an array w. dimensions:
+        (n total cycles) x (k features)
+
+    Sets of featurizer objects must exclusively belong to EITHER of these
+    two paradigms (base classes), but may not be mixed.
+
+    So a set of featurizrs may be -per-cycler-file OR per-cycle, but not
+    both.
 
     Args:
-        beepfeaturizers ([BEEPFeaturizer]): A list of BEEPFeaturizer objects
+        beepfeaturizers ([BEEPFeaturizer]): A list of BEEPFeaturizer objects,
+            either ALL BEEPAllCyclesFeaturizer child objects OR ALL
+            BEEPPerCycleFeaturizer child objects.
 
     """
 
@@ -211,14 +267,30 @@ class BEEPFeatureMatrix(MSONable):
     def __init__(self, beepfeaturizers: List[BEEPFeaturizer]):
 
         if beepfeaturizers:
-            dfs_by_file = {bf.paths.get("structured", "no file found"): [] for bf in beepfeaturizers}
+            bfs_types_per_cycle = [bf.PER_CYCLE for bf in beepfeaturizers]
+
+            # the array should be either all True or all False
+            if all(bfs_types_per_cycle):
+                self.per_cycle = True
+            elif not any(bfs_types_per_cycle):
+                self.per_cycle = False
+            else:
+                raise TypeError(
+                    f"Featurizer types are mixed!\n"
+                    f"BEEPFeatureMatrix can only use EITHER a set of ALL "
+                    f"BEEPAllCyclesFeaturizers OR a set of ALL "
+                    f"BEEPPerCycleFeaturizers.")
+
+
+            dfs_by_file = {bf.paths.get("structured", f"no file found_{i}"): []
+                           for i, bf in enumerate(beepfeaturizers)}
             # big_df_rows = {bf.__class__.__name__: [] for bf in beepfeaturizers}
             unique_features = {}
             for i, bf in enumerate(beepfeaturizers):
+
                 if bf.features is None:
-                    raise BEEPFeatureMatrixError(f"BEEPFeaturizer {bf} has not created features")
-                elif bf.features.shape[0] != 1:
-                    raise BEEPFeatureMatrixError(f"BEEPFeaturizer {bf} features are not 1-dimensional.")
+                    raise BEEPFeatureMatrixError(
+                        f"BEEPFeaturizer {bf} has not created features")
                 else:
                     bfcn = bf.__class__.__name__
 
@@ -233,7 +305,8 @@ def __init__(self, beepfeaturizers: List[BEEPFeaturizer]):
                     # on identical files
 
                     # sort params for this featurizer obj by key
-                    params = sorted(list(bf.hyperparameters.items()), key=lambda x: x[0])
+                    params = sorted(list(bf.hyperparameters.items()),
+                                    key=lambda x: x[0])
 
                     # Prevent identical features from identical input files
                     # create a unique operation string for the application of this featurizer
@@ -241,7 +314,8 @@ def __init__(self, beepfeaturizers: List[BEEPFeaturizer]):
                     # the featurizer class name, hyperparameters, and class are the same
 
                     param_str = "-".join([f"{k}:{v}" for k, v in params])
-                    param_hash = hashlib.sha256(param_str.encode("utf-8")).hexdigest()
+                    param_hash = hashlib.sha256(
+                        param_str.encode("utf-8")).hexdigest()
 
                     # Get an id for this featurizer operation (including hyperparameters)
                     # regardless of the file it is applied on
@@ -255,13 +329,15 @@ def __init__(self, beepfeaturizers: List[BEEPFeaturizer]):
                     # featurizer on a specific file.
                     this_file_feature_columns_ids = \
                         [
-                            f"{file_feature_op_id}{self.OP_DELIMITER}{c}" for c in bf.features.columns
+                            f"{file_feature_op_id}{self.OP_DELIMITER}{c}" for c
+                            in bf.features.columns
                         ]
 
                     # Check to make sure there are no duplicates of the exact same feature for
                     # the exact same featurizer with the exact same hyperparameters on the exact
                     # same file.
-                    collisions = {c: f for c, f in unique_features.items() if c in this_file_feature_columns_ids}
+                    collisions = {c: f for c, f in unique_features.items() if
+                                  c in this_file_feature_columns_ids}
                     if collisions:
                         raise BEEPFeatureMatrixError(
                             f"Multiple features generated with identical classes and identical hyperparameters"
@@ -273,7 +349,9 @@ def __init__(self, beepfeaturizers: List[BEEPFeaturizer]):
 
                     # Create consistent scheme for naming features regardless of file
                     df = copy.deepcopy(bf.features)
-                    consistent_column_names = [f"{c}{self.OP_DELIMITER}{feature_op_id}" for c in df.columns]
+                    consistent_column_names = [
+                        f"{c}{self.OP_DELIMITER}{feature_op_id}" for c in
+                        df.columns]
                     df.columns = consistent_column_names
 
                     df.index = [fname] * df.shape[0]
@@ -323,7 +401,8 @@ def from_dict(cls, d):
         """
         # no need for original datapaths, as their ref paths should
         # be in the subobjects
-        featurizers = [MontyDecoder().process_decoded(f) for f in d["featurizers"]]
+        featurizers = [MontyDecoder().process_decoded(f) for f in
+                       d["featurizers"]]
         return cls(featurizers)
 
     @classmethod
@@ -381,20 +460,22 @@ def __init__(self, beepfeaturizers: List[BEEPFeaturizer]):
             # initialize emtpy dict of file names
             dfs_by_file = {os.path.basename(
                 bf.paths.get("structured", "no file found")
-             )[0:-19]: pd.DataFrame(columns=['filename','cycle_index','diag_pos']
-                                   ) for bf in beepfeaturizers}
+            )[0:-19]: pd.DataFrame(
+                columns=['filename', 'cycle_index', 'diag_pos']
+                ) for bf in beepfeaturizers}
             # big_df_rows = {bf.__class__.__name__: [] for bf in beepfeaturizers}
             unique_features = {}
             for i, bf in enumerate(beepfeaturizers):
                 if bf.features is None:
-                    raise BEEPFeatureMatrixError(f"BEEPFeaturizer {bf} has not created features")
-                    
-#                 elif bf.features.shape[0] != 1:
-#                     raise BEEPFeatureMatrixError(f"BEEPFeaturizer {bf} features are not 1-dimensional.")
+                    raise BEEPFeatureMatrixError(
+                        f"BEEPFeaturizer {bf} has not created features")
+
+                #                 elif bf.features.shape[0] != 1:
+                #                     raise BEEPFeatureMatrixError(f"BEEPFeaturizer {bf} features are not 1-dimensional.")
                 else:
                     bfcn = bf.__class__.__name__
 
-#                     fname = bf.paths.get("structured", None)
+                    #                     fname = bf.paths.get("structured", None)
                     fname = os.path.basename(bf.paths['structured'])[0:-19]
                     if not fname:
                         raise BEEPFeatureMatrixError(
@@ -406,7 +487,8 @@ def __init__(self, beepfeaturizers: List[BEEPFeaturizer]):
                     # on identical files
 
                     # sort params for this featurizer obj by key
-                    params = sorted(list(bf.hyperparameters.items()), key=lambda x: x[0])
+                    params = sorted(list(bf.hyperparameters.items()),
+                                    key=lambda x: x[0])
 
                     # Prevent identical features from identical input files
                     # create a unique operation string for the application of this featurizer
@@ -414,7 +496,8 @@ def __init__(self, beepfeaturizers: List[BEEPFeaturizer]):
                     # the featurizer class name, hyperparameters, and class are the same
 
                     param_str = "-".join([f"{k}:{v}" for k, v in params])
-                    param_hash = hashlib.sha256(param_str.encode("utf-8")).hexdigest()
+                    param_hash = hashlib.sha256(
+                        param_str.encode("utf-8")).hexdigest()
 
                     # Get an id for this featurizer operation (including hyperparameters)
                     # regardless of the file it is applied on
@@ -428,13 +511,15 @@ def __init__(self, beepfeaturizers: List[BEEPFeaturizer]):
                     # featurizer on a specific file.
                     this_file_feature_columns_ids = \
                         [
-                            f"{file_feature_op_id}{self.OP_DELIMITER}{c}" for c in bf.features.columns
+                            f"{file_feature_op_id}{self.OP_DELIMITER}{c}" for c
+                            in bf.features.columns
                         ]
 
                     # Check to make sure there are no duplicates of the exact same feature for
                     # the exact same featurizer with the exact same hyperparameters on the exact
                     # same file.
-                    collisions = {c: f for c, f in unique_features.items() if c in this_file_feature_columns_ids}
+                    collisions = {c: f for c, f in unique_features.items() if
+                                  c in this_file_feature_columns_ids}
                     if collisions:
                         raise BEEPFeatureMatrixError(
                             f"Multiple features generated with identical classes and identical hyperparameters"
@@ -446,49 +531,59 @@ def __init__(self, beepfeaturizers: List[BEEPFeaturizer]):
 
                     # Create consistent scheme for naming features regardless of file
                     df = copy.deepcopy(bf.features)
-                    consistent_column_names = [f"{c}{self.OP_DELIMITER}{feature_op_id}" for c in df.columns]
+                    consistent_column_names = [
+                        f"{c}{self.OP_DELIMITER}{feature_op_id}" for c in
+                        df.columns]
                     df.columns = consistent_column_names
 
-#                     df.index = [fname] * df.shape[0]
-#                     df.index.rename("filename", inplace=True)
+                    #                     df.index = [fname] * df.shape[0]
+                    #                     df.index.rename("filename", inplace=True)
 
                     # create filename column to merge on
-                    df['filename'] = os.path.basename(bf.paths['structured'])[0:-19]
-                    
-#                     df = df.reset_index(drop=True)
+                    df['filename'] = os.path.basename(bf.paths['structured'])[
+                                     0:-19]
+
+                    #                     df = df.reset_index(drop=True)
 
                     # remove hash from cycle_index and diag_pos column
-                    cycle_index_col = [col for col in df.columns if 'cycle_index' in col]
-                    df.rename(columns={cycle_index_col[0]:'cycle_index'},inplace=True)
-        
+                    cycle_index_col = [col for col in df.columns if
+                                       'cycle_index' in col]
+                    df.rename(columns={cycle_index_col[0]: 'cycle_index'},
+                              inplace=True)
+
                     # remove hash from diag_pos column
-                    diag_pos_col = [col for col in df.columns if 'diag_pos' in col]
-                    df.rename(columns={diag_pos_col[0]:'diag_pos'},inplace=True)
-                    
+                    diag_pos_col = [col for col in df.columns if
+                                    'diag_pos' in col]
+                    df.rename(columns={diag_pos_col[0]: 'diag_pos'},
+                              inplace=True)
+
                     # ensure cycle_index and diag_pos are integers
                     df['cycle_index'] = df['cycle_index'].astype(int)
                     df['diag_pos'] = df['diag_pos'].astype(int)
-                    
+
                     # append each BEEPFeaturizer df to the corresponding cell dict entry
-#                     dfs_by_file[fname].append(df)
+                    #                     dfs_by_file[fname].append(df)
                     dfs_by_file[fname] = dfs_by_file[fname].merge(
-                        df,how='outer',on=['filename','cycle_index','diag_pos']).sort_values('cycle_index').reset_index(drop=True)
-#                     dfs_by_file[fname] = pd.concat(
-#                         [dfs_by_file[fname],df],
-#                         axis=1,join='outer',ignore_index=True,
-#                         keys=['filename'])
-#                     self.dfs_by_file = dfs_by_file
-#                     self.df = df
-#             return None
-            
-            
+                        df, how='outer',
+                        on=['filename', 'cycle_index', 'diag_pos']).sort_values(
+                        'cycle_index').reset_index(drop=True)
+            #                     dfs_by_file[fname] = pd.concat(
+            #                         [dfs_by_file[fname],df],
+            #                         axis=1,join='outer',ignore_index=True,
+            #                         keys=['filename'])
+            #                     self.dfs_by_file = dfs_by_file
+            #                     self.df = df
+            #             return None
+
             rows = []
             self.matrix = pd.DataFrame()
             for filename, dfs in dfs_by_file.items():
-#                 row = pd.concat([row,dfs], axis=1)
-#                 row = row[sorted(row.columns)]
-#                 rows.append(row)
-                self.matrix = pd.concat([self.matrix,dfs], axis=0, ignore_index=True, join='outer') #, keys=['filename']
+                #                 row = pd.concat([row,dfs], axis=1)
+                #                 row = row[sorted(row.columns)]
+                #                 rows.append(row)
+                self.matrix = pd.concat([self.matrix, dfs], axis=0,
+                                        ignore_index=True,
+                                        join='outer')  # , keys=['filename']
 
         else:
             self.matrix = None
@@ -526,7 +621,8 @@ def from_dict(cls, d):
         """
         # no need for original datapaths, as their ref paths should
         # be in the subobjects
-        featurizers = [MontyDecoder().process_decoded(f) for f in d["featurizers"]]
+        featurizers = [MontyDecoder().process_decoded(f) for f in
+                       d["featurizers"]]
         return cls(featurizers)
 
     @classmethod
@@ -563,4 +659,4 @@ def to_json_file(self, filename):
             None
         """
         d = self.as_dict()
-        dumpfn(d, filename)
\ No newline at end of file
+        dumpfn(d, filename)

From f5780c890f6a4c3052889d90d5d06e052a58090b Mon Sep 17 00:00:00 2001
From: ardunn <ardunn@lbl.gov>
Date: Tue, 29 Mar 2022 19:16:31 -0700
Subject: [PATCH 08/16] refactoring 1

---
 beep/features/all_cycles/__init__.py          |    0
 beep/features/all_cycles/delta_q.py           |  183 +++
 beep/features/all_cycles/hppc.py              |   89 ++
 .../{ => all_cycles}/intracell_analysis.py    |    0
 .../{ => all_cycles}/intracell_analysis_v2.py |    0
 .../{ => all_cycles}/intracell_losses.py      |    0
 .../{ => all_cycles}/intracell_losses_v2.py   |    0
 beep/features/all_cycles/summary.py           |  392 ++++++
 beep/features/all_cycles/targets.py           |  262 ++++
 beep/features/base.py                         |  662 ---------
 beep/features/core.py                         | 1197 -----------------
 beep/features/featurizer.py                   |  233 ++++
 beep/features/matrix.py                       |  476 +++++++
 beep/features/per_cycle/__init__.py           |    0
 beep/features/per_cycle/diagnostic.py         |  235 ++++
 beep/features/per_cycle/hppc.py               |   80 ++
 16 files changed, 1950 insertions(+), 1859 deletions(-)
 create mode 100644 beep/features/all_cycles/__init__.py
 create mode 100644 beep/features/all_cycles/delta_q.py
 create mode 100644 beep/features/all_cycles/hppc.py
 rename beep/features/{ => all_cycles}/intracell_analysis.py (100%)
 rename beep/features/{ => all_cycles}/intracell_analysis_v2.py (100%)
 rename beep/features/{ => all_cycles}/intracell_losses.py (100%)
 rename beep/features/{ => all_cycles}/intracell_losses_v2.py (100%)
 create mode 100644 beep/features/all_cycles/summary.py
 create mode 100644 beep/features/all_cycles/targets.py
 delete mode 100644 beep/features/base.py
 create mode 100644 beep/features/featurizer.py
 create mode 100644 beep/features/matrix.py
 create mode 100644 beep/features/per_cycle/__init__.py
 create mode 100644 beep/features/per_cycle/diagnostic.py
 create mode 100644 beep/features/per_cycle/hppc.py

diff --git a/beep/features/all_cycles/__init__.py b/beep/features/all_cycles/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/beep/features/all_cycles/delta_q.py b/beep/features/all_cycles/delta_q.py
new file mode 100644
index 00000000..6f8d2ed0
--- /dev/null
+++ b/beep/features/all_cycles/delta_q.py
@@ -0,0 +1,183 @@
+import numpy as np
+import pandas as pd
+from scipy.stats import skew, kurtosis
+
+from beep.features.featurizer import BEEPAllCyclesFeaturizer
+
+
+class DeltaQFastCharge(BEEPAllCyclesFeaturizer):
+    """
+    Object corresponding to feature object. Includes constructors
+    to create the features, object names and metadata attributes in the
+    object
+        name (str): predictor object name.
+        X (pandas.DataFrame): features in DataFrame format.
+        metadata (dict): information about the conditions, data
+            and code used to produce features
+    """
+    DEFAULT_HYPERPARAMETERS = {
+        "init_pred_cycle": 10,
+        "mid_pred_cycle": 91,
+        "final_pred_cycle": 100,
+        "n_nominal_cycles": 40
+    }
+
+    def validate(self):
+        """
+        This function determines if the input data has the necessary attributes for
+        creation of this feature class. It should test for all of the possible reasons
+        that feature generation would fail for this particular input data.
+
+        Args:
+            self.datapath (beep.structure.ProcessedCyclerRun): data from cycler run
+            params_dict (dict): dictionary of parameters governing how the ProcessedCyclerRun object
+            gets featurized. These could be filters for column or row operations
+        Returns:
+            bool: True/False indication of ability to proceed with feature generation
+        """
+
+        if not self.datapath.structured_summary.index.max() > \
+               self.hyperparameters["final_pred_cycle"]:
+            return False, "Structured summary index max is less than final pred cycle"
+        elif not self.datapath.structured_summary.index.min() <= \
+                 self.hyperparameters["init_pred_cycle"]:
+            return False, "Structured summary index min is more than initial pred cycle"
+        elif "cycle_index" not in self.datapath.structured_summary.columns:
+            return False, "Structured summary missing critical data: 'cycle_index'"
+        elif "cycle_index" not in self.datapath.structured_data.columns:
+            return False, "Structured data missing critical data: 'cycle_index'"
+        elif not self.hyperparameters["mid_pred_cycle"] > 10:
+            return False, "Middle pred. cycle less than threshold value of 10"
+        elif not self.hyperparameters["final_pred_cycle"] > \
+                 self.hyperparameters["mid_pred_cycle"]:
+            return False, "Final pred cycle less than middle pred cycle"
+        else:
+            return True, None
+
+    def create_features(self):
+        """
+        """
+        i_final = self.hyperparameters[
+                      "final_pred_cycle"] - 1  # python indexing
+        i_mid = self.hyperparameters["mid_pred_cycle"] - 1
+
+        summary = self.datapath.structured_summary
+        self.hyperparameters[
+            "n_nominal_cycles"
+        ] = 40  # For nominal capacity, use median discharge capacity of first n cycles
+
+        if "step_type" in self.datapath.structured_data.columns:
+            interpolated_df = self.datapath.structured_data[
+                self.datapath.structured_data.step_type == "discharge"
+                ]
+        else:
+            interpolated_df = self.datapath.structured_data
+        X = pd.DataFrame(np.zeros((1, 20)))
+        labels = []
+        # Discharge capacity, cycle 2 = Q(n=2)
+        X[0] = summary.discharge_capacity.iloc[1]
+        labels.append("discharge_capacity_cycle_2")
+
+        # Max discharge capacity - discharge capacity, cycle 2 = max_n(Q(n)) - Q(n=2)
+        X[1] = max(
+            summary.discharge_capacity.iloc[np.arange(i_final + 1)]
+            - summary.discharge_capacity.iloc[1]
+        )
+        labels.append("max_discharge_capacity_difference")
+
+        # Discharge capacity, cycle 100 = Q(n=100)
+        X[2] = summary.discharge_capacity.iloc[i_final]
+        labels.append("discharge_capacity_cycle_100")
+
+        # Feature representing time-temperature integral over cycles 2 to 100
+        X[3] = np.nansum(
+            summary.time_temperature_integrated.iloc[np.arange(i_final + 1)])
+        labels.append("integrated_time_temperature_cycles_1:100")
+
+        # Mean of charge times of first 5 cycles
+        X[4] = np.nanmean(summary.charge_duration.iloc[1:6])
+        labels.append("charge_time_cycles_1:5")
+
+        # Descriptors based on capacity loss between cycles 10 and 100.
+        Qd_final = interpolated_df.discharge_capacity[
+            interpolated_df.cycle_index == i_final
+            ]
+        Qd_10 = interpolated_df.discharge_capacity[
+            interpolated_df.cycle_index == 9]
+
+        Qd_diff = Qd_final.values - Qd_10.values
+
+        # If DeltaQ(V) is not an empty array, compute summary stats, else initialize with np.nan
+        # Cells discharged rapidly over a narrow voltage window run into have no interpolated discharge steps
+        if len(Qd_diff):
+            X[5] = np.log10(np.abs(np.nanmin(Qd_diff)))  # Minimum
+            X[6] = np.log10(np.abs(np.nanmean(Qd_diff)))  # Mean
+            X[7] = np.log10(np.abs(np.nanvar(Qd_diff)))  # Variance
+            X[8] = np.log10(np.abs(skew(Qd_diff)))  # Skewness
+            X[9] = np.log10(np.abs(kurtosis(Qd_diff)))  # Kurtosis
+            X[10] = np.log10(np.abs(Qd_diff[0]))  # First difference
+        else:
+            X[5:11] = np.nan
+
+        labels.append("abs_min_discharge_capacity_difference_cycles_2:100")
+        labels.append("abs_mean_discharge_capacity_difference_cycles_2:100")
+        labels.append("abs_variance_discharge_capacity_difference_cycles_2:100")
+        labels.append("abs_skew_discharge_capacity_difference_cycles_2:100")
+        labels.append("abs_kurtosis_discharge_capacity_difference_cycles_2:100")
+        labels.append("abs_first_discharge_capacity_difference_cycles_2:100")
+
+        X[11] = np.max(summary.temperature_maximum.iloc[
+                           list(range(1, i_final + 1))])  # Max T
+        labels.append("max_temperature_cycles_1:100")
+
+        X[12] = np.min(summary.temperature_minimum.iloc[
+                           list(range(1, i_final + 1))])  # Min T
+        labels.append("min_temperature_cycles_1:100")
+
+        # Slope and intercept of linear fit to discharge capacity as a fn of cycle #, cycles 2 to 100
+
+        X[13], X[14] = np.polyfit(
+            list(range(1, i_final + 1)),
+            summary.discharge_capacity.iloc[list(range(1, i_final + 1))],
+            1,
+        )
+
+        labels.append("slope_discharge_capacity_cycle_number_2:100")
+        labels.append("intercept_discharge_capacity_cycle_number_2:100")
+
+        # Slope and intercept of linear fit to discharge capacity as a fn of cycle #, cycles 91 to 100
+        X[15], X[16] = np.polyfit(
+            list(range(i_mid, i_final + 1)),
+            summary.discharge_capacity.iloc[list(range(i_mid, i_final + 1))],
+            1,
+        )
+        labels.append("slope_discharge_capacity_cycle_number_91:100")
+        labels.append("intercept_discharge_capacity_cycle_number_91:100")
+
+        IR_trend = summary.dc_internal_resistance.iloc[
+            list(range(1, i_final + 1))]
+        if any(v == 0 for v in IR_trend):
+            IR_trend[IR_trend == 0] = np.nan
+
+        # Internal resistance minimum
+        X[17] = np.nanmin(IR_trend)
+        labels.append("min_internal_resistance_cycles_2:100")
+
+        # Internal resistance at cycle 2
+        X[18] = summary.dc_internal_resistance.iloc[1]
+        labels.append("internal_resistance_cycle_2")
+
+        # Internal resistance at cycle 100 - cycle 2
+        X[19] = (
+                summary.dc_internal_resistance.iloc[i_final] -
+                summary.dc_internal_resistance.iloc[1]
+        )
+        labels.append("internal_resistance_difference_cycles_2:100")
+
+        # Nominal capacity
+        end = self.hyperparameters["n_nominal_cycles"]
+        X[20] = np.median(summary.discharge_capacity.iloc[0: end])
+        labels.append("nominal_capacity_by_median")
+
+        X.columns = labels
+        self.features = X
\ No newline at end of file
diff --git a/beep/features/all_cycles/hppc.py b/beep/features/all_cycles/hppc.py
new file mode 100644
index 00000000..45849dd6
--- /dev/null
+++ b/beep/features/all_cycles/hppc.py
@@ -0,0 +1,89 @@
+import numpy as np
+import pandas as pd
+from scipy.stats import skew, kurtosis
+from scipy.interpolate import interp1d
+
+from beep import PROTOCOL_PARAMETERS_DIR
+from beep.features import featurizer_helpers
+from functools import reduce
+from beep.utils.parameters_lookup import get_protocol_parameters
+
+from beep.features.featurizer import BEEPAllCyclesFeaturizer
+
+
+class HPPCResistanceVoltageAllCycles(BEEPAllCyclesFeaturizer):
+    DEFAULT_HYPERPARAMETERS = {
+        "test_time_filter_sec": 1000000,
+        "cycle_index_filter": 6,
+        "diag_pos": 1,
+        "soc_window": 8,
+        "parameters_path": PROTOCOL_PARAMETERS_DIR
+    }
+
+    def validate(self):
+        val, msg = featurizer_helpers.check_diagnostic_validation(self.datapath)
+        if val:
+            conditions = []
+            conditions.append(
+                any(
+                    [
+                        "hppc" in x
+                        for x in
+                        self.datapath.diagnostic_summary.cycle_type.unique()
+                    ]
+                )
+            )
+            if all(conditions):
+                return True, None
+            else:
+                return False, "HPPC conditions not met for this cycler run"
+        else:
+            return val, msg
+
+    def create_features(self):
+        # Filter out low cycle numbers at the end of the test, corresponding to the "final" diagnostic
+        self.datapath.diagnostic_data = self.datapath.diagnostic_data[
+            ~((self.datapath.diagnostic_data.test_time > self.hyperparameters[
+                'test_time_filter_sec']) &
+              (self.datapath.diagnostic_data.cycle_index < self.hyperparameters[
+                  'cycle_index_filter']))
+        ]
+        self.datapath.diagnostic_data = self.datapath.diagnostic_data.groupby(
+            ["cycle_index", "step_index", "step_index_counter"]
+        ).filter(lambda x: ~x["test_time"].isnull().all())
+
+        # diffusion features
+        diffusion_features = featurizer_helpers.get_diffusion_early_features(
+            self.datapath,
+        )
+
+        hppc_r = pd.DataFrame()
+        # the 9 by 6 dataframe
+        df_dr = featurizer_helpers.get_dr_df(
+            self.datapath, self.hyperparameters["diag_pos"]
+        )
+        # transform this dataframe to be 1 by 54
+        columns = df_dr.columns
+        for column in columns:
+            for r in range(len(df_dr[column])):
+                name = column + str(r)
+                hppc_r[name] = [df_dr[column][r]]
+
+        # the variance of ocv features
+        hppc_ocv = featurizer_helpers.get_hppc_ocv(
+            self.datapath,
+            self.hyperparameters["diag_pos"],
+            parameters_path=self.hyperparameters["parameters_path"]
+        )
+
+        # the v_diff features
+        v_diff = featurizer_helpers.get_v_diff(
+            self.datapath,
+            self.hyperparameters["diag_pos"],
+            self.hyperparameters["soc_window"],
+            self.hyperparameters["parameters_path"]
+        )
+
+        # merge everything together as a final result dataframe
+        self.features = pd.concat(
+            [hppc_r, hppc_ocv, v_diff, diffusion_features], axis=1)
\ No newline at end of file
diff --git a/beep/features/intracell_analysis.py b/beep/features/all_cycles/intracell_analysis.py
similarity index 100%
rename from beep/features/intracell_analysis.py
rename to beep/features/all_cycles/intracell_analysis.py
diff --git a/beep/features/intracell_analysis_v2.py b/beep/features/all_cycles/intracell_analysis_v2.py
similarity index 100%
rename from beep/features/intracell_analysis_v2.py
rename to beep/features/all_cycles/intracell_analysis_v2.py
diff --git a/beep/features/intracell_losses.py b/beep/features/all_cycles/intracell_losses.py
similarity index 100%
rename from beep/features/intracell_losses.py
rename to beep/features/all_cycles/intracell_losses.py
diff --git a/beep/features/intracell_losses_v2.py b/beep/features/all_cycles/intracell_losses_v2.py
similarity index 100%
rename from beep/features/intracell_losses_v2.py
rename to beep/features/all_cycles/intracell_losses_v2.py
diff --git a/beep/features/all_cycles/summary.py b/beep/features/all_cycles/summary.py
new file mode 100644
index 00000000..98dd9a1e
--- /dev/null
+++ b/beep/features/all_cycles/summary.py
@@ -0,0 +1,392 @@
+import numpy as np
+import pandas as pd
+from scipy.stats import skew, kurtosis
+
+from beep.features.featurizer import BEEPAllCyclesFeaturizer
+
+
+
+class CycleSummaryStats(BEEPAllCyclesFeaturizer):
+    DEFAULT_HYPERPARAMETERS = {
+        "cycle_comp_num": [10, 100],
+        "statistics": ["var", "min", "mean", "skew", "kurtosis", "abs",
+                       "square"]
+    }
+
+    def validate(self):
+        """
+        This function determines if the input data has the necessary attributes for
+        creation of this feature class. It should test for all of the possible reasons
+        that feature generation would fail for this particular input data.
+
+        Args:
+            processed_cycler_run (beep.structure.ProcessedCyclerRun): data from cycler run
+            params_dict (dict): dictionary of parameters governing how the ProcessedCyclerRun object
+                gets featurized. These could be filters for column or row operations
+        Returns:
+            bool: True/False indication of ability to proceed with feature generation
+        """
+
+        # TODO: not sure this is necessary
+        # Check for data in each of the selected cycles
+        index_1, index_2 = self.hyperparameters['cycle_comp_num']
+        cycle_1 = self.datapath.structured_data[
+            self.datapath.structured_data.cycle_index == index_1]
+        cycle_2 = self.datapath.structured_data[
+            self.datapath.structured_data.cycle_index == index_2]
+        if len(cycle_1) == 0 or len(cycle_2) == 0:
+            return False, "Length of one or more comparison cycles is zero"
+
+        # TODO: check whether this is good
+        # Check for relevant data
+        required_columns = [
+            'charge_capacity',
+            'discharge_capacity',
+            'charge_energy',
+            'discharge_energy',
+        ]
+        pcycler_run_columns = self.datapath.structured_data.columns
+        if not all(
+                [column in pcycler_run_columns for column in required_columns]):
+            return False, f"Required column not present in all structured data " \
+                          f"(must have all of: {required_columns})"
+
+        return True, None
+
+    def create_features(self):
+        """
+        Generate features listed in early prediction manuscript using both diagnostic and regular cycles
+
+        Args:
+            processed_cycler_run (beep.structure.ProcessedCyclerRun)
+            params_dict (dict): dictionary of parameters governing how the ProcessedCyclerRun object
+                gets featurized. These could be filters for column or row operations
+            parameters_path (str): Root directory storing project parameter files.
+
+        Returns:
+            X (pd.DataFrame): Dataframe containing the feature
+        """
+
+        # TODO: extend this dataframe and uncomment energy features when
+        #   structuring is refactored
+        X = pd.DataFrame(np.zeros((1, 28)))
+
+        reg_cycle_comp_num = self.hyperparameters.get("cycle_comp_num")
+        cycle_comp_1 = self.datapath.structured_data[
+            self.datapath.structured_data.cycle_index == reg_cycle_comp_num[1]
+            ]
+        cycle_comp_0 = self.datapath.structured_data[
+            self.datapath.structured_data.cycle_index == reg_cycle_comp_num[0]
+            ]
+        Qc100_1 = cycle_comp_1[
+            cycle_comp_1.step_type == "charge"].charge_capacity
+        Qc10_1 = cycle_comp_0[
+            cycle_comp_0.step_type == "charge"].charge_capacity
+        QcDiff = Qc100_1.values - Qc10_1.values
+        QcDiff = QcDiff[~np.isnan(QcDiff)]
+
+        X.loc[0, 0:6] = self.get_summary_statistics(QcDiff)
+
+        Qd100_1 = cycle_comp_1[
+            cycle_comp_1.step_type == "discharge"].discharge_capacity
+        Qd10_1 = cycle_comp_0[
+            cycle_comp_0.step_type == "discharge"].discharge_capacity
+        QdDiff = Qd100_1.values - Qd10_1.values
+        QdDiff = QdDiff[~np.isnan(QdDiff)]
+
+        X.loc[0, 7:13] = self.get_summary_statistics(QdDiff)
+
+        # # Charging Energy features
+        Ec100_1 = cycle_comp_1[cycle_comp_1.step_type == "charge"].charge_energy
+        Ec10_1 = cycle_comp_0[cycle_comp_0.step_type == "charge"].charge_energy
+        EcDiff = Ec100_1.values - Ec10_1.values
+        EcDiff = EcDiff[~np.isnan(EcDiff)]
+
+        X.loc[0, 14:20] = self.get_summary_statistics(EcDiff)
+
+        # # Discharging Energy features
+        Ed100_1 = cycle_comp_1[
+            cycle_comp_1.step_type == "charge"].discharge_energy
+        Ed10_1 = cycle_comp_0[
+            cycle_comp_0.step_type == "charge"].discharge_energy
+        EdDiff = Ed100_1.values - Ed10_1.values
+        EdDiff = EdDiff[~np.isnan(EdDiff)]
+
+        X.loc[0, 21:27] = self.get_summary_statistics(EdDiff)
+
+        quantities = [
+            "charging_capacity",
+            "discharging_capacity",
+            "charging_energy",
+            "discharging_energy",
+        ]
+
+        X.columns = [y + "_" + x for x in quantities for y in
+                     self.hyperparameters["statistics"]]
+
+        self.features = X
+
+    def get_summary_statistics(self, array):
+        """
+        Static method for getting values corresponding
+        to standard 7 operations that many beep features
+        use, i.e. log of absolute value of each of
+        variance, min, mean, skew, kurtosis, the sum of
+        the absolute values and the sum of squares
+
+        Args:
+            array (list, np.ndarray): array of values to get
+                standard operation values for, e.g. cycle
+                discharging capacity, QcDiff, etc.
+
+        Returns:
+            [float]: list of features
+
+        """
+
+        stats_names = self.hyperparameters["statistics"]
+        supported_stats = self.DEFAULT_HYPERPARAMETERS["statistics"]
+
+        if any(s not in supported_stats for s in stats_names):
+            raise ValueError(
+                f"Unsupported statistics in {stats_names}: supported statistics are {supported_stats}")
+
+        stats = []
+
+        if "var" in stats_names:
+            stats.append(np.log10(np.absolute(np.var(array))))
+        if "min" in stats_names:
+            stats.append(np.log10(np.absolute(min(array))))
+        if "mean" in stats_names:
+            stats.append(np.log10(np.absolute(np.mean(array))))
+        if "skew" in stats_names:
+            stats.append(np.log10(np.absolute(skew(array))))
+        if "kurtosis" in stats_names:
+            stats.append(np.log10(
+                np.absolute(kurtosis(array, fisher=False, bias=False))))
+        if "abs" in stats_names:
+            stats.append(np.log10(np.sum(np.absolute(array))))
+        if "square" in stats_names:
+            stats.append(np.log10(np.sum(np.square(array))))
+
+        return np.asarray(stats)
+
+
+class DiagnosticSummaryStats(CycleSummaryStats):
+    """
+    Object corresponding to summary statistics from a diagnostic cycle of
+    specific type. Includes constructors to create the features, object names
+    and metadata attributes in the object.  Inherits from RegularCycleSummaryStats
+    to reuse standard feature generation
+
+    name (str): predictor object name.
+    X (pandas.DataFrame): features in DataFrame format.
+    metadata (dict): information about the conditions, data
+        and code used to produce features
+    """
+    DEFAULT_HYPERPARAMETERS = {
+        "test_time_filter_sec": 1000000,
+        "cycle_index_filter": 6,
+        "diagnostic_cycle_type": 'rpt_0.2C',
+        "diag_pos_list": [0, 1],
+        "statistics": ["var", "min", "mean", "skew", "kurtosis", "abs",
+                       "square"],
+        "parameters_path": PROTOCOL_PARAMETERS_DIR
+    }
+
+    def validate(self):
+        """
+        This function determines if the input data has the necessary attributes for
+        creation of this feature class. It should test for all of the possible reasons
+        that feature generation would fail for this particular input data.
+
+        Args:
+            processed_cycler_run (beep.structure.ProcessedCyclerRun): data from cycler run
+            params_dict (dict): dictionary of parameters governing how the ProcessedCyclerRun object
+                gets featurized. These could be filters for column or row operations
+
+        Returns:
+            bool: True/False indication of ability to proceed with feature generation
+        """
+        val, msg = featurizer_helpers.check_diagnostic_validation(self.datapath)
+        if val:
+            df = self.datapath.diagnostic_summary
+            df = df[
+                df.cycle_type == self.hyperparameters["diagnostic_cycle_type"]]
+            if df.cycle_index.nunique() >= max(
+                    self.hyperparameters["diag_pos_list"]) + 1:
+                return True, None
+            else:
+                return False, "Diagnostic cycles insufficient for featurization"
+        else:
+            return val, msg
+
+    def get_summary_diff(
+            self,
+            pos=None,
+            cycle_types=("rpt_0.2C", "rpt_1C", "rpt_2C"),
+            metrics=(
+                    "discharge_capacity", "discharge_energy", "charge_capacity",
+                    "charge_energy")
+    ):
+        """
+        Helper function to calculate difference between summary values in the diagnostic cycles
+
+                Args:
+                    processed_cycler_run (beep.structure.ProcessedCyclerRun)
+                    pos (list): position of the diagnostics to use in the calculation
+                    cycle_types (list): calculate difference for these diagnostic types
+                    metrics (str): Calculate difference for these metrics
+
+                Returns:
+                    values (list): List of difference values to insert into the dataframe
+                    names (list): List of column headers to use in the creation of the dataframe
+                """
+        pos = self.hyperparameters["diag_pos_list"] if not pos else pos
+
+        values = []
+        names = []
+        for cycle_type in cycle_types:
+            diag_type_summary = self.datapath.diagnostic_summary[
+                self.datapath.diagnostic_summary.cycle_type == cycle_type]
+            for metric in metrics:
+                diff = (diag_type_summary.iloc[pos[1]][metric] -
+                        diag_type_summary.iloc[pos[0]][metric]) \
+                       / diag_type_summary.iloc[pos[0]][metric]
+                values.append(diff)
+                names.append("diag_sum_diff_" + str(pos[0]) + "_" + str(
+                    pos[1]) + "_" + cycle_type + metric)
+        return values, names
+
+    def create_features(self):
+        """
+        Generate features listed in early prediction manuscript using both diagnostic and regular cycles
+
+        Args:
+            self.datapathn (beep.structure.ProcessedCyclerRun)
+            self.hyperparameters (dict): dictionary of parameters governing how the ProcessedCyclerRun object
+                gets featurized. These could be filters for column or row operations
+            parameters_path (str): Root directory storing project parameter files.
+
+        Returns:
+            X (pd.DataFrame): Dataframe containing the feature
+        """
+        # Filter out "final" diagnostic cycles that have been appended to the end of the file with the wrong
+        # cycle number(test time is monotonic)
+        self.datapath.diagnostic_data = self.datapath.diagnostic_data[
+            ~((self.datapath.diagnostic_data.test_time > self.hyperparameters[
+                'test_time_filter_sec']) &
+              (self.datapath.diagnostic_data.cycle_index < self.hyperparameters[
+                  'cycle_index_filter']))
+        ]
+        self.datapath.diagnostic_data = self.datapath.diagnostic_data.groupby(
+            ["cycle_index", "step_index", "step_index_counter"]
+        ).filter(lambda x: ~x["test_time"].isnull().all())
+
+        diag_intrp = self.datapath.diagnostic_data
+
+        X = pd.DataFrame(np.zeros((1, 54)))
+
+        # Calculate the cycles and the steps for the selected diagnostics
+        cycles = diag_intrp.cycle_index[diag_intrp.cycle_type ==
+                                        self.hyperparameters[
+                                            "diagnostic_cycle_type"]].unique()
+        step_dict_0 = featurizer_helpers.get_step_index(
+            self.datapath,
+            cycle_type=self.hyperparameters["diagnostic_cycle_type"],
+            diag_pos=self.hyperparameters["diag_pos_list"][0],
+            parameters_path=self.hyperparameters["parameters_path"]
+        )
+        step_dict_1 = featurizer_helpers.get_step_index(
+            self.datapath,
+            cycle_type=self.hyperparameters["diagnostic_cycle_type"],
+            diag_pos=self.hyperparameters["diag_pos_list"][1],
+            parameters_path=self.hyperparameters["parameters_path"]
+        )
+
+        # Create masks for each position in the data
+        mask_pos_0_charge = ((diag_intrp.cycle_index == cycles[
+            self.hyperparameters["diag_pos_list"][0]]) &
+                             (diag_intrp.step_index == step_dict_0[
+                                 self.hyperparameters[
+                                     "diagnostic_cycle_type"] + '_charge']))
+        mask_pos_1_charge = ((diag_intrp.cycle_index == cycles[
+            self.hyperparameters["diag_pos_list"][1]]) &
+                             (diag_intrp.step_index == step_dict_1[
+                                 self.hyperparameters[
+                                     "diagnostic_cycle_type"] + '_charge']))
+        mask_pos_0_discharge = ((diag_intrp.cycle_index == cycles[
+            self.hyperparameters["diag_pos_list"][0]]) &
+                                (diag_intrp.step_index ==
+                                 step_dict_0[self.hyperparameters[
+                                                 "diagnostic_cycle_type"] + '_discharge']))
+        mask_pos_1_discharge = ((diag_intrp.cycle_index == cycles[
+            self.hyperparameters["diag_pos_list"][1]]) &
+                                (diag_intrp.step_index ==
+                                 step_dict_1[self.hyperparameters[
+                                                 "diagnostic_cycle_type"] + '_discharge']))
+
+        # Charging Capacity features
+        Qc_1 = diag_intrp.charge_capacity[mask_pos_1_charge]
+        Qc_0 = diag_intrp.charge_capacity[mask_pos_0_charge]
+        QcDiff = Qc_1.values - Qc_0.values
+        QcDiff = QcDiff[~np.isnan(QcDiff)]
+
+        X.loc[0, 0:6] = self.get_summary_statistics(QcDiff)
+
+        # Discharging Capacity features
+        Qd_1 = diag_intrp.discharge_capacity[mask_pos_1_discharge]
+        Qd_0 = diag_intrp.discharge_capacity[mask_pos_0_discharge]
+        QdDiff = Qd_1.values - Qd_0.values
+        QdDiff = QdDiff[~np.isnan(QdDiff)]
+
+        X.loc[0, 7:13] = self.get_summary_statistics(QdDiff)
+
+        # Charging Energy features
+        Ec_1 = diag_intrp.charge_energy[mask_pos_1_charge]
+        Ec_0 = diag_intrp.charge_energy[mask_pos_0_charge]
+        EcDiff = Ec_1.values - Ec_0.values
+        EcDiff = EcDiff[~np.isnan(EcDiff)]
+
+        X.loc[0, 14:20] = self.get_summary_statistics(EcDiff)
+
+        # Discharging Energy features
+        Ed_1 = diag_intrp.discharge_energy[mask_pos_1_discharge]
+        Ed_0 = diag_intrp.discharge_energy[mask_pos_0_discharge]
+        EdDiff = Ed_1.values - Ed_0.values
+        EdDiff = EdDiff[~np.isnan(EdDiff)]
+
+        X.loc[0, 21:27] = self.get_summary_statistics(EdDiff)
+
+        # Charging dQdV features
+        dQdVc_1 = diag_intrp.charge_dQdV[mask_pos_1_charge]
+        dQdVc_0 = diag_intrp.charge_dQdV[mask_pos_0_charge]
+        dQdVcDiff = dQdVc_1.values - dQdVc_0.values
+        dQdVcDiff = dQdVcDiff[~np.isnan(dQdVcDiff)]
+
+        X.loc[0, 28:34] = self.get_summary_statistics(dQdVcDiff)
+
+        # Discharging Capacity features
+        dQdVd_1 = diag_intrp.discharge_dQdV[mask_pos_1_discharge]
+        dQdVd_0 = diag_intrp.discharge_dQdV[mask_pos_0_discharge]
+        dQdVdDiff = dQdVd_1.values - dQdVd_0.values
+        dQdVdDiff = dQdVdDiff[~np.isnan(dQdVdDiff)]
+
+        X.loc[0, 35:41] = self.get_summary_statistics(dQdVdDiff)
+
+        X.loc[0, 42:53], names = self.get_summary_diff(
+            self.hyperparameters["diag_pos_list"]
+        )
+
+        quantities = [
+            "charging_capacity",
+            "discharging_capacity",
+            "charging_energy",
+            "discharging_energy",
+            "charging_dQdV",
+            "discharging_dQdV",
+        ]
+
+        X.columns = [y + "_" + x for x in quantities for y in
+                     self.hyperparameters["statistics"]] + names
+        self.features = X
diff --git a/beep/features/all_cycles/targets.py b/beep/features/all_cycles/targets.py
new file mode 100644
index 00000000..7419b3a9
--- /dev/null
+++ b/beep/features/all_cycles/targets.py
@@ -0,0 +1,262 @@
+import numpy as np
+import pandas as pd
+from scipy.interpolate import interp1d
+
+from beep import PROTOCOL_PARAMETERS_DIR
+from beep.features import featurizer_helpers
+
+from beep.features.featurizer import BEEPFeaturizer, BEEPFeaturizationError
+
+
+class TrajectoryFastCharge(BEEPFeaturizer):
+    """
+    Object corresponding to cycle numbers at which the capacity drops below
+     specific percentages of the initial capacity. Computed on the discharge
+     portion of the regular fast charge cycles.
+
+    """
+
+    DEFAULT_HYPERPARAMETERS = {
+        "thresh_max_cap": 0.98,
+        "thresh_min_cap": 0.78,
+        "interval_cap": 0.03
+    }
+
+    def validate(self):
+        """
+        This function determines if the input data has the necessary attributes for
+        creation of this feature class. It should test for all of the possible reasons
+        that feature generation would fail for this particular input data.
+
+        Args:
+            processed_cycler_run (beep.structure.ProcessedCyclerRun): data from cycler run
+            params_dict (dict): dictionary of parameters governing how the ProcessedCyclerRun object
+            gets featurized. These could be filters for column or row operations
+        Returns:
+            bool: True/False indication of ability to proceed with feature generation
+        """
+        cap = self.datapath.structured_summary.discharge_capacity
+        cap_ratio = cap.min() / cap.max()
+        max_cap = self.hyperparameters["thresh_max_cap"]
+        if not cap_ratio < max_cap:
+            return False, f"thresh_max_cap hyperparameter exceeded: {cap_ratio} !< {max_cap}"
+        else:
+            return True, None
+
+    def create_features(self):
+        """
+        Calculate the outcomes from the input data. In particular, the number of cycles
+        where we expect to reach certain thresholds of capacity loss
+        Args:
+            processed_cycler_run (beep.structure.ProcessedCyclerRun): data from cycler run
+            params_dict (dict): dictionary of parameters governing how the ProcessedCyclerRun object
+            gets featurized. These could be filters for column or row operations
+            parameters_path (str): Root directory storing project parameter files.
+
+        Returns:
+            pd.DataFrame: cycles at which capacity/energy degradation exceeds thresholds
+        """
+        y = self.datapath.capacities_to_cycles(
+            self.hyperparameters["thresh_max_cap"],
+            self.hyperparameters["thresh_min_cap"],
+            self.hyperparameters["interval_cap"],
+        )
+        self.features = y
+
+
+class DiagnosticProperties(BEEPFeaturizer):
+    """
+    This class stores fractional levels of degradation in discharge capacity and discharge energy
+    relative to the first cycle at each diagnostic cycle, grouped by diagnostic cycle type.
+
+        name (str): predictor object name.
+        X (pandas.DataFrame): features in DataFrame format.
+        metadata (dict): information about the conditions, data
+            and code used to produce features
+
+    Hyperparameters:
+        parameters_dir (str): Full path to directory of parameters to analyse the
+            diagnostic cycles
+        quantities ([str]): Quantities to extract/get fractional metrics for
+            diagnostic cycles
+        cycle_type (str): Type of diagnostic cycle being used to measure the
+            fractional metric
+        metric (str): The metric being used for fractional capacity
+        interpolation_axes (list): List of column names to use for
+            x_axis interpolation (distance to threshold)
+        threshold (float): Value for the fractional metric to be considered above
+            or below threshold
+        filter_kinks (float): If set, cutoff value for the second derivative of
+            the fractional metric (cells with an abrupt change in degradation
+            rate might have something else going on). Typical value might be 0.04
+        extrapolate_threshold (bool): Should threshold crossing point be
+            extrapolated for cells that have not yet reached the threshold
+            (warning: this uses a linear extrapolation from the last two
+            diagnostic cycles)
+    """
+    DEFAULT_HYPERPARAMETERS = {
+        "parameters_dir": PROTOCOL_PARAMETERS_DIR,
+        "quantities": ['discharge_energy', 'discharge_capacity'],
+        "threshold": 0.8,
+        "metric": "discharge_energy",
+        "filter_kinks": None,
+        "interpolation_axes": ["normalized_regular_throughput", "cycle_index"],
+        "cycle_type": "rpt_1C",
+        "extrapolate_threshold": True
+    }
+
+    def validate(self):
+        """
+        This function determines if the input data has the necessary attributes for
+        creation of this feature class. It should test for all of the possible reasons
+        that feature generation would fail for this particular input data.
+
+        Args:
+            processed_cycler_run (beep.structure.ProcessedCyclerRun): data from cycler run
+            params_dict (dict): dictionary of parameters governing how the ProcessedCyclerRun object
+            gets featurized. These could be filters for column or row operations
+        Returns:
+            bool: True/False indication of ability to proceed with feature generation
+        """
+        return featurizer_helpers.check_diagnostic_validation(self.datapath)
+
+    def create_features(self):
+        """
+        Generates diagnostic-property features from processed cycler run, including values for n*x method
+        Args:
+            self.datapath (beep.structure.ProcessedCyclerRun): data from cycler run
+            params_dict (dict): dictionary of parameters governing how the ProcessedCyclerRun object
+                gets featurized. These could be filters for column or row operations
+            parameters_path (str): Root directory storing project parameter files.
+
+        Returns:
+            pd.DataFrame: with "cycle_index", "fractional_metric", "x", "n", "cycle_type" and "metric" columns, rows
+            for each diagnostic cycle of the cell
+        """
+
+        parameters_path = self.hyperparameters["parameters_dir"]
+
+        cycle_types = self.datapath.diagnostic_summary.cycle_type.unique()
+        X = pd.DataFrame()
+        for quantity in self.hyperparameters["quantities"]:
+            for cycle_type in cycle_types:
+                summary_diag_cycle_type = featurizer_helpers.get_fractional_quantity_remaining_nx(
+                    self.datapath, quantity, cycle_type,
+                    parameters_path=parameters_path
+                )
+
+                summary_diag_cycle_type.loc[:, "cycle_type"] = cycle_type
+                summary_diag_cycle_type.loc[:, "metric"] = quantity
+                X = X.append(summary_diag_cycle_type)
+
+        X_condensed = self.get_threshold_targets(X)
+        self.features = X_condensed
+
+    def get_threshold_targets(self, df):
+        """
+        Apply a threshold via interpolation for determining various
+        metrics (e.g., discharge energy) from diagnostic cycles.
+
+        Args:
+            df (pd.DataFrame): A dataframe of diagnostic cycle data
+                for a single battery cycler run.
+
+        Returns:
+            (pd.DataFrame): Contains a vector for interpolated/intercept
+                data for determining threshold.
+
+        """
+        cycle_type = self.hyperparameters["cycle_type"]
+        metric = self.hyperparameters["metric"]
+        interpolation_axes = self.hyperparameters["interpolation_axes"]
+        threshold = self.hyperparameters["threshold"]
+        filter_kinks = self.hyperparameters["filter_kinks"]
+        extrapolate_threshold = self.hyperparameters["extrapolate_threshold"]
+
+        if filter_kinks:
+            if np.any(df['fractional_metric'].diff().diff() < filter_kinks):
+                last_good_cycle = df[
+                    df['fractional_metric'].diff().diff() < filter_kinks][
+                    'cycle_index'].min()
+                df = df[df['cycle_index'] < last_good_cycle]
+
+        x_axes = []
+        for type in interpolation_axes:
+            x_axes.append(df[type])
+        y_interpolation_axis = df['fractional_metric']
+
+        # Logic around how to deal with cells that have not crossed threshold
+        if df['fractional_metric'].min() > threshold and \
+                not extrapolate_threshold:
+            BEEPFeaturizationError(
+                "DiagnosticProperties data has not crossed threshold "
+                "and extrapolation inaccurate"
+            )
+        elif df['fractional_metric'].min() > threshold and \
+                extrapolate_threshold:
+            fill_value = "extrapolate"
+            bounds_error = False
+            x_linspaces = []
+            for x_axis in x_axes:
+                y1 = y_interpolation_axis.iloc[-2]
+                y2 = y_interpolation_axis.iloc[-1]
+                x1 = x_axis.iloc[-2]
+                x2 = x_axis.iloc[-1]
+                x_thresh_extrap = (threshold - 0.1 - y1) * (x2 - x1) / (
+                        y2 - y1) + x1
+                x_linspaces.append(
+                    np.linspace(x_axis.min(), x_thresh_extrap, num=1000)
+                )
+        else:
+            fill_value = np.nan
+            bounds_error = True
+            x_linspaces = []
+            for x_axis in x_axes:
+                x_linspaces.append(
+                    np.linspace(x_axis.min(), x_axis.max(), num=1000))
+
+        f_axis = []
+        for x_axis in x_axes:
+            f_axis.append(
+                interp1d(
+                    x_axis,
+                    y_interpolation_axis,
+                    kind='linear',
+                    bounds_error=bounds_error,
+                    fill_value=fill_value
+                )
+            )
+
+        x_to_threshold = []
+        for indx, x_linspace in enumerate(x_linspaces):
+            crossing_array = abs(f_axis[indx](x_linspace) - threshold)
+            x_to_threshold.append(x_linspace[np.argmin(crossing_array)])
+
+        if ~(x_to_threshold[0] > 0) or ~(x_to_threshold[1] > 0):
+            raise BEEPFeaturizationError(
+                "DiagnosticProperties data does not have a positive value "
+                "to threshold"
+            )
+
+        if "normalized_regular_throughput" in interpolation_axes:
+            real_throughput_to_threshold = x_to_threshold[
+                                               interpolation_axes.index(
+                                                   "normalized_regular_throughput")] * \
+                                           df[
+                                               'initial_regular_throughput'].values[
+                                               0]
+            x_to_threshold.append(real_throughput_to_threshold)
+            interpolation_axes = interpolation_axes + [
+                "real_regular_throughput"]
+
+        threshold_dict = {
+            'initial_regular_throughput':
+                df['initial_regular_throughput'].values[0],
+        }
+
+        for indx, x_axis in enumerate(interpolation_axes):
+            threshold_dict[
+                cycle_type + metric + str(threshold) + '_' + x_axis] = [
+                x_to_threshold[indx]]
+
+        return pd.DataFrame(threshold_dict)
\ No newline at end of file
diff --git a/beep/features/base.py b/beep/features/base.py
deleted file mode 100644
index 570d7c12..00000000
--- a/beep/features/base.py
+++ /dev/null
@@ -1,662 +0,0 @@
-# Copyright [2020] [Toyota Research Institute]
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-For creating features and organizing them into datasets.
-
-"""
-import os
-import copy
-import abc
-import json
-import hashlib
-from typing import Union, Tuple, List
-
-import pandas as pd
-from monty.io import zopen
-from monty.json import MSONable, MontyDecoder
-from monty.serialization import loadfn, dumpfn
-
-from beep.structure.base import BEEPDatapath
-
-
-class BEEPFeaturizationError(BaseException):
-    """Raise when a featurization-specific error occurs"""
-    pass
-
-
-class BEEPFeatureMatrixError(BaseException):
-    """ Raise when there is a BEEP-specific problem with a dataset"""
-    pass
-
-
-class BEEPFeaturizer(MSONable, abc.ABC):
-    """
-    Base class for all beep feature generation.
-
-    From a structured battery file representing many cycles of one cell,
-    (AKA a structured datapath), produce a feature vector.
-
-    Works for generating both
-     - Vectors X to use as training vectors
-     - Vectors or scalars y to use as ML targets
-        (as problems may have multiple metrics to predict)
-
-    """
-
-    DEFAULT_HYPERPARAMETERS = {}
-
-    def __init__(self, structured_datapath: Union[BEEPDatapath, None],
-                 hyperparameters: Union[dict, None] = None):
-        # If all required hyperparameters are specified, use those
-        # If some subset of required hyperparameters are specified, throw error
-        # If no hyperparameters are specified, use defaults
-        if hyperparameters:
-            if all(k in hyperparameters for k in self.DEFAULT_HYPERPARAMETERS):
-                self.hyperparameters = hyperparameters
-            else:
-                raise BEEPFeaturizationError(
-                    f"Features cannot be created with incomplete set of "
-                    f"hyperparameters {hyperparameters.keys()} < "
-                    f"{self.DEFAULT_HYPERPARAMETERS.keys()}!")
-        else:
-            self.hyperparameters = self.DEFAULT_HYPERPARAMETERS
-
-        if structured_datapath is not None and not structured_datapath.is_structured:
-            raise BEEPFeaturizationError(
-                "BEEPDatapath input is not structured!")
-        self.datapath = structured_datapath
-
-        self.features = None
-
-        # In case these features are loaded from file
-        # Allow attrs which can hold relevant metadata without having
-        # to reload the original datapath
-        self.paths = self.datapath.paths if self.datapath else {}
-        self.metadata = self.datapath.metadata.raw if self.datapath else {}
-        self.linked_semiunique_id = self.datapath.semiunique_id if self.datapath else None
-
-    @abc.abstractmethod
-    def validate(self) -> Tuple[bool, Union[str, None]]:
-        """
-        Validate a featurizer on it's ingested datapath.
-
-        Returns:
-            (bool, str/None): The validation result and it's message.
-
-        """
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def create_features(self) -> None:
-        """
-        Should assign a dataframe to self.features.
-
-        Returns:
-            None
-        """
-        raise NotImplementedError
-
-    def as_dict(self):
-        """Serialize a BEEPDatapath as a dictionary.
-
-        Must not be loaded from legacy.
-
-        Returns:
-            (dict): corresponding to dictionary for serialization.
-
-        """
-
-        if self.features is None:
-            raise BEEPFeaturizationError(
-                "Cannot serialize features which have not been generated.")
-
-        features = self.features.to_dict("list")
-
-        return {
-            "@module": self.__class__.__module__,
-            "@class": self.__class__.__name__,
-
-            # Core parts of BEEPFeaturizer
-            "features": features,
-            "hyperparameters": self.hyperparameters,
-            "paths": self.paths,
-            "metadata": self.metadata,
-            "linked_datapath_semiunique_id": self.linked_semiunique_id
-        }
-
-    @classmethod
-    def from_dict(cls, d):
-        """Create a BEEPDatapath object from a dictionary.
-
-        Args:
-            d (dict): dictionary represenation.
-
-        Returns:
-            beep.structure.ProcessedCyclerRun: deserialized ProcessedCyclerRun.
-        """
-
-        # no need for original datapath
-        bf = cls(structured_datapath=None, hyperparameters=d["hyperparameters"])
-        bf.features = pd.DataFrame(d["features"])
-        bf.paths = d["paths"]
-        bf.metadata = d["metadata"]
-        bf.linked_semiunique_id = d["linked_datapath_semiunique_id"]
-        return bf
-
-    @classmethod
-    def from_json_file(cls, filename):
-        """Load a structured run previously saved to file.
-
-        .json.gz files are supported.
-
-        Loads a BEEPFeaturizer from json.
-
-        Can be used in combination with files serialized with BEEPFeatures.to_json_file.
-
-        Args:
-            filename (str, Pathlike): a json file from a structured run, serialzed with to_json_file.
-
-        Returns:
-            None
-        """
-        with zopen(filename, "r") as f:
-            d = json.load(f)
-
-        # Add this structured file path to the paths dict
-        paths = d.get("paths", {})
-        paths["features"] = os.path.abspath(filename)
-        d["paths"] = paths
-        return cls.from_dict(d)
-
-    def to_json_file(self, filename):
-        """Save a BEEPFeatures to disk as a json.
-
-        .json.gz files are supported.
-
-        Not named from_json to avoid conflict with MSONable.from_json(*)
-
-        Args:
-            filename (str, Pathlike): The filename to save the file to.
-            omit_raw (bool): If True, saves only structured (NOT RAW) data.
-                More efficient for saving/writing to disk.
-
-        Returns:
-            None
-        """
-        d = self.as_dict()
-        dumpfn(d, filename)
-
-
-class BEEPAllCyclesFeaturizer(BEEPFeaturizer):
-    """Base class for featurizers that return a constant number of features
-    for any number of cycles in a structured datapath.
-
-    These features are typically used for early prediction.
-
-    A BEEPAllCyclesFeaturizer always returns the same number of features
-    for files for datapaths with any number of samples. Thus,
-
-
-    [Datapath w/ 2 cycles]   ---> (vector of k features)
-
-    [Datapath w/ 100 cycles] ---> (vector of k features)
-    """
-    PER_CYCLE = False
-
-
-class BEEPPerCycleFeaturizer(BEEPFeaturizer):
-    """Base class for featurizers that return a vector of features for
-    EACH cycle in a structured datapath.
-
-    These features are generally used for analysis
-
-    A BEEPPerCycleFeaturizer always returns an (n x k) matrix of features
-    for datapaths with n cycles each producing k features. Thus,
-
-    [Datapath w/ 2 cycles]   ---> (2 x k feature matrix)
-
-    [Datapath w/ 100 cycles] ---> (100 x k feature matrix)
-
-    """
-    PER_CYCLE = True
-
-
-class BEEPFeatureMatrix(MSONable):
-    """
-    Create an array composed of BEEPFeaturizer objects.
-
-    The array may either be:
-
-    PER-CYCLER-RUN, using BEEPAllCyclesFeaturizer.
-    One feature vector per cycler file, resulting in an array w. dimenions:
-        (n battery cycler files) x (k features)
-
-    OR:
-
-    PER-CYCLE, using BEEPPerCycleFeaturizer.
-    One feature vector per cycle, resulting in an array w. dimensions:
-        (n total cycles) x (k features)
-
-    Sets of featurizer objects must exclusively belong to EITHER of these
-    two paradigms (base classes), but may not be mixed.
-
-    So a set of featurizrs may be -per-cycler-file OR per-cycle, but not
-    both.
-
-    Args:
-        beepfeaturizers ([BEEPFeaturizer]): A list of BEEPFeaturizer objects,
-            either ALL BEEPAllCyclesFeaturizer child objects OR ALL
-            BEEPPerCycleFeaturizer child objects.
-
-    """
-
-    OP_DELIMITER = "::"
-
-    def __init__(self, beepfeaturizers: List[BEEPFeaturizer]):
-
-        if beepfeaturizers:
-            bfs_types_per_cycle = [bf.PER_CYCLE for bf in beepfeaturizers]
-
-            # the array should be either all True or all False
-            if all(bfs_types_per_cycle):
-                self.per_cycle = True
-            elif not any(bfs_types_per_cycle):
-                self.per_cycle = False
-            else:
-                raise TypeError(
-                    f"Featurizer types are mixed!\n"
-                    f"BEEPFeatureMatrix can only use EITHER a set of ALL "
-                    f"BEEPAllCyclesFeaturizers OR a set of ALL "
-                    f"BEEPPerCycleFeaturizers.")
-
-
-            dfs_by_file = {bf.paths.get("structured", f"no file found_{i}"): []
-                           for i, bf in enumerate(beepfeaturizers)}
-            # big_df_rows = {bf.__class__.__name__: [] for bf in beepfeaturizers}
-            unique_features = {}
-            for i, bf in enumerate(beepfeaturizers):
-
-                if bf.features is None:
-                    raise BEEPFeatureMatrixError(
-                        f"BEEPFeaturizer {bf} has not created features")
-                else:
-                    bfcn = bf.__class__.__name__
-
-                    fname = bf.paths.get("structured", None)
-                    if not fname:
-                        raise BEEPFeatureMatrixError(
-                            "Cannot join features automatically as no linking can be done "
-                            "based on original structured filename."
-                        )
-
-                    # Check for any possible feature collisions using identical featurizers
-                    # on identical files
-
-                    # sort params for this featurizer obj by key
-                    params = sorted(list(bf.hyperparameters.items()),
-                                    key=lambda x: x[0])
-
-                    # Prevent identical features from identical input files
-                    # create a unique operation string for the application of this featurizer
-                    # on a specific file, this op string will be the same as long as
-                    # the featurizer class name, hyperparameters, and class are the same
-
-                    param_str = "-".join([f"{k}:{v}" for k, v in params])
-                    param_hash = hashlib.sha256(
-                        param_str.encode("utf-8")).hexdigest()
-
-                    # Get an id for this featurizer operation (including hyperparameters)
-                    # regardless of the file it is applied on
-                    feature_op_id = f"{bfcn}{self.OP_DELIMITER}{param_hash}"
-
-                    # Get an id for this featurizer operation (including hyperparameters)
-                    # on THIS SPECIFIC file.
-                    file_feature_op_id = f"{fname}{self.OP_DELIMITER}{bfcn}{self.OP_DELIMITER}{param_hash}"
-
-                    # Get a unique id for every feature generated by a specific
-                    # featurizer on a specific file.
-                    this_file_feature_columns_ids = \
-                        [
-                            f"{file_feature_op_id}{self.OP_DELIMITER}{c}" for c
-                            in bf.features.columns
-                        ]
-
-                    # Check to make sure there are no duplicates of the exact same feature for
-                    # the exact same featurizer with the exact same hyperparameters on the exact
-                    # same file.
-                    collisions = {c: f for c, f in unique_features.items() if
-                                  c in this_file_feature_columns_ids}
-                    if collisions:
-                        raise BEEPFeatureMatrixError(
-                            f"Multiple features generated with identical classes and identical hyperparameters"
-                            f" attempted to be joined into same dataset; \n"
-                            f"{bfcn} features collide with existing: \n{collisions}"
-                        )
-                    for c in this_file_feature_columns_ids:
-                        unique_features[c] = bfcn
-
-                    # Create consistent scheme for naming features regardless of file
-                    df = copy.deepcopy(bf.features)
-                    consistent_column_names = [
-                        f"{c}{self.OP_DELIMITER}{feature_op_id}" for c in
-                        df.columns]
-                    df.columns = consistent_column_names
-
-                    df.index = [fname] * df.shape[0]
-                    df.index.rename("filename", inplace=True)
-                    dfs_by_file[fname].append(df)
-
-            rows = []
-            for filename, dfs in dfs_by_file.items():
-                row = pd.concat(dfs, axis=1)
-                row = row[sorted(row.columns)]
-                rows.append(row)
-            self.matrix = pd.concat(rows, axis=0)
-
-        else:
-            self.matrix = None
-
-        self.featurizers = beepfeaturizers
-
-    def as_dict(self):
-        """Serialize a BEEPFeatureMatrix as a dictionary.
-
-        Must not be loaded from legacy.
-
-        Returns:
-            (dict): corresponding to dictionary for serialization.
-
-        """
-
-        return {
-            "@module": self.__class__.__module__,
-            "@class": self.__class__.__name__,
-
-            # Core parts of BEEPFeaturizer
-            "featurizers": [f.as_dict() for f in self.featurizers],
-            "matrix": self.matrix.to_dict("list"),
-        }
-
-    @classmethod
-    def from_dict(cls, d):
-        """Create a BEEPFeatureMatrix object from a dictionary.
-
-        Args:
-            d (dict): dictionary represenation.
-
-        Returns:
-            beep.structure.ProcessedCyclerRun: deserialized ProcessedCyclerRun.
-        """
-        # no need for original datapaths, as their ref paths should
-        # be in the subobjects
-        featurizers = [MontyDecoder().process_decoded(f) for f in
-                       d["featurizers"]]
-        return cls(featurizers)
-
-    @classmethod
-    def from_json_file(cls, filename):
-        """Load a structured run previously saved to file.
-
-        .json.gz files are supported.
-
-        Loads a BEEPFeatureMatrix from json.
-
-        Can be used in combination with files serialized with BEEPFeatures.to_json_file.
-
-        Args:
-            filename (str, Pathlike): a json file from a structured run, serialzed with to_json_file.
-
-        Returns:
-            None
-        """
-        return loadfn(filename)
-
-    def to_json_file(self, filename):
-        """Save a BEEPFeatureMatrix to disk as a json.
-
-        .json.gz files are supported.
-
-        Not named from_json to avoid conflict with MSONable.from_json(*)
-
-        Args:
-            filename (str, Pathlike): The filename to save the file to.
-            omit_raw (bool): If True, saves only structured (NOT RAW) data.
-                More efficient for saving/writing to disk.
-
-        Returns:
-            None
-        """
-        d = self.as_dict()
-        dumpfn(d, filename)
-
-
-class BEEPCycleFeatureMatrix(MSONable):
-    """
-    Create an ((n battery cycler files) x (j cycles)) x (k features)  array composed of
-    m BEEPFeaturizer objects.
-
-    Args:
-        beepfeaturizers ([BEEPFeaturizer]): A list of BEEPFeaturizer objects
-
-    """
-
-    OP_DELIMITER = "::"
-
-    def __init__(self, beepfeaturizers: List[BEEPFeaturizer]):
-
-        if beepfeaturizers:
-            # initialize emtpy dict of file names
-            dfs_by_file = {os.path.basename(
-                bf.paths.get("structured", "no file found")
-            )[0:-19]: pd.DataFrame(
-                columns=['filename', 'cycle_index', 'diag_pos']
-                ) for bf in beepfeaturizers}
-            # big_df_rows = {bf.__class__.__name__: [] for bf in beepfeaturizers}
-            unique_features = {}
-            for i, bf in enumerate(beepfeaturizers):
-                if bf.features is None:
-                    raise BEEPFeatureMatrixError(
-                        f"BEEPFeaturizer {bf} has not created features")
-
-                #                 elif bf.features.shape[0] != 1:
-                #                     raise BEEPFeatureMatrixError(f"BEEPFeaturizer {bf} features are not 1-dimensional.")
-                else:
-                    bfcn = bf.__class__.__name__
-
-                    #                     fname = bf.paths.get("structured", None)
-                    fname = os.path.basename(bf.paths['structured'])[0:-19]
-                    if not fname:
-                        raise BEEPFeatureMatrixError(
-                            "Cannot join features automatically as no linking can be done "
-                            "based on original structured filename."
-                        )
-
-                    # Check for any possible feature collisions using identical featurizers
-                    # on identical files
-
-                    # sort params for this featurizer obj by key
-                    params = sorted(list(bf.hyperparameters.items()),
-                                    key=lambda x: x[0])
-
-                    # Prevent identical features from identical input files
-                    # create a unique operation string for the application of this featurizer
-                    # on a specific file, this op string will be the same as long as
-                    # the featurizer class name, hyperparameters, and class are the same
-
-                    param_str = "-".join([f"{k}:{v}" for k, v in params])
-                    param_hash = hashlib.sha256(
-                        param_str.encode("utf-8")).hexdigest()
-
-                    # Get an id for this featurizer operation (including hyperparameters)
-                    # regardless of the file it is applied on
-                    feature_op_id = f"{bfcn}{self.OP_DELIMITER}{param_hash}"
-
-                    # Get an id for this featurizer operation (including hyperparameters)
-                    # on THIS SPECIFIC file.
-                    file_feature_op_id = f"{fname}{self.OP_DELIMITER}{bfcn}{self.OP_DELIMITER}{param_hash}"
-
-                    # Get a unique id for every feature generated by a specific
-                    # featurizer on a specific file.
-                    this_file_feature_columns_ids = \
-                        [
-                            f"{file_feature_op_id}{self.OP_DELIMITER}{c}" for c
-                            in bf.features.columns
-                        ]
-
-                    # Check to make sure there are no duplicates of the exact same feature for
-                    # the exact same featurizer with the exact same hyperparameters on the exact
-                    # same file.
-                    collisions = {c: f for c, f in unique_features.items() if
-                                  c in this_file_feature_columns_ids}
-                    if collisions:
-                        raise BEEPFeatureMatrixError(
-                            f"Multiple features generated with identical classes and identical hyperparameters"
-                            f" attempted to be joined into same dataset; \n"
-                            f"{bfcn} features collide with existing: \n{collisions}"
-                        )
-                    for c in this_file_feature_columns_ids:
-                        unique_features[c] = bfcn
-
-                    # Create consistent scheme for naming features regardless of file
-                    df = copy.deepcopy(bf.features)
-                    consistent_column_names = [
-                        f"{c}{self.OP_DELIMITER}{feature_op_id}" for c in
-                        df.columns]
-                    df.columns = consistent_column_names
-
-                    #                     df.index = [fname] * df.shape[0]
-                    #                     df.index.rename("filename", inplace=True)
-
-                    # create filename column to merge on
-                    df['filename'] = os.path.basename(bf.paths['structured'])[
-                                     0:-19]
-
-                    #                     df = df.reset_index(drop=True)
-
-                    # remove hash from cycle_index and diag_pos column
-                    cycle_index_col = [col for col in df.columns if
-                                       'cycle_index' in col]
-                    df.rename(columns={cycle_index_col[0]: 'cycle_index'},
-                              inplace=True)
-
-                    # remove hash from diag_pos column
-                    diag_pos_col = [col for col in df.columns if
-                                    'diag_pos' in col]
-                    df.rename(columns={diag_pos_col[0]: 'diag_pos'},
-                              inplace=True)
-
-                    # ensure cycle_index and diag_pos are integers
-                    df['cycle_index'] = df['cycle_index'].astype(int)
-                    df['diag_pos'] = df['diag_pos'].astype(int)
-
-                    # append each BEEPFeaturizer df to the corresponding cell dict entry
-                    #                     dfs_by_file[fname].append(df)
-                    dfs_by_file[fname] = dfs_by_file[fname].merge(
-                        df, how='outer',
-                        on=['filename', 'cycle_index', 'diag_pos']).sort_values(
-                        'cycle_index').reset_index(drop=True)
-            #                     dfs_by_file[fname] = pd.concat(
-            #                         [dfs_by_file[fname],df],
-            #                         axis=1,join='outer',ignore_index=True,
-            #                         keys=['filename'])
-            #                     self.dfs_by_file = dfs_by_file
-            #                     self.df = df
-            #             return None
-
-            rows = []
-            self.matrix = pd.DataFrame()
-            for filename, dfs in dfs_by_file.items():
-                #                 row = pd.concat([row,dfs], axis=1)
-                #                 row = row[sorted(row.columns)]
-                #                 rows.append(row)
-                self.matrix = pd.concat([self.matrix, dfs], axis=0,
-                                        ignore_index=True,
-                                        join='outer')  # , keys=['filename']
-
-        else:
-            self.matrix = None
-
-        self.featurizers = beepfeaturizers
-
-    def as_dict(self):
-        """Serialize a BEEPDatapath as a dictionary.
-
-        Must not be loaded from legacy.
-
-        Returns:
-            (dict): corresponding to dictionary for serialization.
-
-        """
-
-        return {
-            "@module": self.__class__.__module__,
-            "@class": self.__class__.__name__,
-
-            # Core parts of BEEPFeaturizer
-            "featurizers": [f.as_dict() for f in self.featurizers],
-            "matrix": self.matrix.to_dict("list"),
-        }
-
-    @classmethod
-    def from_dict(cls, d):
-        """Create a BEEPDatapath object from a dictionary.
-
-        Args:
-            d (dict): dictionary represenation.
-
-        Returns:
-            beep.structure.ProcessedCyclerRun: deserialized ProcessedCyclerRun.
-        """
-        # no need for original datapaths, as their ref paths should
-        # be in the subobjects
-        featurizers = [MontyDecoder().process_decoded(f) for f in
-                       d["featurizers"]]
-        return cls(featurizers)
-
-    @classmethod
-    def from_json_file(cls, filename):
-        """Load a structured run previously saved to file.
-
-        .json.gz files are supported.
-
-        Loads a BEEPFeatureMatrix from json.
-
-        Can be used in combination with files serialized with BEEPFeatures.to_json_file.
-
-        Args:
-            filename (str, Pathlike): a json file from a structured run, serialzed with to_json_file.
-
-        Returns:
-            None
-        """
-        return loadfn(filename)
-
-    def to_json_file(self, filename):
-        """Save a BEEPFeatureMatrix to disk as a json.
-
-        .json.gz files are supported.
-
-        Not named from_json to avoid conflict with MSONable.from_json(*)
-
-        Args:
-            filename (str, Pathlike): The filename to save the file to.
-            omit_raw (bool): If True, saves only structured (NOT RAW) data.
-                More efficient for saving/writing to disk.
-
-        Returns:
-            None
-        """
-        d = self.as_dict()
-        dumpfn(d, filename)
diff --git a/beep/features/core.py b/beep/features/core.py
index 6887f303..3c0cb953 100644
--- a/beep/features/core.py
+++ b/beep/features/core.py
@@ -11,1204 +11,7 @@
 from beep.features.base import BEEPFeaturizer, BEEPFeaturizationError
 
 
-class HPPCResistanceVoltageEarlyFeatures(BEEPFeaturizer):
-    DEFAULT_HYPERPARAMETERS = {
-        "test_time_filter_sec": 1000000,
-        "cycle_index_filter": 6,
-        "diag_pos": 1,
-        "soc_window": 8,
-        "parameters_path": PROTOCOL_PARAMETERS_DIR
-    }
 
-    def validate(self):
-        val, msg = featurizer_helpers.check_diagnostic_validation(self.datapath)
-        if val:
-            conditions = []
-            conditions.append(
-                any(
-                    [
-                        "hppc" in x
-                        for x in
-                        self.datapath.diagnostic_summary.cycle_type.unique()
-                    ]
-                )
-            )
-            if all(conditions):
-                return True, None
-            else:
-                return False, "HPPC conditions not met for this cycler run"
-        else:
-            return val, msg
 
-    def create_features(self):
-        # Filter out low cycle numbers at the end of the test, corresponding to the "final" diagnostic
-        self.datapath.diagnostic_data = self.datapath.diagnostic_data[
-            ~((self.datapath.diagnostic_data.test_time > self.hyperparameters[
-                'test_time_filter_sec']) &
-              (self.datapath.diagnostic_data.cycle_index < self.hyperparameters[
-                  'cycle_index_filter']))
-        ]
-        self.datapath.diagnostic_data = self.datapath.diagnostic_data.groupby(
-            ["cycle_index", "step_index", "step_index_counter"]
-        ).filter(lambda x: ~x["test_time"].isnull().all())
 
-        # diffusion features
-        diffusion_features = featurizer_helpers.get_diffusion_early_features(
-            self.datapath,
-        )
 
-        hppc_r = pd.DataFrame()
-        # the 9 by 6 dataframe
-        df_dr = featurizer_helpers.get_dr_df(
-            self.datapath, self.hyperparameters["diag_pos"]
-        )
-        # transform this dataframe to be 1 by 54
-        columns = df_dr.columns
-        for column in columns:
-            for r in range(len(df_dr[column])):
-                name = column + str(r)
-                hppc_r[name] = [df_dr[column][r]]
-
-        # the variance of ocv features
-        hppc_ocv = featurizer_helpers.get_hppc_ocv(
-            self.datapath,
-            self.hyperparameters["diag_pos"],
-            parameters_path=self.hyperparameters["parameters_path"]
-        )
-
-        # the v_diff features
-        v_diff = featurizer_helpers.get_v_diff(
-            self.datapath,
-            self.hyperparameters["diag_pos"],
-            self.hyperparameters["soc_window"],
-            self.hyperparameters["parameters_path"]
-        )
-
-        # merge everything together as a final result dataframe
-        self.features = pd.concat(
-            [hppc_r, hppc_ocv, v_diff, diffusion_features], axis=1)
-
-
-class HPPCResistanceVoltageCycleFeatures(BEEPFeaturizer):
-    DEFAULT_HYPERPARAMETERS = {
-        "test_time_filter_sec": 1000000,
-        "cycle_index_filter": 6,
-        "soc_window": 8,
-        "parameters_path": PROTOCOL_PARAMETERS_DIR
-    }
-
-    def validate(self):
-        val, msg = featurizer_helpers.check_diagnostic_validation(self.datapath)
-        if val:
-            conditions = []
-            conditions.append(
-                any(
-                    [
-                        "hppc" in x
-                        for x in
-                        self.datapath.diagnostic_summary.cycle_type.unique()
-                    ]
-                )
-            )
-            if all(conditions):
-                return True, None
-            else:
-                return False, "HPPC conditions not met for this cycler run"
-        else:
-            return val, msg
-
-    def create_features(self):
-        # Filter out low cycle numbers at the end of the test, corresponding to the "final" diagnostic
-        self.datapath.diagnostic_data = self.datapath.diagnostic_data[
-            ~((self.datapath.diagnostic_data.test_time > self.hyperparameters[
-                'test_time_filter_sec']) &
-              (self.datapath.diagnostic_data.cycle_index < self.hyperparameters[
-                  'cycle_index_filter']))
-        ]
-        self.datapath.diagnostic_data = self.datapath.diagnostic_data.groupby(
-            ["cycle_index", "step_index", "step_index_counter"]
-        ).filter(lambda x: ~x["test_time"].isnull().all())
-
-        # Only hppc_resistance_features are able to be calculated without error.
-        # Xiao Cui should be pulled in to understand the issue with the others features.
-
-        # diffusion features
-        #         diffusion_features = featurizer_helpers.get_diffusion_cycle_features(
-        #             self.datapath,
-        #         )
-
-        # hppc resistance features
-        hppc_resistance_features = featurizer_helpers.get_hppc_resistance_cycle_features(
-            self.datapath,
-        )
-
-        # the variance of ocv features
-        #         hppc_ocv_features = featurizer_helpers.get_hppc_ocv_cycle_features(
-        #             self.datapath,
-        #         )
-
-        # the v_diff features
-        #         v_diff = featurizer_helpers.get_v_diff_cycle_features(
-        #             self.datapath,
-        #             self.hyperparameters["soc_window"],
-        #             self.hyperparameters["parameters_path"]
-        #         )
-
-        # merge everything together as a final result dataframe
-        self.features = pd.concat(
-            [hppc_resistance_features,
-             # hppc_ocv_features,
-             # v_diff, #diffusion_features
-             ], axis=1)
-
-
-class CycleSummaryStatsEarlyFeatures(BEEPFeaturizer):
-    DEFAULT_HYPERPARAMETERS = {
-        "cycle_comp_num": [10, 100],
-        "statistics": ["var", "min", "mean", "skew", "kurtosis", "abs",
-                       "square"]
-    }
-
-    def validate(self):
-        """
-        This function determines if the input data has the necessary attributes for
-        creation of this feature class. It should test for all of the possible reasons
-        that feature generation would fail for this particular input data.
-
-        Args:
-            processed_cycler_run (beep.structure.ProcessedCyclerRun): data from cycler run
-            params_dict (dict): dictionary of parameters governing how the ProcessedCyclerRun object
-                gets featurized. These could be filters for column or row operations
-        Returns:
-            bool: True/False indication of ability to proceed with feature generation
-        """
-
-        # TODO: not sure this is necessary
-        # Check for data in each of the selected cycles
-        index_1, index_2 = self.hyperparameters['cycle_comp_num']
-        cycle_1 = self.datapath.structured_data[
-            self.datapath.structured_data.cycle_index == index_1]
-        cycle_2 = self.datapath.structured_data[
-            self.datapath.structured_data.cycle_index == index_2]
-        if len(cycle_1) == 0 or len(cycle_2) == 0:
-            return False, "Length of one or more comparison cycles is zero"
-
-        # TODO: check whether this is good
-        # Check for relevant data
-        required_columns = [
-            'charge_capacity',
-            'discharge_capacity',
-            'charge_energy',
-            'discharge_energy',
-        ]
-        pcycler_run_columns = self.datapath.structured_data.columns
-        if not all(
-                [column in pcycler_run_columns for column in required_columns]):
-            return False, f"Required column not present in all structured data " \
-                          f"(must have all of: {required_columns})"
-
-        return True, None
-
-    def create_features(self):
-        """
-        Generate features listed in early prediction manuscript using both diagnostic and regular cycles
-
-        Args:
-            processed_cycler_run (beep.structure.ProcessedCyclerRun)
-            params_dict (dict): dictionary of parameters governing how the ProcessedCyclerRun object
-                gets featurized. These could be filters for column or row operations
-            parameters_path (str): Root directory storing project parameter files.
-
-        Returns:
-            X (pd.DataFrame): Dataframe containing the feature
-        """
-
-        # TODO: extend this dataframe and uncomment energy features when
-        #   structuring is refactored
-        X = pd.DataFrame(np.zeros((1, 28)))
-
-        reg_cycle_comp_num = self.hyperparameters.get("cycle_comp_num")
-        cycle_comp_1 = self.datapath.structured_data[
-            self.datapath.structured_data.cycle_index == reg_cycle_comp_num[1]
-            ]
-        cycle_comp_0 = self.datapath.structured_data[
-            self.datapath.structured_data.cycle_index == reg_cycle_comp_num[0]
-            ]
-        Qc100_1 = cycle_comp_1[
-            cycle_comp_1.step_type == "charge"].charge_capacity
-        Qc10_1 = cycle_comp_0[
-            cycle_comp_0.step_type == "charge"].charge_capacity
-        QcDiff = Qc100_1.values - Qc10_1.values
-        QcDiff = QcDiff[~np.isnan(QcDiff)]
-
-        X.loc[0, 0:6] = self.get_summary_statistics(QcDiff)
-
-        Qd100_1 = cycle_comp_1[
-            cycle_comp_1.step_type == "discharge"].discharge_capacity
-        Qd10_1 = cycle_comp_0[
-            cycle_comp_0.step_type == "discharge"].discharge_capacity
-        QdDiff = Qd100_1.values - Qd10_1.values
-        QdDiff = QdDiff[~np.isnan(QdDiff)]
-
-        X.loc[0, 7:13] = self.get_summary_statistics(QdDiff)
-
-        # # Charging Energy features
-        Ec100_1 = cycle_comp_1[cycle_comp_1.step_type == "charge"].charge_energy
-        Ec10_1 = cycle_comp_0[cycle_comp_0.step_type == "charge"].charge_energy
-        EcDiff = Ec100_1.values - Ec10_1.values
-        EcDiff = EcDiff[~np.isnan(EcDiff)]
-
-        X.loc[0, 14:20] = self.get_summary_statistics(EcDiff)
-
-        # # Discharging Energy features
-        Ed100_1 = cycle_comp_1[
-            cycle_comp_1.step_type == "charge"].discharge_energy
-        Ed10_1 = cycle_comp_0[
-            cycle_comp_0.step_type == "charge"].discharge_energy
-        EdDiff = Ed100_1.values - Ed10_1.values
-        EdDiff = EdDiff[~np.isnan(EdDiff)]
-
-        X.loc[0, 21:27] = self.get_summary_statistics(EdDiff)
-
-        quantities = [
-            "charging_capacity",
-            "discharging_capacity",
-            "charging_energy",
-            "discharging_energy",
-        ]
-
-        X.columns = [y + "_" + x for x in quantities for y in
-                     self.hyperparameters["statistics"]]
-
-        self.features = X
-
-    def get_summary_statistics(self, array):
-        """
-        Static method for getting values corresponding
-        to standard 7 operations that many beep features
-        use, i.e. log of absolute value of each of
-        variance, min, mean, skew, kurtosis, the sum of
-        the absolute values and the sum of squares
-
-        Args:
-            array (list, np.ndarray): array of values to get
-                standard operation values for, e.g. cycle
-                discharging capacity, QcDiff, etc.
-
-        Returns:
-            [float]: list of features
-
-        """
-
-        stats_names = self.hyperparameters["statistics"]
-        supported_stats = self.DEFAULT_HYPERPARAMETERS["statistics"]
-
-        if any(s not in supported_stats for s in stats_names):
-            raise ValueError(
-                f"Unsupported statistics in {stats_names}: supported statistics are {supported_stats}")
-
-        stats = []
-
-        if "var" in stats_names:
-            stats.append(np.log10(np.absolute(np.var(array))))
-        if "min" in stats_names:
-            stats.append(np.log10(np.absolute(min(array))))
-        if "mean" in stats_names:
-            stats.append(np.log10(np.absolute(np.mean(array))))
-        if "skew" in stats_names:
-            stats.append(np.log10(np.absolute(skew(array))))
-        if "kurtosis" in stats_names:
-            stats.append(np.log10(
-                np.absolute(kurtosis(array, fisher=False, bias=False))))
-        if "abs" in stats_names:
-            stats.append(np.log10(np.sum(np.absolute(array))))
-        if "square" in stats_names:
-            stats.append(np.log10(np.sum(np.square(array))))
-
-        return np.asarray(stats)
-
-
-class DiagnosticSummaryStatsEarlyFeatures(CycleSummaryStats):
-    """
-    Object corresponding to summary statistics from a diagnostic cycle of
-    specific type. Includes constructors to create the features, object names
-    and metadata attributes in the object.  Inherits from RegularCycleSummaryStats
-    to reuse standard feature generation
-
-    name (str): predictor object name.
-    X (pandas.DataFrame): features in DataFrame format.
-    metadata (dict): information about the conditions, data
-        and code used to produce features
-    """
-    DEFAULT_HYPERPARAMETERS = {
-        "test_time_filter_sec": 1000000,
-        "cycle_index_filter": 6,
-        "diagnostic_cycle_type": 'rpt_0.2C',
-        "diag_pos_list": [0, 1],
-        "statistics": ["var", "min", "mean", "skew", "kurtosis", "abs",
-                       "square"],
-        "parameters_path": PROTOCOL_PARAMETERS_DIR
-    }
-
-    def validate(self):
-        """
-        This function determines if the input data has the necessary attributes for
-        creation of this feature class. It should test for all of the possible reasons
-        that feature generation would fail for this particular input data.
-
-        Args:
-            processed_cycler_run (beep.structure.ProcessedCyclerRun): data from cycler run
-            params_dict (dict): dictionary of parameters governing how the ProcessedCyclerRun object
-                gets featurized. These could be filters for column or row operations
-
-        Returns:
-            bool: True/False indication of ability to proceed with feature generation
-        """
-        val, msg = featurizer_helpers.check_diagnostic_validation(self.datapath)
-        if val:
-            df = self.datapath.diagnostic_summary
-            df = df[
-                df.cycle_type == self.hyperparameters["diagnostic_cycle_type"]]
-            if df.cycle_index.nunique() >= max(
-                    self.hyperparameters["diag_pos_list"]) + 1:
-                return True, None
-            else:
-                return False, "Diagnostic cycles insufficient for featurization"
-        else:
-            return val, msg
-
-    def get_summary_diff(
-            self,
-            pos=None,
-            cycle_types=("rpt_0.2C", "rpt_1C", "rpt_2C"),
-            metrics=(
-                    "discharge_capacity", "discharge_energy", "charge_capacity",
-                    "charge_energy")
-    ):
-        """
-        Helper function to calculate difference between summary values in the diagnostic cycles
-
-                Args:
-                    processed_cycler_run (beep.structure.ProcessedCyclerRun)
-                    pos (list): position of the diagnostics to use in the calculation
-                    cycle_types (list): calculate difference for these diagnostic types
-                    metrics (str): Calculate difference for these metrics
-
-                Returns:
-                    values (list): List of difference values to insert into the dataframe
-                    names (list): List of column headers to use in the creation of the dataframe
-                """
-        pos = self.hyperparameters["diag_pos_list"] if not pos else pos
-
-        values = []
-        names = []
-        for cycle_type in cycle_types:
-            diag_type_summary = self.datapath.diagnostic_summary[
-                self.datapath.diagnostic_summary.cycle_type == cycle_type]
-            for metric in metrics:
-                diff = (diag_type_summary.iloc[pos[1]][metric] -
-                        diag_type_summary.iloc[pos[0]][metric]) \
-                       / diag_type_summary.iloc[pos[0]][metric]
-                values.append(diff)
-                names.append("diag_sum_diff_" + str(pos[0]) + "_" + str(
-                    pos[1]) + "_" + cycle_type + metric)
-        return values, names
-
-    def create_features(self):
-        """
-        Generate features listed in early prediction manuscript using both diagnostic and regular cycles
-
-        Args:
-            self.datapathn (beep.structure.ProcessedCyclerRun)
-            self.hyperparameters (dict): dictionary of parameters governing how the ProcessedCyclerRun object
-                gets featurized. These could be filters for column or row operations
-            parameters_path (str): Root directory storing project parameter files.
-
-        Returns:
-            X (pd.DataFrame): Dataframe containing the feature
-        """
-        # Filter out "final" diagnostic cycles that have been appended to the end of the file with the wrong
-        # cycle number(test time is monotonic)
-        self.datapath.diagnostic_data = self.datapath.diagnostic_data[
-            ~((self.datapath.diagnostic_data.test_time > self.hyperparameters[
-                'test_time_filter_sec']) &
-              (self.datapath.diagnostic_data.cycle_index < self.hyperparameters[
-                  'cycle_index_filter']))
-        ]
-        self.datapath.diagnostic_data = self.datapath.diagnostic_data.groupby(
-            ["cycle_index", "step_index", "step_index_counter"]
-        ).filter(lambda x: ~x["test_time"].isnull().all())
-
-        diag_intrp = self.datapath.diagnostic_data
-
-        X = pd.DataFrame(np.zeros((1, 54)))
-
-        # Calculate the cycles and the steps for the selected diagnostics
-        cycles = diag_intrp.cycle_index[diag_intrp.cycle_type ==
-                                        self.hyperparameters[
-                                            "diagnostic_cycle_type"]].unique()
-        step_dict_0 = featurizer_helpers.get_step_index(
-            self.datapath,
-            cycle_type=self.hyperparameters["diagnostic_cycle_type"],
-            diag_pos=self.hyperparameters["diag_pos_list"][0],
-            parameters_path=self.hyperparameters["parameters_path"]
-        )
-        step_dict_1 = featurizer_helpers.get_step_index(
-            self.datapath,
-            cycle_type=self.hyperparameters["diagnostic_cycle_type"],
-            diag_pos=self.hyperparameters["diag_pos_list"][1],
-            parameters_path=self.hyperparameters["parameters_path"]
-        )
-
-        # Create masks for each position in the data
-        mask_pos_0_charge = ((diag_intrp.cycle_index == cycles[
-            self.hyperparameters["diag_pos_list"][0]]) &
-                             (diag_intrp.step_index == step_dict_0[
-                                 self.hyperparameters[
-                                     "diagnostic_cycle_type"] + '_charge']))
-        mask_pos_1_charge = ((diag_intrp.cycle_index == cycles[
-            self.hyperparameters["diag_pos_list"][1]]) &
-                             (diag_intrp.step_index == step_dict_1[
-                                 self.hyperparameters[
-                                     "diagnostic_cycle_type"] + '_charge']))
-        mask_pos_0_discharge = ((diag_intrp.cycle_index == cycles[
-            self.hyperparameters["diag_pos_list"][0]]) &
-                                (diag_intrp.step_index ==
-                                 step_dict_0[self.hyperparameters[
-                                                 "diagnostic_cycle_type"] + '_discharge']))
-        mask_pos_1_discharge = ((diag_intrp.cycle_index == cycles[
-            self.hyperparameters["diag_pos_list"][1]]) &
-                                (diag_intrp.step_index ==
-                                 step_dict_1[self.hyperparameters[
-                                                 "diagnostic_cycle_type"] + '_discharge']))
-
-        # Charging Capacity features
-        Qc_1 = diag_intrp.charge_capacity[mask_pos_1_charge]
-        Qc_0 = diag_intrp.charge_capacity[mask_pos_0_charge]
-        QcDiff = Qc_1.values - Qc_0.values
-        QcDiff = QcDiff[~np.isnan(QcDiff)]
-
-        X.loc[0, 0:6] = self.get_summary_statistics(QcDiff)
-
-        # Discharging Capacity features
-        Qd_1 = diag_intrp.discharge_capacity[mask_pos_1_discharge]
-        Qd_0 = diag_intrp.discharge_capacity[mask_pos_0_discharge]
-        QdDiff = Qd_1.values - Qd_0.values
-        QdDiff = QdDiff[~np.isnan(QdDiff)]
-
-        X.loc[0, 7:13] = self.get_summary_statistics(QdDiff)
-
-        # Charging Energy features
-        Ec_1 = diag_intrp.charge_energy[mask_pos_1_charge]
-        Ec_0 = diag_intrp.charge_energy[mask_pos_0_charge]
-        EcDiff = Ec_1.values - Ec_0.values
-        EcDiff = EcDiff[~np.isnan(EcDiff)]
-
-        X.loc[0, 14:20] = self.get_summary_statistics(EcDiff)
-
-        # Discharging Energy features
-        Ed_1 = diag_intrp.discharge_energy[mask_pos_1_discharge]
-        Ed_0 = diag_intrp.discharge_energy[mask_pos_0_discharge]
-        EdDiff = Ed_1.values - Ed_0.values
-        EdDiff = EdDiff[~np.isnan(EdDiff)]
-
-        X.loc[0, 21:27] = self.get_summary_statistics(EdDiff)
-
-        # Charging dQdV features
-        dQdVc_1 = diag_intrp.charge_dQdV[mask_pos_1_charge]
-        dQdVc_0 = diag_intrp.charge_dQdV[mask_pos_0_charge]
-        dQdVcDiff = dQdVc_1.values - dQdVc_0.values
-        dQdVcDiff = dQdVcDiff[~np.isnan(dQdVcDiff)]
-
-        X.loc[0, 28:34] = self.get_summary_statistics(dQdVcDiff)
-
-        # Discharging Capacity features
-        dQdVd_1 = diag_intrp.discharge_dQdV[mask_pos_1_discharge]
-        dQdVd_0 = diag_intrp.discharge_dQdV[mask_pos_0_discharge]
-        dQdVdDiff = dQdVd_1.values - dQdVd_0.values
-        dQdVdDiff = dQdVdDiff[~np.isnan(dQdVdDiff)]
-
-        X.loc[0, 35:41] = self.get_summary_statistics(dQdVdDiff)
-
-        X.loc[0, 42:53], names = self.get_summary_diff(
-            self.hyperparameters["diag_pos_list"]
-        )
-
-        quantities = [
-            "charging_capacity",
-            "discharging_capacity",
-            "charging_energy",
-            "discharging_energy",
-            "charging_dQdV",
-            "discharging_dQdV",
-        ]
-
-        X.columns = [y + "_" + x for x in quantities for y in
-                     self.hyperparameters["statistics"]] + names
-        self.features = X
-
-
-class DeltaQFastChargeEarlyFeatures(BEEPFeaturizer):
-    """
-    Object corresponding to feature object. Includes constructors
-    to create the features, object names and metadata attributes in the
-    object
-        name (str): predictor object name.
-        X (pandas.DataFrame): features in DataFrame format.
-        metadata (dict): information about the conditions, data
-            and code used to produce features
-    """
-    DEFAULT_HYPERPARAMETERS = {
-        "init_pred_cycle": 10,
-        "mid_pred_cycle": 91,
-        "final_pred_cycle": 100,
-        "n_nominal_cycles": 40
-    }
-
-    def validate(self):
-        """
-        This function determines if the input data has the necessary attributes for
-        creation of this feature class. It should test for all of the possible reasons
-        that feature generation would fail for this particular input data.
-
-        Args:
-            self.datapath (beep.structure.ProcessedCyclerRun): data from cycler run
-            params_dict (dict): dictionary of parameters governing how the ProcessedCyclerRun object
-            gets featurized. These could be filters for column or row operations
-        Returns:
-            bool: True/False indication of ability to proceed with feature generation
-        """
-
-        if not self.datapath.structured_summary.index.max() > \
-               self.hyperparameters["final_pred_cycle"]:
-            return False, "Structured summary index max is less than final pred cycle"
-        elif not self.datapath.structured_summary.index.min() <= \
-                 self.hyperparameters["init_pred_cycle"]:
-            return False, "Structured summary index min is more than initial pred cycle"
-        elif "cycle_index" not in self.datapath.structured_summary.columns:
-            return False, "Structured summary missing critical data: 'cycle_index'"
-        elif "cycle_index" not in self.datapath.structured_data.columns:
-            return False, "Structured data missing critical data: 'cycle_index'"
-        elif not self.hyperparameters["mid_pred_cycle"] > 10:
-            return False, "Middle pred. cycle less than threshold value of 10"
-        elif not self.hyperparameters["final_pred_cycle"] > \
-                 self.hyperparameters["mid_pred_cycle"]:
-            return False, "Final pred cycle less than middle pred cycle"
-        else:
-            return True, None
-
-    def create_features(self):
-        """
-        Generate features listed in early prediction manuscript, primarily related to the
-        so called delta Q feature
-        Args:
-            processed_cycler_run (beep.structure.ProcessedCyclerRun): data from cycler run
-            self.hyperparameters (dict): dictionary of parameters governing how the ProcessedCyclerRun object
-                gets featurized. These could be filters for column or row operations
-            parameters_path (str): Root directory storing project parameter files.
-
-        Returns:
-            pd.DataFrame: features indicative of degradation, derived from the input data
-        """
-        i_final = self.hyperparameters[
-                      "final_pred_cycle"] - 1  # python indexing
-        i_mid = self.hyperparameters["mid_pred_cycle"] - 1
-
-        summary = self.datapath.structured_summary
-        self.hyperparameters[
-            "n_nominal_cycles"
-        ] = 40  # For nominal capacity, use median discharge capacity of first n cycles
-
-        if "step_type" in self.datapath.structured_data.columns:
-            interpolated_df = self.datapath.structured_data[
-                self.datapath.structured_data.step_type == "discharge"
-                ]
-        else:
-            interpolated_df = self.datapath.structured_data
-        X = pd.DataFrame(np.zeros((1, 20)))
-        labels = []
-        # Discharge capacity, cycle 2 = Q(n=2)
-        X[0] = summary.discharge_capacity.iloc[1]
-        labels.append("discharge_capacity_cycle_2")
-
-        # Max discharge capacity - discharge capacity, cycle 2 = max_n(Q(n)) - Q(n=2)
-        X[1] = max(
-            summary.discharge_capacity.iloc[np.arange(i_final + 1)]
-            - summary.discharge_capacity.iloc[1]
-        )
-        labels.append("max_discharge_capacity_difference")
-
-        # Discharge capacity, cycle 100 = Q(n=100)
-        X[2] = summary.discharge_capacity.iloc[i_final]
-        labels.append("discharge_capacity_cycle_100")
-
-        # Feature representing time-temperature integral over cycles 2 to 100
-        X[3] = np.nansum(
-            summary.time_temperature_integrated.iloc[np.arange(i_final + 1)])
-        labels.append("integrated_time_temperature_cycles_1:100")
-
-        # Mean of charge times of first 5 cycles
-        X[4] = np.nanmean(summary.charge_duration.iloc[1:6])
-        labels.append("charge_time_cycles_1:5")
-
-        # Descriptors based on capacity loss between cycles 10 and 100.
-        Qd_final = interpolated_df.discharge_capacity[
-            interpolated_df.cycle_index == i_final
-            ]
-        Qd_10 = interpolated_df.discharge_capacity[
-            interpolated_df.cycle_index == 9]
-
-        Qd_diff = Qd_final.values - Qd_10.values
-
-        # If DeltaQ(V) is not an empty array, compute summary stats, else initialize with np.nan
-        # Cells discharged rapidly over a narrow voltage window run into have no interpolated discharge steps
-        if len(Qd_diff):
-            X[5] = np.log10(np.abs(np.nanmin(Qd_diff)))  # Minimum
-            X[6] = np.log10(np.abs(np.nanmean(Qd_diff)))  # Mean
-            X[7] = np.log10(np.abs(np.nanvar(Qd_diff)))  # Variance
-            X[8] = np.log10(np.abs(skew(Qd_diff)))  # Skewness
-            X[9] = np.log10(np.abs(kurtosis(Qd_diff)))  # Kurtosis
-            X[10] = np.log10(np.abs(Qd_diff[0]))  # First difference
-        else:
-            X[5:11] = np.nan
-
-        labels.append("abs_min_discharge_capacity_difference_cycles_2:100")
-        labels.append("abs_mean_discharge_capacity_difference_cycles_2:100")
-        labels.append("abs_variance_discharge_capacity_difference_cycles_2:100")
-        labels.append("abs_skew_discharge_capacity_difference_cycles_2:100")
-        labels.append("abs_kurtosis_discharge_capacity_difference_cycles_2:100")
-        labels.append("abs_first_discharge_capacity_difference_cycles_2:100")
-
-        X[11] = np.max(summary.temperature_maximum.iloc[
-                           list(range(1, i_final + 1))])  # Max T
-        labels.append("max_temperature_cycles_1:100")
-
-        X[12] = np.min(summary.temperature_minimum.iloc[
-                           list(range(1, i_final + 1))])  # Min T
-        labels.append("min_temperature_cycles_1:100")
-
-        # Slope and intercept of linear fit to discharge capacity as a fn of cycle #, cycles 2 to 100
-
-        X[13], X[14] = np.polyfit(
-            list(range(1, i_final + 1)),
-            summary.discharge_capacity.iloc[list(range(1, i_final + 1))],
-            1,
-        )
-
-        labels.append("slope_discharge_capacity_cycle_number_2:100")
-        labels.append("intercept_discharge_capacity_cycle_number_2:100")
-
-        # Slope and intercept of linear fit to discharge capacity as a fn of cycle #, cycles 91 to 100
-        X[15], X[16] = np.polyfit(
-            list(range(i_mid, i_final + 1)),
-            summary.discharge_capacity.iloc[list(range(i_mid, i_final + 1))],
-            1,
-        )
-        labels.append("slope_discharge_capacity_cycle_number_91:100")
-        labels.append("intercept_discharge_capacity_cycle_number_91:100")
-
-        IR_trend = summary.dc_internal_resistance.iloc[
-            list(range(1, i_final + 1))]
-        if any(v == 0 for v in IR_trend):
-            IR_trend[IR_trend == 0] = np.nan
-
-        # Internal resistance minimum
-        X[17] = np.nanmin(IR_trend)
-        labels.append("min_internal_resistance_cycles_2:100")
-
-        # Internal resistance at cycle 2
-        X[18] = summary.dc_internal_resistance.iloc[1]
-        labels.append("internal_resistance_cycle_2")
-
-        # Internal resistance at cycle 100 - cycle 2
-        X[19] = (
-                summary.dc_internal_resistance.iloc[i_final] -
-                summary.dc_internal_resistance.iloc[1]
-        )
-        labels.append("internal_resistance_difference_cycles_2:100")
-
-        # Nominal capacity
-        end = self.hyperparameters["n_nominal_cycles"]
-        X[20] = np.median(summary.discharge_capacity.iloc[0: end])
-        labels.append("nominal_capacity_by_median")
-
-        X.columns = labels
-        self.features = X
-
-
-class TrajectoryFastCharge(DeltaQFastCharge):
-    """
-    Object corresponding to cycle numbers at which the capacity drops below
-     specific percentages of the initial capacity. Computed on the discharge
-     portion of the regular fast charge cycles.
-
-    """
-
-    DEFAULT_HYPERPARAMETERS = {
-        "thresh_max_cap": 0.98,
-        "thresh_min_cap": 0.78,
-        "interval_cap": 0.03
-    }
-
-    def validate(self):
-        """
-        This function determines if the input data has the necessary attributes for
-        creation of this feature class. It should test for all of the possible reasons
-        that feature generation would fail for this particular input data.
-
-        Args:
-            processed_cycler_run (beep.structure.ProcessedCyclerRun): data from cycler run
-            params_dict (dict): dictionary of parameters governing how the ProcessedCyclerRun object
-            gets featurized. These could be filters for column or row operations
-        Returns:
-            bool: True/False indication of ability to proceed with feature generation
-        """
-        cap = self.datapath.structured_summary.discharge_capacity
-        cap_ratio = cap.min() / cap.max()
-        max_cap = self.hyperparameters["thresh_max_cap"]
-        if not cap_ratio < max_cap:
-            return False, f"thresh_max_cap hyperparameter exceeded: {cap_ratio} !< {max_cap}"
-        else:
-            return True, None
-
-    def create_features(self):
-        """
-        Calculate the outcomes from the input data. In particular, the number of cycles
-        where we expect to reach certain thresholds of capacity loss
-        Args:
-            processed_cycler_run (beep.structure.ProcessedCyclerRun): data from cycler run
-            params_dict (dict): dictionary of parameters governing how the ProcessedCyclerRun object
-            gets featurized. These could be filters for column or row operations
-            parameters_path (str): Root directory storing project parameter files.
-
-        Returns:
-            pd.DataFrame: cycles at which capacity/energy degradation exceeds thresholds
-        """
-        y = self.datapath.capacities_to_cycles(
-            self.hyperparameters["thresh_max_cap"],
-            self.hyperparameters["thresh_min_cap"],
-            self.hyperparameters["interval_cap"],
-        )
-        self.features = y
-
-
-class DiagnosticProperties(BEEPFeaturizer):
-    """
-    This class stores fractional levels of degradation in discharge capacity and discharge energy
-    relative to the first cycle at each diagnostic cycle, grouped by diagnostic cycle type.
-
-        name (str): predictor object name.
-        X (pandas.DataFrame): features in DataFrame format.
-        metadata (dict): information about the conditions, data
-            and code used to produce features
-
-    Hyperparameters:
-        parameters_dir (str): Full path to directory of parameters to analyse the
-            diagnostic cycles
-        quantities ([str]): Quantities to extract/get fractional metrics for
-            diagnostic cycles
-        cycle_type (str): Type of diagnostic cycle being used to measure the
-            fractional metric
-        metric (str): The metric being used for fractional capacity
-        interpolation_axes (list): List of column names to use for
-            x_axis interpolation (distance to threshold)
-        threshold (float): Value for the fractional metric to be considered above
-            or below threshold
-        filter_kinks (float): If set, cutoff value for the second derivative of
-            the fractional metric (cells with an abrupt change in degradation
-            rate might have something else going on). Typical value might be 0.04
-        extrapolate_threshold (bool): Should threshold crossing point be
-            extrapolated for cells that have not yet reached the threshold
-            (warning: this uses a linear extrapolation from the last two
-            diagnostic cycles)
-    """
-    DEFAULT_HYPERPARAMETERS = {
-        "parameters_dir": PROTOCOL_PARAMETERS_DIR,
-        "quantities": ['discharge_energy', 'discharge_capacity'],
-        "threshold": 0.8,
-        "metric": "discharge_energy",
-        "filter_kinks": None,
-        "interpolation_axes": ["normalized_regular_throughput", "cycle_index"],
-        "cycle_type": "rpt_1C",
-        "extrapolate_threshold": True
-    }
-
-    def validate(self):
-        """
-        This function determines if the input data has the necessary attributes for
-        creation of this feature class. It should test for all of the possible reasons
-        that feature generation would fail for this particular input data.
-
-        Args:
-            processed_cycler_run (beep.structure.ProcessedCyclerRun): data from cycler run
-            params_dict (dict): dictionary of parameters governing how the ProcessedCyclerRun object
-            gets featurized. These could be filters for column or row operations
-        Returns:
-            bool: True/False indication of ability to proceed with feature generation
-        """
-        return featurizer_helpers.check_diagnostic_validation(self.datapath)
-
-    def create_features(self):
-        """
-        Generates diagnostic-property features from processed cycler run, including values for n*x method
-        Args:
-            self.datapath (beep.structure.ProcessedCyclerRun): data from cycler run
-            params_dict (dict): dictionary of parameters governing how the ProcessedCyclerRun object
-                gets featurized. These could be filters for column or row operations
-            parameters_path (str): Root directory storing project parameter files.
-
-        Returns:
-            pd.DataFrame: with "cycle_index", "fractional_metric", "x", "n", "cycle_type" and "metric" columns, rows
-            for each diagnostic cycle of the cell
-        """
-
-        parameters_path = self.hyperparameters["parameters_dir"]
-
-        cycle_types = self.datapath.diagnostic_summary.cycle_type.unique()
-        X = pd.DataFrame()
-        for quantity in self.hyperparameters["quantities"]:
-            for cycle_type in cycle_types:
-                summary_diag_cycle_type = featurizer_helpers.get_fractional_quantity_remaining_nx(
-                    self.datapath, quantity, cycle_type,
-                    parameters_path=parameters_path
-                )
-
-                summary_diag_cycle_type.loc[:, "cycle_type"] = cycle_type
-                summary_diag_cycle_type.loc[:, "metric"] = quantity
-                X = X.append(summary_diag_cycle_type)
-
-        X_condensed = self.get_threshold_targets(X)
-        self.features = X_condensed
-
-    def get_threshold_targets(self, df):
-        """
-        Apply a threshold via interpolation for determining various
-        metrics (e.g., discharge energy) from diagnostic cycles.
-
-        Args:
-            df (pd.DataFrame): A dataframe of diagnostic cycle data
-                for a single battery cycler run.
-
-        Returns:
-            (pd.DataFrame): Contains a vector for interpolated/intercept
-                data for determining threshold.
-
-        """
-        cycle_type = self.hyperparameters["cycle_type"]
-        metric = self.hyperparameters["metric"]
-        interpolation_axes = self.hyperparameters["interpolation_axes"]
-        threshold = self.hyperparameters["threshold"]
-        filter_kinks = self.hyperparameters["filter_kinks"]
-        extrapolate_threshold = self.hyperparameters["extrapolate_threshold"]
-
-        if filter_kinks:
-            if np.any(df['fractional_metric'].diff().diff() < filter_kinks):
-                last_good_cycle = df[
-                    df['fractional_metric'].diff().diff() < filter_kinks][
-                    'cycle_index'].min()
-                df = df[df['cycle_index'] < last_good_cycle]
-
-        x_axes = []
-        for type in interpolation_axes:
-            x_axes.append(df[type])
-        y_interpolation_axis = df['fractional_metric']
-
-        # Logic around how to deal with cells that have not crossed threshold
-        if df['fractional_metric'].min() > threshold and \
-                not extrapolate_threshold:
-            BEEPFeaturizationError(
-                "DiagnosticProperties data has not crossed threshold "
-                "and extrapolation inaccurate"
-            )
-        elif df['fractional_metric'].min() > threshold and \
-                extrapolate_threshold:
-            fill_value = "extrapolate"
-            bounds_error = False
-            x_linspaces = []
-            for x_axis in x_axes:
-                y1 = y_interpolation_axis.iloc[-2]
-                y2 = y_interpolation_axis.iloc[-1]
-                x1 = x_axis.iloc[-2]
-                x2 = x_axis.iloc[-1]
-                x_thresh_extrap = (threshold - 0.1 - y1) * (x2 - x1) / (
-                        y2 - y1) + x1
-                x_linspaces.append(
-                    np.linspace(x_axis.min(), x_thresh_extrap, num=1000)
-                )
-        else:
-            fill_value = np.nan
-            bounds_error = True
-            x_linspaces = []
-            for x_axis in x_axes:
-                x_linspaces.append(
-                    np.linspace(x_axis.min(), x_axis.max(), num=1000))
-
-        f_axis = []
-        for x_axis in x_axes:
-            f_axis.append(
-                interp1d(
-                    x_axis,
-                    y_interpolation_axis,
-                    kind='linear',
-                    bounds_error=bounds_error,
-                    fill_value=fill_value
-                )
-            )
-
-        x_to_threshold = []
-        for indx, x_linspace in enumerate(x_linspaces):
-            crossing_array = abs(f_axis[indx](x_linspace) - threshold)
-            x_to_threshold.append(x_linspace[np.argmin(crossing_array)])
-
-        if ~(x_to_threshold[0] > 0) or ~(x_to_threshold[1] > 0):
-            raise BEEPFeaturizationError(
-                "DiagnosticProperties data does not have a positive value "
-                "to threshold"
-            )
-
-        if "normalized_regular_throughput" in interpolation_axes:
-            real_throughput_to_threshold = x_to_threshold[
-                                               interpolation_axes.index(
-                                                   "normalized_regular_throughput")] * \
-                                           df[
-                                               'initial_regular_throughput'].values[
-                                               0]
-            x_to_threshold.append(real_throughput_to_threshold)
-            interpolation_axes = interpolation_axes + [
-                "real_regular_throughput"]
-
-        threshold_dict = {
-            'initial_regular_throughput':
-                df['initial_regular_throughput'].values[0],
-        }
-
-        for indx, x_axis in enumerate(interpolation_axes):
-            threshold_dict[
-                cycle_type + metric + str(threshold) + '_' + x_axis] = [
-                x_to_threshold[indx]]
-
-        return pd.DataFrame(threshold_dict)
-
-
-class DiagnosticCycleFeatures(BEEPFeaturizer):
-    """
-    This class stores fractional levels of degradation in discharge capacity and discharge energy
-    relative to the first cycle at each diagnostic cycle, grouped by diagnostic cycle type.
-
-        name (str): predictor object name.
-        X (pandas.DataFrame): features in DataFrame format.
-        metadata (dict): information about the conditions, data
-            and code used to produce features
-
-    Hyperparameters:
-        parameters_dir (str): Full path to directory of parameters to analyse the
-            diagnostic cycles
-    """
-    DEFAULT_HYPERPARAMETERS = {
-        "parameters_dir": PROTOCOL_PARAMETERS_DIR,
-        "nominal_capacity": 4.84,
-
-    }
-
-    def validate(self):
-        """
-        This function determines if the input data has the necessary attributes for
-        creation of this feature class. It should test for all of the possible reasons
-        that feature generation would fail for this particular input data.
-
-        Args:
-            processed_cycler_run (beep.structure.ProcessedCyclerRun): data from cycler run
-            params_dict (dict): dictionary of parameters governing how the ProcessedCyclerRun object
-            gets featurized. These could be filters for column or row operations
-        Returns:
-            bool: True/False indication of ability to proceed with feature generation
-        """
-        return featurizer_helpers.check_diagnostic_validation(self.datapath)
-
-    def create_features(self):
-        """
-        Generates diagnostic-property features from processed cycler run, including values for n*x method
-        Args:
-            self.datapath (beep.structure.ProcessedCyclerRun): data from cycler run
-            params_dict (dict): dictionary of parameters governing how the ProcessedCyclerRun object
-                gets featurized. These could be filters for column or row operations
-            parameters_path (str): Root directory storing project parameter files.
-
-        Returns:
-            pd.DataFrame: cycle_index, RPT discharge capacities and energies, aging cycle discharge capacity and energy,
-                            equivalent full cycles of aging cycle discharge, cumulative discharge throughput.
-            for each diagnostic cycle of the cell
-        """
-
-        parameters_path = self.hyperparameters["parameters_dir"]
-
-        # RPT discharge capacities       
-        data_rpt_02C = self.datapath.diagnostic_data.loc[
-            self.datapath.diagnostic_data.cycle_type == 'rpt_0.2C']
-        Q_rpt_02C = data_rpt_02C.groupby('cycle_index')[
-            ['discharge_capacity', 'discharge_energy']].max().reset_index(
-            drop=False)
-        Q_rpt_02C.rename(
-            columns={'discharge_capacity': 'rpt_0.2C_discharge_capacity',
-                     'discharge_energy': 'rpt_0.2C_discharge_energy'},
-            inplace=True)
-        Q_rpt_02C = Q_rpt_02C.reset_index(drop=False).rename(
-            columns={'index': 'diag_pos'})
-
-        rpt_02C_cycles = data_rpt_02C.cycle_index.unique()  # for referencing last regular cycle before diagnostic
-
-        data_rpt_1C = self.datapath.diagnostic_data.loc[
-            self.datapath.diagnostic_data.cycle_type == 'rpt_1C']
-        Q_rpt_1C = data_rpt_1C.groupby('cycle_index')[
-            ['discharge_capacity', 'discharge_energy']].max().reset_index(
-            drop=False)
-        Q_rpt_1C.rename(
-            columns={'discharge_capacity': 'rpt_1C_discharge_capacity',
-                     'discharge_energy': 'rpt_1C_discharge_energy'},
-            inplace=True)
-        Q_rpt_1C = Q_rpt_1C.reset_index(drop=False).rename(
-            columns={'index': 'diag_pos'})
-
-        data_rpt_2C = self.datapath.diagnostic_data.loc[
-            self.datapath.diagnostic_data.cycle_type == 'rpt_2C']
-        Q_rpt_2C = data_rpt_2C.groupby('cycle_index')[
-            ['discharge_capacity', 'discharge_energy']].max().reset_index(
-            drop=False)
-        Q_rpt_2C.rename(
-            columns={'discharge_capacity': 'rpt_2C_discharge_capacity',
-                     'discharge_energy': 'rpt_2C_discharge_energy'},
-            inplace=True)
-        Q_rpt_2C = Q_rpt_2C.reset_index(drop=False).rename(
-            columns={'index': 'diag_pos'})
-
-        # cumuative discharge throughput
-        aging_df = self.datapath.structured_summary[
-            ['cycle_index', 'charge_throughput', 'energy_throughput',
-             'energy_efficiency', 'charge_duration', 'CV_time', 'CV_current',
-             'energy_efficiency']]
-        aging_df = aging_df.loc[aging_df.cycle_index.isin(rpt_02C_cycles - 3)]
-
-        cumulative_discharge_throughput = aging_df[
-            ['cycle_index', 'charge_throughput']].rename(
-            columns={'charge_throughput': 'discharge_throughput'}).reset_index(
-            drop=True)
-        cumulative_discharge_throughput = cumulative_discharge_throughput.reset_index(
-            drop=False).rename(columns={'index': 'diag_pos'})
-
-        cumulative_energy_throughput = aging_df[
-            ['cycle_index', 'energy_throughput']].reset_index(drop=True)
-        cumulative_energy_throughput = cumulative_energy_throughput.reset_index(
-            drop=False).rename(columns={'index': 'diag_pos'})
-
-        equivalent_full_cycles = cumulative_discharge_throughput.copy()
-        equivalent_full_cycles.rename(
-            columns={'discharge_throughput': 'equivalent_full_cycles'},
-            inplace=True)
-        equivalent_full_cycles['equivalent_full_cycles'] = \
-        equivalent_full_cycles['equivalent_full_cycles'] / self.hyperparameters[
-            'nominal_capacity']
-
-        # Q_aging_pre_diag - discharge capacity of aging cycle before diagnostic
-        Q_aging_pre_diag = self.datapath.structured_data.groupby('cycle_index')[
-            'discharge_capacity'].max().loc[rpt_02C_cycles[1:] - 3].reset_index(
-            drop=False)  # ignore first diagnostic, adjust cycle index to Q_aging_pre_diag
-        Q_aging_pre_diag.rename(
-            columns={'discharge_capacity': 'Q_aging_pre_diag'}, inplace=True)
-        Q_aging_pre_diag = Q_aging_pre_diag.reset_index(
-            drop=False).rename(columns={'index': 'diag_pos'})
-        Q_aging_pre_diag['diag_pos'] = Q_aging_pre_diag[
-                                           'diag_pos'] + 1  # since, first diag is ignored, add one to diag_pos
-
-        # Q_aging_post_diag - discharge capacity of aging cycle after diagnostic
-        Q_aging_post_diag = \
-        self.datapath.structured_data.groupby('cycle_index')[
-            'discharge_capacity'].max().loc[rpt_02C_cycles + 3].reset_index(
-            drop=False)  # does not ignore first diag since Q_aging exists after first diag
-        Q_aging_post_diag.rename(
-            columns={'discharge_capacity': 'Q_aging_post_diag'}, inplace=True)
-        Q_aging_post_diag = Q_aging_post_diag.reset_index(
-            drop=False).rename(columns={'index': 'diag_pos'})
-
-        # Diagnostic time
-        diagnostic_time = data_rpt_02C.groupby('cycle_index')[
-            'test_time'].min().reset_index(drop=False).rename(
-            columns={'test_time': 'diagnostic_time'})
-        diagnostic_time = diagnostic_time.reset_index(
-            drop=False).rename(columns={'index': 'diag_pos'})
-
-        # Combine dataframes
-        df_list = [Q_rpt_02C, Q_rpt_1C, Q_rpt_2C,
-                   cumulative_discharge_throughput,
-                   cumulative_energy_throughput,
-                   equivalent_full_cycles,
-                   Q_aging_pre_diag,
-                   Q_aging_post_diag,
-                   diagnostic_time]
-
-        for df in df_list:
-            df['cycle_index'] = df['cycle_index'].copy().astype(int)
-            df['diag_pos'] = df['diag_pos'].copy().astype(int)
-
-        cycle_features = reduce(
-            lambda x, y: pd.merge(x, y, on=['cycle_index', 'diag_pos'],
-                                  how='outer'), df_list)
-        self.features = cycle_features.sort_values('cycle_index').reset_index(
-            drop=True)
-
-
-class CyclingProtocol(BEEPFeaturizer):
-    """
-    This class stores information about the charging protocol used
-        name (str): predictor object name.
-        X (pandas.DataFrame): features in DataFrame format.
-        metadata (dict): information about the conditions, data
-            and code used to produce features
-    Hyperparameters:
-        parameters_dir (str): Full path to directory of charging protocol parameters
-        quantities ([str]): list of parameters to return
-    """
-    DEFAULT_HYPERPARAMETERS = {
-        "parameters_dir": PROTOCOL_PARAMETERS_DIR,
-        "quantities": ["charge_constant_current_1", "charge_constant_current_2",
-                       "charge_cutoff_voltage", "charge_constant_voltage_time",
-                       "discharge_constant_current",
-                       "discharge_cutoff_voltage"],
-    }
-
-    def validate(self):
-        """
-        This function determines if the input data has the necessary attributes for
-        creation of this feature class. It should test for all of the possible reasons
-        that feature generation would fail for this particular input data.
-        Args:
-            processed_cycler_run (beep.structure.ProcessedCyclerRun): data from cycler run
-            params_dict (dict): dictionary of parameters governing how the ProcessedCyclerRun object
-            gets featurized. These could be filters for column or row operations
-        Returns:
-            bool: True/False indication of ability to proceed with feature generation
-        """
-        if not (
-                'raw' in self.datapath.paths.keys() or 'structured' in self.datapath.paths.keys()):
-            message = "datapath paths not set, unable to fetch charging protocol"
-            return False, message
-        else:
-            return featurizer_helpers.check_diagnostic_validation(self.datapath)
-
-    def create_features(self):
-        """
-        Fetches charging protocol features
-        """
-
-        parameters_path = self.hyperparameters["parameters_dir"]
-        file_path = self.datapath.paths[
-            'raw'] if 'raw' in self.datapath.paths.keys() else \
-        self.datapath.paths['structured']
-
-        parameters, _ = get_protocol_parameters(file_path, parameters_path)
-
-        parameters = parameters[self.hyperparameters["quantities"]]
-        parameters['cycle_index'] = int(
-            0)  # create a cycle index column for merging with other featurizers
-        parameters['diag_pos'] = int(
-            0)  # create a diag_pos column for merging with other featurizers
-        self.features = parameters
diff --git a/beep/features/featurizer.py b/beep/features/featurizer.py
new file mode 100644
index 00000000..7a3e57cf
--- /dev/null
+++ b/beep/features/featurizer.py
@@ -0,0 +1,233 @@
+# Copyright [2020] [Toyota Research Institute]
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+For creating features and organizing them into datasets.
+
+"""
+import os
+import abc
+import json
+from typing import Union, Tuple
+
+import pandas as pd
+from monty.io import zopen
+from monty.json import MSONable
+from monty.serialization import dumpfn
+
+from beep.structure.base import BEEPDatapath
+
+
+class BEEPFeaturizationError(BaseException):
+    """Raise when a featurization-specific error occurs"""
+    pass
+
+
+class BEEPFeaturizer(MSONable, abc.ABC):
+    """
+    Base class for all beep feature generation.
+
+    From a structured battery file representing many cycles of one cell,
+    (AKA a structured datapath), produce a feature vector.
+
+    Works for generating both
+     - Vectors X to use as training vectors
+     - Vectors or scalars y to use as ML targets
+        (as problems may have multiple metrics to predict)
+
+    """
+
+    DEFAULT_HYPERPARAMETERS = {}
+
+    def __init__(self, structured_datapath: Union[BEEPDatapath, None],
+                 hyperparameters: Union[dict, None] = None):
+        # If all required hyperparameters are specified, use those
+        # If some subset of required hyperparameters are specified, throw error
+        # If no hyperparameters are specified, use defaults
+        if hyperparameters:
+            if all(k in hyperparameters for k in self.DEFAULT_HYPERPARAMETERS):
+                self.hyperparameters = hyperparameters
+            else:
+                raise BEEPFeaturizationError(
+                    f"Features cannot be created with incomplete set of "
+                    f"hyperparameters {hyperparameters.keys()} < "
+                    f"{self.DEFAULT_HYPERPARAMETERS.keys()}!")
+        else:
+            self.hyperparameters = self.DEFAULT_HYPERPARAMETERS
+
+        if structured_datapath is not None and not structured_datapath.is_structured:
+            raise BEEPFeaturizationError(
+                "BEEPDatapath input is not structured!")
+        self.datapath = structured_datapath
+
+        self.features = None
+
+        # In case these features are loaded from file
+        # Allow attrs which can hold relevant metadata without having
+        # to reload the original datapath
+        self.paths = self.datapath.paths if self.datapath else {}
+        self.metadata = self.datapath.metadata.raw if self.datapath else {}
+        self.linked_semiunique_id = self.datapath.semiunique_id if self.datapath else None
+
+    @abc.abstractmethod
+    def validate(self) -> Tuple[bool, Union[str, None]]:
+        """
+        Validate a featurizer on it's ingested datapath.
+
+        Returns:
+            (bool, str/None): The validation result and it's message.
+
+        """
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def create_features(self) -> None:
+        """
+        Should assign a dataframe to self.features.
+
+        Returns:
+            None
+        """
+        raise NotImplementedError
+
+    def as_dict(self):
+        """Serialize a BEEPDatapath as a dictionary.
+
+        Must not be loaded from legacy.
+
+        Returns:
+            (dict): corresponding to dictionary for serialization.
+
+        """
+
+        if self.features is None:
+            raise BEEPFeaturizationError(
+                "Cannot serialize features which have not been generated.")
+
+        features = self.features.to_dict("list")
+
+        return {
+            "@module": self.__class__.__module__,
+            "@class": self.__class__.__name__,
+
+            # Core parts of BEEPFeaturizer
+            "features": features,
+            "hyperparameters": self.hyperparameters,
+            "paths": self.paths,
+            "metadata": self.metadata,
+            "linked_datapath_semiunique_id": self.linked_semiunique_id
+        }
+
+    @classmethod
+    def from_dict(cls, d):
+        """Create a BEEPDatapath object from a dictionary.
+
+        Args:
+            d (dict): dictionary represenation.
+
+        Returns:
+            beep.structure.ProcessedCyclerRun: deserialized ProcessedCyclerRun.
+        """
+
+        # no need for original datapath
+        bf = cls(structured_datapath=None, hyperparameters=d["hyperparameters"])
+        bf.features = pd.DataFrame(d["features"])
+        bf.paths = d["paths"]
+        bf.metadata = d["metadata"]
+        bf.linked_semiunique_id = d["linked_datapath_semiunique_id"]
+        return bf
+
+    @classmethod
+    def from_json_file(cls, filename):
+        """Load a structured run previously saved to file.
+
+        .json.gz files are supported.
+
+        Loads a BEEPFeaturizer from json.
+
+        Can be used in combination with files serialized with BEEPFeatures.to_json_file.
+
+        Args:
+            filename (str, Pathlike): a json file from a structured run, serialzed with to_json_file.
+
+        Returns:
+            None
+        """
+        with zopen(filename, "r") as f:
+            d = json.load(f)
+
+        # Add this structured file path to the paths dict
+        paths = d.get("paths", {})
+        paths["features"] = os.path.abspath(filename)
+        d["paths"] = paths
+        return cls.from_dict(d)
+
+    def to_json_file(self, filename):
+        """Save a BEEPFeatures to disk as a json.
+
+        .json.gz files are supported.
+
+        Not named from_json to avoid conflict with MSONable.from_json(*)
+
+        Args:
+            filename (str, Pathlike): The filename to save the file to.
+            omit_raw (bool): If True, saves only structured (NOT RAW) data.
+                More efficient for saving/writing to disk.
+
+        Returns:
+            None
+        """
+        d = self.as_dict()
+        dumpfn(d, filename)
+
+
+class BEEPAllCyclesFeaturizer(BEEPFeaturizer):
+    """Base class for featurizers that return a constant number of features
+    for any number of cycles in a structured datapath.
+
+    These features are typically used for early prediction.
+
+    A BEEPAllCyclesFeaturizer always returns the same number of features
+    for files for datapaths with any number of samples. Thus,
+
+
+    [Datapath w/ 2 cycles]   ---> (vector of k features)
+
+    [Datapath w/ 100 cycles] ---> (vector of k features)
+    """
+    PER_CYCLE = False
+
+
+class BEEPPerCycleFeaturizer(BEEPFeaturizer):
+    """Base class for featurizers that return a vector of features for
+    EACH cycle in a structured datapath.
+
+    These features are generally used for analysis
+
+    A BEEPPerCycleFeaturizer always returns an (n x k) matrix of features
+    for datapaths with n cycles each producing k features. Thus,
+
+    [Datapath w/ 2 cycles]   ---> (2 x k feature matrix)
+
+    [Datapath w/ 100 cycles] ---> (100 x k feature matrix)
+
+    """
+    PER_CYCLE = True
+    SPECIAL_COLUMNS = ("cycle_index", "diag_pos")
+
+
+
+
+
+
diff --git a/beep/features/matrix.py b/beep/features/matrix.py
new file mode 100644
index 00000000..355899cb
--- /dev/null
+++ b/beep/features/matrix.py
@@ -0,0 +1,476 @@
+
+import copy
+import hashlib
+from typing import List
+
+import pandas as pd
+from monty.json import MSONable, MontyDecoder
+from monty.serialization import loadfn, dumpfn
+
+from beep.features.featurizer import BEEPFeaturizer, BEEPPerCycleFeaturizer
+
+
+class BEEPFeatureMatrixError(BaseException):
+    """ Raise when there is a BEEP-specific problem with a dataset"""
+    pass
+
+
+class BEEPFeatureMatrix(MSONable):
+    """
+    Create an array composed of BEEPFeaturizer objects.
+
+    The array may either be:
+
+    PER-CYCLER-RUN, using BEEPAllCyclesFeaturizer.
+    One feature vector per cycler file, resulting in an array w. dimenions:
+        (n battery cycler files) x (k features)
+
+    OR:
+
+    PER-CYCLE, using BEEPPerCycleFeaturizer.
+    One feature vector per cycle, resulting in an array w. dimensions:
+        (n total cycles) x (k features)
+
+    Sets of featurizer objects must exclusively belong to EITHER of these
+    two paradigms (base classes), but may not be mixed.
+
+    So a set of featurizrs may be -per-cycler-file OR per-cycle, but not
+    both.
+
+    Args:
+        beepfeaturizers ([BEEPFeaturizer]): A list of BEEPFeaturizer objects,
+            either ALL BEEPAllCyclesFeaturizer child objects OR ALL
+            BEEPPerCycleFeaturizer child objects.
+
+    """
+
+    OP_DELIMITER = "::"
+
+    def __init__(self, beepfeaturizers: List[BEEPFeaturizer]):
+
+        if beepfeaturizers:
+            bfs_types_per_cycle = [bf.PER_CYCLE for bf in beepfeaturizers]
+
+            # the array should be either all True or all False
+            if all(bfs_types_per_cycle):
+                self.per_cycle = True
+            elif not any(bfs_types_per_cycle):
+                self.per_cycle = False
+            else:
+                raise TypeError(
+                    f"Featurizer types are mixed!\n"
+                    f"BEEPFeatureMatrix can only use EITHER a set of ALL "
+                    f"BEEPAllCyclesFeaturizers OR a set of ALL "
+                    f"BEEPPerCycleFeaturizers.")
+            dfs_by_file = {bf.paths.get("structured", "no file found"): [] for bf in beepfeaturizers}
+
+            unique_features = {}
+            for i, bf in enumerate(beepfeaturizers):
+
+                if bf.features is None:
+                    raise BEEPFeatureMatrixError(
+                        f"BEEPFeaturizer {bf} has not created features")
+                else:
+                    bfcn = bf.__class__.__name__
+
+                    fname = bf.paths.get("structured", None)
+                    if not fname:
+                        raise BEEPFeatureMatrixError(
+                            "Cannot join features automatically as no linking can be done "
+                            "based on original structured filename."
+                        )
+
+
+                    # Ensure no per-cycle featurizer is missing required special columns for merge
+                    if self.per_cycle:
+                        missing_special_columns = set()
+                        for sf in BEEPPerCycleFeaturizer.SPECIAL_COLUMNS:
+                            if sf not in bf.features.columns:
+                                missing_special_columns.add(sf)
+                        if missing_special_columns:
+                            raise BEEPFeatureMatrixError(
+                                f"Per cycle featurizer object missing special columns: {missing_special_columns}"
+                            )
+
+                    # Check for any possible feature collisions using identical featurizers
+                    # on identical files
+
+                    # sort params for this featurizer obj by key
+                    params = sorted(list(bf.hyperparameters.items()), key=lambda x: x[0])
+
+                    # Prevent identical features from identical input files
+                    # create a unique operation string for the application of this featurizer
+                    # on a specific file, this op string will be the same as long as
+                    # the featurizer class name, hyperparameters, and class are the same
+
+                    param_str = "-".join([f"{k}:{v}" for k, v in params])
+                    param_hash = hashlib.sha256(
+                        param_str.encode("utf-8")).hexdigest()
+
+                    # Get an id for this featurizer operation (including hyperparameters)
+                    # regardless of the file it is applied on
+                    feature_op_id = f"{bfcn}{self.OP_DELIMITER}{param_hash}"
+
+                    # Get an id for this featurizer operation (including hyperparameters)
+                    # on THIS SPECIFIC file.
+                    file_feature_op_id = f"{fname}{self.OP_DELIMITER}{bfcn}{self.OP_DELIMITER}{param_hash}"
+
+                    # Get a unique id for every feature generated by a specific
+                    # featurizer on a specific file.
+                    this_file_feature_columns_ids = \
+                        [
+                            f"{file_feature_op_id}{self.OP_DELIMITER}{c}" for c
+                            in bf.features.columns
+                        ]
+
+                    # Check to make sure there are no duplicates of the exact same feature for
+                    # the exact same featurizer with the exact same hyperparameters on the exact
+                    # same file.
+                    collisions = {c: f for c, f in unique_features.items() if
+                                  c in this_file_feature_columns_ids}
+                    if collisions:
+                        raise BEEPFeatureMatrixError(
+                            f"Multiple features generated with identical classes and identical hyperparameters"
+                            f" attempted to be joined into same dataset; \n"
+                            f"{bfcn} features collide with existing: \n{collisions}"
+                        )
+                    for c in this_file_feature_columns_ids:
+                        unique_features[c] = bfcn
+
+                    # Create consistent scheme for naming features regardless of file
+                    # Only rename non-special column names
+                    df = copy.deepcopy(bf.features)
+
+                    special_column_names = BEEPPerCycleFeaturizer.SPECIAL_COLUMNS if self.per_cycle else set()
+                    consistent_column_names = [
+                        f"{c}{self.OP_DELIMITER}{feature_op_id}" for c in df.columns if c not in special_column_names
+                    ]
+                    df.columns = consistent_column_names
+
+                    # ensure cycle_index and diag_pos are integers
+                    if self.per_cycle:
+                        for col in BEEPPerCycleFeaturizer.SPECIAL_COLUMNS:
+                            df[col] = df[col].astype(int)
+
+                    df.index = [fname] * df.shape[0]
+                    df.index.rename("filename", inplace=True)
+                    dfs_by_file[fname].append(df)
+
+            blocks = []
+            # concat dfs by file across columns
+            for filename, dfs in dfs_by_file.items():
+
+                if self.per_cycle:
+                    rows = pd.concat(
+                        dfs,
+                        axis=1,
+                        join="outer",
+                        ignore_index=False,
+                        keys=list(BEEPPerCycleFeaturizer.SPECIAL_COLUMNS)
+                    )
+
+                else:
+
+                    rows = pd.concat(dfs, axis=1)
+                    rows = rows[sorted(rows.columns)]
+
+                blocks.append(rows)
+
+            # concat all dfs for all files across rows
+            self.matrix = pd.concat(blocks, axis=0)
+
+        else:
+            self.matrix = None
+
+        self.featurizers = beepfeaturizers
+
+    def as_dict(self):
+        """Serialize a BEEPFeatureMatrix as a dictionary.
+
+        Must not be loaded from legacy.
+
+        Returns:
+            (dict): corresponding to dictionary for serialization.
+
+        """
+
+        return {
+            "@module": self.__class__.__module__,
+            "@class": self.__class__.__name__,
+
+            # Core parts of BEEPFeaturizer
+            "featurizers": [f.as_dict() for f in self.featurizers],
+            "matrix": self.matrix.to_dict("list"),
+        }
+
+    @classmethod
+    def from_dict(cls, d):
+        """Create a BEEPFeatureMatrix object from a dictionary.
+
+        Args:
+            d (dict): dictionary represenation.
+
+        Returns:
+            beep.structure.ProcessedCyclerRun: deserialized ProcessedCyclerRun.
+        """
+        # no need for original datapaths, as their ref paths should
+        # be in the subobjects
+        featurizers = [MontyDecoder().process_decoded(f) for f in
+                       d["featurizers"]]
+        return cls(featurizers)
+
+    @classmethod
+    def from_json_file(cls, filename):
+        """Load a structured run previously saved to file.
+
+        .json.gz files are supported.
+
+        Loads a BEEPFeatureMatrix from json.
+
+        Can be used in combination with files serialized with BEEPFeatures.to_json_file.
+
+        Args:
+            filename (str, Pathlike): a json file from a structured run, serialzed with to_json_file.
+
+        Returns:
+            None
+        """
+        return loadfn(filename)
+
+    def to_json_file(self, filename):
+        """Save a BEEPFeatureMatrix to disk as a json.
+
+        .json.gz files are supported.
+
+        Not named from_json to avoid conflict with MSONable.from_json(*)
+
+        Args:
+            filename (str, Pathlike): The filename to save the file to.
+            omit_raw (bool): If True, saves only structured (NOT RAW) data.
+                More efficient for saving/writing to disk.
+
+        Returns:
+            None
+        """
+        d = self.as_dict()
+        dumpfn(d, filename)
+
+
+# class BEEPCycleFeatureMatrix(MSONable):
+#     """
+#     Create an ((n battery cycler files) x (j cycles)) x (k features)  array composed of
+#     m BEEPFeaturizer objects.
+#
+#     Args:
+#         beepfeaturizers ([BEEPFeaturizer]): A list of BEEPFeaturizer objects
+#
+#     """
+#
+#     OP_DELIMITER = "::"
+#
+#     def __init__(self, beepfeaturizers: List[BEEPFeaturizer]):
+#
+#         if beepfeaturizers:
+#             # initialize emtpy dict of file names
+#             dfs_by_file = {os.path.basename(
+#                 bf.paths.get("structured", "no file found")
+#             )[0:-19]: pd.DataFrame(
+#                 columns=['filename', 'cycle_index', 'diag_pos']
+#                 ) for bf in beepfeaturizers}
+#             # big_df_rows = {bf.__class__.__name__: [] for bf in beepfeaturizers}
+#             unique_features = {}
+#             for i, bf in enumerate(beepfeaturizers):
+#                 if bf.features is None:
+#                     raise BEEPFeatureMatrixError(
+#                         f"BEEPFeaturizer {bf} has not created features")
+#
+#                 #                 elif bf.features.shape[0] != 1:
+#                 #                     raise BEEPFeatureMatrixError(f"BEEPFeaturizer {bf} features are not 1-dimensional.")
+#                 else:
+#                     bfcn = bf.__class__.__name__
+#
+#                     #                     fname = bf.paths.get("structured", None)
+#                     fname = os.path.basename(bf.paths['structured'])[0:-19]
+#                     if not fname:
+#                         raise BEEPFeatureMatrixError(
+#                             "Cannot join features automatically as no linking can be done "
+#                             "based on original structured filename."
+#                         )
+#
+#                     # Check for any possible feature collisions using identical featurizers
+#                     # on identical files
+#
+#                     # sort params for this featurizer obj by key
+#                     params = sorted(list(bf.hyperparameters.items()),
+#                                     key=lambda x: x[0])
+#
+#                     # Prevent identical features from identical input files
+#                     # create a unique operation string for the application of this featurizer
+#                     # on a specific file, this op string will be the same as long as
+#                     # the featurizer class name, hyperparameters, and class are the same
+#
+#                     param_str = "-".join([f"{k}:{v}" for k, v in params])
+#                     param_hash = hashlib.sha256(
+#                         param_str.encode("utf-8")).hexdigest()
+#
+#                     # Get an id for this featurizer operation (including hyperparameters)
+#                     # regardless of the file it is applied on
+#                     feature_op_id = f"{bfcn}{self.OP_DELIMITER}{param_hash}"
+#
+#                     # Get an id for this featurizer operation (including hyperparameters)
+#                     # on THIS SPECIFIC file.
+#                     file_feature_op_id = f"{fname}{self.OP_DELIMITER}{bfcn}{self.OP_DELIMITER}{param_hash}"
+#
+#                     # Get a unique id for every feature generated by a specific
+#                     # featurizer on a specific file.
+#                     this_file_feature_columns_ids = \
+#                         [
+#                             f"{file_feature_op_id}{self.OP_DELIMITER}{c}" for c
+#                             in bf.features.columns
+#                         ]
+#
+#                     # Check to make sure there are no duplicates of the exact same feature for
+#                     # the exact same featurizer with the exact same hyperparameters on the exact
+#                     # same file.
+#                     collisions = {c: f for c, f in unique_features.items() if
+#                                   c in this_file_feature_columns_ids}
+#                     if collisions:
+#                         raise BEEPFeatureMatrixError(
+#                             f"Multiple features generated with identical classes and identical hyperparameters"
+#                             f" attempted to be joined into same dataset; \n"
+#                             f"{bfcn} features collide with existing: \n{collisions}"
+#                         )
+#                     for c in this_file_feature_columns_ids:
+#                         unique_features[c] = bfcn
+#
+#                     # Create consistent scheme for naming features regardless of file
+#                     df = copy.deepcopy(bf.features)
+#                     consistent_column_names = [
+#                         f"{c}{self.OP_DELIMITER}{feature_op_id}" for c in
+#                         df.columns]
+#                     df.columns = consistent_column_names
+#
+#                     #                     df.index = [fname] * df.shape[0]
+#                     #                     df.index.rename("filename", inplace=True)
+#
+#                     # create filename column to merge on
+#                     df['filename'] = os.path.basename(bf.paths['structured'])[
+#                                      0:-19]
+#
+#                     #                     df = df.reset_index(drop=True)
+#
+#                     # remove hash from cycle_index and diag_pos column
+#                     cycle_index_col = [col for col in df.columns if
+#                                        'cycle_index' in col]
+#                     df.rename(columns={cycle_index_col[0]: 'cycle_index'},
+#                               inplace=True)
+#
+#                     # remove hash from diag_pos column
+#                     diag_pos_col = [col for col in df.columns if
+#                                     'diag_pos' in col]
+#                     df.rename(columns={diag_pos_col[0]: 'diag_pos'},
+#                               inplace=True)
+#
+#                     # ensure cycle_index and diag_pos are integers
+#                     df['cycle_index'] = df['cycle_index'].astype(int)
+#                     df['diag_pos'] = df['diag_pos'].astype(int)
+#
+#                     # append each BEEPFeaturizer df to the corresponding cell dict entry
+#                     #                     dfs_by_file[fname].append(df)
+#                     dfs_by_file[fname] = dfs_by_file[fname].merge(
+#                         df, how='outer',
+#                         on=['filename', 'cycle_index', 'diag_pos']).sort_values(
+#                         'cycle_index').reset_index(drop=True)
+#             #                     dfs_by_file[fname] = pd.concat(
+#             #                         [dfs_by_file[fname],df],
+#             #                         axis=1,join='outer',ignore_index=True,
+#             #                         keys=['filename'])
+#             #                     self.dfs_by_file = dfs_by_file
+#             #                     self.df = df
+#             #             return None
+#
+#             rows = []
+#             self.matrix = pd.DataFrame()
+#             for filename, dfs in dfs_by_file.items():
+#                 #                 row = pd.concat([row,dfs], axis=1)
+#                 #                 row = row[sorted(row.columns)]
+#                 #                 rows.append(row)
+#                 self.matrix = pd.concat([self.matrix, dfs], axis=0,
+#                                         ignore_index=True,
+#                                         join='outer')  # , keys=['filename']
+#
+#         else:
+#             self.matrix = None
+#
+#         self.featurizers = beepfeaturizers
+#
+#     def as_dict(self):
+#         """Serialize a BEEPDatapath as a dictionary.
+#
+#         Must not be loaded from legacy.
+#
+#         Returns:
+#             (dict): corresponding to dictionary for serialization.
+#
+#         """
+#
+#         return {
+#             "@module": self.__class__.__module__,
+#             "@class": self.__class__.__name__,
+#
+#             # Core parts of BEEPFeaturizer
+#             "featurizers": [f.as_dict() for f in self.featurizers],
+#             "matrix": self.matrix.to_dict("list"),
+#         }
+#
+#     @classmethod
+#     def from_dict(cls, d):
+#         """Create a BEEPDatapath object from a dictionary.
+#
+#         Args:
+#             d (dict): dictionary represenation.
+#
+#         Returns:
+#             beep.structure.ProcessedCyclerRun: deserialized ProcessedCyclerRun.
+#         """
+#         # no need for original datapaths, as their ref paths should
+#         # be in the subobjects
+#         featurizers = [MontyDecoder().process_decoded(f) for f in
+#                        d["featurizers"]]
+#         return cls(featurizers)
+#
+#     @classmethod
+#     def from_json_file(cls, filename):
+#         """Load a structured run previously saved to file.
+#
+#         .json.gz files are supported.
+#
+#         Loads a BEEPFeatureMatrix from json.
+#
+#         Can be used in combination with files serialized with BEEPFeatures.to_json_file.
+#
+#         Args:
+#             filename (str, Pathlike): a json file from a structured run, serialzed with to_json_file.
+#
+#         Returns:
+#             None
+#         """
+#         return loadfn(filename)
+#
+#     def to_json_file(self, filename):
+#         """Save a BEEPFeatureMatrix to disk as a json.
+#
+#         .json.gz files are supported.
+#
+#         Not named from_json to avoid conflict with MSONable.from_json(*)
+#
+#         Args:
+#             filename (str, Pathlike): The filename to save the file to.
+#             omit_raw (bool): If True, saves only structured (NOT RAW) data.
+#                 More efficient for saving/writing to disk.
+#
+#         Returns:
+#             None
+#         """
+#         d = self.as_dict()
+#         dumpfn(d, filename)
\ No newline at end of file
diff --git a/beep/features/per_cycle/__init__.py b/beep/features/per_cycle/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/beep/features/per_cycle/diagnostic.py b/beep/features/per_cycle/diagnostic.py
new file mode 100644
index 00000000..766cf022
--- /dev/null
+++ b/beep/features/per_cycle/diagnostic.py
@@ -0,0 +1,235 @@
+
+import numpy as np
+import pandas as pd
+from scipy.stats import skew, kurtosis
+from scipy.interpolate import interp1d
+
+from beep import PROTOCOL_PARAMETERS_DIR
+from beep.features import featurizer_helpers
+from functools import reduce
+from beep.utils.parameters_lookup import get_protocol_parameters
+
+from beep.features.featurizer import BEEPPerCycleFeaturizer
+
+class DiagnosticCycleFeatures(BEEPPerCycleFeaturizer:
+    """
+    This class stores fractional levels of degradation in discharge capacity and discharge energy
+    relative to the first cycle at each diagnostic cycle, grouped by diagnostic cycle type.
+
+        name (str): predictor object name.
+        X (pandas.DataFrame): features in DataFrame format.
+        metadata (dict): information about the conditions, data
+            and code used to produce features
+
+    Hyperparameters:
+        parameters_dir (str): Full path to directory of parameters to analyse the
+            diagnostic cycles
+    """
+    DEFAULT_HYPERPARAMETERS = {
+        "parameters_dir": PROTOCOL_PARAMETERS_DIR,
+        "nominal_capacity": 4.84,
+
+    }
+
+    def validate(self):
+        """
+        This function determines if the input data has the necessary attributes for
+        creation of this feature class. It should test for all of the possible reasons
+        that feature generation would fail for this particular input data.
+
+        Args:
+            processed_cycler_run (beep.structure.ProcessedCyclerRun): data from cycler run
+            params_dict (dict): dictionary of parameters governing how the ProcessedCyclerRun object
+            gets featurized. These could be filters for column or row operations
+        Returns:
+            bool: True/False indication of ability to proceed with feature generation
+        """
+        return featurizer_helpers.check_diagnostic_validation(self.datapath)
+
+    def create_features(self):
+        """
+        Generates diagnostic-property features from processed cycler run, including values for n*x method
+        Args:
+            self.datapath (beep.structure.ProcessedCyclerRun): data from cycler run
+            params_dict (dict): dictionary of parameters governing how the ProcessedCyclerRun object
+                gets featurized. These could be filters for column or row operations
+            parameters_path (str): Root directory storing project parameter files.
+
+        Returns:
+            pd.DataFrame: cycle_index, RPT discharge capacities and energies, aging cycle discharge capacity and energy,
+                            equivalent full cycles of aging cycle discharge, cumulative discharge throughput.
+            for each diagnostic cycle of the cell
+        """
+
+        parameters_path = self.hyperparameters["parameters_dir"]
+
+        # RPT discharge capacities
+        data_rpt_02C = self.datapath.diagnostic_data.loc[
+            self.datapath.diagnostic_data.cycle_type == 'rpt_0.2C']
+        Q_rpt_02C = data_rpt_02C.groupby('cycle_index')[
+            ['discharge_capacity', 'discharge_energy']].max().reset_index(
+            drop=False)
+        Q_rpt_02C.rename(
+            columns={'discharge_capacity': 'rpt_0.2C_discharge_capacity',
+                     'discharge_energy': 'rpt_0.2C_discharge_energy'},
+            inplace=True)
+        Q_rpt_02C = Q_rpt_02C.reset_index(drop=False).rename(
+            columns={'index': 'diag_pos'})
+
+        rpt_02C_cycles = data_rpt_02C.cycle_index.unique()  # for referencing last regular cycle before diagnostic
+
+        data_rpt_1C = self.datapath.diagnostic_data.loc[
+            self.datapath.diagnostic_data.cycle_type == 'rpt_1C']
+        Q_rpt_1C = data_rpt_1C.groupby('cycle_index')[
+            ['discharge_capacity', 'discharge_energy']].max().reset_index(
+            drop=False)
+        Q_rpt_1C.rename(
+            columns={'discharge_capacity': 'rpt_1C_discharge_capacity',
+                     'discharge_energy': 'rpt_1C_discharge_energy'},
+            inplace=True)
+        Q_rpt_1C = Q_rpt_1C.reset_index(drop=False).rename(
+            columns={'index': 'diag_pos'})
+
+        data_rpt_2C = self.datapath.diagnostic_data.loc[
+            self.datapath.diagnostic_data.cycle_type == 'rpt_2C']
+        Q_rpt_2C = data_rpt_2C.groupby('cycle_index')[
+            ['discharge_capacity', 'discharge_energy']].max().reset_index(
+            drop=False)
+        Q_rpt_2C.rename(
+            columns={'discharge_capacity': 'rpt_2C_discharge_capacity',
+                     'discharge_energy': 'rpt_2C_discharge_energy'},
+            inplace=True)
+        Q_rpt_2C = Q_rpt_2C.reset_index(drop=False).rename(
+            columns={'index': 'diag_pos'})
+
+        # cumuative discharge throughput
+        aging_df = self.datapath.structured_summary[
+            ['cycle_index', 'charge_throughput', 'energy_throughput',
+             'energy_efficiency', 'charge_duration', 'CV_time', 'CV_current',
+             'energy_efficiency']]
+        aging_df = aging_df.loc[aging_df.cycle_index.isin(rpt_02C_cycles - 3)]
+
+        cumulative_discharge_throughput = aging_df[
+            ['cycle_index', 'charge_throughput']].rename(
+            columns={'charge_throughput': 'discharge_throughput'}).reset_index(
+            drop=True)
+        cumulative_discharge_throughput = cumulative_discharge_throughput.reset_index(
+            drop=False).rename(columns={'index': 'diag_pos'})
+
+        cumulative_energy_throughput = aging_df[
+            ['cycle_index', 'energy_throughput']].reset_index(drop=True)
+        cumulative_energy_throughput = cumulative_energy_throughput.reset_index(
+            drop=False).rename(columns={'index': 'diag_pos'})
+
+        equivalent_full_cycles = cumulative_discharge_throughput.copy()
+        equivalent_full_cycles.rename(
+            columns={'discharge_throughput': 'equivalent_full_cycles'},
+            inplace=True)
+        equivalent_full_cycles['equivalent_full_cycles'] = \
+        equivalent_full_cycles['equivalent_full_cycles'] / self.hyperparameters[
+            'nominal_capacity']
+
+        # Q_aging_pre_diag - discharge capacity of aging cycle before diagnostic
+        Q_aging_pre_diag = self.datapath.structured_data.groupby('cycle_index')[
+            'discharge_capacity'].max().loc[rpt_02C_cycles[1:] - 3].reset_index(
+            drop=False)  # ignore first diagnostic, adjust cycle index to Q_aging_pre_diag
+        Q_aging_pre_diag.rename(
+            columns={'discharge_capacity': 'Q_aging_pre_diag'}, inplace=True)
+        Q_aging_pre_diag = Q_aging_pre_diag.reset_index(
+            drop=False).rename(columns={'index': 'diag_pos'})
+        Q_aging_pre_diag['diag_pos'] = Q_aging_pre_diag[
+                                           'diag_pos'] + 1  # since, first diag is ignored, add one to diag_pos
+
+        # Q_aging_post_diag - discharge capacity of aging cycle after diagnostic
+        Q_aging_post_diag = \
+        self.datapath.structured_data.groupby('cycle_index')[
+            'discharge_capacity'].max().loc[rpt_02C_cycles + 3].reset_index(
+            drop=False)  # does not ignore first diag since Q_aging exists after first diag
+        Q_aging_post_diag.rename(
+            columns={'discharge_capacity': 'Q_aging_post_diag'}, inplace=True)
+        Q_aging_post_diag = Q_aging_post_diag.reset_index(
+            drop=False).rename(columns={'index': 'diag_pos'})
+
+        # Diagnostic time
+        diagnostic_time = data_rpt_02C.groupby('cycle_index')[
+            'test_time'].min().reset_index(drop=False).rename(
+            columns={'test_time': 'diagnostic_time'})
+        diagnostic_time = diagnostic_time.reset_index(
+            drop=False).rename(columns={'index': 'diag_pos'})
+
+        # Combine dataframes
+        df_list = [Q_rpt_02C, Q_rpt_1C, Q_rpt_2C,
+                   cumulative_discharge_throughput,
+                   cumulative_energy_throughput,
+                   equivalent_full_cycles,
+                   Q_aging_pre_diag,
+                   Q_aging_post_diag,
+                   diagnostic_time]
+
+        for df in df_list:
+            df['cycle_index'] = df['cycle_index'].copy().astype(int)
+            df['diag_pos'] = df['diag_pos'].copy().astype(int)
+
+        cycle_features = reduce(
+            lambda x, y: pd.merge(x, y, on=['cycle_index', 'diag_pos'],
+                                  how='outer'), df_list)
+        self.features = cycle_features.sort_values('cycle_index').reset_index(
+            drop=True)
+
+
+class CyclingProtocol(BEEPPerCycleFeaturizer):
+    """
+    This class stores information about the charging protocol used
+        name (str): predictor object name.
+        X (pandas.DataFrame): features in DataFrame format.
+        metadata (dict): information about the conditions, data
+            and code used to produce features
+    Hyperparameters:
+        parameters_dir (str): Full path to directory of charging protocol parameters
+        quantities ([str]): list of parameters to return
+    """
+    DEFAULT_HYPERPARAMETERS = {
+        "parameters_dir": PROTOCOL_PARAMETERS_DIR,
+        "quantities": ["charge_constant_current_1", "charge_constant_current_2",
+                       "charge_cutoff_voltage", "charge_constant_voltage_time",
+                       "discharge_constant_current",
+                       "discharge_cutoff_voltage"],
+    }
+
+    def validate(self):
+        """
+        This function determines if the input data has the necessary attributes for
+        creation of this feature class. It should test for all of the possible reasons
+        that feature generation would fail for this particular input data.
+        Args:
+            processed_cycler_run (beep.structure.ProcessedCyclerRun): data from cycler run
+            params_dict (dict): dictionary of parameters governing how the ProcessedCyclerRun object
+            gets featurized. These could be filters for column or row operations
+        Returns:
+            bool: True/False indication of ability to proceed with feature generation
+        """
+        if not (
+                'raw' in self.datapath.paths.keys() or 'structured' in self.datapath.paths.keys()):
+            message = "datapath paths not set, unable to fetch charging protocol"
+            return False, message
+        else:
+            return featurizer_helpers.check_diagnostic_validation(self.datapath)
+
+    def create_features(self):
+        """
+        Fetches charging protocol features
+        """
+
+        parameters_path = self.hyperparameters["parameters_dir"]
+        file_path = self.datapath.paths[
+            'raw'] if 'raw' in self.datapath.paths.keys() else \
+        self.datapath.paths['structured']
+
+        parameters, _ = get_protocol_parameters(file_path, parameters_path)
+
+        parameters = parameters[self.hyperparameters["quantities"]]
+        parameters['cycle_index'] = int(
+            0)  # create a cycle index column for merging with other featurizers
+        parameters['diag_pos'] = int(
+            0)  # create a diag_pos column for merging with other featurizers
+        self.features = parameters
diff --git a/beep/features/per_cycle/hppc.py b/beep/features/per_cycle/hppc.py
new file mode 100644
index 00000000..4f6408ba
--- /dev/null
+++ b/beep/features/per_cycle/hppc.py
@@ -0,0 +1,80 @@
+import pandas as pd
+
+from beep import PROTOCOL_PARAMETERS_DIR
+from beep.features import featurizer_helpers
+
+from beep.features.featurizer import BEEPPerCycleFeaturizer
+
+
+
+class HPPCResistanceVoltagePerCycle(BEEPPerCycleFeaturizer):
+    DEFAULT_HYPERPARAMETERS = {
+        "test_time_filter_sec": 1000000,
+        "cycle_index_filter": 6,
+        "soc_window": 8,
+        "parameters_path": PROTOCOL_PARAMETERS_DIR
+    }
+
+    def validate(self):
+        val, msg = featurizer_helpers.check_diagnostic_validation(self.datapath)
+        if val:
+            conditions = []
+            conditions.append(
+                any(
+                    [
+                        "hppc" in x
+                        for x in
+                        self.datapath.diagnostic_summary.cycle_type.unique()
+                    ]
+                )
+            )
+            if all(conditions):
+                return True, None
+            else:
+                return False, "HPPC conditions not met for this cycler run"
+        else:
+            return val, msg
+
+    def create_features(self):
+        # Filter out low cycle numbers at the end of the test, corresponding to the "final" diagnostic
+        self.datapath.diagnostic_data = self.datapath.diagnostic_data[
+            ~((self.datapath.diagnostic_data.test_time > self.hyperparameters[
+                'test_time_filter_sec']) &
+              (self.datapath.diagnostic_data.cycle_index < self.hyperparameters[
+                  'cycle_index_filter']))
+        ]
+        self.datapath.diagnostic_data = self.datapath.diagnostic_data.groupby(
+            ["cycle_index", "step_index", "step_index_counter"]
+        ).filter(lambda x: ~x["test_time"].isnull().all())
+
+        # Only hppc_resistance_features are able to be calculated without error.
+        # Xiao Cui should be pulled in to understand the issue with the others features.
+
+        # diffusion features
+        #         diffusion_features = featurizer_helpers.get_diffusion_cycle_features(
+        #             self.datapath,
+        #         )
+
+        # hppc resistance features
+        hppc_resistance_features = featurizer_helpers.get_hppc_resistance_cycle_features(
+            self.datapath,
+        )
+
+        # the variance of ocv features
+        #         hppc_ocv_features = featurizer_helpers.get_hppc_ocv_cycle_features(
+        #             self.datapath,
+        #         )
+
+        # the v_diff features
+        #         v_diff = featurizer_helpers.get_v_diff_cycle_features(
+        #             self.datapath,
+        #             self.hyperparameters["soc_window"],
+        #             self.hyperparameters["parameters_path"]
+        #         )
+
+        # merge everything together as a final result dataframe
+        self.features = pd.concat(
+            [hppc_resistance_features,
+             # hppc_ocv_features,
+             # v_diff, #diffusion_features
+             ], axis=1)

From f0778f9b41abfce172507cd66a0da676ac97b912 Mon Sep 17 00:00:00 2001
From: ardunn <ardunn@lbl.gov>
Date: Tue, 29 Mar 2022 19:22:28 -0700
Subject: [PATCH 09/16] refactoring featurizers 2

---
 beep/features/all_cycles/summary.py   |  2 +-
 beep/features/per_cycle/diagnostic.py | 62 +-------------------------
 beep/features/per_cycle/protocol.py   | 64 +++++++++++++++++++++++++++
 3 files changed, 67 insertions(+), 61 deletions(-)
 create mode 100644 beep/features/per_cycle/protocol.py

diff --git a/beep/features/all_cycles/summary.py b/beep/features/all_cycles/summary.py
index 98dd9a1e..9e1c0e1a 100644
--- a/beep/features/all_cycles/summary.py
+++ b/beep/features/all_cycles/summary.py
@@ -2,10 +2,10 @@
 import pandas as pd
 from scipy.stats import skew, kurtosis
 
+from beep import PROTOCOL_PARAMETERS_DIR
 from beep.features.featurizer import BEEPAllCyclesFeaturizer
 
 
-
 class CycleSummaryStats(BEEPAllCyclesFeaturizer):
     DEFAULT_HYPERPARAMETERS = {
         "cycle_comp_num": [10, 100],
diff --git a/beep/features/per_cycle/diagnostic.py b/beep/features/per_cycle/diagnostic.py
index 766cf022..9ebbca16 100644
--- a/beep/features/per_cycle/diagnostic.py
+++ b/beep/features/per_cycle/diagnostic.py
@@ -1,17 +1,14 @@
 
-import numpy as np
 import pandas as pd
-from scipy.stats import skew, kurtosis
-from scipy.interpolate import interp1d
 
 from beep import PROTOCOL_PARAMETERS_DIR
 from beep.features import featurizer_helpers
 from functools import reduce
-from beep.utils.parameters_lookup import get_protocol_parameters
 
 from beep.features.featurizer import BEEPPerCycleFeaturizer
 
-class DiagnosticCycleFeatures(BEEPPerCycleFeaturizer:
+
+class DiagnosticFeaturesPerCycle(BEEPPerCycleFeaturizer):
     """
     This class stores fractional levels of degradation in discharge capacity and discharge energy
     relative to the first cycle at each diagnostic cycle, grouped by diagnostic cycle type.
@@ -177,59 +174,4 @@ def create_features(self):
             drop=True)
 
 
-class CyclingProtocol(BEEPPerCycleFeaturizer):
-    """
-    This class stores information about the charging protocol used
-        name (str): predictor object name.
-        X (pandas.DataFrame): features in DataFrame format.
-        metadata (dict): information about the conditions, data
-            and code used to produce features
-    Hyperparameters:
-        parameters_dir (str): Full path to directory of charging protocol parameters
-        quantities ([str]): list of parameters to return
-    """
-    DEFAULT_HYPERPARAMETERS = {
-        "parameters_dir": PROTOCOL_PARAMETERS_DIR,
-        "quantities": ["charge_constant_current_1", "charge_constant_current_2",
-                       "charge_cutoff_voltage", "charge_constant_voltage_time",
-                       "discharge_constant_current",
-                       "discharge_cutoff_voltage"],
-    }
 
-    def validate(self):
-        """
-        This function determines if the input data has the necessary attributes for
-        creation of this feature class. It should test for all of the possible reasons
-        that feature generation would fail for this particular input data.
-        Args:
-            processed_cycler_run (beep.structure.ProcessedCyclerRun): data from cycler run
-            params_dict (dict): dictionary of parameters governing how the ProcessedCyclerRun object
-            gets featurized. These could be filters for column or row operations
-        Returns:
-            bool: True/False indication of ability to proceed with feature generation
-        """
-        if not (
-                'raw' in self.datapath.paths.keys() or 'structured' in self.datapath.paths.keys()):
-            message = "datapath paths not set, unable to fetch charging protocol"
-            return False, message
-        else:
-            return featurizer_helpers.check_diagnostic_validation(self.datapath)
-
-    def create_features(self):
-        """
-        Fetches charging protocol features
-        """
-
-        parameters_path = self.hyperparameters["parameters_dir"]
-        file_path = self.datapath.paths[
-            'raw'] if 'raw' in self.datapath.paths.keys() else \
-        self.datapath.paths['structured']
-
-        parameters, _ = get_protocol_parameters(file_path, parameters_path)
-
-        parameters = parameters[self.hyperparameters["quantities"]]
-        parameters['cycle_index'] = int(
-            0)  # create a cycle index column for merging with other featurizers
-        parameters['diag_pos'] = int(
-            0)  # create a diag_pos column for merging with other featurizers
-        self.features = parameters
diff --git a/beep/features/per_cycle/protocol.py b/beep/features/per_cycle/protocol.py
new file mode 100644
index 00000000..23a73be3
--- /dev/null
+++ b/beep/features/per_cycle/protocol.py
@@ -0,0 +1,64 @@
+
+from beep import PROTOCOL_PARAMETERS_DIR
+from beep.features import featurizer_helpers
+from beep.utils.parameters_lookup import get_protocol_parameters
+
+from beep.features.featurizer import BEEPPerCycleFeaturizer
+
+
+class CyclingProtocolPerCycle(BEEPPerCycleFeaturizer):
+    """
+    This class stores information about the charging protocol used
+        name (str): predictor object name.
+        X (pandas.DataFrame): features in DataFrame format.
+        metadata (dict): information about the conditions, data
+            and code used to produce features
+    Hyperparameters:
+        parameters_dir (str): Full path to directory of charging protocol parameters
+        quantities ([str]): list of parameters to return
+    """
+    DEFAULT_HYPERPARAMETERS = {
+        "parameters_dir": PROTOCOL_PARAMETERS_DIR,
+        "quantities": ["charge_constant_current_1", "charge_constant_current_2",
+                       "charge_cutoff_voltage", "charge_constant_voltage_time",
+                       "discharge_constant_current",
+                       "discharge_cutoff_voltage"],
+    }
+
+    def validate(self):
+        """
+        This function determines if the input data has the necessary attributes for
+        creation of this feature class. It should test for all of the possible reasons
+        that feature generation would fail for this particular input data.
+        Args:
+            processed_cycler_run (beep.structure.ProcessedCyclerRun): data from cycler run
+            params_dict (dict): dictionary of parameters governing how the ProcessedCyclerRun object
+            gets featurized. These could be filters for column or row operations
+        Returns:
+            bool: True/False indication of ability to proceed with feature generation
+        """
+        if not (
+                'raw' in self.datapath.paths.keys() or 'structured' in self.datapath.paths.keys()):
+            message = "datapath paths not set, unable to fetch charging protocol"
+            return False, message
+        else:
+            return featurizer_helpers.check_diagnostic_validation(self.datapath)
+
+    def create_features(self):
+        """
+        Fetches charging protocol features
+        """
+
+        parameters_path = self.hyperparameters["parameters_dir"]
+        file_path = self.datapath.paths[
+            'raw'] if 'raw' in self.datapath.paths.keys() else \
+        self.datapath.paths['structured']
+
+        parameters, _ = get_protocol_parameters(file_path, parameters_path)
+
+        parameters = parameters[self.hyperparameters["quantities"]]
+        parameters['cycle_index'] = int(
+            0)  # create a cycle index column for merging with other featurizers
+        parameters['diag_pos'] = int(
+            0)  # create a diag_pos column for merging with other featurizers
+        self.features = parameters
\ No newline at end of file

From 1e7a3ce6ab206ff1f9e7eaed0cba7cdd4088a014 Mon Sep 17 00:00:00 2001
From: ardunn <ardunn@lbl.gov>
Date: Tue, 29 Mar 2022 19:47:48 -0700
Subject: [PATCH 10/16] move intracell to its own modules, further refactor

---
 beep/features/all_cycles/hppc.py                |  6 ------
 beep/features/core.py                           | 17 -----------------
 beep/features/intracell/__init__.py             |  0
 .../intracell_analysis.py                       |  0
 .../intracell_analysisv2.py}                    |  0
 .../intracell_losses.py                         |  8 ++++----
 .../intracell_lossesv2.py}                      |  6 +++---
 beep/features/matrix.py                         | 11 +++++++----
 8 files changed, 14 insertions(+), 34 deletions(-)
 delete mode 100644 beep/features/core.py
 create mode 100644 beep/features/intracell/__init__.py
 rename beep/features/{all_cycles => intracell}/intracell_analysis.py (100%)
 rename beep/features/{all_cycles/intracell_analysis_v2.py => intracell/intracell_analysisv2.py} (100%)
 rename beep/features/{all_cycles => intracell}/intracell_losses.py (97%)
 rename beep/features/{all_cycles/intracell_losses_v2.py => intracell/intracell_lossesv2.py} (98%)

diff --git a/beep/features/all_cycles/hppc.py b/beep/features/all_cycles/hppc.py
index 45849dd6..ec1eb9e9 100644
--- a/beep/features/all_cycles/hppc.py
+++ b/beep/features/all_cycles/hppc.py
@@ -1,13 +1,7 @@
-import numpy as np
 import pandas as pd
-from scipy.stats import skew, kurtosis
-from scipy.interpolate import interp1d
 
 from beep import PROTOCOL_PARAMETERS_DIR
 from beep.features import featurizer_helpers
-from functools import reduce
-from beep.utils.parameters_lookup import get_protocol_parameters
-
 from beep.features.featurizer import BEEPAllCyclesFeaturizer
 
 
diff --git a/beep/features/core.py b/beep/features/core.py
deleted file mode 100644
index 3c0cb953..00000000
--- a/beep/features/core.py
+++ /dev/null
@@ -1,17 +0,0 @@
-import numpy as np
-import pandas as pd
-from scipy.stats import skew, kurtosis
-from scipy.interpolate import interp1d
-
-from beep import PROTOCOL_PARAMETERS_DIR
-from beep.features import featurizer_helpers
-from functools import reduce
-from beep.utils.parameters_lookup import get_protocol_parameters
-
-from beep.features.base import BEEPFeaturizer, BEEPFeaturizationError
-
-
-
-
-
-
diff --git a/beep/features/intracell/__init__.py b/beep/features/intracell/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/beep/features/all_cycles/intracell_analysis.py b/beep/features/intracell/intracell_analysis.py
similarity index 100%
rename from beep/features/all_cycles/intracell_analysis.py
rename to beep/features/intracell/intracell_analysis.py
diff --git a/beep/features/all_cycles/intracell_analysis_v2.py b/beep/features/intracell/intracell_analysisv2.py
similarity index 100%
rename from beep/features/all_cycles/intracell_analysis_v2.py
rename to beep/features/intracell/intracell_analysisv2.py
diff --git a/beep/features/all_cycles/intracell_losses.py b/beep/features/intracell/intracell_losses.py
similarity index 97%
rename from beep/features/all_cycles/intracell_losses.py
rename to beep/features/intracell/intracell_losses.py
index e8ce7746..84f5a004 100644
--- a/beep/features/all_cycles/intracell_losses.py
+++ b/beep/features/intracell/intracell_losses.py
@@ -4,14 +4,14 @@
 
 from beep import PROTOCOL_PARAMETERS_DIR
 from beep.features import featurizer_helpers
-from beep.features.base import BEEPFeaturizer
-from beep.features.intracell_analysis import IntracellAnalysis
+from beep.features.featurizer import BEEPAllCyclesFeaturizer
+from beep.features.intracell.intracell_analysis import IntracellAnalysis
 
 
 DEFAULT_CELL_INFO_DIR = os.path.join(PROTOCOL_PARAMETERS_DIR, "intracell_info")
 
 
-class IntracellCycles(BEEPFeaturizer):
+class IntracellAllCycles(BEEPAllCyclesFeaturizer):
     """
     Object corresponding to the fitted material parameters of the cell. Material parameters
     are determined by using high resolution half cell data to fit full cell dQdV curves. Rows
@@ -131,7 +131,7 @@ def create_features(self):
         self.features = degradation_df
 
 
-class IntracellFeatures(IntracellCycles):
+class IntracellAllCyclesFeatures(IntracellAllCycles):
     """
     Object corresponding to the fitted material parameters of the cell. Material parameters
     are determined by using high resolution half cell data to fit full cell dQdV curves. The
diff --git a/beep/features/all_cycles/intracell_losses_v2.py b/beep/features/intracell/intracell_lossesv2.py
similarity index 98%
rename from beep/features/all_cycles/intracell_losses_v2.py
rename to beep/features/intracell/intracell_lossesv2.py
index 516a4cdc..d5c0cd46 100644
--- a/beep/features/all_cycles/intracell_losses_v2.py
+++ b/beep/features/intracell/intracell_lossesv2.py
@@ -4,13 +4,13 @@
 
 from beep import PROTOCOL_PARAMETERS_DIR
 from beep.features import featurizer_helpers
-from beep.features.base import BEEPFeaturizer
-from beep.features.intracell_analysis_v2 import IntracellAnalysisV2
+from beep.features.featurizer import BEEPPerCycleFeaturizer
+from beep.features.intracell.intracell_analysisv2 import IntracellAnalysisV2
 
 DEFAULT_CELL_INFO_DIR = os.path.join(PROTOCOL_PARAMETERS_DIR, "intracell_info")
 
 
-class IntracellCyclesV2(BEEPFeaturizer):
+class IntracellCyclesV2(BEEPPerCycleFeaturizer):
     """
     Object corresponding to the fitted material parameters of the cell. Material parameters
     are determined by using high resolution half cell data to fit full cell dQdV curves. Rows
diff --git a/beep/features/matrix.py b/beep/features/matrix.py
index 355899cb..cf338e62 100644
--- a/beep/features/matrix.py
+++ b/beep/features/matrix.py
@@ -140,11 +140,14 @@ def __init__(self, beepfeaturizers: List[BEEPFeaturizer]):
                     # Create consistent scheme for naming features regardless of file
                     # Only rename non-special column names
                     df = copy.deepcopy(bf.features)
-
                     special_column_names = BEEPPerCycleFeaturizer.SPECIAL_COLUMNS if self.per_cycle else set()
-                    consistent_column_names = [
-                        f"{c}{self.OP_DELIMITER}{feature_op_id}" for c in df.columns if c not in special_column_names
-                    ]
+                    consistent_column_names = []
+
+                    for c in df.columns:
+                        if c in special_column_names:
+                            consistent_column_names.append(c)
+                        else:
+                            consistent_column_names.append(f"{c}{self.OP_DELIMITER}{feature_op_id}")
                     df.columns = consistent_column_names
 
                     # ensure cycle_index and diag_pos are integers

From 2f1917723e95e251836da818bb9a7ff7133dc531 Mon Sep 17 00:00:00 2001
From: ardunn <ardunn@lbl.gov>
Date: Tue, 29 Mar 2022 20:02:11 -0700
Subject: [PATCH 11/16] wip fixing matrix for per-cycle featurizers merges

---
 beep/features/matrix.py | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/beep/features/matrix.py b/beep/features/matrix.py
index cf338e62..ebe16152 100644
--- a/beep/features/matrix.py
+++ b/beep/features/matrix.py
@@ -154,31 +154,43 @@ def __init__(self, beepfeaturizers: List[BEEPFeaturizer]):
                     if self.per_cycle:
                         for col in BEEPPerCycleFeaturizer.SPECIAL_COLUMNS:
                             df[col] = df[col].astype(int)
+                        df["filename"] = [fname] * df.shape[0]
+
+                    else:
+                        df.index = [fname] * df.shape[0]
+                        df.index.rename("filename", inplace=True)
 
-                    df.index = [fname] * df.shape[0]
-                    df.index.rename("filename", inplace=True)
                     dfs_by_file[fname].append(df)
 
             blocks = []
             # concat dfs by file across columns
             for filename, dfs in dfs_by_file.items():
-
                 if self.per_cycle:
+                    merge_keys = list(BEEPPerCycleFeaturizer.SPECIAL_COLUMNS) + \
+                        ["filename"]
                     rows = pd.concat(
                         dfs,
                         axis=1,
                         join="outer",
-                        ignore_index=False,
-                        keys=list(BEEPPerCycleFeaturizer.SPECIAL_COLUMNS)
+                        ignore_index=True,
+                        keys=merge_keys
                     )
 
                 else:
-
                     rows = pd.concat(dfs, axis=1)
                     rows = rows[sorted(rows.columns)]
 
                 blocks.append(rows)
 
+
+            # if self.per_cycle:
+            #     self.matrix = pd.concat(
+            #         blocks,
+            #         join="outer",
+            #         ignore_index=True
+            #         axis=0
+            #     )
+            # else:
             # concat all dfs for all files across rows
             self.matrix = pd.concat(blocks, axis=0)
 

From d8781b530cbd970cc6509030cced7020dcc4d070 Mon Sep 17 00:00:00 2001
From: ardunn <ardunn@lbl.gov>
Date: Tue, 29 Mar 2022 20:36:05 -0700
Subject: [PATCH 12/16] bugfixing on merging values for feature matrix from
 per-cycler featurizers

---
 beep/features/matrix.py | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/beep/features/matrix.py b/beep/features/matrix.py
index ebe16152..5e3c997a 100644
--- a/beep/features/matrix.py
+++ b/beep/features/matrix.py
@@ -1,13 +1,13 @@
 
 import copy
 import hashlib
-from typing import List
+from typing import List, Union
 
 import pandas as pd
 from monty.json import MSONable, MontyDecoder
 from monty.serialization import loadfn, dumpfn
 
-from beep.features.featurizer import BEEPFeaturizer, BEEPPerCycleFeaturizer
+from beep.features.featurizer import BEEPFeaturizer, BEEPPerCycleFeaturizer, BEEPAllCyclesFeaturizer
 
 
 class BEEPFeatureMatrixError(BaseException):
@@ -46,7 +46,10 @@ class BEEPFeatureMatrix(MSONable):
 
     OP_DELIMITER = "::"
 
-    def __init__(self, beepfeaturizers: List[BEEPFeaturizer]):
+    def __init__(
+            self,
+            beepfeaturizers: List[Union[BEEPPerCycleFeaturizer, BEEPAllCyclesFeaturizer]]
+    ):
 
         if beepfeaturizers:
             bfs_types_per_cycle = [bf.PER_CYCLE for bf in beepfeaturizers]
@@ -163,18 +166,20 @@ def __init__(self, beepfeaturizers: List[BEEPFeaturizer]):
                     dfs_by_file[fname].append(df)
 
             blocks = []
+            self.dfs_by_file = dfs_by_file
+
             # concat dfs by file across columns
             for filename, dfs in dfs_by_file.items():
                 if self.per_cycle:
-                    merge_keys = list(BEEPPerCycleFeaturizer.SPECIAL_COLUMNS) + \
-                        ["filename"]
-                    rows = pd.concat(
-                        dfs,
-                        axis=1,
-                        join="outer",
-                        ignore_index=True,
-                        keys=merge_keys
-                    )
+                    rows = pd.DataFrame()
+                    for df in dfs:
+                        rows = pd.merge(
+                            rows,
+                            df,
+                            how="outer",
+                            on=["filename", "cycle_index", "diag_pos"]
+                        )
+                    rows.reset_index(inplace=True).sort_values(["cycle_index"], inplace=True)
 
                 else:
                     rows = pd.concat(dfs, axis=1)
@@ -187,8 +192,9 @@ def __init__(self, beepfeaturizers: List[BEEPFeaturizer]):
             #     self.matrix = pd.concat(
             #         blocks,
             #         join="outer",
-            #         ignore_index=True
-            #         axis=0
+            #         ignore_index=False,
+            #         axis=1,
+            #         keys=merge_keys
             #     )
             # else:
             # concat all dfs for all files across rows

From 33af986b7387a000e9c0514a482166099ee6e9b7 Mon Sep 17 00:00:00 2001
From: ardunn <ardunn@lbl.gov>
Date: Tue, 29 Mar 2022 20:47:51 -0700
Subject: [PATCH 13/16] working matrix for both per-cycle and all-cycle
 featurizers

---
 beep/features/matrix.py | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/beep/features/matrix.py b/beep/features/matrix.py
index 5e3c997a..5e145cae 100644
--- a/beep/features/matrix.py
+++ b/beep/features/matrix.py
@@ -167,19 +167,22 @@ def __init__(
 
             blocks = []
             self.dfs_by_file = dfs_by_file
+            indexing_cols = ["filename"] + list(BEEPPerCycleFeaturizer.SPECIAL_COLUMNS)
 
             # concat dfs by file across columns
             for filename, dfs in dfs_by_file.items():
                 if self.per_cycle:
-                    rows = pd.DataFrame()
-                    for df in dfs:
+                    rows = dfs[0]
+                    for df in dfs[1:]:
                         rows = pd.merge(
                             rows,
                             df,
                             how="outer",
-                            on=["filename", "cycle_index", "diag_pos"]
+                            on=indexing_cols
                         )
-                    rows.reset_index(inplace=True).sort_values(["cycle_index"], inplace=True)
+                    rows = rows.reset_index().sort_values(["cycle_index"])
+                    feature_cols = sorted([f for f in rows.columns if f not in indexing_cols])
+                    rows = rows[indexing_cols + feature_cols]
 
                 else:
                     rows = pd.concat(dfs, axis=1)
@@ -187,16 +190,6 @@ def __init__(
 
                 blocks.append(rows)
 
-
-            # if self.per_cycle:
-            #     self.matrix = pd.concat(
-            #         blocks,
-            #         join="outer",
-            #         ignore_index=False,
-            #         axis=1,
-            #         keys=merge_keys
-            #     )
-            # else:
             # concat all dfs for all files across rows
             self.matrix = pd.concat(blocks, axis=0)
 

From baab63b0298331243d430037b8c323de874201f4 Mon Sep 17 00:00:00 2001
From: ardunn <ardunn@lbl.gov>
Date: Wed, 30 Mar 2022 13:43:26 -0700
Subject: [PATCH 14/16] refactor from "allcycles" to "earlycycles" as per
 @pasinger recommendation

---
 beep/features/all_cycles/__init__.py               |  0
 beep/features/early_cycles/__init__.py             | 14 ++++++++++++++
 .../{all_cycles => early_cycles}/delta_q.py        |  4 ++--
 beep/features/{all_cycles => early_cycles}/hppc.py |  4 ++--
 .../{all_cycles => early_cycles}/summary.py        |  4 ++--
 .../{all_cycles => early_cycles}/targets.py        |  0
 beep/features/featurizer.py                        |  4 ++--
 beep/features/intracell/intracell_losses.py        |  4 ++--
 beep/features/matrix.py                            |  8 ++++----
 9 files changed, 28 insertions(+), 14 deletions(-)
 delete mode 100644 beep/features/all_cycles/__init__.py
 create mode 100644 beep/features/early_cycles/__init__.py
 rename beep/features/{all_cycles => early_cycles}/delta_q.py (98%)
 rename beep/features/{all_cycles => early_cycles}/hppc.py (95%)
 rename beep/features/{all_cycles => early_cycles}/summary.py (99%)
 rename beep/features/{all_cycles => early_cycles}/targets.py (100%)

diff --git a/beep/features/all_cycles/__init__.py b/beep/features/all_cycles/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/beep/features/early_cycles/__init__.py b/beep/features/early_cycles/__init__.py
new file mode 100644
index 00000000..520baf44
--- /dev/null
+++ b/beep/features/early_cycles/__init__.py
@@ -0,0 +1,14 @@
+"""
+Features for predicting cycling characteristics from early cycles.
+
+Typically these features are formed by looking at two or more early cycles,
+assessing the differences between features of these cycles, then relating
+that to the degradation characteristics of the battery. These features
+can then be used with any cycler file with few numbers of cycles to predict
+the total number of cycles before reaching certain degradation thresholds.
+"""
+
+from beep.features.early_cycles.delta_q import DeltaQFastCharge
+from beep.features.early_cycles.hppc import HPPCResistanceVoltage
+from beep.features.early_cycles.summary import DiagnosticSummaryStats, CycleSummaryStats
+from beep.features.early_cycles.targets import TrajectoryFastCharge, DiagnosticProperties
\ No newline at end of file
diff --git a/beep/features/all_cycles/delta_q.py b/beep/features/early_cycles/delta_q.py
similarity index 98%
rename from beep/features/all_cycles/delta_q.py
rename to beep/features/early_cycles/delta_q.py
index 6f8d2ed0..64feaac9 100644
--- a/beep/features/all_cycles/delta_q.py
+++ b/beep/features/early_cycles/delta_q.py
@@ -2,10 +2,10 @@
 import pandas as pd
 from scipy.stats import skew, kurtosis
 
-from beep.features.featurizer import BEEPAllCyclesFeaturizer
+from beep.features.featurizer import BEEPEarlyCyclesFeaturizer
 
 
-class DeltaQFastCharge(BEEPAllCyclesFeaturizer):
+class DeltaQFastCharge(BEEPEarlyCyclesFeaturizer):
     """
     Object corresponding to feature object. Includes constructors
     to create the features, object names and metadata attributes in the
diff --git a/beep/features/all_cycles/hppc.py b/beep/features/early_cycles/hppc.py
similarity index 95%
rename from beep/features/all_cycles/hppc.py
rename to beep/features/early_cycles/hppc.py
index ec1eb9e9..9cba2b14 100644
--- a/beep/features/all_cycles/hppc.py
+++ b/beep/features/early_cycles/hppc.py
@@ -2,10 +2,10 @@
 
 from beep import PROTOCOL_PARAMETERS_DIR
 from beep.features import featurizer_helpers
-from beep.features.featurizer import BEEPAllCyclesFeaturizer
+from beep.features.featurizer import BEEPEarlyCyclesFeaturizer
 
 
-class HPPCResistanceVoltageAllCycles(BEEPAllCyclesFeaturizer):
+class HPPCResistanceVoltage(BEEPEarlyCyclesFeaturizer):
     DEFAULT_HYPERPARAMETERS = {
         "test_time_filter_sec": 1000000,
         "cycle_index_filter": 6,
diff --git a/beep/features/all_cycles/summary.py b/beep/features/early_cycles/summary.py
similarity index 99%
rename from beep/features/all_cycles/summary.py
rename to beep/features/early_cycles/summary.py
index 9e1c0e1a..dc6a99e4 100644
--- a/beep/features/all_cycles/summary.py
+++ b/beep/features/early_cycles/summary.py
@@ -3,10 +3,10 @@
 from scipy.stats import skew, kurtosis
 
 from beep import PROTOCOL_PARAMETERS_DIR
-from beep.features.featurizer import BEEPAllCyclesFeaturizer
+from beep.features.featurizer import BEEPEarlyCyclesFeaturizer
 
 
-class CycleSummaryStats(BEEPAllCyclesFeaturizer):
+class CycleSummaryStats(BEEPEarlyCyclesFeaturizer):
     DEFAULT_HYPERPARAMETERS = {
         "cycle_comp_num": [10, 100],
         "statistics": ["var", "min", "mean", "skew", "kurtosis", "abs",
diff --git a/beep/features/all_cycles/targets.py b/beep/features/early_cycles/targets.py
similarity index 100%
rename from beep/features/all_cycles/targets.py
rename to beep/features/early_cycles/targets.py
diff --git a/beep/features/featurizer.py b/beep/features/featurizer.py
index 7a3e57cf..2fb1f181 100644
--- a/beep/features/featurizer.py
+++ b/beep/features/featurizer.py
@@ -192,13 +192,13 @@ def to_json_file(self, filename):
         dumpfn(d, filename)
 
 
-class BEEPAllCyclesFeaturizer(BEEPFeaturizer):
+class BEEPEarlyCyclesFeaturizer(BEEPFeaturizer):
     """Base class for featurizers that return a constant number of features
     for any number of cycles in a structured datapath.
 
     These features are typically used for early prediction.
 
-    A BEEPAllCyclesFeaturizer always returns the same number of features
+    A BEEPEarlyCyclesFeaturizer always returns the same number of features
     for files for datapaths with any number of samples. Thus,
 
 
diff --git a/beep/features/intracell/intracell_losses.py b/beep/features/intracell/intracell_losses.py
index 84f5a004..9a387cee 100644
--- a/beep/features/intracell/intracell_losses.py
+++ b/beep/features/intracell/intracell_losses.py
@@ -4,14 +4,14 @@
 
 from beep import PROTOCOL_PARAMETERS_DIR
 from beep.features import featurizer_helpers
-from beep.features.featurizer import BEEPAllCyclesFeaturizer
+from beep.features.featurizer import BEEPEarlyCyclesFeaturizer
 from beep.features.intracell.intracell_analysis import IntracellAnalysis
 
 
 DEFAULT_CELL_INFO_DIR = os.path.join(PROTOCOL_PARAMETERS_DIR, "intracell_info")
 
 
-class IntracellAllCycles(BEEPAllCyclesFeaturizer):
+class IntracellAllCycles(BEEPEarlyCyclesFeaturizer):
     """
     Object corresponding to the fitted material parameters of the cell. Material parameters
     are determined by using high resolution half cell data to fit full cell dQdV curves. Rows
diff --git a/beep/features/matrix.py b/beep/features/matrix.py
index 5e145cae..872c93e2 100644
--- a/beep/features/matrix.py
+++ b/beep/features/matrix.py
@@ -7,7 +7,7 @@
 from monty.json import MSONable, MontyDecoder
 from monty.serialization import loadfn, dumpfn
 
-from beep.features.featurizer import BEEPFeaturizer, BEEPPerCycleFeaturizer, BEEPAllCyclesFeaturizer
+from beep.features.featurizer import BEEPFeaturizer, BEEPPerCycleFeaturizer, BEEPEarlyCyclesFeaturizer
 
 
 class BEEPFeatureMatrixError(BaseException):
@@ -21,7 +21,7 @@ class BEEPFeatureMatrix(MSONable):
 
     The array may either be:
 
-    PER-CYCLER-RUN, using BEEPAllCyclesFeaturizer.
+    PER-CYCLER-RUN, using BEEPEarlyCyclesFeaturizer.
     One feature vector per cycler file, resulting in an array w. dimenions:
         (n battery cycler files) x (k features)
 
@@ -39,7 +39,7 @@ class BEEPFeatureMatrix(MSONable):
 
     Args:
         beepfeaturizers ([BEEPFeaturizer]): A list of BEEPFeaturizer objects,
-            either ALL BEEPAllCyclesFeaturizer child objects OR ALL
+            either ALL BEEPEarlyCyclesFeaturizer child objects OR ALL
             BEEPPerCycleFeaturizer child objects.
 
     """
@@ -48,7 +48,7 @@ class BEEPFeatureMatrix(MSONable):
 
     def __init__(
             self,
-            beepfeaturizers: List[Union[BEEPPerCycleFeaturizer, BEEPAllCyclesFeaturizer]]
+            beepfeaturizers: List[Union[BEEPPerCycleFeaturizer, BEEPEarlyCyclesFeaturizer]]
     ):
 
         if beepfeaturizers:

From 3dd9da5e05c455ab0249cca5078f2106fee3f2a3 Mon Sep 17 00:00:00 2001
From: ardunn <ardunn@lbl.gov>
Date: Wed, 30 Mar 2022 13:49:30 -0700
Subject: [PATCH 15/16] featurizer_helpers --> helper_functions

---
 beep/features/early_cycles/hppc.py            | 12 +--
 beep/features/early_cycles/targets.py         |  6 +-
 ...turizer_helpers.py => helper_functions.py} |  0
 beep/features/intracell/intracell_losses.py   |  4 +-
 beep/features/intracell/intracell_lossesv2.py |  4 +-
 beep/features/per_cycle/diagnostic.py         |  4 +-
 beep/features/per_cycle/hppc.py               |  6 +-
 beep/features/per_cycle/protocol.py           |  4 +-
 beep/features/tests/test_features.py          | 74 +++++++++----------
 9 files changed, 57 insertions(+), 57 deletions(-)
 rename beep/features/{featurizer_helpers.py => helper_functions.py} (100%)

diff --git a/beep/features/early_cycles/hppc.py b/beep/features/early_cycles/hppc.py
index 9cba2b14..cbedb678 100644
--- a/beep/features/early_cycles/hppc.py
+++ b/beep/features/early_cycles/hppc.py
@@ -1,7 +1,7 @@
 import pandas as pd
 
 from beep import PROTOCOL_PARAMETERS_DIR
-from beep.features import featurizer_helpers
+from beep.features import helper_functions
 from beep.features.featurizer import BEEPEarlyCyclesFeaturizer
 
 
@@ -15,7 +15,7 @@ class HPPCResistanceVoltage(BEEPEarlyCyclesFeaturizer):
     }
 
     def validate(self):
-        val, msg = featurizer_helpers.check_diagnostic_validation(self.datapath)
+        val, msg = helper_functions.check_diagnostic_validation(self.datapath)
         if val:
             conditions = []
             conditions.append(
@@ -47,13 +47,13 @@ def create_features(self):
         ).filter(lambda x: ~x["test_time"].isnull().all())
 
         # diffusion features
-        diffusion_features = featurizer_helpers.get_diffusion_early_features(
+        diffusion_features = helper_functions.get_diffusion_early_features(
             self.datapath,
         )
 
         hppc_r = pd.DataFrame()
         # the 9 by 6 dataframe
-        df_dr = featurizer_helpers.get_dr_df(
+        df_dr = helper_functions.get_dr_df(
             self.datapath, self.hyperparameters["diag_pos"]
         )
         # transform this dataframe to be 1 by 54
@@ -64,14 +64,14 @@ def create_features(self):
                 hppc_r[name] = [df_dr[column][r]]
 
         # the variance of ocv features
-        hppc_ocv = featurizer_helpers.get_hppc_ocv(
+        hppc_ocv = helper_functions.get_hppc_ocv(
             self.datapath,
             self.hyperparameters["diag_pos"],
             parameters_path=self.hyperparameters["parameters_path"]
         )
 
         # the v_diff features
-        v_diff = featurizer_helpers.get_v_diff(
+        v_diff = helper_functions.get_v_diff(
             self.datapath,
             self.hyperparameters["diag_pos"],
             self.hyperparameters["soc_window"],
diff --git a/beep/features/early_cycles/targets.py b/beep/features/early_cycles/targets.py
index 7419b3a9..aecd7e80 100644
--- a/beep/features/early_cycles/targets.py
+++ b/beep/features/early_cycles/targets.py
@@ -3,7 +3,7 @@
 from scipy.interpolate import interp1d
 
 from beep import PROTOCOL_PARAMETERS_DIR
-from beep.features import featurizer_helpers
+from beep.features import helper_functions
 
 from beep.features.featurizer import BEEPFeaturizer, BEEPFeaturizationError
 
@@ -118,7 +118,7 @@ def validate(self):
         Returns:
             bool: True/False indication of ability to proceed with feature generation
         """
-        return featurizer_helpers.check_diagnostic_validation(self.datapath)
+        return helper_functions.check_diagnostic_validation(self.datapath)
 
     def create_features(self):
         """
@@ -140,7 +140,7 @@ def create_features(self):
         X = pd.DataFrame()
         for quantity in self.hyperparameters["quantities"]:
             for cycle_type in cycle_types:
-                summary_diag_cycle_type = featurizer_helpers.get_fractional_quantity_remaining_nx(
+                summary_diag_cycle_type = helper_functions.get_fractional_quantity_remaining_nx(
                     self.datapath, quantity, cycle_type,
                     parameters_path=parameters_path
                 )
diff --git a/beep/features/featurizer_helpers.py b/beep/features/helper_functions.py
similarity index 100%
rename from beep/features/featurizer_helpers.py
rename to beep/features/helper_functions.py
diff --git a/beep/features/intracell/intracell_losses.py b/beep/features/intracell/intracell_losses.py
index 9a387cee..0ac1da92 100644
--- a/beep/features/intracell/intracell_losses.py
+++ b/beep/features/intracell/intracell_losses.py
@@ -3,7 +3,7 @@
 import pandas as pd
 
 from beep import PROTOCOL_PARAMETERS_DIR
-from beep.features import featurizer_helpers
+from beep.features import helper_functions
 from beep.features.featurizer import BEEPEarlyCyclesFeaturizer
 from beep.features.intracell.intracell_analysis import IntracellAnalysis
 
@@ -52,7 +52,7 @@ def validate(self):
         Returns:
             bool: True/False indication of ability to proceed with feature generation
         """
-        val, msg = featurizer_helpers.check_diagnostic_validation(self.datapath)
+        val, msg = helper_functions.check_diagnostic_validation(self.datapath)
         if val:
             conditions = []
 
diff --git a/beep/features/intracell/intracell_lossesv2.py b/beep/features/intracell/intracell_lossesv2.py
index d5c0cd46..fdad7202 100644
--- a/beep/features/intracell/intracell_lossesv2.py
+++ b/beep/features/intracell/intracell_lossesv2.py
@@ -3,7 +3,7 @@
 import pandas as pd
 
 from beep import PROTOCOL_PARAMETERS_DIR
-from beep.features import featurizer_helpers
+from beep.features import helper_functions
 from beep.features.featurizer import BEEPPerCycleFeaturizer
 from beep.features.intracell.intracell_analysisv2 import IntracellAnalysisV2
 
@@ -56,7 +56,7 @@ def validate(self):
         Returns:
             bool: True/False indication of ability to proceed with feature generation
         """
-        val, msg = featurizer_helpers.check_diagnostic_validation(self.datapath)
+        val, msg = helper_functions.check_diagnostic_validation(self.datapath)
         if val:
             conditions = []
 
diff --git a/beep/features/per_cycle/diagnostic.py b/beep/features/per_cycle/diagnostic.py
index 9ebbca16..f4a1b0ef 100644
--- a/beep/features/per_cycle/diagnostic.py
+++ b/beep/features/per_cycle/diagnostic.py
@@ -2,7 +2,7 @@
 import pandas as pd
 
 from beep import PROTOCOL_PARAMETERS_DIR
-from beep.features import featurizer_helpers
+from beep.features import helper_functions
 from functools import reduce
 
 from beep.features.featurizer import BEEPPerCycleFeaturizer
@@ -41,7 +41,7 @@ def validate(self):
         Returns:
             bool: True/False indication of ability to proceed with feature generation
         """
-        return featurizer_helpers.check_diagnostic_validation(self.datapath)
+        return helper_functions.check_diagnostic_validation(self.datapath)
 
     def create_features(self):
         """
diff --git a/beep/features/per_cycle/hppc.py b/beep/features/per_cycle/hppc.py
index 4f6408ba..d95f4c3c 100644
--- a/beep/features/per_cycle/hppc.py
+++ b/beep/features/per_cycle/hppc.py
@@ -1,7 +1,7 @@
 import pandas as pd
 
 from beep import PROTOCOL_PARAMETERS_DIR
-from beep.features import featurizer_helpers
+from beep.features import helper_functions
 
 from beep.features.featurizer import BEEPPerCycleFeaturizer
 
@@ -16,7 +16,7 @@ class HPPCResistanceVoltagePerCycle(BEEPPerCycleFeaturizer):
     }
 
     def validate(self):
-        val, msg = featurizer_helpers.check_diagnostic_validation(self.datapath)
+        val, msg = helper_functions.check_diagnostic_validation(self.datapath)
         if val:
             conditions = []
             conditions.append(
@@ -56,7 +56,7 @@ def create_features(self):
         #         )
 
         # hppc resistance features
-        hppc_resistance_features = featurizer_helpers.get_hppc_resistance_cycle_features(
+        hppc_resistance_features = helper_functions.get_hppc_resistance_cycle_features(
             self.datapath,
         )
 
diff --git a/beep/features/per_cycle/protocol.py b/beep/features/per_cycle/protocol.py
index 23a73be3..63c78926 100644
--- a/beep/features/per_cycle/protocol.py
+++ b/beep/features/per_cycle/protocol.py
@@ -1,6 +1,6 @@
 
 from beep import PROTOCOL_PARAMETERS_DIR
-from beep.features import featurizer_helpers
+from beep.features import helper_functions
 from beep.utils.parameters_lookup import get_protocol_parameters
 
 from beep.features.featurizer import BEEPPerCycleFeaturizer
@@ -42,7 +42,7 @@ def validate(self):
             message = "datapath paths not set, unable to fetch charging protocol"
             return False, message
         else:
-            return featurizer_helpers.check_diagnostic_validation(self.datapath)
+            return helper_functions.check_diagnostic_validation(self.datapath)
 
     def create_features(self):
         """
diff --git a/beep/features/tests/test_features.py b/beep/features/tests/test_features.py
index 59171edb..9423e331 100644
--- a/beep/features/tests/test_features.py
+++ b/beep/features/tests/test_features.py
@@ -29,7 +29,7 @@
 
 from beep.structure.maccor import MaccorDatapath
 from beep.structure.cli import auto_load_processed, auto_load
-from beep.features import featurizer_helpers
+from beep.features import helper_functions
 from beep.utils import parameters_lookup
 from monty.serialization import dumpfn, loadfn
 from monty.tempfile import ScratchDir
@@ -270,9 +270,9 @@ def test_get_fractional_quantity_remaining_nx(self):
         structured_datapath.structured_summary = structured_datapath.structured_summary[
             ~structured_datapath.structured_summary.cycle_index.isin(structured_datapath.diagnostic_summary.cycle_index)]
 
-        sum_diag = featurizer_helpers.get_fractional_quantity_remaining_nx(structured_datapath,
-                                                                           metric="discharge_energy",
-                                                                           diagnostic_cycle_type="hppc")
+        sum_diag = helper_functions.get_fractional_quantity_remaining_nx(structured_datapath,
+                                                                         metric="discharge_energy",
+                                                                         diagnostic_cycle_type="hppc")
         # print(sum_diag["normalized_regular_throughput"])
         self.assertEqual(len(sum_diag.index), 16)
         self.assertEqual(sum_diag.cycle_index.max(), 1507)
@@ -284,9 +284,9 @@ def test_get_fractional_quantity_remaining_nx(self):
         self.assertEqual(sum_diag['diagnostic_interval'].iloc[0], 100)
         self.assertEqual(sum_diag['epoch_time'].iloc[0], 1576641695)
 
-        sum_diag = featurizer_helpers.get_fractional_quantity_remaining_nx(structured_datapath,
-                                                                           metric="discharge_energy",
-                                                                           diagnostic_cycle_type="rpt_1C")
+        sum_diag = helper_functions.get_fractional_quantity_remaining_nx(structured_datapath,
+                                                                         metric="discharge_energy",
+                                                                         diagnostic_cycle_type="rpt_1C")
         self.assertEqual(len(sum_diag.index), 16)
         self.assertEqual(sum_diag.cycle_index.max(), 1509)
         self.assertEqual(np.around(sum_diag["initial_regular_throughput"].iloc[0], 3), np.around(237.001769, 3))
@@ -301,9 +301,9 @@ def test_get_fractional_quantity_remaining_nx(self):
         )
         structured_datapath = auto_load_processed(processed_cycler_run_path_2)
 
-        sum_diag = featurizer_helpers.get_fractional_quantity_remaining_nx(structured_datapath,
-                                                                           metric="discharge_energy",
-                                                                           diagnostic_cycle_type="hppc")
+        sum_diag = helper_functions.get_fractional_quantity_remaining_nx(structured_datapath,
+                                                                         metric="discharge_energy",
+                                                                         diagnostic_cycle_type="hppc")
         self.assertEqual(len(sum_diag.index), 3)
         self.assertEqual(sum_diag.cycle_index.max(), 242)
         self.assertEqual(np.around(sum_diag["initial_regular_throughput"].iloc[0], 3), np.around(331.428, 3))
@@ -329,7 +329,7 @@ def test_get_v_diff(self):
         with ScratchDir("."):
             # processed_cycler_run_path_1
             structured_datapath = auto_load_processed(processed_cycler_run_path_1)
-            v_vars_df = featurizer_helpers.get_v_diff(structured_datapath, 1, 8)
+            v_vars_df = helper_functions.get_v_diff(structured_datapath, 1, 8)
             # print(v_vars_df)
             self.assertEqual(np.round(v_vars_df.iloc[0]['var_v_diff'], decimals=8),
                              np.round(0.00472705, decimals=8))
@@ -344,7 +344,7 @@ def test_get_v_diff(self):
 
             # processed_cycler_run_path_2
             structured_datapath = auto_load_processed(processed_cycler_run_path_2)
-            v_vars_df = featurizer_helpers.get_v_diff(structured_datapath, 1, 8)
+            v_vars_df = helper_functions.get_v_diff(structured_datapath, 1, 8)
             # print(v_vars_df)
             self.assertEqual(np.round(v_vars_df.iloc[0]['var_v_diff'], decimals=8),
                              np.round(2.664e-05, decimals=8))
@@ -359,7 +359,7 @@ def test_get_v_diff(self):
 
             # processed_cycler_run_path_3
             structured_datapath = auto_load_processed(processed_cycler_run_path_3)
-            v_vars_df = featurizer_helpers.get_v_diff(structured_datapath, 1, 8)
+            v_vars_df = helper_functions.get_v_diff(structured_datapath, 1, 8)
             # print(v_vars_df)
             self.assertEqual(np.round(v_vars_df.iloc[0]['var_v_diff'], decimals=8),
                              np.round(4.82e-06, decimals=8))
@@ -374,7 +374,7 @@ def test_get_v_diff(self):
 
             # processed_cycler_run_path_4
             structured_datapath = auto_load_processed(processed_cycler_run_path_4)
-            v_vars_df = featurizer_helpers.get_v_diff(structured_datapath, 1, 8)
+            v_vars_df = helper_functions.get_v_diff(structured_datapath, 1, 8)
             # print(v_vars_df)
             self.assertEqual(np.round(v_vars_df.iloc[0]['var_v_diff'], decimals=8),
                              np.round(9.71e-06, decimals=8))
@@ -392,7 +392,7 @@ def test_get_hppc_ocv(self):
             TEST_FILE_DIR, "PreDiag_000240_000227_truncated_structure.json"
         )
         structured_datapath = auto_load_processed(structured_datapath_loc)
-        hppc_ocv_features = featurizer_helpers.get_hppc_ocv(structured_datapath, 1)
+        hppc_ocv_features = helper_functions.get_hppc_ocv(structured_datapath, 1)
         self.assertAlmostEqual(hppc_ocv_features['var_ocv'].iloc[0], 0.000016, 6)
         self.assertAlmostEqual(hppc_ocv_features['min_ocv'].iloc[0], -0.001291, 6)
         self.assertAlmostEqual(hppc_ocv_features['mean_ocv'].iloc[0], 0.002221, 6)
@@ -425,9 +425,9 @@ def test_get_step_index(self):
                                             parameter_row["capacity_nominal"].iloc[0], 2)
                     # print(step, median_crate, duration)
 
-        step_ind = featurizer_helpers.get_step_index(structured_datapath,
-                                                     cycle_type="hppc",
-                                                     diag_pos=0)
+        step_ind = helper_functions.get_step_index(structured_datapath,
+                                                   cycle_type="hppc",
+                                                   diag_pos=0)
         self.assertEqual(len(step_ind.values()), 6)
         print([step_ind["hppc_long_rest"],
                step_ind["hppc_discharge_pulse"],
@@ -443,9 +443,9 @@ def test_get_step_index(self):
             'hppc_charge_pulse': 14,
             'hppc_discharge_to_next_soc': 15
         })
-        step_ind = featurizer_helpers.get_step_index(structured_datapath,
-                                                     cycle_type="hppc",
-                                                     diag_pos=1)
+        step_ind = helper_functions.get_step_index(structured_datapath,
+                                                   cycle_type="hppc",
+                                                   diag_pos=1)
         self.assertEqual(len(step_ind.values()), 6)
         self.assertEqual(step_ind, {
             'hppc_charge_to_soc': 41,
@@ -465,9 +465,9 @@ def test_get_step_index_2(self):
         _, protocol_name = os.path.split(structured_datapath.metadata.protocol)
         parameter_row, _ = parameters_lookup.get_protocol_parameters(protocol_name, parameters_path=parameters_path)
 
-        step_ind = featurizer_helpers.get_step_index(structured_datapath,
-                                                     cycle_type="hppc",
-                                                     diag_pos=0)
+        step_ind = helper_functions.get_step_index(structured_datapath,
+                                                   cycle_type="hppc",
+                                                   diag_pos=0)
         self.assertEqual(len(step_ind.values()), 7)
 
         self.assertEqual(step_ind, {
@@ -479,9 +479,9 @@ def test_get_step_index_2(self):
             'hppc_discharge_to_next_soc': 15,
             'hppc_final_discharge': 17
         })
-        step_ind = featurizer_helpers.get_step_index(structured_datapath,
-                                                     cycle_type="hppc",
-                                                     diag_pos=1)
+        step_ind = helper_functions.get_step_index(structured_datapath,
+                                                   cycle_type="hppc",
+                                                   diag_pos=1)
         self.assertEqual(len(step_ind.values()), 7)
         self.assertEqual(step_ind, {
             'hppc_charge_to_soc': 41,
@@ -492,24 +492,24 @@ def test_get_step_index_2(self):
             'hppc_discharge_to_next_soc': 47,
             'hppc_final_discharge': 49
         })
-        step_ind = featurizer_helpers.get_step_index(structured_datapath, cycle_type="reset", diag_pos=0)
+        step_ind = helper_functions.get_step_index(structured_datapath, cycle_type="reset", diag_pos=0)
         self.assertEqual(step_ind, {'reset_charge': 5, 'reset_discharge': 6})
-        step_ind = featurizer_helpers.get_step_index(structured_datapath, cycle_type="reset", diag_pos=1)
+        step_ind = helper_functions.get_step_index(structured_datapath, cycle_type="reset", diag_pos=1)
         self.assertEqual(step_ind, {'reset_charge': 38, 'reset_discharge': 39})
 
-        step_ind = featurizer_helpers.get_step_index(structured_datapath, cycle_type="rpt_0.2C", diag_pos=0)
+        step_ind = helper_functions.get_step_index(structured_datapath, cycle_type="rpt_0.2C", diag_pos=0)
         self.assertEqual(step_ind, {'rpt_0.2C_charge': 19, 'rpt_0.2C_discharge': 20})
-        step_ind = featurizer_helpers.get_step_index(structured_datapath, cycle_type="rpt_0.2C", diag_pos=1)
+        step_ind = helper_functions.get_step_index(structured_datapath, cycle_type="rpt_0.2C", diag_pos=1)
         self.assertEqual(step_ind, {'rpt_0.2C_charge': 51, 'rpt_0.2C_discharge': 52})
 
-        step_ind = featurizer_helpers.get_step_index(structured_datapath, cycle_type="rpt_1C", diag_pos=0)
+        step_ind = helper_functions.get_step_index(structured_datapath, cycle_type="rpt_1C", diag_pos=0)
         self.assertEqual(step_ind, {'rpt_1C_charge': 22, 'rpt_1C_discharge': 23})
-        step_ind = featurizer_helpers.get_step_index(structured_datapath, cycle_type="rpt_1C", diag_pos=1)
+        step_ind = helper_functions.get_step_index(structured_datapath, cycle_type="rpt_1C", diag_pos=1)
         self.assertEqual(step_ind, {'rpt_1C_charge': 54, 'rpt_1C_discharge': 55})
 
-        step_ind = featurizer_helpers.get_step_index(structured_datapath, cycle_type="rpt_2C", diag_pos=0)
+        step_ind = helper_functions.get_step_index(structured_datapath, cycle_type="rpt_2C", diag_pos=0)
         self.assertEqual(step_ind, {'rpt_2C_charge': 25, 'rpt_2C_discharge': 26})
-        step_ind = featurizer_helpers.get_step_index(structured_datapath, cycle_type="rpt_2C", diag_pos=1)
+        step_ind = helper_functions.get_step_index(structured_datapath, cycle_type="rpt_2C", diag_pos=1)
         self.assertEqual(step_ind, {'rpt_2C_charge': 57, 'rpt_2C_discharge': 58})
 
     def test_get_step_index_3(self):
@@ -517,7 +517,7 @@ def test_get_step_index_3(self):
             TEST_FILE_DIR, "PredictionDiagnostics_000136_00002D_truncated_structure.json"
         )
         structured_datapath = auto_load_processed(structured_datapath_loc)
-        step_ind = featurizer_helpers.get_step_index(structured_datapath, cycle_type="hppc", diag_pos=0)
+        step_ind = helper_functions.get_step_index(structured_datapath, cycle_type="hppc", diag_pos=0)
         self.assertEqual(len(step_ind.values()), 6)
 
     def test_get_diffusion_coeff(self):
@@ -526,7 +526,7 @@ def test_get_diffusion_coeff(self):
                 TEST_FILE_DIR, "PreDiag_000240_000227_truncated_structure.json"
             )
             structured_datapath = auto_load_processed(structured_datapath_loc)
-            diffusion_df = featurizer_helpers.get_diffusion_coeff(structured_datapath, 1)
+            diffusion_df = helper_functions.get_diffusion_coeff(structured_datapath, 1)
             print(np.round(diffusion_df.iloc[0].to_list(), 3))
             self.assertEqual(np.round(diffusion_df.iloc[0].to_list(), 3)[0], -0.016)
             self.assertEqual(np.round(diffusion_df.iloc[0].to_list(), 3)[5], -0.011)

From d07c2ab028646b29a6d6e59163d8600bc261d2d2 Mon Sep 17 00:00:00 2001
From: ardunn <ardunn@lbl.gov>
Date: Wed, 30 Mar 2022 13:54:25 -0700
Subject: [PATCH 16/16] further refactoring

---
 beep/cmd.py                           | 7 ++++---
 beep/features/early_cycles/hppc.py    | 4 ++--
 beep/features/early_cycles/summary.py | 7 ++++---
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/beep/cmd.py b/beep/cmd.py
index d6569056..f96cf476 100644
--- a/beep/cmd.py
+++ b/beep/cmd.py
@@ -42,12 +42,13 @@
 from beep.structure.base import BEEPDatapath
 from beep.structure.cli import auto_load, auto_load_processed
 from beep.structure.validate import BEEPValidationError
-from beep.features.base import (
+from beep.features.featurizer import (
     BEEPFeaturizer,
     BEEPFeaturizationError,
-    BEEPFeatureMatrix,
 )
-from beep.features.core import (
+from beep.features.matrix import BEEPFeatureMatrix
+
+from beep.features.early_cycles import (
     HPPCResistanceVoltageFeatures,
     DeltaQFastCharge,
     TrajectoryFastCharge,
diff --git a/beep/features/early_cycles/hppc.py b/beep/features/early_cycles/hppc.py
index cbedb678..23c864f9 100644
--- a/beep/features/early_cycles/hppc.py
+++ b/beep/features/early_cycles/hppc.py
@@ -5,7 +5,7 @@
 from beep.features.featurizer import BEEPEarlyCyclesFeaturizer
 
 
-class HPPCResistanceVoltage(BEEPEarlyCyclesFeaturizer):
+class HPPCResistanceVoltageFeatures(BEEPEarlyCyclesFeaturizer):
     DEFAULT_HYPERPARAMETERS = {
         "test_time_filter_sec": 1000000,
         "cycle_index_filter": 6,
@@ -47,7 +47,7 @@ def create_features(self):
         ).filter(lambda x: ~x["test_time"].isnull().all())
 
         # diffusion features
-        diffusion_features = helper_functions.get_diffusion_early_features(
+        diffusion_features = helper_functions.get_diffusion_features(
             self.datapath,
         )
 
diff --git a/beep/features/early_cycles/summary.py b/beep/features/early_cycles/summary.py
index dc6a99e4..e17ceea6 100644
--- a/beep/features/early_cycles/summary.py
+++ b/beep/features/early_cycles/summary.py
@@ -2,6 +2,7 @@
 import pandas as pd
 from scipy.stats import skew, kurtosis
 
+from beep.features import helper_functions
 from beep import PROTOCOL_PARAMETERS_DIR
 from beep.features.featurizer import BEEPEarlyCyclesFeaturizer
 
@@ -208,7 +209,7 @@ def validate(self):
         Returns:
             bool: True/False indication of ability to proceed with feature generation
         """
-        val, msg = featurizer_helpers.check_diagnostic_validation(self.datapath)
+        val, msg = helper_functions.check_diagnostic_validation(self.datapath)
         if val:
             df = self.datapath.diagnostic_summary
             df = df[
@@ -291,13 +292,13 @@ def create_features(self):
         cycles = diag_intrp.cycle_index[diag_intrp.cycle_type ==
                                         self.hyperparameters[
                                             "diagnostic_cycle_type"]].unique()
-        step_dict_0 = featurizer_helpers.get_step_index(
+        step_dict_0 = helper_functions.get_step_index(
             self.datapath,
             cycle_type=self.hyperparameters["diagnostic_cycle_type"],
             diag_pos=self.hyperparameters["diag_pos_list"][0],
             parameters_path=self.hyperparameters["parameters_path"]
         )
-        step_dict_1 = featurizer_helpers.get_step_index(
+        step_dict_1 = helper_functions.get_step_index(
             self.datapath,
             cycle_type=self.hyperparameters["diagnostic_cycle_type"],
             diag_pos=self.hyperparameters["diag_pos_list"][1],