e-baumer · heroldn · Aug 10, 2022 · Aug 11, 2022 · Aug 12, 2022 · Aug 18, 2022
diff --git a/standard_precip/base_sp.py b/standard_precip/base_sp.py
@@ -4,6 +4,7 @@
 import pandas as pd
 import scipy.stats as scs
 import matplotlib.pyplot as plt
+from IPython import embed
 
 from standard_precip.lmoments import distr
 
@@ -117,13 +118,32 @@ def cdf_to_ppf(self, data, params, p_zero):
             cdf = self.distrb.cdf(data, **params)
 
         # Apply inverse normal distribution
-        norm_ppf = scs.norm.ppf(cdf)
+        norm_ppf = scs.norm.ppf(cdf) # NH: find the inverse of the CDF, except for a normal distribution
         norm_ppf[np.isinf(norm_ppf)] = np.nan
 
+        #########################
+        # diagnostics
+        #########################
+        # On PPF's and CDF's: https://www.itl.nist.gov/div898/handbook/eda/section3/eda362.htm
+#        __import__("IPython").embed()
+        # back calculate precip value from normalised data (i.e. from the SPI value)
+#        norm_cdf = scs.norm.cdf(norm_ppf)
+#        original_data = scs.gamma.ppf(norm_cdf,a=params['a'],loc=params['loc'],scale=params['scale'])
+
+#        from matplotlib import pyplot as plt
+#        # plot the CDF
+#        plt.plot(np.sort(data),np.sort(cdf))
+#        cdf_norm = scs.norm.cdf(norm_ppf)
+#        # plot the CDF of the normalised function
+#        plt.plot(np.sort(norm_ppf),np.sort(cdf_norm))
+        #########################
+        # end diagnostics
+        #########################
+
         return norm_ppf
 
-    def calculate(self, df: pd.DataFrame, date_col: str, precip_cols: list, freq: str="M",
-                  scale: int=1, freq_col: str=None, fit_type: str='lmom', dist_type: str='gam',
+    def calculate(self, df: pd.DataFrame, date_col: str, precip_cols: list, startyr: int, endyr: int, freq: str="M",
+                  scale: int=1, freq_col: str=None, fit_type: str='lmom', dist_type: str='gam', 
                   **dist_kwargs) -> pd.DataFrame:
         '''
         Calculate the index.
@@ -145,6 +165,12 @@ def calculate(self, df: pd.DataFrame, date_col: str, precip_cols: list, freq: st
             List of columns with precipitation data. Each column is treated as a separate set of
             observations.
 
+        startyr: int
+			First year of baseline period.
+
+		endyr: int
+			Last year of baseline period.
+
         freq: str ["M", "W", "D"]
             The temporal frequency to calculate the index on. The day of year ("D") or week of year
             ("W") or month of year ("M") is derived from the date_col. If the user desires a custome
@@ -192,6 +218,8 @@ def calculate(self, df: pd.DataFrame, date_col: str, precip_cols: list, freq: st
         df: pd.Dataframe
             Pandas dataframe with the calculated indicies for each precipitation column appended
             to the original dataframe.
+		df_params: pd.DataFrame
+			Pandas dataframe with the distribution parameters for each calendar month.
         '''
 
         # Check for duplicate dates
@@ -222,31 +250,38 @@ def calculate(self, df: pd.DataFrame, date_col: str, precip_cols: list, freq: st
         freq_range = self._df_copy[self.freq_col].unique().tolist()
         # Loop over months
         dfs = []
+        dictpar = dict.fromkeys(freq_range)
         for p in precip_cols:
             dfs_p = pd.DataFrame()
             for j in freq_range:
                 precip_all = self._df_copy.loc[self._df_copy[self.freq_col]==j]
                 precip_single_df = precip_all.dropna().copy()
                 precip_single = precip_single_df[p].values
-                precip_sorted = np.sort(precip_single)[::-1]
+                precip_single_baseline = precip_single_df[p].loc[((precip_single_df[date_col].dt.year.values >=startyr) & (precip_single_df[date_col].dt.year.values <=endyr))].values
+                precip_sorted = np.sort(precip_single_baseline)[::-1] # why sorted?
 
                 # Fit distribution for particular series and month
                 params, p_zero = self.fit_distribution(
                     precip_sorted, dist_type, fit_type, **dist_kwargs
                 )
 
                 # Calculate SPI/SPEI
+                # NOTE: It's my understanding that p_zero does NOT need re-calculating based on the whole time series (as opposed to just 
+                # the base period above). ie. p_zero should represent the base period only, which is used to derive the parameters used in 
+                # the below CDF calcs. p_zero re-calculated using the whole time series would result in CDF's being calculated using 
+                # parameters from the baseline and a p_zero from the whole time series.
                 spi = self.cdf_to_ppf(precip_single, params, p_zero)
                 idx_col = f"{p}_calculated_index"
                 precip_single_df[idx_col] = spi
                 precip_single_df = precip_single_df[[date_col, idx_col]]
                 dfs_p = pd.concat([dfs_p, precip_single_df])
                 dfs_p = dfs_p.sort_values(date_col)
+                dictpar[j] = params
             dfs.append(dfs_p)
 
         df_all = reduce(
             lambda left, right: pd.merge(left, right, on=date_col, how='left'), dfs, self._df_copy
         )
         df_all = df_all.drop(columns=self.freq_col)
 
-        return df_all
+        return df_all, pd.DataFrame(dictpar).transpose()