From 3eecd0b5850b69e28cab41b47b3c7ea95e23c211 Mon Sep 17 00:00:00 2001
From: heroldn <nicholas.herold@climaterisk.com.au>
Date: Wed, 10 Aug 2022 13:29:40 +0930
Subject: [PATCH 1/4] return distribution parameters and allow base period

---
 standard_precip/base_sp.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/standard_precip/base_sp.py b/standard_precip/base_sp.py
index 3559db9..98113ea 100644
--- a/standard_precip/base_sp.py
+++ b/standard_precip/base_sp.py
@@ -122,8 +122,8 @@ def cdf_to_ppf(self, data, params, p_zero):
 
         return norm_ppf
 
-    def calculate(self, df: pd.DataFrame, date_col: str, precip_cols: list, freq: str="M",
-                  scale: int=1, freq_col: str=None, fit_type: str='lmom', dist_type: str='gam',
+    def calculate(self, df: pd.DataFrame, date_col: str, precip_cols: list, startyr: int, endyr: int, freq: str="M",
+                  scale: int=1, freq_col: str=None, fit_type: str='lmom', dist_type: str='gam', 
                   **dist_kwargs) -> pd.DataFrame:
         '''
         Calculate the index.
@@ -145,6 +145,12 @@ def calculate(self, df: pd.DataFrame, date_col: str, precip_cols: list, freq: st
             List of columns with precipitation data. Each column is treated as a separate set of
             observations.
 
+        startyr: int
+			First year of baseline period.
+
+		endyr: int
+			Last year of baseline period.
+
         freq: str ["M", "W", "D"]
             The temporal frequency to calculate the index on. The day of year ("D") or week of year
             ("W") or month of year ("M") is derived from the date_col. If the user desires a custome
@@ -228,7 +234,8 @@ def calculate(self, df: pd.DataFrame, date_col: str, precip_cols: list, freq: st
                 precip_all = self._df_copy.loc[self._df_copy[self.freq_col]==j]
                 precip_single_df = precip_all.dropna().copy()
                 precip_single = precip_single_df[p].values
-                precip_sorted = np.sort(precip_single)[::-1]
+                precip_single_baseline = precip_single_df[p].loc[((precip_single_df[date_col].dt.year.values >=startyr) & (precip_single_df[date_col].dt.year.values <=endyr))].values
+                precip_sorted = np.sort(precip_single_baseline)[::-1] # why sorted?
 
                 # Fit distribution for particular series and month
                 params, p_zero = self.fit_distribution(
@@ -248,5 +255,6 @@ def calculate(self, df: pd.DataFrame, date_col: str, precip_cols: list, freq: st
             lambda left, right: pd.merge(left, right, on=date_col, how='left'), dfs, self._df_copy
         )
         df_all = df_all.drop(columns=self.freq_col)
+#        df_all['params'] = params
 
-        return df_all
+        return df_all, params

From bf0b4211617e8a26a48803645283c1b47e51b8c5 Mon Sep 17 00:00:00 2001
From: heroldn <nicholas.herold@climaterisk.com.au>
Date: Thu, 11 Aug 2022 15:34:47 +0930
Subject: [PATCH 2/4] add some inline commented diagnostics

---
 standard_precip/base_sp.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/standard_precip/base_sp.py b/standard_precip/base_sp.py
index 98113ea..8a16ff4 100644
--- a/standard_precip/base_sp.py
+++ b/standard_precip/base_sp.py
@@ -4,6 +4,7 @@
 import pandas as pd
 import scipy.stats as scs
 import matplotlib.pyplot as plt
+from IPython import embed
 
 from standard_precip.lmoments import distr
 
@@ -117,9 +118,19 @@ def cdf_to_ppf(self, data, params, p_zero):
             cdf = self.distrb.cdf(data, **params)
 
         # Apply inverse normal distribution
-        norm_ppf = scs.norm.ppf(cdf)
+        norm_ppf = scs.norm.ppf(cdf) # NH: find the inverse of the CDF except for a normal distribution
         norm_ppf[np.isinf(norm_ppf)] = np.nan
 
+        #########################
+        # diagnostics
+#        __import__("IPython").embed()
+#        from matplotlib import pyplot as plt
+#        # plot the CDF
+#        plt.plot(np.sort(data),np.sort(cdf))
+#        cdf_norm = scs.norm.cdf(norm_ppf)
+#        # plot the CDF of the normalised function
+#        plt.plot(np.sort(norm_ppf),np.sort(cdf_norm))
+
         return norm_ppf
 
     def calculate(self, df: pd.DataFrame, date_col: str, precip_cols: list, startyr: int, endyr: int, freq: str="M",
@@ -243,6 +254,7 @@ def calculate(self, df: pd.DataFrame, date_col: str, precip_cols: list, startyr:
                 )
 
                 # Calculate SPI/SPEI
+				# TODO: p_zero needs recalculating for the whole precip time series? Currently it's calculated using the base period only.
                 spi = self.cdf_to_ppf(precip_single, params, p_zero)
                 idx_col = f"{p}_calculated_index"
                 precip_single_df[idx_col] = spi

From 081d66d8f551a5f80b1a252f8715d66e9ba3c058 Mon Sep 17 00:00:00 2001
From: heroldn <nicholas.herold@climaterisk.com.au>
Date: Fri, 12 Aug 2022 10:16:18 +0930
Subject: [PATCH 3/4] diagnostic lines and comments

---
 standard_precip/base_sp.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/standard_precip/base_sp.py b/standard_precip/base_sp.py
index 8a16ff4..d5e5606 100644
--- a/standard_precip/base_sp.py
+++ b/standard_precip/base_sp.py
@@ -118,18 +118,27 @@ def cdf_to_ppf(self, data, params, p_zero):
             cdf = self.distrb.cdf(data, **params)
 
         # Apply inverse normal distribution
-        norm_ppf = scs.norm.ppf(cdf) # NH: find the inverse of the CDF except for a normal distribution
+        norm_ppf = scs.norm.ppf(cdf) # NH: find the inverse of the CDF, except for a normal distribution
         norm_ppf[np.isinf(norm_ppf)] = np.nan
 
         #########################
         # diagnostics
+        #########################
+        # On PPF's and CDF's: https://www.itl.nist.gov/div898/handbook/eda/section3/eda362.htm
 #        __import__("IPython").embed()
+        # back calculate precip value from normalised data (i.e. from the SPI value)
+#        norm_cdf = scs.norm.cdf(norm_ppf)
+#        original_data = scs.gamma.ppf(norm_cdf,a=params['a'],loc=params['loc'],scale=params['scale'])
+
 #        from matplotlib import pyplot as plt
 #        # plot the CDF
 #        plt.plot(np.sort(data),np.sort(cdf))
 #        cdf_norm = scs.norm.cdf(norm_ppf)
 #        # plot the CDF of the normalised function
 #        plt.plot(np.sort(norm_ppf),np.sort(cdf_norm))
+        #########################
+        # end diagnostics
+        #########################
 
         return norm_ppf
 
@@ -254,7 +263,10 @@ def calculate(self, df: pd.DataFrame, date_col: str, precip_cols: list, startyr:
                 )
 
                 # Calculate SPI/SPEI
-				# TODO: p_zero needs recalculating for the whole precip time series? Currently it's calculated using the base period only.
+                # NOTE: It's my understanding that p_zero does NOT need re-calculating based on the whole time series (as opposed to just 
+                # the base period above). ie. p_zero should represent the base period only, which is used to derive the parameters used in 
+                # the below CDF calcs. p_zero re-calculated using the whole time series would result in CDF's being calculated using 
+                # parameters from the baseline and a p_zero from the whole time series.
                 spi = self.cdf_to_ppf(precip_single, params, p_zero)
                 idx_col = f"{p}_calculated_index"
                 precip_single_df[idx_col] = spi
@@ -267,6 +279,5 @@ def calculate(self, df: pd.DataFrame, date_col: str, precip_cols: list, startyr:
             lambda left, right: pd.merge(left, right, on=date_col, how='left'), dfs, self._df_copy
         )
         df_all = df_all.drop(columns=self.freq_col)
-#        df_all['params'] = params
 
         return df_all, params

From 58f265207df3ab91e8eb82121122a6c3975b64a5 Mon Sep 17 00:00:00 2001
From: heroldn <nicholas.herold@climaterisk.com.au>
Date: Thu, 18 Aug 2022 16:27:29 +0930
Subject: [PATCH 4/4] fix: return distribution parameters for all 12 calendar
 months

---
 standard_precip/base_sp.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/standard_precip/base_sp.py b/standard_precip/base_sp.py
index d5e5606..d2b27ac 100644
--- a/standard_precip/base_sp.py
+++ b/standard_precip/base_sp.py
@@ -218,6 +218,8 @@ def calculate(self, df: pd.DataFrame, date_col: str, precip_cols: list, startyr:
         df: pd.Dataframe
             Pandas dataframe with the calculated indicies for each precipitation column appended
             to the original dataframe.
+		df_params: pd.DataFrame
+			Pandas dataframe with the distribution parameters for each calendar month.
         '''
 
         # Check for duplicate dates
@@ -248,6 +250,7 @@ def calculate(self, df: pd.DataFrame, date_col: str, precip_cols: list, startyr:
         freq_range = self._df_copy[self.freq_col].unique().tolist()
         # Loop over months
         dfs = []
+        dictpar = dict.fromkeys(freq_range)
         for p in precip_cols:
             dfs_p = pd.DataFrame()
             for j in freq_range:
@@ -273,6 +276,7 @@ def calculate(self, df: pd.DataFrame, date_col: str, precip_cols: list, startyr:
                 precip_single_df = precip_single_df[[date_col, idx_col]]
                 dfs_p = pd.concat([dfs_p, precip_single_df])
                 dfs_p = dfs_p.sort_values(date_col)
+                dictpar[j] = params
             dfs.append(dfs_p)
 
         df_all = reduce(
@@ -280,4 +284,4 @@ def calculate(self, df: pd.DataFrame, date_col: str, precip_cols: list, startyr:
         )
         df_all = df_all.drop(columns=self.freq_col)
 
-        return df_all, params
+        return df_all, pd.DataFrame(dictpar).transpose()