From 3eecd0b5850b69e28cab41b47b3c7ea95e23c211 Mon Sep 17 00:00:00 2001 From: heroldn Date: Wed, 10 Aug 2022 13:29:40 +0930 Subject: [PATCH 1/4] return distribution parameters and allow base period --- standard_precip/base_sp.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/standard_precip/base_sp.py b/standard_precip/base_sp.py index 3559db9..98113ea 100644 --- a/standard_precip/base_sp.py +++ b/standard_precip/base_sp.py @@ -122,8 +122,8 @@ def cdf_to_ppf(self, data, params, p_zero): return norm_ppf - def calculate(self, df: pd.DataFrame, date_col: str, precip_cols: list, freq: str="M", - scale: int=1, freq_col: str=None, fit_type: str='lmom', dist_type: str='gam', + def calculate(self, df: pd.DataFrame, date_col: str, precip_cols: list, startyr: int, endyr: int, freq: str="M", + scale: int=1, freq_col: str=None, fit_type: str='lmom', dist_type: str='gam', **dist_kwargs) -> pd.DataFrame: ''' Calculate the index. @@ -145,6 +145,12 @@ def calculate(self, df: pd.DataFrame, date_col: str, precip_cols: list, freq: st List of columns with precipitation data. Each column is treated as a separate set of observations. + startyr: int + First year of baseline period. + + endyr: int + Last year of baseline period. + freq: str ["M", "W", "D"] The temporal frequency to calculate the index on. The day of year ("D") or week of year ("W") or month of year ("M") is derived from the date_col. If the user desires a custome @@ -228,7 +234,8 @@ def calculate(self, df: pd.DataFrame, date_col: str, precip_cols: list, freq: st precip_all = self._df_copy.loc[self._df_copy[self.freq_col]==j] precip_single_df = precip_all.dropna().copy() precip_single = precip_single_df[p].values - precip_sorted = np.sort(precip_single)[::-1] + precip_single_baseline = precip_single_df[p].loc[((precip_single_df[date_col].dt.year.values >=startyr) & (precip_single_df[date_col].dt.year.values <=endyr))].values + precip_sorted = np.sort(precip_single_baseline)[::-1] # why sorted? # Fit distribution for particular series and month params, p_zero = self.fit_distribution( @@ -248,5 +255,6 @@ def calculate(self, df: pd.DataFrame, date_col: str, precip_cols: list, freq: st lambda left, right: pd.merge(left, right, on=date_col, how='left'), dfs, self._df_copy ) df_all = df_all.drop(columns=self.freq_col) +# df_all['params'] = params - return df_all + return df_all, params From bf0b4211617e8a26a48803645283c1b47e51b8c5 Mon Sep 17 00:00:00 2001 From: heroldn Date: Thu, 11 Aug 2022 15:34:47 +0930 Subject: [PATCH 2/4] add some inline commented diagnostics --- standard_precip/base_sp.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/standard_precip/base_sp.py b/standard_precip/base_sp.py index 98113ea..8a16ff4 100644 --- a/standard_precip/base_sp.py +++ b/standard_precip/base_sp.py @@ -4,6 +4,7 @@ import pandas as pd import scipy.stats as scs import matplotlib.pyplot as plt +from IPython import embed from standard_precip.lmoments import distr @@ -117,9 +118,19 @@ def cdf_to_ppf(self, data, params, p_zero): cdf = self.distrb.cdf(data, **params) # Apply inverse normal distribution - norm_ppf = scs.norm.ppf(cdf) + norm_ppf = scs.norm.ppf(cdf) # NH: find the inverse of the CDF except for a normal distribution norm_ppf[np.isinf(norm_ppf)] = np.nan + ######################### + # diagnostics +# __import__("IPython").embed() +# from matplotlib import pyplot as plt +# # plot the CDF +# plt.plot(np.sort(data),np.sort(cdf)) +# cdf_norm = scs.norm.cdf(norm_ppf) +# # plot the CDF of the normalised function +# plt.plot(np.sort(norm_ppf),np.sort(cdf_norm)) + return norm_ppf def calculate(self, df: pd.DataFrame, date_col: str, precip_cols: list, startyr: int, endyr: int, freq: str="M", @@ -243,6 +254,7 @@ def calculate(self, df: pd.DataFrame, date_col: str, precip_cols: list, startyr: ) # Calculate SPI/SPEI + # TODO: p_zero needs recalculating for the whole precip time series? Currently it's calculated using the base period only. spi = self.cdf_to_ppf(precip_single, params, p_zero) idx_col = f"{p}_calculated_index" precip_single_df[idx_col] = spi From 081d66d8f551a5f80b1a252f8715d66e9ba3c058 Mon Sep 17 00:00:00 2001 From: heroldn Date: Fri, 12 Aug 2022 10:16:18 +0930 Subject: [PATCH 3/4] diagnostic lines and comments --- standard_precip/base_sp.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/standard_precip/base_sp.py b/standard_precip/base_sp.py index 8a16ff4..d5e5606 100644 --- a/standard_precip/base_sp.py +++ b/standard_precip/base_sp.py @@ -118,18 +118,27 @@ def cdf_to_ppf(self, data, params, p_zero): cdf = self.distrb.cdf(data, **params) # Apply inverse normal distribution - norm_ppf = scs.norm.ppf(cdf) # NH: find the inverse of the CDF except for a normal distribution + norm_ppf = scs.norm.ppf(cdf) # NH: find the inverse of the CDF, except for a normal distribution norm_ppf[np.isinf(norm_ppf)] = np.nan ######################### # diagnostics + ######################### + # On PPF's and CDF's: https://www.itl.nist.gov/div898/handbook/eda/section3/eda362.htm # __import__("IPython").embed() + # back calculate precip value from normalised data (i.e. from the SPI value) +# norm_cdf = scs.norm.cdf(norm_ppf) +# original_data = scs.gamma.ppf(norm_cdf,a=params['a'],loc=params['loc'],scale=params['scale']) + # from matplotlib import pyplot as plt # # plot the CDF # plt.plot(np.sort(data),np.sort(cdf)) # cdf_norm = scs.norm.cdf(norm_ppf) # # plot the CDF of the normalised function # plt.plot(np.sort(norm_ppf),np.sort(cdf_norm)) + ######################### + # end diagnostics + ######################### return norm_ppf @@ -254,7 +263,10 @@ def calculate(self, df: pd.DataFrame, date_col: str, precip_cols: list, startyr: ) # Calculate SPI/SPEI - # TODO: p_zero needs recalculating for the whole precip time series? Currently it's calculated using the base period only. + # NOTE: It's my understanding that p_zero does NOT need re-calculating based on the whole time series (as opposed to just + # the base period above). ie. p_zero should represent the base period only, which is used to derive the parameters used in + # the below CDF calcs. p_zero re-calculated using the whole time series would result in CDF's being calculated using + # parameters from the baseline and a p_zero from the whole time series. spi = self.cdf_to_ppf(precip_single, params, p_zero) idx_col = f"{p}_calculated_index" precip_single_df[idx_col] = spi @@ -267,6 +279,5 @@ def calculate(self, df: pd.DataFrame, date_col: str, precip_cols: list, startyr: lambda left, right: pd.merge(left, right, on=date_col, how='left'), dfs, self._df_copy ) df_all = df_all.drop(columns=self.freq_col) -# df_all['params'] = params return df_all, params From 58f265207df3ab91e8eb82121122a6c3975b64a5 Mon Sep 17 00:00:00 2001 From: heroldn Date: Thu, 18 Aug 2022 16:27:29 +0930 Subject: [PATCH 4/4] fix: return distribution parameters for all 12 calendar months --- standard_precip/base_sp.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/standard_precip/base_sp.py b/standard_precip/base_sp.py index d5e5606..d2b27ac 100644 --- a/standard_precip/base_sp.py +++ b/standard_precip/base_sp.py @@ -218,6 +218,8 @@ def calculate(self, df: pd.DataFrame, date_col: str, precip_cols: list, startyr: df: pd.Dataframe Pandas dataframe with the calculated indicies for each precipitation column appended to the original dataframe. + df_params: pd.DataFrame + Pandas dataframe with the distribution parameters for each calendar month. ''' # Check for duplicate dates @@ -248,6 +250,7 @@ def calculate(self, df: pd.DataFrame, date_col: str, precip_cols: list, startyr: freq_range = self._df_copy[self.freq_col].unique().tolist() # Loop over months dfs = [] + dictpar = dict.fromkeys(freq_range) for p in precip_cols: dfs_p = pd.DataFrame() for j in freq_range: @@ -273,6 +276,7 @@ def calculate(self, df: pd.DataFrame, date_col: str, precip_cols: list, startyr: precip_single_df = precip_single_df[[date_col, idx_col]] dfs_p = pd.concat([dfs_p, precip_single_df]) dfs_p = dfs_p.sort_values(date_col) + dictpar[j] = params dfs.append(dfs_p) df_all = reduce( @@ -280,4 +284,4 @@ def calculate(self, df: pd.DataFrame, date_col: str, precip_cols: list, startyr: ) df_all = df_all.drop(columns=self.freq_col) - return df_all, params + return df_all, pd.DataFrame(dictpar).transpose()