From 655894f656a5db02867f0c6e6f003315fc81bc70 Mon Sep 17 00:00:00 2001 From: harisbal Date: Wed, 26 Apr 2017 23:29:22 +0100 Subject: [PATCH 1/9] Improve Pandas version Use vectorization to enhance the pandas version of the algorithm. Multi-indexed dataframes can be also used as input --- ipfn/ipfn.py | 97 ++++++++++++++++++++++------------------------------ 1 file changed, 41 insertions(+), 56 deletions(-) diff --git a/ipfn/ipfn.py b/ipfn/ipfn.py index 9463096..4257e15 100755 --- a/ipfn/ipfn.py +++ b/ipfn/ipfn.py @@ -121,7 +121,7 @@ def ipfn_np(self, m, aggregates, dimensions, weight_col='total'): return m, max_conv - def ipfn_df(self, df, aggregates, dimensions, weight_col='total'): + def ipfn_df(self, df, aggregates, dimensions): """ Runs the ipfn method from a dataframe df, aggregates/marginals and the dimension(s) preserved. For example: @@ -153,74 +153,52 @@ def ipfn_df(self, df, aggregates, dimensions, weight_col='total'): print(df) print(df.groupby('age')['total'].sum(), xip)""" - steps = len(aggregates) - tables = [df] - for inc in range(steps-1): - tables.append(df.copy()) - original = df.copy() - - # Calculate the new weights for each dimension - inc = 0 - for features in dimensions: - if inc == (steps-1): - table_update = df - table_current = tables[inc] + aggrs = self.aggregates + dims = self.dimensions + factors = [] + index_names = df.index.names + + for k, d in enumerate(dims): + dfg = df.groupby(level=d).sum() + f = aggrs[k].div(dfg) + # Joining on multiindexes of not same length is not implemented + if len(d) > 1: + unstack_levels = [lvl for lvl in index_names if lvl not in d] + rem_index = [lvl for lvl in index_names if lvl in d] + df = (df.unstack(unstack_levels) + .multiply(f.reorder_levels(rem_index), axis=0) + .stack(unstack_levels) + .reorder_levels(index_names)) else: - table_update = tables[inc+1] - table_current = tables[inc] - - tmp = table_current.groupby(features)[weight_col].sum() - xijk = aggregates[inc] - - feat_l = [] - for feature in features: - feat_l.append(np.unique(table_current[feature])) - table_update.set_index(features, inplace=True) - table_current.set_index(features, inplace=True) - - for feature in product(*feat_l): - den = tmp.loc[feature] - # calculate new weight for this iteration - if den == 0: - table_update.loc[feature, weight_col] =\ - table_current.loc[feature, weight_col] *\ - xijk.loc[feature] - else: - table_update.loc[feature, weight_col] = \ - table_current.loc[feature, weight_col].astype(float) * \ - xijk.loc[feature]/den - - table_update.reset_index(inplace=True) - table_current.reset_index(inplace=True) - inc += 1 - feat_l = [] - - # Calculate the max convergence rate - max_conv = 0 - inc = 0 - for features in dimensions: - tmp = df.groupby(features)[weight_col].sum() - ori_ijk = aggregates[inc] - temp_conv = max(abs(tmp/ori_ijk - 1)) - if temp_conv > max_conv: - max_conv = temp_conv - inc += 1 + df = df.multiply(f, fill_value=0) + f = f.sub(1).abs().max() + factors.append(f) + # Check for convergence + max_conv = max(factors) + return df, max_conv def iteration(self): """ - Runs the ipfn algorithm. Automatically detects of working with numpy ndarray or pandas dataframes. + Runs the ipfn algorithm. Automatically detects of working with + numpy ndarray or pandas dataframes. """ i = 0 conv = self.conv_rate * 100 - m = self.original + m = self.original.copy() # If the original data input is in pandas DataFrame format if isinstance(self.original, pd.DataFrame): + # Add index + indexcols = list(set(x for l in self.dimensions for x in l)) + m.reset_index(inplace=True) + m.set_index(indexcols, inplace=True) + # Turn to series + m = m[self.weight_col] while i <= self.max_itr and conv > self.conv_rate: - m, conv = self.ipfn_df(m, self.aggregates, self.dimensions, self.weight_col) + m, conv = self.ipfn_df(m, self.aggregates, self.dimensions) i += 1 # print(i, conv) # If the original data input is in numpy format @@ -230,6 +208,13 @@ def iteration(self): m, conv = self.ipfn_np(m, self.aggregates, self.dimensions, self.weight_col) i += 1 # print(i, conv) + + if isinstance(m, pd.Series): + #Reset to dataframe + m.name = self.weight_col + m = m.reset_index() + m.set_index(self.original.index.names, inplace=True) + converged = 1 if i <= self.max_itr: print('ipfn converged') @@ -246,7 +231,6 @@ def iteration(self): print('wrong verbose input, return None') sys.exit(0) - if __name__ == '__main__': # Example 1, 2D using ipfn_np, @@ -429,6 +413,7 @@ def iteration(self): df['age'] = age_l df['total'] = m + df.set_index(['dma', 'size', 'age'], inplace=True) xipp = df.groupby('dma')['total'].sum() xpjp = df.groupby('size')['total'].sum() xppk = df.groupby('age')['total'].sum() From 99e62669b5d80cf3da4b7d5df55f9c1ae5973f25 Mon Sep 17 00:00:00 2001 From: harisbal Date: Mon, 16 Sep 2019 17:55:07 +0300 Subject: [PATCH 2/9] Simplified version now that pandas (0.25) can accept multi-index joins --- ipfn/ipfn.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/ipfn/ipfn.py b/ipfn/ipfn.py index 4257e15..941e11f 100755 --- a/ipfn/ipfn.py +++ b/ipfn/ipfn.py @@ -153,27 +153,25 @@ def ipfn_df(self, df, aggregates, dimensions): print(df) print(df.groupby('age')['total'].sum(), xip)""" - aggrs = self.aggregates - dims = self.dimensions + aggregates = self.aggregates + dimensions = self.dimensions factors = [] index_names = df.index.names - for k, d in enumerate(dims): + for k, d in enumerate(dimensions): dfg = df.groupby(level=d).sum() - f = aggrs[k].div(dfg) - # Joining on multiindexes of not same length is not implemented + f = aggregates[k].div(dfg) + # Requires pandas >= 0.25 if len(d) > 1: - unstack_levels = [lvl for lvl in index_names if lvl not in d] rem_index = [lvl for lvl in index_names if lvl in d] - df = (df.unstack(unstack_levels) - .multiply(f.reorder_levels(rem_index), axis=0) - .stack(unstack_levels) - .reorder_levels(index_names)) + df = (df.multiply(f.reorder_levels(rem_index), axis=0) + .reorder_levels(index_names)) else: df = df.multiply(f, fill_value=0) f = f.sub(1).abs().max() factors.append(f) + # Check for convergence max_conv = max(factors) From 8e09b16be691061ebb16247509ffe0a85039777c Mon Sep 17 00:00:00 2001 From: harisbal Date: Mon, 16 Sep 2019 18:20:40 +0300 Subject: [PATCH 3/9] Simplified version now that pandas (0.25) can accept multi-index joins --- ipfn/ipfn.py | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/ipfn/ipfn.py b/ipfn/ipfn.py index f070e1d..7e28802 100755 --- a/ipfn/ipfn.py +++ b/ipfn/ipfn.py @@ -213,36 +213,24 @@ def iteration(self): # print(i, conv) # If the original data input is in numpy format elif isinstance(self.original, np.ndarray): - ipfn_method = self.ipfn_np self.original = self.original.astype('float64') while i <= self.max_itr and conv > self.conv_rate: m, conv = self.ipfn_np(m, self.aggregates, self.dimensions, self.weight_col) i += 1 # print(i, conv) - if isinstance(m, pd.Series): - #Reset to dataframe - m.name = self.weight_col - m = m.reset_index() - m.set_index(self.original.index.names, inplace=True) - - converged = 1 + converged = True if i <= self.max_itr: - if (not conv > self.conv_rate) & (self.verbose > 1): - print('ipfn converged: convergence_rate below threshold') - elif not abs(conv - old_conv) > self.rate_tolerance: - print('ipfn converged: convergence_rate not updating or below rate_tolerance') + print('ipfn converged') else: print('Maximum iterations reached') - converged = 0 + converged = False # Handle the verbose if self.verbose == 0: return m elif self.verbose == 1: return m, converged - elif self.verbose == 2: - return m, converged, pd.DataFrame({'iteration': range(i), 'conv': conv_list}).set_index('iteration') else: print('wrong verbose input, return None') sys.exit(0) From 5e8e4b307583d7250f77af87f290534e7f54fba3 Mon Sep 17 00:00:00 2001 From: harisbal Date: Mon, 16 Sep 2019 18:28:42 +0300 Subject: [PATCH 4/9] Simplified version now that pandas (0.25) can accept multi-index joins --- tests/tests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/tests.py b/tests/tests.py index b4e480b..bd5c03d 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -254,7 +254,7 @@ def test_pandas_3D(self): for marginal, vertical in marginals1D: features = marginal.index.tolist() for feature in features: - assert round(df.groupby(vertical)['total'].sum().loc[feature], 2) == round(marginal.loc[feature], 2) + assert round(df.groupby(vertical).sum().loc[feature], 2) == round(marginal.loc[feature], 2) m_inc += 1 marginals2D = [(xijp, ['dma', 'size']), (xpjk, ['size', 'age'])] @@ -262,5 +262,5 @@ def test_pandas_3D(self): for marginal, vertical in marginals2D: features = marginal.index.tolist() for feature in features: - assert round(df.groupby(vertical)['total'].sum().loc[feature], 2) == round(marginal.loc[feature], 2) + assert round(df.groupby(vertical).sum().loc[feature], 2) == round(marginal.loc[feature], 2) m_inc += 1 From 70409f7ba1cf4b0f3db9ee83607c32d13c52b62f Mon Sep 17 00:00:00 2001 From: harisbal Date: Mon, 16 Sep 2019 18:28:42 +0300 Subject: [PATCH 5/9] Simplified version now that pandas (0.25) can accept multi-index joins --- .gitignore | 6 ++++++ tests/tests.py | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f40a1e8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ + +\.idea/ + +ipfn/__pycache__/ + +tests/__pycache__/ diff --git a/tests/tests.py b/tests/tests.py index b4e480b..bd5c03d 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -254,7 +254,7 @@ def test_pandas_3D(self): for marginal, vertical in marginals1D: features = marginal.index.tolist() for feature in features: - assert round(df.groupby(vertical)['total'].sum().loc[feature], 2) == round(marginal.loc[feature], 2) + assert round(df.groupby(vertical).sum().loc[feature], 2) == round(marginal.loc[feature], 2) m_inc += 1 marginals2D = [(xijp, ['dma', 'size']), (xpjk, ['size', 'age'])] @@ -262,5 +262,5 @@ def test_pandas_3D(self): for marginal, vertical in marginals2D: features = marginal.index.tolist() for feature in features: - assert round(df.groupby(vertical)['total'].sum().loc[feature], 2) == round(marginal.loc[feature], 2) + assert round(df.groupby(vertical).sum().loc[feature], 2) == round(marginal.loc[feature], 2) m_inc += 1 From 4a548e485aa9e2606d74ac64c10484b8a7a5fd21 Mon Sep 17 00:00:00 2001 From: harisbal Date: Mon, 30 Sep 2019 01:36:53 +0300 Subject: [PATCH 6/9] Simplified version of pandas version. Requires pandas >=0.24 --- ipfn/ipfn.py | 51 +++++++++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/ipfn/ipfn.py b/ipfn/ipfn.py index 7e28802..07d9b5a 100755 --- a/ipfn/ipfn.py +++ b/ipfn/ipfn.py @@ -165,15 +165,13 @@ def ipfn_df(self, df, aggregates, dimensions): print(df) print(df.groupby('age')['total'].sum(), xip)""" - aggregates = self.aggregates - dimensions = self.dimensions factors = [] index_names = df.index.names for k, d in enumerate(dimensions): dfg = df.groupby(level=d).sum() f = aggregates[k].div(dfg) - # Requires pandas >= 0.25 + # Requires pandas >= 0.24 if len(d) > 1: rem_index = [lvl for lvl in index_names if lvl in d] df = (df.multiply(f.reorder_levels(rem_index), axis=0) @@ -191,36 +189,40 @@ def ipfn_df(self, df, aggregates, dimensions): def iteration(self): """ - Runs the ipfn algorithm. Automatically detects of working with - numpy ndarray or pandas dataframes. + Runs the ipfn algorithm. Automatically detects of working with numpy ndarray or pandas dataframes. """ + def _prepare_df_format(df): + # Add index + idxcols = list(set(x for l in self.dimensions for x in l)) + df = df.reset_index().set_index(idxcols) + # Turn to series + df = df[self.weight_col] + return df + i = 0 conv = self.conv_rate * 100 - m = self.original.copy() + conv_progress = [] + m = self.original # If the original data input is in pandas DataFrame format if isinstance(self.original, pd.DataFrame): - # Add index - indexcols = list(set(x for l in self.dimensions for x in l)) - m.reset_index(inplace=True) - m.set_index(indexcols, inplace=True) - # Turn to series - m = m[self.weight_col] - while i <= self.max_itr and conv > self.conv_rate: - m, conv = self.ipfn_df(m, self.aggregates, self.dimensions) - i += 1 - # print(i, conv) - # If the original data input is in numpy format + m = _prepare_df_format(m) + ipfn_method = self.ipfn_df elif isinstance(self.original, np.ndarray): + ipfn_method = self.ipfn_np self.original = self.original.astype('float64') - while i <= self.max_itr and conv > self.conv_rate: - m, conv = self.ipfn_np(m, self.aggregates, self.dimensions, self.weight_col) - i += 1 - # print(i, conv) + else: + print('Data input instance not recognized') + sys.exit(0) + + while (i <= self.max_itr) and (conv > self.conv_rate): + m, conv = ipfn_method(m, self.aggregates, self.dimensions) + conv_progress.append(conv) + i += 1 - converged = True if i <= self.max_itr: + converged = True print('ipfn converged') else: print('Maximum iterations reached') @@ -231,6 +233,11 @@ def iteration(self): return m elif self.verbose == 1: return m, converged + elif self.verbose == 2: + conv_progress = pd.DataFrame({'iteration': range(i), + 'convergence': conv_progress} + ).set_index('iteration') + return m, converged, conv_progress else: print('wrong verbose input, return None') sys.exit(0) From e14b1107d07e8d8ecf3b4bb0ece6d87bfb0bd56a Mon Sep 17 00:00:00 2001 From: harisbal Date: Tue, 31 Aug 2021 00:04:11 +0300 Subject: [PATCH 7/9] fixed numpy deprecation warning --- .vscode/settings.json | 3 +++ ipfn/ipfn.py | 10 +++++----- 2 files changed, 8 insertions(+), 5 deletions(-) create mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..3516cb9 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.formatting.provider": "autopep8" +} \ No newline at end of file diff --git a/ipfn/ipfn.py b/ipfn/ipfn.py index 4983c45..ee61f4c 100644 --- a/ipfn/ipfn.py +++ b/ipfn/ipfn.py @@ -85,16 +85,16 @@ def ipfn_np(self, m, aggregates, dimensions, weight_col="total"): inc = 0 for aggregate in aggregates: if not isinstance(aggregate, np.ndarray): - aggregate = np.array(aggregate).astype(np.float) + aggregate = np.array(aggregate).astype(float) aggregates[inc] = aggregate - elif aggregate.dtype not in [np.float, float]: - aggregate = aggregate.astype(np.float) + elif aggregate.dtype != float: + aggregate = aggregate.astype(float) aggregates[inc] = aggregate inc += 1 if not isinstance(m, np.ndarray): m = np.array(m) - elif m.dtype not in [np.float, float]: - m = m.astype(np.float) + elif m.dtype != float: + m = m.astype(float) steps = len(aggregates) dim = len(m.shape) From 5f75ba84d325d044a3b67fefd3da99baab69015f Mon Sep 17 00:00:00 2001 From: harisbal Date: Tue, 31 Aug 2021 00:08:40 +0300 Subject: [PATCH 8/9] add to gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index f40a1e8..bcd3e54 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,6 @@ ipfn/__pycache__/ tests/__pycache__/ + +.pytest_cache/ +.vscode/ \ No newline at end of file From 46ca2a15f0d3c2ce0380153ac30e6bb140cef82d Mon Sep 17 00:00:00 2001 From: harisbal Date: Tue, 31 Aug 2021 00:10:53 +0300 Subject: [PATCH 9/9] tmp --- .gitignore | 2 +- .vscode/settings.json | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) delete mode 100644 .vscode/settings.json diff --git a/.gitignore b/.gitignore index bcd3e54..0a7ed66 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,4 @@ ipfn/__pycache__/ tests/__pycache__/ .pytest_cache/ -.vscode/ \ No newline at end of file +.vscode/ diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 3516cb9..0000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "python.formatting.provider": "autopep8" -} \ No newline at end of file