From 655894f656a5db02867f0c6e6f003315fc81bc70 Mon Sep 17 00:00:00 2001
From: harisbal <hbsolutionsgr@hotmail.com>
Date: Wed, 26 Apr 2017 23:29:22 +0100
Subject: [PATCH 1/9] Improve Pandas version

Use vectorization to enhance the pandas version of the algorithm. Multi-indexed dataframes can be also used as input
---
 ipfn/ipfn.py | 97 ++++++++++++++++++++++------------------------------
 1 file changed, 41 insertions(+), 56 deletions(-)

diff --git a/ipfn/ipfn.py b/ipfn/ipfn.py
index 9463096..4257e15 100755
--- a/ipfn/ipfn.py
+++ b/ipfn/ipfn.py
@@ -121,7 +121,7 @@ def ipfn_np(self, m, aggregates, dimensions, weight_col='total'):
 
         return m, max_conv
 
-    def ipfn_df(self, df, aggregates, dimensions, weight_col='total'):
+    def ipfn_df(self, df, aggregates, dimensions):
         """
         Runs the ipfn method from a dataframe df, aggregates/marginals and the dimension(s) preserved.
         For example:
@@ -153,74 +153,52 @@ def ipfn_df(self, df, aggregates, dimensions, weight_col='total'):
         print(df)
         print(df.groupby('age')['total'].sum(), xip)"""
 
-        steps = len(aggregates)
-        tables = [df]
-        for inc in range(steps-1):
-            tables.append(df.copy())
-        original = df.copy()
-
-        # Calculate the new weights for each dimension
-        inc = 0
-        for features in dimensions:
-            if inc == (steps-1):
-                table_update = df
-                table_current = tables[inc]
+        aggrs = self.aggregates
+        dims = self.dimensions
+        factors = []
+        index_names = df.index.names
+
+        for k, d in enumerate(dims):
+            dfg = df.groupby(level=d).sum()
+            f = aggrs[k].div(dfg)
+            # Joining on multiindexes of not same length is not implemented
+            if len(d) > 1:
+                unstack_levels = [lvl for lvl in index_names if lvl not in d]
+                rem_index = [lvl for lvl in index_names if lvl in d]
+                df = (df.unstack(unstack_levels)
+                      .multiply(f.reorder_levels(rem_index), axis=0)
+                      .stack(unstack_levels)
+                      .reorder_levels(index_names))
             else:
-                table_update = tables[inc+1]
-                table_current = tables[inc]
-
-            tmp = table_current.groupby(features)[weight_col].sum()
-            xijk = aggregates[inc]
-
-            feat_l = []
-            for feature in features:
-                feat_l.append(np.unique(table_current[feature]))
-            table_update.set_index(features, inplace=True)
-            table_current.set_index(features, inplace=True)
-
-            for feature in product(*feat_l):
-                den = tmp.loc[feature]
-                # calculate new weight for this iteration
-                if den == 0:
-                    table_update.loc[feature, weight_col] =\
-                        table_current.loc[feature, weight_col] *\
-                        xijk.loc[feature]
-                else:
-                    table_update.loc[feature, weight_col] = \
-                        table_current.loc[feature, weight_col].astype(float) * \
-                        xijk.loc[feature]/den
-
-            table_update.reset_index(inplace=True)
-            table_current.reset_index(inplace=True)
-            inc += 1
-            feat_l = []
-
-        # Calculate the max convergence rate
-        max_conv = 0
-        inc = 0
-        for features in dimensions:
-            tmp = df.groupby(features)[weight_col].sum()
-            ori_ijk = aggregates[inc]
-            temp_conv = max(abs(tmp/ori_ijk - 1))
-            if temp_conv > max_conv:
-                max_conv = temp_conv
-            inc += 1
+                df = df.multiply(f, fill_value=0)
 
+            f = f.sub(1).abs().max()
+            factors.append(f)
+        # Check for convergence
+        max_conv = max(factors)
+        
         return df, max_conv
 
     def iteration(self):
         """
-        Runs the ipfn algorithm. Automatically detects of working with numpy ndarray or pandas dataframes.
+        Runs the ipfn algorithm. Automatically detects of working with
+        numpy ndarray or pandas dataframes.
         """
 
         i = 0
         conv = self.conv_rate * 100
-        m = self.original
+        m = self.original.copy()
 
         # If the original data input is in pandas DataFrame format
         if isinstance(self.original, pd.DataFrame):
+            # Add index
+            indexcols = list(set(x for l in self.dimensions for x in l))
+            m.reset_index(inplace=True)
+            m.set_index(indexcols, inplace=True)
+            # Turn to series
+            m = m[self.weight_col]
             while i <= self.max_itr and conv > self.conv_rate:
-                m, conv = self.ipfn_df(m, self.aggregates, self.dimensions, self.weight_col)
+                m, conv = self.ipfn_df(m, self.aggregates, self.dimensions)
                 i += 1
                 # print(i, conv)
         # If the original data input is in numpy format
@@ -230,6 +208,13 @@ def iteration(self):
                 m, conv = self.ipfn_np(m, self.aggregates, self.dimensions, self.weight_col)
                 i += 1
                 # print(i, conv)
+
+        if isinstance(m, pd.Series):
+		    #Reset to dataframe
+            m.name = self.weight_col
+            m = m.reset_index()
+            m.set_index(self.original.index.names, inplace=True)
+            
         converged = 1
         if i <= self.max_itr:
             print('ipfn converged')
@@ -246,7 +231,6 @@ def iteration(self):
             print('wrong verbose input, return None')
             sys.exit(0)
 
-
 if __name__ == '__main__':
 
     # Example 1, 2D using ipfn_np,
@@ -429,6 +413,7 @@ def iteration(self):
     df['age'] = age_l
     df['total'] = m
 
+    df.set_index(['dma', 'size', 'age'], inplace=True)
     xipp = df.groupby('dma')['total'].sum()
     xpjp = df.groupby('size')['total'].sum()
     xppk = df.groupby('age')['total'].sum()

From 99e62669b5d80cf3da4b7d5df55f9c1ae5973f25 Mon Sep 17 00:00:00 2001
From: harisbal <theoballisgr@gmail.com>
Date: Mon, 16 Sep 2019 17:55:07 +0300
Subject: [PATCH 2/9] Simplified version now that pandas (0.25) can accept
 multi-index joins

---
 ipfn/ipfn.py | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/ipfn/ipfn.py b/ipfn/ipfn.py
index 4257e15..941e11f 100755
--- a/ipfn/ipfn.py
+++ b/ipfn/ipfn.py
@@ -153,27 +153,25 @@ def ipfn_df(self, df, aggregates, dimensions):
         print(df)
         print(df.groupby('age')['total'].sum(), xip)"""
 
-        aggrs = self.aggregates
-        dims = self.dimensions
+        aggregates = self.aggregates
+        dimensions = self.dimensions
         factors = []
         index_names = df.index.names
 
-        for k, d in enumerate(dims):
+        for k, d in enumerate(dimensions):
             dfg = df.groupby(level=d).sum()
-            f = aggrs[k].div(dfg)
-            # Joining on multiindexes of not same length is not implemented
+            f = aggregates[k].div(dfg)
+            # Requires pandas >= 0.25
             if len(d) > 1:
-                unstack_levels = [lvl for lvl in index_names if lvl not in d]
                 rem_index = [lvl for lvl in index_names if lvl in d]
-                df = (df.unstack(unstack_levels)
-                      .multiply(f.reorder_levels(rem_index), axis=0)
-                      .stack(unstack_levels)
-                      .reorder_levels(index_names))
+                df = (df.multiply(f.reorder_levels(rem_index), axis=0)
+                        .reorder_levels(index_names))
             else:
                 df = df.multiply(f, fill_value=0)
 
             f = f.sub(1).abs().max()
             factors.append(f)
+
         # Check for convergence
         max_conv = max(factors)
         

From 8e09b16be691061ebb16247509ffe0a85039777c Mon Sep 17 00:00:00 2001
From: harisbal <theoballisgr@gmail.com>
Date: Mon, 16 Sep 2019 18:20:40 +0300
Subject: [PATCH 3/9] Simplified version now that pandas (0.25) can accept
 multi-index joins

---
 ipfn/ipfn.py | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/ipfn/ipfn.py b/ipfn/ipfn.py
index f070e1d..7e28802 100755
--- a/ipfn/ipfn.py
+++ b/ipfn/ipfn.py
@@ -213,36 +213,24 @@ def iteration(self):
                 # print(i, conv)
         # If the original data input is in numpy format
         elif isinstance(self.original, np.ndarray):
-            ipfn_method = self.ipfn_np
             self.original = self.original.astype('float64')
             while i <= self.max_itr and conv > self.conv_rate:
                 m, conv = self.ipfn_np(m, self.aggregates, self.dimensions, self.weight_col)
                 i += 1
                 # print(i, conv)
 
-        if isinstance(m, pd.Series):
-		    #Reset to dataframe
-            m.name = self.weight_col
-            m = m.reset_index()
-            m.set_index(self.original.index.names, inplace=True)
-
-        converged = 1
+        converged = True
         if i <= self.max_itr:
-            if (not conv > self.conv_rate) & (self.verbose > 1):
-                print('ipfn converged: convergence_rate below threshold')
-            elif not abs(conv - old_conv) > self.rate_tolerance:
-                print('ipfn converged: convergence_rate not updating or below rate_tolerance')
+            print('ipfn converged')
         else:
             print('Maximum iterations reached')
-            converged = 0
+            converged = False
 
         # Handle the verbose
         if self.verbose == 0:
             return m
         elif self.verbose == 1:
             return m, converged
-        elif self.verbose == 2:
-            return m, converged, pd.DataFrame({'iteration': range(i), 'conv': conv_list}).set_index('iteration')
         else:
             print('wrong verbose input, return None')
             sys.exit(0)

From 5e8e4b307583d7250f77af87f290534e7f54fba3 Mon Sep 17 00:00:00 2001
From: harisbal <theoballisgr@gmail.com>
Date: Mon, 16 Sep 2019 18:28:42 +0300
Subject: [PATCH 4/9] Simplified version now that pandas (0.25) can accept
 multi-index joins

---
 tests/tests.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/tests.py b/tests/tests.py
index b4e480b..bd5c03d 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -254,7 +254,7 @@ def test_pandas_3D(self):
         for marginal, vertical in marginals1D:
             features = marginal.index.tolist()
             for feature in features:
-                assert round(df.groupby(vertical)['total'].sum().loc[feature], 2) == round(marginal.loc[feature], 2)
+                assert round(df.groupby(vertical).sum().loc[feature], 2) == round(marginal.loc[feature], 2)
             m_inc += 1
 
         marginals2D = [(xijp, ['dma', 'size']), (xpjk, ['size', 'age'])]
@@ -262,5 +262,5 @@ def test_pandas_3D(self):
         for marginal, vertical in marginals2D:
             features = marginal.index.tolist()
             for feature in features:
-                assert round(df.groupby(vertical)['total'].sum().loc[feature], 2) == round(marginal.loc[feature], 2)
+                assert round(df.groupby(vertical).sum().loc[feature], 2) == round(marginal.loc[feature], 2)
             m_inc += 1

From 70409f7ba1cf4b0f3db9ee83607c32d13c52b62f Mon Sep 17 00:00:00 2001
From: harisbal <theoballisgr@gmail.com>
Date: Mon, 16 Sep 2019 18:28:42 +0300
Subject: [PATCH 5/9] Simplified version now that pandas (0.25) can accept
 multi-index joins

---
 .gitignore     | 6 ++++++
 tests/tests.py | 4 ++--
 2 files changed, 8 insertions(+), 2 deletions(-)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..f40a1e8
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,6 @@
+
+\.idea/
+
+ipfn/__pycache__/
+
+tests/__pycache__/
diff --git a/tests/tests.py b/tests/tests.py
index b4e480b..bd5c03d 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -254,7 +254,7 @@ def test_pandas_3D(self):
         for marginal, vertical in marginals1D:
             features = marginal.index.tolist()
             for feature in features:
-                assert round(df.groupby(vertical)['total'].sum().loc[feature], 2) == round(marginal.loc[feature], 2)
+                assert round(df.groupby(vertical).sum().loc[feature], 2) == round(marginal.loc[feature], 2)
             m_inc += 1
 
         marginals2D = [(xijp, ['dma', 'size']), (xpjk, ['size', 'age'])]
@@ -262,5 +262,5 @@ def test_pandas_3D(self):
         for marginal, vertical in marginals2D:
             features = marginal.index.tolist()
             for feature in features:
-                assert round(df.groupby(vertical)['total'].sum().loc[feature], 2) == round(marginal.loc[feature], 2)
+                assert round(df.groupby(vertical).sum().loc[feature], 2) == round(marginal.loc[feature], 2)
             m_inc += 1

From 4a548e485aa9e2606d74ac64c10484b8a7a5fd21 Mon Sep 17 00:00:00 2001
From: harisbal <theoballisgr@gmail.com>
Date: Mon, 30 Sep 2019 01:36:53 +0300
Subject: [PATCH 6/9] Simplified version of pandas version. Requires pandas
 >=0.24

---
 ipfn/ipfn.py | 51 +++++++++++++++++++++++++++++----------------------
 1 file changed, 29 insertions(+), 22 deletions(-)

diff --git a/ipfn/ipfn.py b/ipfn/ipfn.py
index 7e28802..07d9b5a 100755
--- a/ipfn/ipfn.py
+++ b/ipfn/ipfn.py
@@ -165,15 +165,13 @@ def ipfn_df(self, df, aggregates, dimensions):
         print(df)
         print(df.groupby('age')['total'].sum(), xip)"""
 
-        aggregates = self.aggregates
-        dimensions = self.dimensions
         factors = []
         index_names = df.index.names
 
         for k, d in enumerate(dimensions):
             dfg = df.groupby(level=d).sum()
             f = aggregates[k].div(dfg)
-            # Requires pandas >= 0.25
+            # Requires pandas >= 0.24
             if len(d) > 1:
                 rem_index = [lvl for lvl in index_names if lvl in d]
                 df = (df.multiply(f.reorder_levels(rem_index), axis=0)
@@ -191,36 +189,40 @@ def ipfn_df(self, df, aggregates, dimensions):
 
     def iteration(self):
         """
-        Runs the ipfn algorithm. Automatically detects of working with
-        numpy ndarray or pandas dataframes.
+        Runs the ipfn algorithm. Automatically detects of working with numpy ndarray or pandas dataframes.
         """
 
+        def _prepare_df_format(df):
+            # Add index
+            idxcols = list(set(x for l in self.dimensions for x in l))
+            df = df.reset_index().set_index(idxcols)
+            # Turn to series
+            df = df[self.weight_col]
+            return df
+
         i = 0
         conv = self.conv_rate * 100
-        m = self.original.copy()
+        conv_progress = []
+        m = self.original
 
         # If the original data input is in pandas DataFrame format
         if isinstance(self.original, pd.DataFrame):
-            # Add index
-            indexcols = list(set(x for l in self.dimensions for x in l))
-            m.reset_index(inplace=True)
-            m.set_index(indexcols, inplace=True)
-            # Turn to series
-            m = m[self.weight_col]
-            while i <= self.max_itr and conv > self.conv_rate:
-                m, conv = self.ipfn_df(m, self.aggregates, self.dimensions)
-                i += 1
-                # print(i, conv)
-        # If the original data input is in numpy format
+            m = _prepare_df_format(m)
+            ipfn_method = self.ipfn_df
         elif isinstance(self.original, np.ndarray):
+            ipfn_method = self.ipfn_np
             self.original = self.original.astype('float64')
-            while i <= self.max_itr and conv > self.conv_rate:
-                m, conv = self.ipfn_np(m, self.aggregates, self.dimensions, self.weight_col)
-                i += 1
-                # print(i, conv)
+        else:
+            print('Data input instance not recognized')
+            sys.exit(0)
+
+        while (i <= self.max_itr) and (conv > self.conv_rate):
+            m, conv = ipfn_method(m, self.aggregates, self.dimensions)
+            conv_progress.append(conv)
+            i += 1
 
-        converged = True
         if i <= self.max_itr:
+            converged = True
             print('ipfn converged')
         else:
             print('Maximum iterations reached')
@@ -231,6 +233,11 @@ def iteration(self):
             return m
         elif self.verbose == 1:
             return m, converged
+        elif self.verbose == 2:
+            conv_progress = pd.DataFrame({'iteration': range(i),
+                                          'convergence': conv_progress}
+                                         ).set_index('iteration')
+            return m, converged, conv_progress
         else:
             print('wrong verbose input, return None')
             sys.exit(0)

From e14b1107d07e8d8ecf3b4bb0ece6d87bfb0bd56a Mon Sep 17 00:00:00 2001
From: harisbal <theoballisgr@gmail.com>
Date: Tue, 31 Aug 2021 00:04:11 +0300
Subject: [PATCH 7/9] fixed numpy deprecation warning

---
 .vscode/settings.json |  3 +++
 ipfn/ipfn.py          | 10 +++++-----
 2 files changed, 8 insertions(+), 5 deletions(-)
 create mode 100644 .vscode/settings.json

diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..3516cb9
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+    "python.formatting.provider": "autopep8"
+}
\ No newline at end of file
diff --git a/ipfn/ipfn.py b/ipfn/ipfn.py
index 4983c45..ee61f4c 100644
--- a/ipfn/ipfn.py
+++ b/ipfn/ipfn.py
@@ -85,16 +85,16 @@ def ipfn_np(self, m, aggregates, dimensions, weight_col="total"):
         inc = 0
         for aggregate in aggregates:
             if not isinstance(aggregate, np.ndarray):
-                aggregate = np.array(aggregate).astype(np.float)
+                aggregate = np.array(aggregate).astype(float)
                 aggregates[inc] = aggregate
-            elif aggregate.dtype not in [np.float, float]:
-                aggregate = aggregate.astype(np.float)
+            elif aggregate.dtype != float:
+                aggregate = aggregate.astype(float)
                 aggregates[inc] = aggregate
             inc += 1
         if not isinstance(m, np.ndarray):
             m = np.array(m)
-        elif m.dtype not in [np.float, float]:
-            m = m.astype(np.float)
+        elif m.dtype != float:
+            m = m.astype(float)
 
         steps = len(aggregates)
         dim = len(m.shape)

From 5f75ba84d325d044a3b67fefd3da99baab69015f Mon Sep 17 00:00:00 2001
From: harisbal <theoballisgr@gmail.com>
Date: Tue, 31 Aug 2021 00:08:40 +0300
Subject: [PATCH 8/9] add to gitignore

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index f40a1e8..bcd3e54 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,6 @@
 ipfn/__pycache__/
 
 tests/__pycache__/
+
+.pytest_cache/
+.vscode/
\ No newline at end of file

From 46ca2a15f0d3c2ce0380153ac30e6bb140cef82d Mon Sep 17 00:00:00 2001
From: harisbal <theoballisgr@gmail.com>
Date: Tue, 31 Aug 2021 00:10:53 +0300
Subject: [PATCH 9/9] tmp

---
 .gitignore            | 2 +-
 .vscode/settings.json | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)
 delete mode 100644 .vscode/settings.json

diff --git a/.gitignore b/.gitignore
index bcd3e54..0a7ed66 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,4 +6,4 @@ ipfn/__pycache__/
 tests/__pycache__/
 
 .pytest_cache/
-.vscode/
\ No newline at end of file
+.vscode/
diff --git a/.vscode/settings.json b/.vscode/settings.json
deleted file mode 100644
index 3516cb9..0000000
--- a/.vscode/settings.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-    "python.formatting.provider": "autopep8"
-}
\ No newline at end of file