From 3727fc24dbc5b162af847eae6b30866ffb7758b5 Mon Sep 17 00:00:00 2001
From: Scott Sievert <scott@stsievert.com>
Date: Fri, 1 Sep 2017 15:06:15 -0500
Subject: [PATCH 1/4] ENH: implement growing batch size

---
 dask_glm/algorithms.py | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/dask_glm/algorithms.py b/dask_glm/algorithms.py
index b7bdbd0..74d517b 100644
--- a/dask_glm/algorithms.py
+++ b/dask_glm/algorithms.py
@@ -65,7 +65,8 @@ def compute_stepsize_dask(beta, step, Xbeta, Xstep, y, curr_val,
 
 
 @normalize
-def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, **kwargs):
+def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic,
+                     approx_grad=False, initial_batch_size=None, **kwargs):
     """
     Michael Grant's implementation of Gradient Descent.
 
@@ -80,6 +81,13 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, **kwargs):
         Maximum allowed change from prior iteration required to
         declare convergence
     family : Family
+    approx_grad : bool
+        If True (default False), approximate the gradient with a subset of
+        examples in X and y. When the number of examples is very large this can
+        result in speed gains.
+    initial_batch_size : int
+        The initial batch size when approximating the gradient. Only used when
+        `approx_grad == True`. Defaults to `min(n // n_chunks, n // 100)`
 
     Returns
     -------
@@ -97,9 +105,23 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, **kwargs):
     backtrackMult = firstBacktrackMult
     beta = np.zeros(p)
 
+    if approx_grad:
+        n_chunks = max(len(c) for c in X.chunks)
+        batch_size = (min(n // n_chunks, n // 100) + 1
+                      if initial_batch_size is None else initial_batch_size)
+        keep = {'X': X, 'y': y}
+
     for k in range(max_iter):
+        if approx_grad:
+            batch_size = min(1.1 * batch_size + 1, n)
+            i = np.random.permutation(n).astype(int)
+            batch = list(i[:int(batch_size)])
+            X, y = keep['X'][batch], keep['y'][batch]
+            Xbeta = X.dot(beta)
+            func = loglike(Xbeta, y)
+
         # how necessary is this recalculation?
-        if k % recalcRate == 0:
+        if not approx_grad and k % recalcRate == 0:
             Xbeta = X.dot(beta)
             func = loglike(Xbeta, y)
 

From a7d7826ad494df28d093989a6e276bd19caa3b79 Mon Sep 17 00:00:00 2001
From: Scott Sievert <scott@stsievert.com>
Date: Sat, 2 Sep 2017 08:28:44 -0500
Subject: [PATCH 2/4] Implementing masked arrays

---
 dask_glm/algorithms.py | 41 +++++++++++++++++++++++++++++++++++------
 1 file changed, 35 insertions(+), 6 deletions(-)

diff --git a/dask_glm/algorithms.py b/dask_glm/algorithms.py
index 74d517b..3f2f885 100644
--- a/dask_glm/algorithms.py
+++ b/dask_glm/algorithms.py
@@ -84,7 +84,7 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic,
     approx_grad : bool
         If True (default False), approximate the gradient with a subset of
         examples in X and y. When the number of examples is very large this can
-        result in speed gains.
+        result in speed gains, and is described in :cite:`friedlander2012hybrid`.
     initial_batch_size : int
         The initial batch size when approximating the gradient. Only used when
         `approx_grad == True`. Defaults to `min(n // n_chunks, n // 100)`
@@ -92,6 +92,18 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic,
     Returns
     -------
     beta : array-like, shape (n_features,)
+
+    References
+    ----------
+    @article{friedlander2012hybrid,
+        Author = {Friedlander, Michael P and Schmidt, Mark},
+        Journal = {SIAM Journal on Scientific Computing},
+        Number = {3},
+        Pages = {A1380--A1405},
+        Title = {Hybrid deterministic-stochastic methods for data fitting},
+        Volume = {34},
+        Year = {2012}}
+
     """
 
     loglike, gradient = family.loglike, family.gradient
@@ -107,16 +119,33 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic,
 
     if approx_grad:
         n_chunks = max(len(c) for c in X.chunks)
-        batch_size = (min(n // n_chunks, n // 100) + 1
+        batch_size = (min(n // n_chunks, n // 100) + 2
                       if initial_batch_size is None else initial_batch_size)
         keep = {'X': X, 'y': y}
 
     for k in range(max_iter):
+        print(k)
         if approx_grad:
-            batch_size = min(1.1 * batch_size + 1, n)
-            i = np.random.permutation(n).astype(int)
-            batch = list(i[:int(batch_size)])
-            X, y = keep['X'][batch], keep['y'][batch]
+            batch_size = int(min(1.1 * batch_size + 1, n))
+
+            if k % recalcRate == 0:
+                i = np.random.permutation(n)
+                X = keep['X'][i]
+                y = keep['y'][i]
+            i = np.random.choice(n)
+            batch = np.random.permutation(n).astype(int)[:batch_size]
+            ma = np.zeros(n, dtype=bool)
+            ma[batch] = True
+            ma2 = ma.repeat(p).reshape(n, p)
+
+            #  idx = da.arange(n, chunks=1, dtype=int)
+            #  batch = da.ma.masked_array(idx, mask=ma)
+            X_full = da.ma.masked_array(keep['X'], mask=ma2)
+            y_full = da.ma.masked_array(keep['y'], mask=ma)
+            X = da.ma.getdata(X_full)[ma]
+            y = da.ma.getdata(y_full)[ma]
+            #batch = list(i[:int(batch_size)])
+            #  X, y = keep['X'][batch], keep['y'][batch]
             Xbeta = X.dot(beta)
             func = loglike(Xbeta, y)
 

From 1a8e6cc42797a103271e18bf791fba80924ff447 Mon Sep 17 00:00:00 2001
From: Scott Sievert <scott@stsievert.com>
Date: Sat, 2 Sep 2017 09:12:04 -0500
Subject: [PATCH 3/4] Implement slicing

---
 dask_glm/algorithms.py | 26 +++++++-------------------
 1 file changed, 7 insertions(+), 19 deletions(-)

diff --git a/dask_glm/algorithms.py b/dask_glm/algorithms.py
index 3f2f885..39e449d 100644
--- a/dask_glm/algorithms.py
+++ b/dask_glm/algorithms.py
@@ -124,28 +124,16 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic,
         keep = {'X': X, 'y': y}
 
     for k in range(max_iter):
-        print(k)
         if approx_grad:
-            batch_size = int(min(1.1 * batch_size + 1, n))
-
             if k % recalcRate == 0:
                 i = np.random.permutation(n)
-                X = keep['X'][i]
-                y = keep['y'][i]
-            i = np.random.choice(n)
-            batch = np.random.permutation(n).astype(int)[:batch_size]
-            ma = np.zeros(n, dtype=bool)
-            ma[batch] = True
-            ma2 = ma.repeat(p).reshape(n, p)
-
-            #  idx = da.arange(n, chunks=1, dtype=int)
-            #  batch = da.ma.masked_array(idx, mask=ma)
-            X_full = da.ma.masked_array(keep['X'], mask=ma2)
-            y_full = da.ma.masked_array(keep['y'], mask=ma)
-            X = da.ma.getdata(X_full)[ma]
-            y = da.ma.getdata(y_full)[ma]
-            #batch = list(i[:int(batch_size)])
-            #  X, y = keep['X'][batch], keep['y'][batch]
+                keep['X'] = keep['X'][i]
+                keep['y'] = keep['y'][i]
+            batch_size = int(min(1.1 * batch_size + 1, n - 1))
+            i = np.random.choice(n - batch_size)
+            X = keep['X'][i:i + batch_size]
+            y = keep['y'][i:i + batch_size]
+
             Xbeta = X.dot(beta)
             func = loglike(Xbeta, y)
 

From 04be6bdd5d09cbd254e387f2a12cba821681ecb9 Mon Sep 17 00:00:00 2001
From: Scott Sievert <scott@stsievert.com>
Date: Mon, 4 Sep 2017 10:36:00 -0500
Subject: [PATCH 4/4] Implement _comptue_batch with map_blocks

---
 dask_glm/algorithms.py | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/dask_glm/algorithms.py b/dask_glm/algorithms.py
index 39e449d..9f89f2c 100644
--- a/dask_glm/algorithms.py
+++ b/dask_glm/algorithms.py
@@ -64,9 +64,17 @@ def compute_stepsize_dask(beta, step, Xbeta, Xstep, y, curr_val,
     return stepSize, beta, Xbeta, func
 
 
+def _compute_batch(chunk, batch_size=1, seed=42):
+    n_block = chunk.shape[0]
+    rnd = np.random.RandomState(seed)
+    i = rnd.permutation(n_block)
+    return chunk[i[:batch_size]]
+
+
 @normalize
 def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic,
-                     approx_grad=False, initial_batch_size=None, **kwargs):
+                     approx_grad=False, initial_batch_size=None, armijoMult=1e-1,
+                     seed=42, **kwargs):
     """
     Michael Grant's implementation of Gradient Descent.
 
@@ -110,7 +118,7 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic,
     n, p = X.shape
     firstBacktrackMult = 0.1
     nextBacktrackMult = 0.5
-    armijoMult = 0.1
+    armijoMult = 1e-1
     stepGrowth = 1.25
     stepSize = 1.0
     recalcRate = 10
@@ -119,26 +127,21 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic,
 
     if approx_grad:
         n_chunks = max(len(c) for c in X.chunks)
-        batch_size = (min(n // n_chunks, n // 100) + 2
-                      if initial_batch_size is None else initial_batch_size)
+        batch_size = n_chunks + 1 \
+                     if initial_batch_size is None else initial_batch_size
         keep = {'X': X, 'y': y}
 
     for k in range(max_iter):
         if approx_grad:
-            if k % recalcRate == 0:
-                i = np.random.permutation(n)
-                keep['X'] = keep['X'][i]
-                keep['y'] = keep['y'][i]
-            batch_size = int(min(1.1 * batch_size + 1, n - 1))
-            i = np.random.choice(n - batch_size)
-            X = keep['X'][i:i + batch_size]
-            y = keep['y'][i:i + batch_size]
-
-            Xbeta = X.dot(beta)
-            func = loglike(Xbeta, y)
+            block_batch_size = batch_size // n_chunks
+            kwargs = {'batch_size': block_batch_size, 'seed': k + seed}
+            X = keep['X'].map_blocks(_compute_batch, **kwargs)
+            y = keep['y'].map_blocks(_compute_batch, **kwargs)
+            #  Xbeta = X.dot(beta)
+            #  func = loglike(Xbeta, y)
 
         # how necessary is this recalculation?
-        if not approx_grad and k % recalcRate == 0:
+        if k % recalcRate == 0:
             Xbeta = X.dot(beta)
             func = loglike(Xbeta, y)