From 3727fc24dbc5b162af847eae6b30866ffb7758b5 Mon Sep 17 00:00:00 2001 From: Scott Sievert Date: Fri, 1 Sep 2017 15:06:15 -0500 Subject: [PATCH 1/4] ENH: implement growing batch size --- dask_glm/algorithms.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/dask_glm/algorithms.py b/dask_glm/algorithms.py index b7bdbd0..74d517b 100644 --- a/dask_glm/algorithms.py +++ b/dask_glm/algorithms.py @@ -65,7 +65,8 @@ def compute_stepsize_dask(beta, step, Xbeta, Xstep, y, curr_val, @normalize -def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, **kwargs): +def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, + approx_grad=False, initial_batch_size=None, **kwargs): """ Michael Grant's implementation of Gradient Descent. @@ -80,6 +81,13 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, **kwargs): Maximum allowed change from prior iteration required to declare convergence family : Family + approx_grad : bool + If True (default False), approximate the gradient with a subset of + examples in X and y. When the number of examples is very large this can + result in speed gains. + initial_batch_size : int + The initial batch size when approximating the gradient. Only used when + `approx_grad == True`. Defaults to `min(n // n_chunks, n // 100)` Returns ------- @@ -97,9 +105,23 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, **kwargs): backtrackMult = firstBacktrackMult beta = np.zeros(p) + if approx_grad: + n_chunks = max(len(c) for c in X.chunks) + batch_size = (min(n // n_chunks, n // 100) + 1 + if initial_batch_size is None else initial_batch_size) + keep = {'X': X, 'y': y} + for k in range(max_iter): + if approx_grad: + batch_size = min(1.1 * batch_size + 1, n) + i = np.random.permutation(n).astype(int) + batch = list(i[:int(batch_size)]) + X, y = keep['X'][batch], keep['y'][batch] + Xbeta = X.dot(beta) + func = loglike(Xbeta, y) + # how necessary is this recalculation? - if k % recalcRate == 0: + if not approx_grad and k % recalcRate == 0: Xbeta = X.dot(beta) func = loglike(Xbeta, y) From a7d7826ad494df28d093989a6e276bd19caa3b79 Mon Sep 17 00:00:00 2001 From: Scott Sievert Date: Sat, 2 Sep 2017 08:28:44 -0500 Subject: [PATCH 2/4] Implementing masked arrays --- dask_glm/algorithms.py | 41 +++++++++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/dask_glm/algorithms.py b/dask_glm/algorithms.py index 74d517b..3f2f885 100644 --- a/dask_glm/algorithms.py +++ b/dask_glm/algorithms.py @@ -84,7 +84,7 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, approx_grad : bool If True (default False), approximate the gradient with a subset of examples in X and y. When the number of examples is very large this can - result in speed gains. + result in speed gains, and is described in :cite:`friedlander2012hybrid`. initial_batch_size : int The initial batch size when approximating the gradient. Only used when `approx_grad == True`. Defaults to `min(n // n_chunks, n // 100)` @@ -92,6 +92,18 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, Returns ------- beta : array-like, shape (n_features,) + + References + ---------- + @article{friedlander2012hybrid, + Author = {Friedlander, Michael P and Schmidt, Mark}, + Journal = {SIAM Journal on Scientific Computing}, + Number = {3}, + Pages = {A1380--A1405}, + Title = {Hybrid deterministic-stochastic methods for data fitting}, + Volume = {34}, + Year = {2012}} + """ loglike, gradient = family.loglike, family.gradient @@ -107,16 +119,33 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, if approx_grad: n_chunks = max(len(c) for c in X.chunks) - batch_size = (min(n // n_chunks, n // 100) + 1 + batch_size = (min(n // n_chunks, n // 100) + 2 if initial_batch_size is None else initial_batch_size) keep = {'X': X, 'y': y} for k in range(max_iter): + print(k) if approx_grad: - batch_size = min(1.1 * batch_size + 1, n) - i = np.random.permutation(n).astype(int) - batch = list(i[:int(batch_size)]) - X, y = keep['X'][batch], keep['y'][batch] + batch_size = int(min(1.1 * batch_size + 1, n)) + + if k % recalcRate == 0: + i = np.random.permutation(n) + X = keep['X'][i] + y = keep['y'][i] + i = np.random.choice(n) + batch = np.random.permutation(n).astype(int)[:batch_size] + ma = np.zeros(n, dtype=bool) + ma[batch] = True + ma2 = ma.repeat(p).reshape(n, p) + + # idx = da.arange(n, chunks=1, dtype=int) + # batch = da.ma.masked_array(idx, mask=ma) + X_full = da.ma.masked_array(keep['X'], mask=ma2) + y_full = da.ma.masked_array(keep['y'], mask=ma) + X = da.ma.getdata(X_full)[ma] + y = da.ma.getdata(y_full)[ma] + #batch = list(i[:int(batch_size)]) + # X, y = keep['X'][batch], keep['y'][batch] Xbeta = X.dot(beta) func = loglike(Xbeta, y) From 1a8e6cc42797a103271e18bf791fba80924ff447 Mon Sep 17 00:00:00 2001 From: Scott Sievert Date: Sat, 2 Sep 2017 09:12:04 -0500 Subject: [PATCH 3/4] Implement slicing --- dask_glm/algorithms.py | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/dask_glm/algorithms.py b/dask_glm/algorithms.py index 3f2f885..39e449d 100644 --- a/dask_glm/algorithms.py +++ b/dask_glm/algorithms.py @@ -124,28 +124,16 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, keep = {'X': X, 'y': y} for k in range(max_iter): - print(k) if approx_grad: - batch_size = int(min(1.1 * batch_size + 1, n)) - if k % recalcRate == 0: i = np.random.permutation(n) - X = keep['X'][i] - y = keep['y'][i] - i = np.random.choice(n) - batch = np.random.permutation(n).astype(int)[:batch_size] - ma = np.zeros(n, dtype=bool) - ma[batch] = True - ma2 = ma.repeat(p).reshape(n, p) - - # idx = da.arange(n, chunks=1, dtype=int) - # batch = da.ma.masked_array(idx, mask=ma) - X_full = da.ma.masked_array(keep['X'], mask=ma2) - y_full = da.ma.masked_array(keep['y'], mask=ma) - X = da.ma.getdata(X_full)[ma] - y = da.ma.getdata(y_full)[ma] - #batch = list(i[:int(batch_size)]) - # X, y = keep['X'][batch], keep['y'][batch] + keep['X'] = keep['X'][i] + keep['y'] = keep['y'][i] + batch_size = int(min(1.1 * batch_size + 1, n - 1)) + i = np.random.choice(n - batch_size) + X = keep['X'][i:i + batch_size] + y = keep['y'][i:i + batch_size] + Xbeta = X.dot(beta) func = loglike(Xbeta, y) From 04be6bdd5d09cbd254e387f2a12cba821681ecb9 Mon Sep 17 00:00:00 2001 From: Scott Sievert Date: Mon, 4 Sep 2017 10:36:00 -0500 Subject: [PATCH 4/4] Implement _comptue_batch with map_blocks --- dask_glm/algorithms.py | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/dask_glm/algorithms.py b/dask_glm/algorithms.py index 39e449d..9f89f2c 100644 --- a/dask_glm/algorithms.py +++ b/dask_glm/algorithms.py @@ -64,9 +64,17 @@ def compute_stepsize_dask(beta, step, Xbeta, Xstep, y, curr_val, return stepSize, beta, Xbeta, func +def _compute_batch(chunk, batch_size=1, seed=42): + n_block = chunk.shape[0] + rnd = np.random.RandomState(seed) + i = rnd.permutation(n_block) + return chunk[i[:batch_size]] + + @normalize def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, - approx_grad=False, initial_batch_size=None, **kwargs): + approx_grad=False, initial_batch_size=None, armijoMult=1e-1, + seed=42, **kwargs): """ Michael Grant's implementation of Gradient Descent. @@ -110,7 +118,7 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, n, p = X.shape firstBacktrackMult = 0.1 nextBacktrackMult = 0.5 - armijoMult = 0.1 + armijoMult = 1e-1 stepGrowth = 1.25 stepSize = 1.0 recalcRate = 10 @@ -119,26 +127,21 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, if approx_grad: n_chunks = max(len(c) for c in X.chunks) - batch_size = (min(n // n_chunks, n // 100) + 2 - if initial_batch_size is None else initial_batch_size) + batch_size = n_chunks + 1 \ + if initial_batch_size is None else initial_batch_size keep = {'X': X, 'y': y} for k in range(max_iter): if approx_grad: - if k % recalcRate == 0: - i = np.random.permutation(n) - keep['X'] = keep['X'][i] - keep['y'] = keep['y'][i] - batch_size = int(min(1.1 * batch_size + 1, n - 1)) - i = np.random.choice(n - batch_size) - X = keep['X'][i:i + batch_size] - y = keep['y'][i:i + batch_size] - - Xbeta = X.dot(beta) - func = loglike(Xbeta, y) + block_batch_size = batch_size // n_chunks + kwargs = {'batch_size': block_batch_size, 'seed': k + seed} + X = keep['X'].map_blocks(_compute_batch, **kwargs) + y = keep['y'].map_blocks(_compute_batch, **kwargs) + # Xbeta = X.dot(beta) + # func = loglike(Xbeta, y) # how necessary is this recalculation? - if not approx_grad and k % recalcRate == 0: + if k % recalcRate == 0: Xbeta = X.dot(beta) func = loglike(Xbeta, y)