dask · TomAugspurger · Nov 18, 2016 · Nov 18, 2016 · Nov 18, 2016 · Nov 18, 2016
diff --git a/.gitignore b/.gitignore
@@ -122,3 +122,4 @@ docs/source/auto_examples/
 docs/source/examples/mydask.png
 
 dask-worker-space
+.coverage
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -13,4 +13,3 @@ repos:
     rev: v4.3.21
     hooks:
     - id: isort
-
diff --git a/LICENSE.txt b/LICENSE.txt
@@ -25,4 +25,4 @@ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
-THE POSSIBILITY OF SUCH DAMAGE.
+THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/ci/environment-3.7.yaml b/ci/environment-3.7.yaml
@@ -14,7 +14,7 @@ dependencies:
   - multipledispatch >=0.4.9
   - mypy
   - numba
-  - numpy >=1.16.3
+  - numpy >=1.17.0
   - numpydoc
   - packaging
   - pandas

diff --git a/ci/environment-docs.yaml b/ci/environment-docs.yaml
@@ -36,6 +36,7 @@ dependencies:
   - tornado
   - toolz
   - xgboost
+  - dask-xgboost
   - zict
   - pip
   - dask

diff --git a/dask_ml/_compat.py b/dask_ml/_compat.py
@@ -1,4 +1,5 @@
 import contextlib
+import importlib
 import os
 from collections.abc import Mapping  # noqa
 from typing import Any, List, Optional, Union
@@ -19,6 +20,7 @@
 SK_024 = SK_VERSION >= packaging.version.parse("0.24.0.dev0")
 DASK_240 = DASK_VERSION >= packaging.version.parse("2.4.0")
 DASK_2130 = DASK_VERSION >= packaging.version.parse("2.13.0")
+DASK_2200 = DASK_VERSION > packaging.version.parse("2.19.0")  # TODO: update to >=
 DISTRIBUTED_2_5_0 = DISTRIBUTED_VERSION > packaging.version.parse("2.5.0")
 DISTRIBUTED_2_11_0 = DISTRIBUTED_VERSION > packaging.version.parse("2.10.0")  # dev
 WINDOWS = os.name == "nt"
@@ -40,6 +42,15 @@ def check_is_fitted(est, attributes: Optional[Union[str, List[str]]] = None):
     return sklearn.utils.validation.check_is_fitted(est, *args)
 
 
+def _import_sparse():
+    try:
+        return importlib.import_module("sparse")
+    except ImportError:
+        raise ImportError(
+            "This requires the optional 'sparse' library. Please install 'sparse'."
+        )
+
+
 def _check_multimetric_scoring(estimator, scoring=None):
     from sklearn.metrics._scorer import _check_multimetric_scoring
 

diff --git a/dask_ml/_utils.py b/dask_ml/_utils.py
@@ -5,6 +5,14 @@
 from sklearn.base import BaseEstimator
 
 
+def is_sparse(x):
+    try:
+        from sparse import SparseArray
+    except ImportError:
+        return False
+    return isinstance(x, SparseArray)
+
+
 def copy_learned_attributes(from_estimator, to_estimator):
     attrs = {k: v for k, v in vars(from_estimator).items() if k.endswith("_")}
 

diff --git a/dask_ml/datasets.py b/dask_ml/datasets.py
@@ -10,6 +10,8 @@
 
 import dask_ml.utils
 
+from . import _compat
+
 
 def _check_axis_partitioning(chunks, n_features):
     c = chunks[1][0]
@@ -30,6 +32,7 @@ def make_counts(
     scale=1.0,
     chunks=100,
     random_state=None,
+    is_sparse=False,
 ):
     """
     Generate a dummy dataset for modeling count data.
@@ -72,6 +75,11 @@ def make_counts(
     z0 = X[:, informative_idx].dot(beta[informative_idx])
     rate = da.exp(z0)
     y = rng.poisson(rate, size=1, chunks=(chunks,))
+
+    if is_sparse:
+        sparse = _compat._import_sparse()
+        X = X.map_blocks(sparse.COO)
+
     return X, y
 
 
@@ -218,6 +226,7 @@ def make_regression(
     coef=False,
     random_state=None,
     chunks=None,
+    is_sparse=False,
 ):
     """
     Generate a random regression problem.
@@ -334,6 +343,10 @@ def make_regression(
 
     y_big = y_big.squeeze()
 
+    if is_sparse:
+        sparse = _compat._import_sparse()
+        X_big = X_big.map_blocks(sparse.COO)
+
     if return_coef:
         return X_big, y_big, coef
     else:
@@ -357,6 +370,7 @@ def make_classification(
     shuffle=True,
     random_state=None,
     chunks=None,
+    is_sparse=False,
 ):
     chunks = da.core.normalize_chunks(chunks, (n_samples, n_features))
     _check_axis_partitioning(chunks, n_features)
@@ -378,9 +392,16 @@ def make_classification(
     y = rng.random(z0.shape, chunks=chunks[0]) < 1 / (1 + da.exp(-z0))
     y = y.astype(int)
 
+    if is_sparse:
+        sparse = _compat._import_sparse()
+        X = X.map_blocks(sparse.COO)
+
     return X, y
 
 
+make_poisson = make_counts
+
+
 def random_date(start, end):
     delta = end - start
     int_delta = (delta.days * 24 * 60 * 60) + delta.seconds
Original file line number	Diff line number	Diff line change
Expand Up		@@ -122,3 +122,4 @@ docs/source/auto_examples/
		docs/source/examples/mydask.png

		dask-worker-space
		.coverage
Original file line number	Diff line number	Diff line change
Expand Up		@@ -13,4 +13,3 @@ repos:
		rev: v4.3.21
		hooks:
		- id: isort
-Original file line number
+Diff line change
@@ Expand Up / @@ -36,6 +36,7 @@ dependencies: @@
       - tornado
       - toolz
       - xgboost
+      - dask-xgboost
       - zict
       - pip
       - dask
@@ Expand Down @@