diff --git a/.gitignore b/.gitignore index 1e539b7..1bed0ff 100644 --- a/.gitignore +++ b/.gitignore @@ -11,5 +11,9 @@ MANIFEST docs/_build docs/_templates docs/*.log -.DS_Store +.DS_Store +.Rproj.user +*.o +*.so +.Rhistory diff --git a/include/blitzml/base/common.h b/include/blitzml/base/common.h index 9014809..a101066 100644 --- a/include/blitzml/base/common.h +++ b/include/blitzml/base/common.h @@ -17,24 +17,29 @@ namespace BlitzML { #define LIBRARY_API extern "C" #endif -typedef double value_t; +typedef double value_t; + +#ifdef BLITZML_R_WRAPPER +typedef uint32_t size_t; +#else typedef std::size_t size_t; -typedef int index_t; +#endif + +typedef uint32_t index_t; + typedef std::vector::iterator index_itr; typedef std::vector::iterator value_itr; typedef std::vector::const_iterator const_index_itr; typedef std::vector::const_iterator const_value_itr; - -void assert(bool okay, std::string error_message); +void assert_with_error_message(bool okay, std::string error_message); void warn_if(bool condition, std::string message); void print(const char* fmt, ...); void debug(const char* fmt, ...); - class ObjectiveValues { public: - ObjectiveValues() + ObjectiveValues() : dual_obj_(std::numeric_limits::min()), primal_obj_x_(std::numeric_limits::max()), primal_obj_y_(std::numeric_limits::max()), @@ -82,9 +87,9 @@ class Parameters { virtual ~Parameters() { debug("delete params"); } - private: + private: const value_t *values; - size_t count; + size_t count; Parameters(); }; diff --git a/r-package/.Rbuildignore b/r-package/.Rbuildignore new file mode 100644 index 0000000..e8e0505 --- /dev/null +++ b/r-package/.Rbuildignore @@ -0,0 +1,3 @@ +^.*\.Rproj$ +^\.Rproj\.user$ +^docs/* diff --git a/r-package/DESCRIPTION b/r-package/DESCRIPTION new file mode 100644 index 0000000..a29f9d0 --- /dev/null +++ b/r-package/DESCRIPTION @@ -0,0 +1,24 @@ +Package: blitzml +Type: Package +Title: Efficient Solver for Linear Models with Various Loss Functions and L1 penalty +Version: 0.1.1 +Authors@R: c( + person("Tyler", "Johnson", role = c("aut"), comment = "backend C++ solver"), + person("Dmitriy", "Selivanov", role = c("aut", "cre"), email = "selivanov.dmitriy@gmail.com", comment = "R package") + ) +Description: Efficient Solver for Linear Models with L1 penalty. +License: GPL (>= 2) +Encoding: UTF-8 +LazyData: true +SystemRequirements: GNU make +Depends: + R (>= 3.1.0), + methods +Imports: + Matrix, + R6, + Rcpp (>= 0.12.0), + futile.logger, + data.table +LinkingTo: Rcpp +RoxygenNote: 6.1.1 diff --git a/r-package/NAMESPACE b/r-package/NAMESPACE new file mode 100644 index 0000000..be5f960 --- /dev/null +++ b/r-package/NAMESPACE @@ -0,0 +1,12 @@ +# Generated by roxygen2: do not edit by hand + +export(BlitzMLLassoRegression) +export(auc) +import(Matrix) +import(methods) +importFrom(Rcpp,sourceCpp) +importFrom(futile.logger,flog.debug) +importFrom(futile.logger,flog.info) +importFrom(futile.logger,flog.trace) +importFrom(futile.logger,flog.warn) +useDynLib("blitzml", .registration=TRUE) diff --git a/r-package/R/RcppExports.R b/r-package/R/RcppExports.R new file mode 100644 index 0000000..63e6cbb --- /dev/null +++ b/r-package/R/RcppExports.R @@ -0,0 +1,51 @@ +# Generated by using Rcpp::compileAttributes() -> do not edit by hand +# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 + +BlitzML_new_sparse_dataset <- function(x, y) { + .Call(`_blitzml_BlitzML_new_sparse_dataset`, x, y) +} + +BlitzML_new_dense_dataset <- function(x, y) { + .Call(`_blitzml_BlitzML_new_dense_dataset`, x, y) +} + +BlitzML_new_solver <- function() { + .Call(`_blitzml_BlitzML_new_solver`) +} + +BlitzML_new_logreg_solver <- function() { + .Call(`_blitzml_BlitzML_new_logreg_solver`) +} + +BlitzML_new_linear_solver <- function() { + .Call(`_blitzml_BlitzML_new_linear_solver`) +} + +BlitzML_new_parameters <- function(params) { + .Call(`_blitzml_BlitzML_new_parameters`, params) +} + +BlitzML_solver_compute_max_l1_penalty <- function(xptr_solver, xptr_dataset, xptr_params) { + .Call(`_blitzml_BlitzML_solver_compute_max_l1_penalty`, xptr_solver, xptr_dataset, xptr_params) +} + +BlitzML_solve_problem <- function(xptr_solver, xptr_dataset, xptr_params, result, status_buffer, log_dir) { + invisible(.Call(`_blitzml_BlitzML_solve_problem`, xptr_solver, xptr_dataset, xptr_params, result, status_buffer, log_dir)) +} + +BlitzML_set_tolerance <- function(xptr_solver, value) { + invisible(.Call(`_blitzml_BlitzML_set_tolerance`, xptr_solver, value)) +} + +BlitzML_set_max_time <- function(xptr_solver, value) { + invisible(.Call(`_blitzml_BlitzML_set_max_time`, xptr_solver, value)) +} + +BlitzML_set_max_iterations <- function(xptr_solver, value) { + invisible(.Call(`_blitzml_BlitzML_set_max_iterations`, xptr_solver, value)) +} + +BlitzML_set_use_working_sets <- function(xptr_solver, value) { + invisible(.Call(`_blitzml_BlitzML_set_use_working_sets`, xptr_solver, value)) +} + diff --git a/r-package/R/dataset.R b/r-package/R/dataset.R new file mode 100644 index 0000000..33c6604 --- /dev/null +++ b/r-package/R/dataset.R @@ -0,0 +1,13 @@ +BlitzML_create_dataset = function(x, y) { + + if(!is.numeric(y)) stop("'y' should be numeric vector") + + if(!(inherits(x, "matrix") || inherits(x, "dgCMatrix"))) + stop("'x' should inherit from 'matrix' or 'dgCMatrix'") + + if(inherits(x, "dgCMatrix")) { + BlitzML_new_sparse_dataset(x, y) + } else { + BlitzML_new_dense_dataset(x, y) + } +} diff --git a/r-package/R/plot.R b/r-package/R/plot.R new file mode 100644 index 0000000..14d2cab --- /dev/null +++ b/r-package/R/plot.R @@ -0,0 +1,28 @@ +error.bars = function (x, upper, lower, width = 0.02, ...) { + xlim <- range(x) + barw <- diff(xlim) * width + graphics::segments(x, upper, x, lower, ...) + graphics::segments(x - barw, upper, x + barw, upper, ...) + graphics::segments(x - barw, lower, x + barw, lower, ...) + range(upper, lower) +} + +plot.cv.BlitzMLLassoRegression = function (x, ...) { + cvobj = x + xlab = "log10(lambda)" + + new.args = list(...) + if (length(new.args)) plot.args[names(new.args)] = new.args + plot.args = list(x = log10(cvobj$lambda), + y = cvobj$cvm, + ylim = range(cvobj$cvup, cvobj$cvlo), + xlab = xlab, + ylab = "auc", + type = "n") + do.call("plot", plot.args) + + error.bars(log10(cvobj$lambda), cvobj$cvup, + cvobj$cvlo, width = 0.01, col = "darkgrey") + graphics::points(log10(cvobj$lambda), cvobj$cvm, pch = 20, col = "red") + invisible() +} diff --git a/r-package/R/solver.R b/r-package/R/solver.R new file mode 100644 index 0000000..8abe7cf --- /dev/null +++ b/r-package/R/solver.R @@ -0,0 +1,259 @@ +#' @export +BlitzMLLassoRegression = R6::R6Class( + "BlitzMLLassoRegression", + public = list( + initialize = function(loss = c("squared", "logistic"), #c("squared", "huber", "logistic", "squared_hinge", "smoothed_hinge"), + lambda = "auto", + n_lambda = 20, + lambda_min_fraction = 0.0001, + lambda_max_fraction = 0.3, + include_bias_term = TRUE, + max_nonzero = Inf, + tol = 1e-3, + max_time = 60, + max_iter = 10000, + log_dir = tempdir()) { + + loss = match.arg(loss) + check_loss(loss) + + stopifnot( + is.numeric(lambda_min_fraction) && + is.numeric(lambda_max_fraction) && + is.numeric(n_lambda) && + is.numeric(tol) && + is.numeric(max_time) && + is.numeric(max_iter) && + is.numeric(max_nonzero) && + is.logical(include_bias_term) + ) + + stopifnot(is.numeric(lambda) || identical(lambda, "auto")) + if (is.numeric(lambda)) lambda = sort(lambda, decreasing = TRUE) + + private$n_lambda = n_lambda[[1]] + private$lambda_min_fraction = lambda_min_fraction[[1]] + private$lambda_max_fraction = lambda_max_fraction[[1]] + private$tol = tol + private$max_iter = max_iter + private$max_time = max_time + private$loss = loss + private$log_dir = log_dir + + private$params = list(lambda = lambda, + undefined_param = 0, + include_bias_term = as.numeric(include_bias_term), + loss_index = encode_loss(private$loss), + loss = private$loss, + max_nonzero = max_nonzero) + futile.logger::flog.debug("params : %s", paste(names(private$params), private$params, sep = "=", collapse = ", ")) + }, + fit = function(x, y, ...) { + stopifnot(is.numeric(y)) + if (inherits(x, "sparseMatrix")) { + x = as(x, "dgCMatrix") + } else { + if (!inherits(x, "matrix")) + stop("'x' should inherit from 'matrix' or 'Matrix::sparseMatrix'") + } + + stopifnot(nrow(x) == length(y)) + solver = BlitzML_create_solver(private$loss) + + # BlitzML requirement - positive classes encoded as 1 and negative as -1 + # FIXME this is for logistic solver only! + if (private$loss == "logistic") y = ifelse(y > 0, 1, -1) + + BlitzML_set_tolerance(solver, private$tol) + BlitzML_set_use_working_sets(solver, TRUE) + BlitzML_set_max_time(solver, private$max_time) + BlitzML_set_max_iterations(solver, private$max_iter) + + num_features = ncol(x) + num_examples = nrow(x) + # FIXME + # https://github.com/tbjohns/BlitzML/blob/93f80a83e206001817a0b1a0b81db6455ef44a87/python-package/_sparse_linear.py#L141 + result_buffer = numeric(num_features + 1 + num_examples + 2) + + dataset = BlitzML_create_dataset(x, y) + + if (is.null(self$lambda_seq)) { + if (identical(private$params$lambda, "auto")) { + self$lambda_seq = private$generate_lambda_seq(x, y) + } else { + self$lambda_seq = private$params$lambda + } + } + + res = list() + for (lambda in self$lambda_seq) { + status_buffer = raw(STATUS_BUFFER_SIZE) + start = Sys.time() + params = c(lambda = lambda, + # FIXME https://github.com/tbjohns/BlitzML/blob/93f80a83e206001817a0b1a0b81db6455ef44a87/python-package/_sparse_linear.py#L134 + undefined_param = private$params$undefined_param, + include_bias_term = private$params$include_bias_term, + loss_index = private$params$loss_index) + params = BlitzML_new_parameters(params) + BlitzML_solve_problem(solver, dataset, params, result_buffer, status_buffer, private$log_dir) + + coefs = result_buffer[1L:num_features] + bias = result_buffer[[num_features + 1L]] + + status = rawToChar(status_buffer) + time_spent = difftime(Sys.time(), start, "sec") + + if (status == 'Exceeded time limit') + flog.warn("haven't converged for lambda %f - exceeded time limit of %f sec", lambda, private$max_time) + + flog.trace("solved in %f sec for lambda %f, %d non-zero coef (status: '%s')", + time_spent, lambda, sum(coefs != 0), status) + + if (sum(coefs != 0) > private$params$max_nonzero) { + break() + } + # put bias followed coefficients and convert to sparse matrix (actually vector) + res[[length(res) + 1L]] = c(bias, coefs) + } + self$lambda_seq = self$lambda_seq[seq_along(res)] + cn = paste("lambda=", self$lambda_seq, sep = "") + res = do.call(cbind, res) + colnames(res) = cn + res = as(res, "CsparseMatrix") + + cn = colnames(x) + if (is.null(cn)) + cn = paste("V", seq_len(ncol(x)), sep = "") + rownames(res) = c("bias", cn) + self$coef = res + invisible(self$coef) + }, + predict = function(x, ...) { + # add first dummy column of ones in order to add biases via matrix multiplication + x = cbind(rep(1, nrow(x)), x) + res = x %*% self$coef + if (private$loss == "logistic") res = sigmoid(res) + as.matrix(res) + }, + cross_validate = function(x, y, + fold_id = NULL, + n_folds = 4, + callbacks = list(auc = blitzml::auc), + lambda = "auto", + cores = getOption("mc.cores", parallel::detectCores()), + ...) { + + if (identical(lambda, "auto")) { + private$params$lambda = lambda + self$lambda_seq = private$generate_lambda_seq(x, y) + } else { + self$lambda_seq = lambda + } + + if (.Platform$OS.type != "unix" && cores > 1) { + warning("Windows detected - setting mc.cores = 1") + cores = 1L + } + + lapply(callbacks, function(f) stopifnot(is.function(f) && length(formals(f)) == 2L)) + + if (is.null(fold_id)) { + i_neg = y == -1 + i_pos = which(!i_neg) + i_neg = which(i_neg) + fold_id = rep(0, nrow(x)) + fold_id[i_pos] = sample.int(n_folds, length(i_pos), replace = TRUE) + fold_id[i_neg] = sample.int(n_folds, length(i_neg), replace = TRUE) + } + + folds = unique(fold_id) + + res = parallel::mclapply(folds, function(fld) { + i = fold_id == fld + + x_train = x[i, , drop = FALSE] + y_train = y[i] + + x_test = x[!i, , drop = FALSE] + y_test = y[!i] + + coef = self$fit(x_train, y_train) + preds = self$predict(x_test) + flog.trace("executing callbacks") + scores = lapply(callbacks, function(f) { + vapply(1:ncol(preds), function(col) { + f(y_test, preds[, col]) + }, 0.0) + }) + ret = data.frame(fold = fld, scores, list(lambda = self$lambda_seq)) + # attr(ret, "coef") = coef + ret + }, mc.cores = cores, ...) + ret = do.call(rbind, res) + class(ret) = c("cv.BlitzMLLassoRegression", class(ret)) + # attr(ret, "coef") = lapply(res, function(x) attr(x, "coef")) + ret + }, + coef = NULL, + lambda_seq = NULL + ), + private = list( + generate_lambda_seq = function(x, y) { + # lambda_max = find_max_lambda(x, y, params = c(0, 0, private$params$include_bias_term, private$params$loss_index)) + lambda_max = find_max_lambda(x, y, private$params) + flog.info("found max lambda: %f", lambda_max) + if (lambda_max == 0) { + lambda_seq = 0 + } else { + lambda_seq = seq( + log10(private$lambda_max_fraction * lambda_max), + log10(private$lambda_min_fraction * lambda_max), + length.out = private$n_lambda + ) + lambda_seq = 10 ** lambda_seq + } + lambda_seq + }, + params = NULL, + loss = NULL, + log_dir = NULL, + lambda_min_fraction = NULL, + lambda_max_fraction = NULL, + n_lambda = NULL, + tol = NULL, + max_iter = NULL, + max_time = NULL + ) +) + +BlitzML_create_solver = function(loss) { + check_loss(loss) + switch(loss, + squared = BlitzML_new_linear_solver(), + huber = BlitzML_new_solver(), + logistic = BlitzML_new_logreg_solver(), + squared_hinge = BlitzML_new_solver(), + smoothed_hinge = BlitzML_new_solver(), + stop(sprintf("solver for '%s' loss function is not supported yet", loss)) + ) +} + +check_loss = function(loss) { + if (!(loss %in% IMPLEMENTED_LOSS_FUNCTIONS)) + stop(sprintf("only %s loss supported at the moment", paste(paste("'", IMPLEMENTED_LOSS_FUNCTIONS, "'", sep = ""), collapse = "/"))) + invisible(TRUE) +} + +encode_loss = function(x) { + match(x, AVAILABLE_LOSS_FUNCTIONS) - 1L # loss codes start from 0 in BlitzML +} + +find_max_lambda = function(x, y, params) { + solver = BlitzML_create_solver(params$loss) + dataset = BlitzML_create_dataset(x, y) + # https://github.com/tbjohns/BlitzML/blob/93f80a83e206001817a0b1a0b81db6455ef44a87/python-package/_sparse_linear.py#L42 + params_vec = c(0, 0, params$include_bias_term, params$loss_index) + params_vec = BlitzML_new_parameters(params_vec) + + BlitzML_solver_compute_max_l1_penalty(solver, dataset, params_vec) +} diff --git a/r-package/R/utils.R b/r-package/R/utils.R new file mode 100644 index 0000000..9c69b90 --- /dev/null +++ b/r-package/R/utils.R @@ -0,0 +1,13 @@ +sigmoid = function(x) { + 1 / (1 + exp(-x)) +} + +#' @export +auc = function (actual, predicted) { + actual = as.integer(actual > 0) + rprob = data.table::frank(predicted) + n1 = sum(actual) + n0 = length(actual) - n1 + u = sum(rprob[actual == 1]) - n1 * (n1 + 1)/2 + exp(log(u) - log(n1) - log(n0)) +} diff --git a/r-package/R/zzz.R b/r-package/R/zzz.R new file mode 100644 index 0000000..d08e973 --- /dev/null +++ b/r-package/R/zzz.R @@ -0,0 +1,10 @@ +#' @useDynLib "blitzml", .registration=TRUE +#' @importFrom Rcpp sourceCpp +#' @import methods +#' @import Matrix +#' @importFrom futile.logger flog.info flog.debug flog.trace flog.warn + +STATUS_BUFFER_SIZE = 64L + +AVAILABLE_LOSS_FUNCTIONS = c("squared", "huber", "logistic", "squared_hinge", "smoothed_hinge") +IMPLEMENTED_LOSS_FUNCTIONS = c("squared", "logistic") diff --git a/r-package/README.md b/r-package/README.md new file mode 100644 index 0000000..15ead0b --- /dev/null +++ b/r-package/README.md @@ -0,0 +1,56 @@ +# Work in progress + +## Installation + +```bash +git clone https://github.com/dselivanov/BlitzML.git +R CMD INSTALL r-package +``` + +- At the moment package supports only logistic regression on sparse matrices in CSC format. + +## Example + +Here is exmple on sentiment classification. We perform 4-fold cross-validation and track AUC: +```r +library(blitzml) +library(text2vec) +library(futile.logger) +flog.threshold(INFO, "blitzml") + +it = itoken(movie_review$review, tolower, word_tokenizer) +dtm = create_dtm(it, hash_vectorizer(2**14)) + +system.time({ + set.seed(1) + model = BlitzMLLassoRegression$new(loss = "logistic", + lambda = "auto", + tol = 1e-2, + use_working_sets = TRUE) + cv_stat = model$cross_validate(dtm, movie_review$sentiment, n_folds = 8, cores = 1) +}) + +#INFO [2018-05-16 20:55:51] found max lambda: 1699.562800 +# user system elapsed +# 3.935 0.117 4.091 +``` + +And visialization of lambda path: +```r +library(data.table) +library(ggplot2) + +setDT(cv_stat) +cv_stat = cv_stat[, .(cvm = mean(auc), sd = sd(auc)), keyby = .(lambda)] + +ggplot(data = cv_stat, aes(x = lambda, y = cvm, col = 'red')) + + geom_point(size = 2) + + geom_line(size = 1) + + ylab("auc") + + scale_color_discrete(guide=FALSE) + + geom_errorbar(aes(ymin = cvm - sd, ymax = cvm + sd) , width=.01, alpha = 0.3) + + scale_x_log10() + + theme_bw() +``` + +![cv](docs/img/1-cross-validation.png) diff --git a/r-package/blitzml.Rproj b/r-package/blitzml.Rproj new file mode 100644 index 0000000..270314b --- /dev/null +++ b/r-package/blitzml.Rproj @@ -0,0 +1,21 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +AutoAppendNewline: Yes +StripTrailingWhitespace: Yes + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source +PackageRoxygenize: rd,collate,namespace diff --git a/r-package/docs/img/1-cross-validation.png b/r-package/docs/img/1-cross-validation.png new file mode 100644 index 0000000..4810c52 Binary files /dev/null and b/r-package/docs/img/1-cross-validation.png differ diff --git a/r-package/inst/include b/r-package/inst/include new file mode 120000 index 0000000..3611dd2 --- /dev/null +++ b/r-package/inst/include @@ -0,0 +1 @@ +../../include/ \ No newline at end of file diff --git a/r-package/src/Makevars b/r-package/src/Makevars new file mode 100644 index 0000000..18ba54a --- /dev/null +++ b/r-package/src/Makevars @@ -0,0 +1,16 @@ +CXX_STD = CXX11 + +PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) + +# https://cran.r-project.org/doc/manuals/r-release/R-exts.html#Writing-portable-packages +# potentially it can make sense to remove 'wildcard' +SOURCES=$(wildcard base/*.cpp dataset/*.cpp smooth_loss/*.cpp sparse_linear/*.cpp) + +OBJECTS = wrap_solver.o wrap_dataset.o RcppExports.o $(SOURCES:.cpp=.o) + +all: $(SHLIB) + +clean: + @rm -f $(OBJECTS) + +PKG_CXXFLAGS = -I../inst/include -DBLITZML_R_WRAPPER -DR_USE_C99_IN_CXX diff --git a/r-package/src/RcppExports.cpp b/r-package/src/RcppExports.cpp new file mode 100644 index 0000000..6ed291e --- /dev/null +++ b/r-package/src/RcppExports.cpp @@ -0,0 +1,165 @@ +// Generated by using Rcpp::compileAttributes() -> do not edit by hand +// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 + +#include + +using namespace Rcpp; + +// BlitzML_new_sparse_dataset +SEXP BlitzML_new_sparse_dataset(const Rcpp::S4& x, const Rcpp::NumericVector& y); +RcppExport SEXP _blitzml_BlitzML_new_sparse_dataset(SEXP xSEXP, SEXP ySEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const Rcpp::S4& >::type x(xSEXP); + Rcpp::traits::input_parameter< const Rcpp::NumericVector& >::type y(ySEXP); + rcpp_result_gen = Rcpp::wrap(BlitzML_new_sparse_dataset(x, y)); + return rcpp_result_gen; +END_RCPP +} +// BlitzML_new_dense_dataset +SEXP BlitzML_new_dense_dataset(const Rcpp::NumericMatrix& x, const Rcpp::NumericVector& y); +RcppExport SEXP _blitzml_BlitzML_new_dense_dataset(SEXP xSEXP, SEXP ySEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const Rcpp::NumericMatrix& >::type x(xSEXP); + Rcpp::traits::input_parameter< const Rcpp::NumericVector& >::type y(ySEXP); + rcpp_result_gen = Rcpp::wrap(BlitzML_new_dense_dataset(x, y)); + return rcpp_result_gen; +END_RCPP +} +// BlitzML_new_solver +SEXP BlitzML_new_solver(); +RcppExport SEXP _blitzml_BlitzML_new_solver() { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + rcpp_result_gen = Rcpp::wrap(BlitzML_new_solver()); + return rcpp_result_gen; +END_RCPP +} +// BlitzML_new_logreg_solver +SEXP BlitzML_new_logreg_solver(); +RcppExport SEXP _blitzml_BlitzML_new_logreg_solver() { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + rcpp_result_gen = Rcpp::wrap(BlitzML_new_logreg_solver()); + return rcpp_result_gen; +END_RCPP +} +// BlitzML_new_linear_solver +SEXP BlitzML_new_linear_solver(); +RcppExport SEXP _blitzml_BlitzML_new_linear_solver() { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + rcpp_result_gen = Rcpp::wrap(BlitzML_new_linear_solver()); + return rcpp_result_gen; +END_RCPP +} +// BlitzML_new_parameters +SEXP BlitzML_new_parameters(const Rcpp::NumericVector params); +RcppExport SEXP _blitzml_BlitzML_new_parameters(SEXP paramsSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const Rcpp::NumericVector >::type params(paramsSEXP); + rcpp_result_gen = Rcpp::wrap(BlitzML_new_parameters(params)); + return rcpp_result_gen; +END_RCPP +} +// BlitzML_solver_compute_max_l1_penalty +double BlitzML_solver_compute_max_l1_penalty(SEXP xptr_solver, SEXP xptr_dataset, SEXP xptr_params); +RcppExport SEXP _blitzml_BlitzML_solver_compute_max_l1_penalty(SEXP xptr_solverSEXP, SEXP xptr_datasetSEXP, SEXP xptr_paramsSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< SEXP >::type xptr_solver(xptr_solverSEXP); + Rcpp::traits::input_parameter< SEXP >::type xptr_dataset(xptr_datasetSEXP); + Rcpp::traits::input_parameter< SEXP >::type xptr_params(xptr_paramsSEXP); + rcpp_result_gen = Rcpp::wrap(BlitzML_solver_compute_max_l1_penalty(xptr_solver, xptr_dataset, xptr_params)); + return rcpp_result_gen; +END_RCPP +} +// BlitzML_solve_problem +void BlitzML_solve_problem(SEXP xptr_solver, SEXP xptr_dataset, SEXP xptr_params, Rcpp::NumericVector& result, Rcpp::RawVector& status_buffer, const Rcpp::String& log_dir); +RcppExport SEXP _blitzml_BlitzML_solve_problem(SEXP xptr_solverSEXP, SEXP xptr_datasetSEXP, SEXP xptr_paramsSEXP, SEXP resultSEXP, SEXP status_bufferSEXP, SEXP log_dirSEXP) { +BEGIN_RCPP + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< SEXP >::type xptr_solver(xptr_solverSEXP); + Rcpp::traits::input_parameter< SEXP >::type xptr_dataset(xptr_datasetSEXP); + Rcpp::traits::input_parameter< SEXP >::type xptr_params(xptr_paramsSEXP); + Rcpp::traits::input_parameter< Rcpp::NumericVector& >::type result(resultSEXP); + Rcpp::traits::input_parameter< Rcpp::RawVector& >::type status_buffer(status_bufferSEXP); + Rcpp::traits::input_parameter< const Rcpp::String& >::type log_dir(log_dirSEXP); + BlitzML_solve_problem(xptr_solver, xptr_dataset, xptr_params, result, status_buffer, log_dir); + return R_NilValue; +END_RCPP +} +// BlitzML_set_tolerance +void BlitzML_set_tolerance(SEXP xptr_solver, double value); +RcppExport SEXP _blitzml_BlitzML_set_tolerance(SEXP xptr_solverSEXP, SEXP valueSEXP) { +BEGIN_RCPP + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< SEXP >::type xptr_solver(xptr_solverSEXP); + Rcpp::traits::input_parameter< double >::type value(valueSEXP); + BlitzML_set_tolerance(xptr_solver, value); + return R_NilValue; +END_RCPP +} +// BlitzML_set_max_time +void BlitzML_set_max_time(SEXP xptr_solver, double value); +RcppExport SEXP _blitzml_BlitzML_set_max_time(SEXP xptr_solverSEXP, SEXP valueSEXP) { +BEGIN_RCPP + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< SEXP >::type xptr_solver(xptr_solverSEXP); + Rcpp::traits::input_parameter< double >::type value(valueSEXP); + BlitzML_set_max_time(xptr_solver, value); + return R_NilValue; +END_RCPP +} +// BlitzML_set_max_iterations +void BlitzML_set_max_iterations(SEXP xptr_solver, unsigned value); +RcppExport SEXP _blitzml_BlitzML_set_max_iterations(SEXP xptr_solverSEXP, SEXP valueSEXP) { +BEGIN_RCPP + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< SEXP >::type xptr_solver(xptr_solverSEXP); + Rcpp::traits::input_parameter< unsigned >::type value(valueSEXP); + BlitzML_set_max_iterations(xptr_solver, value); + return R_NilValue; +END_RCPP +} +// BlitzML_set_use_working_sets +void BlitzML_set_use_working_sets(SEXP xptr_solver, bool value); +RcppExport SEXP _blitzml_BlitzML_set_use_working_sets(SEXP xptr_solverSEXP, SEXP valueSEXP) { +BEGIN_RCPP + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< SEXP >::type xptr_solver(xptr_solverSEXP); + Rcpp::traits::input_parameter< bool >::type value(valueSEXP); + BlitzML_set_use_working_sets(xptr_solver, value); + return R_NilValue; +END_RCPP +} + +static const R_CallMethodDef CallEntries[] = { + {"_blitzml_BlitzML_new_sparse_dataset", (DL_FUNC) &_blitzml_BlitzML_new_sparse_dataset, 2}, + {"_blitzml_BlitzML_new_dense_dataset", (DL_FUNC) &_blitzml_BlitzML_new_dense_dataset, 2}, + {"_blitzml_BlitzML_new_solver", (DL_FUNC) &_blitzml_BlitzML_new_solver, 0}, + {"_blitzml_BlitzML_new_logreg_solver", (DL_FUNC) &_blitzml_BlitzML_new_logreg_solver, 0}, + {"_blitzml_BlitzML_new_linear_solver", (DL_FUNC) &_blitzml_BlitzML_new_linear_solver, 0}, + {"_blitzml_BlitzML_new_parameters", (DL_FUNC) &_blitzml_BlitzML_new_parameters, 1}, + {"_blitzml_BlitzML_solver_compute_max_l1_penalty", (DL_FUNC) &_blitzml_BlitzML_solver_compute_max_l1_penalty, 3}, + {"_blitzml_BlitzML_solve_problem", (DL_FUNC) &_blitzml_BlitzML_solve_problem, 6}, + {"_blitzml_BlitzML_set_tolerance", (DL_FUNC) &_blitzml_BlitzML_set_tolerance, 2}, + {"_blitzml_BlitzML_set_max_time", (DL_FUNC) &_blitzml_BlitzML_set_max_time, 2}, + {"_blitzml_BlitzML_set_max_iterations", (DL_FUNC) &_blitzml_BlitzML_set_max_iterations, 2}, + {"_blitzml_BlitzML_set_use_working_sets", (DL_FUNC) &_blitzml_BlitzML_set_use_working_sets, 2}, + {NULL, NULL, 0} +}; + +RcppExport void R_init_blitzml(DllInfo *dll) { + R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); + R_useDynamicSymbols(dll, FALSE); +} diff --git a/r-package/src/base b/r-package/src/base new file mode 120000 index 0000000..1210551 --- /dev/null +++ b/r-package/src/base @@ -0,0 +1 @@ +../../src/base/ \ No newline at end of file diff --git a/r-package/src/dataset b/r-package/src/dataset new file mode 120000 index 0000000..c409859 --- /dev/null +++ b/r-package/src/dataset @@ -0,0 +1 @@ +../../src/dataset/ \ No newline at end of file diff --git a/r-package/src/mapped_csc_matrix.h b/r-package/src/mapped_csc_matrix.h new file mode 100644 index 0000000..f3b6931 --- /dev/null +++ b/r-package/src/mapped_csc_matrix.h @@ -0,0 +1,32 @@ +#include + +template< typename T> +class MappedCSC { +public: + MappedCSC(); + MappedCSC(std::uint32_t n_rows, + std::uint32_t n_cols, + uint32_t nnz, + std::uint32_t * row_indices, + std::uint32_t * col_ptrs, + T * values): + n_rows(n_rows), n_cols(n_cols), nnz(nnz), row_indices(row_indices), col_ptrs(col_ptrs), values(values) {}; + const std::uint32_t n_rows; + const std::uint32_t n_cols; + const uint32_t nnz; + const std::uint32_t * row_indices; + const std::uint32_t * col_ptrs; + T * values; +}; + +using dMappedCSC = MappedCSC; + +dMappedCSC extract_mapped_csc(Rcpp::S4 input) { + Rcpp::IntegerVector dim = input.slot("Dim"); + Rcpp::NumericVector values = input.slot("x"); + uint32_t nrows = dim[0]; + uint32_t ncols = dim[1]; + Rcpp::IntegerVector row_indices = input.slot("i"); + Rcpp::IntegerVector col_ptrs = input.slot("p"); + return dMappedCSC(nrows, ncols, values.length(), (uint32_t *)row_indices.begin(), (uint32_t *)col_ptrs.begin(), values.begin()); +} diff --git a/r-package/src/smooth_loss b/r-package/src/smooth_loss new file mode 120000 index 0000000..7a16780 --- /dev/null +++ b/r-package/src/smooth_loss @@ -0,0 +1 @@ +../../src/smooth_loss/ \ No newline at end of file diff --git a/r-package/src/sparse_linear b/r-package/src/sparse_linear new file mode 120000 index 0000000..591e05d --- /dev/null +++ b/r-package/src/sparse_linear @@ -0,0 +1 @@ +../../src/sparse_linear/ \ No newline at end of file diff --git a/r-package/src/wrap_dataset.cpp b/r-package/src/wrap_dataset.cpp new file mode 100644 index 0000000..3622cbb --- /dev/null +++ b/r-package/src/wrap_dataset.cpp @@ -0,0 +1,27 @@ +#include +#include "mapped_csc_matrix.h" +#include +#include + +// [[Rcpp::export]] +SEXP BlitzML_new_sparse_dataset(const Rcpp::S4 &x, + const Rcpp::NumericVector &y) { + dMappedCSC x_csc = extract_mapped_csc(x); + double *y_ptr = (double *)y.begin(); + BlitzML::SparseDataset* dataset = + new BlitzML::SparseDataset(x_csc.row_indices, x_csc.col_ptrs, x_csc.values, + x_csc.n_rows, x_csc.n_cols, x_csc.nnz, y_ptr, y.size()); + Rcpp::XPtr> ptr(dataset, true); + return ptr; +} + +// [[Rcpp::export]] +SEXP BlitzML_new_dense_dataset(const Rcpp::NumericMatrix &x, + const Rcpp::NumericVector &y) { + double *x_ptr = (double *)x.begin(); + double *y_ptr = (double *)y.begin(); + BlitzML::DenseDataset* dataset = + new BlitzML::DenseDataset(x_ptr, x.nrow(), x.ncol(), y_ptr, y.size()); + Rcpp::XPtr> ptr(dataset, true); + return ptr; +} diff --git a/r-package/src/wrap_solver.cpp b/r-package/src/wrap_solver.cpp new file mode 100644 index 0000000..ed0f469 --- /dev/null +++ b/r-package/src/wrap_solver.cpp @@ -0,0 +1,86 @@ +#include + +#include +#include +#include +#include +#include +#include + + +// [[Rcpp::export]] +SEXP BlitzML_new_solver() { + BlitzML::SparseLinearSolver* solver = new BlitzML::SparseLinearSolver(); + Rcpp::XPtr ptr(solver, true); + return ptr; +} + +// [[Rcpp::export]] +SEXP BlitzML_new_logreg_solver() { + BlitzML::SparseLogRegSolver* solver = new BlitzML::SparseLogRegSolver(); + Rcpp::XPtr ptr(solver, true); + return ptr; +} + +// [[Rcpp::export]] +SEXP BlitzML_new_linear_solver() { + BlitzML::LassoSolver* solver = new BlitzML::LassoSolver(); + Rcpp::XPtr ptr(solver, true); + return ptr; +} + +// [[Rcpp::export]] +SEXP BlitzML_new_parameters(const Rcpp::NumericVector params) { + BlitzML::Parameters* blitz_params = new BlitzML::Parameters((double*)params.begin(), params.size() ); + Rcpp::XPtr ptr(blitz_params, true); + return ptr; +} + +// [[Rcpp::export]] +double BlitzML_solver_compute_max_l1_penalty( + SEXP xptr_solver, + SEXP xptr_dataset, + SEXP xptr_params) { + Rcpp::XPtr< BlitzML::SparseLinearSolver > solver(xptr_solver); + Rcpp::XPtr< BlitzML::Dataset> data(xptr_dataset); + Rcpp::XPtr< BlitzML::Parameters > params(xptr_params); + return solver->compute_max_l1_penalty(data, params); +} + +// [[Rcpp::export]] +void BlitzML_solve_problem(SEXP xptr_solver, + SEXP xptr_dataset, + SEXP xptr_params, + Rcpp::NumericVector &result, + Rcpp::RawVector &status_buffer, + const Rcpp::String &log_dir) { + Rcpp::XPtr< BlitzML::SparseLinearSolver > solver(xptr_solver); + Rcpp::XPtr< BlitzML::SparseDataset > data(xptr_dataset); + Rcpp::XPtr< BlitzML::Parameters > params(xptr_params); + double *result_ptr = (double *)result.begin(); + char *buf = (char *)status_buffer.begin(); + solver->solve(data, params, result_ptr, buf, log_dir.get_cstring()); +} + +// [[Rcpp::export]] +void BlitzML_set_tolerance(SEXP xptr_solver, double value) { + Rcpp::XPtr< BlitzML::SparseLinearSolver > solver(xptr_solver); + solver->set_tolerance(value); +} +// [[Rcpp::export]] +void BlitzML_set_max_time(SEXP xptr_solver, double value) { + Rcpp::XPtr< BlitzML::SparseLinearSolver > solver(xptr_solver); + solver->set_max_time(value); +} + +// [[Rcpp::export]] +void BlitzML_set_max_iterations(SEXP xptr_solver, unsigned value) { + Rcpp::XPtr< BlitzML::SparseLinearSolver > solver(xptr_solver); + solver->set_max_iterations(value); +} + +// [[Rcpp::export]] +void BlitzML_set_use_working_sets(SEXP xptr_solver, bool value) { + Rcpp::XPtr< BlitzML::SparseLinearSolver > solver(xptr_solver); + solver->set_use_working_sets(value); +} diff --git a/src/base/common.cpp b/src/base/common.cpp index 95c1a64..9d29121 100644 --- a/src/base/common.cpp +++ b/src/base/common.cpp @@ -4,26 +4,39 @@ #include #include +#ifdef BLITZML_R_WRAPPER +#include +// https://cran.r-project.org/doc/manuals/r-release/R-exts.html#index-Rvprintf +#include +#define vprintf Rvprintf +#define printf Rprintf +#endif + using std::string; using std::cout; using std::cerr; using std::endl; - namespace BlitzML { -void assert(bool okay, string error_message) { +void assert_with_error_message(bool okay, string error_message) { if (!okay) { - cerr << "Program exited with error: " - << error_message - << endl; +#ifdef BLITZML_R_WRAPPER + Rcpp::Rcerr << "Program exited with error: " << error_message << endl; +#else + cerr << "Program exited with error: " << error_message << endl; +#endif throw error_message; } } void warn_if(bool condition, std::string message) { if (condition) { +#ifdef BLITZML_R_WRAPPER + Rcpp::Rcout << "Warning: " << message << endl; +#else cout << "Warning: " << message << endl; +#endif } } diff --git a/src/base/scheduler.cpp b/src/base/scheduler.cpp index 8d58b17..d64c1f0 100644 --- a/src/base/scheduler.cpp +++ b/src/base/scheduler.cpp @@ -27,7 +27,7 @@ value_t Scheduler::estimate_C_progress() { value_t ret = 1.; if (C_progress_estimates.size() > 0) { ret = median_last_k(C_progress_estimates, 2); - } + } if (ret == ret && ret >= 1.) { return ret; } else { @@ -42,16 +42,16 @@ void Scheduler::set_problem_sizes(const std::vector &problem_sizes) { } -void Scheduler::optimize_xi_and_epsilon(unsigned short &best_capsule_ind, +void Scheduler::optimize_xi_and_epsilon(unsigned short &best_capsule_ind, value_t &best_epsilon, double &best_time_limit) { - assert(gamma > 0, "in scheduler, must set gamma to be > 0"); - assert(xi_candidates.size() > 0, + assert_with_error_message(gamma > 0, "in scheduler, must set gamma to be > 0"); + assert_with_error_message(xi_candidates.size() > 0, "in scheduler, must have at least 1 xi candidate"); - assert(epsilon_candidates.size() > 0, + assert_with_error_message(epsilon_candidates.size() > 0, "in scheduler, must have at least 1 epsilon candidate"); - + value_t C_progress = estimate_C_progress(); double T_overhead = overhead_time_estimate(); @@ -98,7 +98,7 @@ void Scheduler::optimize_xi_and_epsilon(unsigned short &best_capsule_ind, } -void Scheduler::record_subproblem_size(value_t xi, +void Scheduler::record_subproblem_size(value_t xi, value_t subproblem_size) { chosen_xi = xi; chosen_subproblem_size = subproblem_size; @@ -133,7 +133,7 @@ void Scheduler::record_subproblem_progress(value_t new_Delta, //C_progress_est = 1.5 / chosen_xi; //} else { C_progress_est = (1 - new_Delta / current_Delta) / (1 - realized_epsilon) / chosen_xi; - //} + //} /* if (C_progress_est != C_progress_est || C_progress_est < 1) { @@ -151,7 +151,7 @@ value_t Scheduler::subproblem_time_complexity(value_t epsilon) { double Scheduler::overhead_time_estimate() { - assert(C_overhead_time_estimates.size() > 0, + assert_with_error_message(C_overhead_time_estimates.size() > 0, "cannot estimate overhead time before calling record_overhead_time"); return median_last_k(C_overhead_time_estimates, 5); } diff --git a/src/sparse_linear/logreg_solver.cpp b/src/sparse_linear/logreg_solver.cpp index 4a9c765..a65fd8c 100644 --- a/src/sparse_linear/logreg_solver.cpp +++ b/src/sparse_linear/logreg_solver.cpp @@ -108,11 +108,11 @@ void SparseLogRegSolver::update_bias(int max_newton_itr) { return; } - value_t exp_delta_total = 1.0; + value_t exp_delta_total = 1.0; if (is_vector_const(exp_Aomega)) { // Special case closed-form solution: // (this case occurs when we initialize model as all zeros) - exp_delta_total = (exp_Aomega[0] * num_positive_labels) / + exp_delta_total = (exp_Aomega[0] * num_positive_labels) / (num_examples - num_positive_labels); scale_vector(exp_Aomega, exp_delta_total); max_newton_itr = 0; @@ -148,7 +148,7 @@ void SparseLogRegSolver::update_bias(int max_newton_itr) { value_t change = log(exp_delta_total); bias += change; - + sum_x = 0.; for (index_t j = 0; j < num_examples; ++j) { Aomega[j] += change; @@ -167,7 +167,7 @@ void SparseLogRegSolver::perform_backtracking() { std::vector low_exp_Aomega = exp_Aomega; sum_x = 0.; - for (int j = 0; j < num_examples; ++j) { + for (index_t j = 0; j < num_examples; ++j) { exp_Aomega[j] = exp(Aomega[j] + Delta_Aomega[j] + Delta_bias); x[j] = compute_x_value(j); sum_x += x[j]; @@ -190,7 +190,7 @@ void SparseLogRegSolver::perform_backtracking() { low_exp_Aomega[j] < 1e15 && low_exp_Aomega[j] > 1e-15) { exp_Aomega[j] = sqrt(high_exp_Aomega[j] * low_exp_Aomega[j]); } else { - exp_Aomega[j] = exp(Aomega[j] + + exp_Aomega[j] = exp(Aomega[j] + step_size * (Delta_Aomega[j] + Delta_bias)); } x[j] = compute_x_value(j); @@ -215,8 +215,8 @@ void SparseLogRegSolver::perform_backtracking() { } } - for (const_index_itr ind = ws.begin_indices(); - ind != ws.end_indices(); + for (const_index_itr ind = ws.begin_indices(); + ind != ws.end_indices(); ++ind) { index_t i = ws.ith_member(*ind); omega[i] += step_size * Delta_omega[*ind]; @@ -245,7 +245,7 @@ value_t SparseLogRegSolver::compute_dual_obj() const { } -void SparseLogRegSolver::update_subproblem_obj_vals() { +void SparseLogRegSolver::update_subproblem_obj_vals() { obj_vals.set_dual_obj(compute_dual_obj()); value_t gap = 0.; @@ -256,7 +256,7 @@ void SparseLogRegSolver::update_subproblem_obj_vals() { if (log_kappa != log_kappa) { throw log_kappa; } - for (int j = 0; j < num_examples; ++j) { + for (index_t j = 0; j < num_examples; ++j) { value_t prob, mid_term, last_term; if (is_positive_label[j]) { prob = kappa_x / (1 + exp_Aomega[j]);