hl475 · hl475 · Dec 2, 2016 · Dec 3, 2016 · Dec 3, 2016 · Dec 3, 2016
diff --git a/R/CRAN_RELEASE.md b/R/CRAN_RELEASE.md
@@ -7,7 +7,7 @@ To release SparkR as a package to CRAN, we would use the `devtools` package. Ple
 
 First, check that the `Version:` field in the `pkg/DESCRIPTION` file is updated. Also, check for stale files not under source control.
 
-Note that while `check-cran.sh` is running `R CMD check`, it is doing so with `--no-manual --no-vignettes`, which skips a few vignettes or PDF checks - therefore it will be preferred to run `R CMD check` on the source package built manually before uploading a release.
+Note that while `run-tests.sh` runs `check-cran.sh` (which runs `R CMD check`), it is doing so with `--no-manual --no-vignettes`, which skips a few vignettes or PDF checks - therefore it will be preferred to run `R CMD check` on the source package built manually before uploading a release. Also note that for CRAN checks for pdf vignettes to success, `qpdf` tool must be there (to install it, eg. `yum -q -y install qpdf`).
 
 To upload a release, we would need to update the `cran-comments.md`. This should generally contain the results from running the `check-cran.sh` script along with comments on status of all `WARNING` (should not be any) or `NOTE`. As a part of `check-cran.sh` and the release process, the vignettes is build - make sure `SPARK_HOME` is set and Spark jars are accessible.
 

diff --git a/R/check-cran.sh b/R/check-cran.sh
@@ -34,8 +34,9 @@ if [ ! -z "$R_HOME" ]
     fi
     R_SCRIPT_PATH="$(dirname $(which R))"
 fi
-echo "USING R_HOME = $R_HOME"
+echo "Using R_SCRIPT_PATH = ${R_SCRIPT_PATH}"
 
+# Install the package (this is required for code in vignettes to run when building it later)
 # Build the latest docs, but not vignettes, which is built with the package next
 $FWDIR/create-docs.sh
 
@@ -82,4 +83,20 @@ else
   # This will run tests and/or build vignettes, and require SPARK_HOME
   SPARK_HOME="${SPARK_HOME}" "$R_SCRIPT_PATH/"R CMD check $CRAN_CHECK_OPTIONS SparkR_"$VERSION".tar.gz
 fi
+
+# Install source package to get it to generate vignettes rds files, etc.
+if [ -n "$CLEAN_INSTALL" ]
+then
+  echo "Removing lib path and installing from source package"
+  LIB_DIR="$FWDIR/lib"
+  rm -rf $LIB_DIR
+  mkdir -p $LIB_DIR
+  "$R_SCRIPT_PATH/"R CMD INSTALL SparkR_"$VERSION".tar.gz --library=$LIB_DIR
+
+  # Zip the SparkR package so that it can be distributed to worker nodes on YARN
+  pushd $LIB_DIR > /dev/null
+  jar cfM "$LIB_DIR/sparkr.zip" SparkR
+  popd > /dev/null
+fi
+
 popd > /dev/null
diff --git a/R/install-dev.sh b/R/install-dev.sh
@@ -46,7 +46,7 @@ if [ ! -z "$R_HOME" ]
     fi
     R_SCRIPT_PATH="$(dirname $(which R))"
 fi
-echo "USING R_HOME = $R_HOME"
+echo "Using R_SCRIPT_PATH = ${R_SCRIPT_PATH}"
 
 # Generate Rd files if devtools is installed
 "$R_SCRIPT_PATH/"Rscript -e ' if("devtools" %in% rownames(installed.packages())) { library(devtools); devtools::document(pkg="./pkg", roclets=c("rd")) }'

diff --git a/R/pkg/.Rbuildignore b/R/pkg/.Rbuildignore
@@ -1,5 +1,8 @@
 ^.*\.Rproj$
 ^\.Rproj\.user$
 ^\.lintr$
+^cran-comments\.md$
+^NEWS\.md$
+^README\.Rmd$
 ^src-native$
 ^html$
diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
@@ -1,28 +1,27 @@
 Package: SparkR
 Type: Package
-Title: R Frontend for Apache Spark
 Version: 2.1.0
-Date: 2016-11-06
+Title: R Frontend for Apache Spark
+Description: The SparkR package provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
                     email = "shivaram@cs.berkeley.edu"),
              person("Xiangrui", "Meng", role = "aut",
                     email = "meng@databricks.com"),
              person("Felix", "Cheung", role = "aut",
                     email = "felixcheung@apache.org"),
              person(family = "The Apache Software Foundation", role = c("aut", "cph")))
+License: Apache License (== 2.0)
 URL: http://www.apache.org/ http://spark.apache.org/
 BugReports: http://spark.apache.org/contributing.html
 Depends:
     R (>= 3.0),
     methods
 Suggests:
+    knitr,
+    rmarkdown,
     testthat,
     e1071,
-    survival,
-    knitr,
-    rmarkdown
-Description: The SparkR package provides an R frontend for Apache Spark.
-License: Apache License (== 2.0)
+    survival
 Collate:
     'schema.R'
     'generics.R'

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
@@ -3,7 +3,7 @@
 importFrom("methods", "setGeneric", "setMethod", "setOldClass")
 importFrom("methods", "is", "new", "signature", "show")
 importFrom("stats", "gaussian", "setNames")
-importFrom("utils", "download.file", "object.size", "packageVersion", "untar")
+importFrom("utils", "download.file", "object.size", "packageVersion", "tail", "untar")
 
 # Disable native libraries till we figure out how to package it
 # See SPARKR-7839
@@ -16,6 +16,7 @@ export("sparkR.stop")
 export("sparkR.session.stop")
 export("sparkR.conf")
 export("sparkR.version")
+export("sparkR.uiWebUrl")
 export("print.jobj")
 
 export("sparkR.newJObject")

diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
@@ -634,7 +634,7 @@ tableNames <- function(x, ...) {
 cacheTable.default <- function(tableName) {
   sparkSession <- getSparkSession()
   catalog <- callJMethod(sparkSession, "catalog")
-  callJMethod(catalog, "cacheTable", tableName)
+  invisible(callJMethod(catalog, "cacheTable", tableName))
 }
 
 cacheTable <- function(x, ...) {
@@ -663,7 +663,7 @@ cacheTable <- function(x, ...) {
 uncacheTable.default <- function(tableName) {
   sparkSession <- getSparkSession()
   catalog <- callJMethod(sparkSession, "catalog")
-  callJMethod(catalog, "uncacheTable", tableName)
+  invisible(callJMethod(catalog, "uncacheTable", tableName))
 }
 
 uncacheTable <- function(x, ...) {
@@ -686,7 +686,7 @@ uncacheTable <- function(x, ...) {
 clearCache.default <- function() {
   sparkSession <- getSparkSession()
   catalog <- callJMethod(sparkSession, "catalog")
-  callJMethod(catalog, "clearCache")
+  invisible(callJMethod(catalog, "clearCache"))
 }
 
 clearCache <- function() {
@@ -730,6 +730,7 @@ dropTempTable <- function(x, ...) {
 #' If the view has been cached before, then it will also be uncached.
 #'
 #' @param viewName the name of the view to be dropped.
+#' @return TRUE if the view is dropped successfully, FALSE otherwise.
 #' @rdname dropTempView
 #' @name dropTempView
 #' @export

diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R
@@ -87,8 +87,8 @@ objectFile <- function(sc, path, minPartitions = NULL) {
 #' in the list are split into \code{numSlices} slices and distributed to nodes
 #' in the cluster.
 #'
-#' If size of serialized slices is larger than spark.r.maxAllocationLimit or (200MB), the function 
-#' will write it to disk and send the file name to JVM. Also to make sure each slice is not 
+#' If size of serialized slices is larger than spark.r.maxAllocationLimit or (200MB), the function
+#' will write it to disk and send the file name to JVM. Also to make sure each slice is not
 #' larger than that limit, number of slices may be increased.
 #'
 #' @param sc SparkContext to use
@@ -379,5 +379,5 @@ spark.lapply <- function(list, func) {
 #' @note setLogLevel since 2.0.0
 setLogLevel <- function(level) {
   sc <- getSparkContext()
-  callJMethod(sc, "setLogLevel", level)
+  invisible(callJMethod(sc, "setLogLevel", level))
 }
diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R
@@ -79,19 +79,28 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL,
     dir.create(localDir, recursive = TRUE)
   }
 
-  packageLocalDir <- file.path(localDir, packageName)
-
   if (overwrite) {
     message(paste0("Overwrite = TRUE: download and overwrite the tar file",
                    "and Spark package directory if they exist."))
   }
 
+  releaseUrl <- Sys.getenv("SPARKR_RELEASE_DOWNLOAD_URL")
+  if (releaseUrl != "") {
+    packageName <- basenameSansExtFromUrl(releaseUrl)
+  }
+
+  packageLocalDir <- file.path(localDir, packageName)
+
   # can use dir.exists(packageLocalDir) under R 3.2.0 or later
   if (!is.na(file.info(packageLocalDir)$isdir) && !overwrite) {
-    fmt <- "%s for Hadoop %s found, with SPARK_HOME set to %s"
-    msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion),
-                   packageLocalDir)
-    message(msg)
+    if (releaseUrl != "") {
+      message(paste(packageName, "found, setting SPARK_HOME to", packageLocalDir))
+    } else {
+      fmt <- "%s for Hadoop %s found, setting SPARK_HOME to %s"
+      msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion),
+                     packageLocalDir)
+      message(msg)
+    }
     Sys.setenv(SPARK_HOME = packageLocalDir)
     return(invisible(packageLocalDir))
   } else {
@@ -104,7 +113,12 @@ install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL,
   if (tarExists && !overwrite) {
     message("tar file found.")
   } else {
-    robustDownloadTar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath)
+    if (releaseUrl != "") {
+      message("Downloading from alternate URL:\n- ", releaseUrl)
+      downloadUrl(releaseUrl, packageLocalPath, paste0("Fetch failed from ", releaseUrl))
+    } else {
+      robustDownloadTar(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath)
+    }
   }
 
   message(sprintf("Installing to %s", localDir))
@@ -182,16 +196,18 @@ getPreferredMirror <- function(version, packageName) {
 }
 
 directDownloadTar <- function(mirrorUrl, version, hadoopVersion, packageName, packageLocalPath) {
-  packageRemotePath <- paste0(
-    file.path(mirrorUrl, version, packageName), ".tgz")
+  packageRemotePath <- paste0(file.path(mirrorUrl, version, packageName), ".tgz")
   fmt <- "Downloading %s for Hadoop %s from:\n- %s"
   msg <- sprintf(fmt, version, ifelse(hadoopVersion == "without", "Free build", hadoopVersion),
                  packageRemotePath)
   message(msg)
+  downloadUrl(packageRemotePath, packageLocalPath, paste0("Fetch failed from ", mirrorUrl))
+}
 
-  isFail <- tryCatch(download.file(packageRemotePath, packageLocalPath),
+downloadUrl <- function(remotePath, localPath, errorMessage) {
+  isFail <- tryCatch(download.file(remotePath, localPath),
                      error = function(e) {
-                       message(sprintf("Fetch failed from %s", mirrorUrl))
+                       message(errorMessage)
                        print(e)
                        TRUE
                      })