From d849ac6f62a004f4b08dc896deeec9b9dac51121 Mon Sep 17 00:00:00 2001 From: Kanishka Misra Date: Thu, 22 Mar 2018 11:14:50 -0400 Subject: [PATCH 1/7] Added multi_scale.R to project documents separated by pairwise distances to a k dimensional space --- DESCRIPTION | 2 +- NAMESPACE | 1 + R/multi_scale.R | 55 ++++++++++++++++++++++++++++++++++++++++++++++ man/multi_scale.Rd | 44 +++++++++++++++++++++++++++++++++++++ 4 files changed, 101 insertions(+), 1 deletion(-) create mode 100644 R/multi_scale.R create mode 100644 man/multi_scale.Rd diff --git a/DESCRIPTION b/DESCRIPTION index e428f59..c6a161b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -39,4 +39,4 @@ Suggests: ggplot2, maps, irlba -RoxygenNote: 6.0.1 +RoxygenNote: 6.0.1.9000 diff --git a/NAMESPACE b/NAMESPACE index e41bb8c..8e93aec 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +export(multi_scale) export(pairwise_cor) export(pairwise_cor_) export(pairwise_count) diff --git a/R/multi_scale.R b/R/multi_scale.R new file mode 100644 index 0000000..e1ad9f3 --- /dev/null +++ b/R/multi_scale.R @@ -0,0 +1,55 @@ +#' Multidimensional Scaling of documents separated by a distance measure. +#' +#' Given a tbl or data frame of pairwise distances between documents, scale each document +#' to a *k* dimensional space that ensures the distance between all documents is maintained. +#' **NOTE:** Doesnt work when the pairwise distance tbl is formed using \code{upper = FALSE}. +#' +#' @param tbl Table obtained by running a pairwise distance method \code{pairwise_delta} or \code{pairwise_dist} +#' @param item1 first item +#' @param item1 second item +#' @param value Value +#' @param k number of dimensions, defaults to 2 +#' +#' @examples +#' +#' library(janeaustenr) +#' library(dplyr) +#' library(tidyr) +#' library(tidytext) +#' library(tibble) +#' +#' # closest documents in terms of 1000 most frequent words +#' austen_delta <- austen_books() %>% +#' unnest_tokens(word, text) %>% +#' count(book, word) %>% +#' pairwise_delta(book, word, n) +#' +#' austen_delta +#' +#' austen_delta %>% +#' multi_scale(item1, item2, delta) +#' +#' @export + +multi_scale <- function(tbl, item1, item2, value, k = 2) { + multi_scale_(tbl, + col_name(substitute(item1)), + col_name(substitute(item2)), + col_name(substitute(value)), + k = 2) +} + + +multi_scale_ <- function(tbl, item1, item2, value, k = 2) { + tbl_matrix <- tbl %>% + spread(item2, col_name(value), fill = 0) %>% + as.data.frame() %>% + remove_rownames() %>% + column_to_rownames("item1") %>% + as.matrix() + + cmdscale(tbl_matrix, k = k) %>% + as.data.frame() %>% + rownames_to_column("item") %>% + as.tibble() +} diff --git a/man/multi_scale.Rd b/man/multi_scale.Rd new file mode 100644 index 0000000..5d51afb --- /dev/null +++ b/man/multi_scale.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/multi_scale.R +\name{multi_scale} +\alias{multi_scale} +\title{Multidimensional Scaling of documents separated by a distance measure.} +\usage{ +multi_scale(tbl, item1, item2, value, k = 2) +} +\arguments{ +\item{tbl}{Table obtained by running a pairwise distance method \code{pairwise_delta} or \code{pairwise_dist}} + +\item{item1}{first item} + +\item{value}{Value} + +\item{k}{number of dimensions, defaults to 2} + +\item{item1}{second item} +} +\description{ +Given a tbl or data frame of pairwise distances between documents, scale each document +to a *k* dimensional space that ensures the distance between all documents is maintained. +**NOTE:** Doesnt work when the pairwise distance tbl is formed using \code{upper = FALSE}. +} +\examples{ + +library(janeaustenr) +library(dplyr) +library(tidyr) +library(tidytext) +library(tibble) + +# closest documents in terms of 1000 most frequent words +austen_delta <- austen_books() \%>\% + unnest_tokens(word, text) \%>\% + count(book, word) \%>\% + pairwise_delta(book, word, n) + +austen_delta + +austen_delta \%>\% + multi_scale(item1, item2, delta) + +} From 5a7cb5b3727839276443e6c138c7507b9c8e6bdf Mon Sep 17 00:00:00 2001 From: Kanishka Misra Date: Wed, 6 Jun 2018 23:44:55 -0500 Subject: [PATCH 2/7] Added multidimensional scaling and solved issue #13 --- DESCRIPTION | 2 +- R/multi_scale.R | 10 +++++----- R/pairwise_cor.R | 20 ++++++++++---------- R/widely.R | 9 ++++++--- 4 files changed, 22 insertions(+), 19 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index c6a161b..e428f59 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -39,4 +39,4 @@ Suggests: ggplot2, maps, irlba -RoxygenNote: 6.0.1.9000 +RoxygenNote: 6.0.1 diff --git a/R/multi_scale.R b/R/multi_scale.R index e1ad9f3..dfd320f 100644 --- a/R/multi_scale.R +++ b/R/multi_scale.R @@ -42,14 +42,14 @@ multi_scale <- function(tbl, item1, item2, value, k = 2) { multi_scale_ <- function(tbl, item1, item2, value, k = 2) { tbl_matrix <- tbl %>% - spread(item2, col_name(value), fill = 0) %>% + tidyr::spread(item2, col_name(value), fill = 0) %>% as.data.frame() %>% - remove_rownames() %>% - column_to_rownames("item1") %>% + tibble::remove_rownames() %>% + tibble::column_to_rownames("item1") %>% as.matrix() - cmdscale(tbl_matrix, k = k) %>% + stats::cmdscale(tbl_matrix, k = k) %>% as.data.frame() %>% - rownames_to_column("item") %>% + tibble::rownames_to_column("item") %>% as.tibble() } diff --git a/R/pairwise_cor.R b/R/pairwise_cor.R index 8b238cc..5a3e8a1 100644 --- a/R/pairwise_cor.R +++ b/R/pairwise_cor.R @@ -37,8 +37,8 @@ #' #' @export pairwise_cor <- function(tbl, item, feature, value, - method = c("pearson", "kendall", "spearman"), - use = "everything", ...) { + method = c("pearson", "kendall", "spearman"), + use = "everything", ...) { if (missing(value)) { tbl$..value <- 1 val <- "..value" @@ -47,19 +47,19 @@ pairwise_cor <- function(tbl, item, feature, value, } pairwise_cor_(tbl, - col_name(substitute(item)), - col_name(substitute(feature)), - val, - method = method, use = use, ...) + col_name(substitute(item)), + col_name(substitute(feature)), + val, + method = method, use = use, ...) } #' @rdname pairwise_cor #' @export pairwise_cor_ <- function(tbl, item, feature, value, - method = c("pearson", "kendall", "spearman"), - use = "everything", - ...) { + method = c("pearson", "kendall", "spearman"), + use = "everything", + ...) { method <- match.arg(method) sparse <- (method == "pearson" & use == "everything") @@ -68,7 +68,7 @@ pairwise_cor_ <- function(tbl, item, feature, value, } else { function(x) stats::cor(t(x), method = method, use = use) } - cor_func <- squarely_(f, sparse = sparse, ...) + cor_func <- squarely_(f, sparse = sparse, fill_value = NA, ...) tbl %>% ungroup() %>% diff --git a/R/widely.R b/R/widely.R index 01c4497..a929460 100644 --- a/R/widely.R +++ b/R/widely.R @@ -52,6 +52,8 @@ widely <- function(.f, sparse = FALSE, maximum_size = 1e7) { function(tbl, row, column, value, ...) { + + inner_func <- widely_(.f, sort = sort, sparse = sparse, @@ -71,7 +73,7 @@ widely <- function(.f, widely_ <- function(.f, sort = FALSE, sparse = FALSE, - maximum_size = 1e7) { + maximum_size = 1e7, fill_value = 0) { f <- function(tbl, row, column, value, ...) { if (inherits(tbl, "grouped_df")) { # perform within each group, then restore groups @@ -87,7 +89,7 @@ widely_ <- function(.f, if (!sparse) { if (!is.null(maximum_size)) { matrix_size <- (length(unique(tbl[[row]])) * - length(unique(tbl[[column]]))) + length(unique(tbl[[column]]))) if (matrix_size > maximum_size) { stop("Size of acast matrix, ", matrix_size, " will be too large. Set maximum_size = NULL to avoid ", @@ -98,7 +100,8 @@ widely_ <- function(.f, form <- stats::as.formula(paste(row, column, sep = " ~ ")) - input <- reshape2::acast(tbl, form, value.var = value, fill = 0) + input <- reshape2::acast(tbl, form, value.var = value, fill = fill_value) + } else { input <- tidytext::cast_sparse_(tbl, row, column, value) } From 88818cf2538663b56064a96008caca01bd917617 Mon Sep 17 00:00:00 2001 From: Kanishka Misra Date: Thu, 7 Jun 2018 00:13:36 -0500 Subject: [PATCH 3/7] Fixed tibble call to make Travic CI happy --- R/multi_scale.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/multi_scale.R b/R/multi_scale.R index dfd320f..80d1b15 100644 --- a/R/multi_scale.R +++ b/R/multi_scale.R @@ -51,5 +51,5 @@ multi_scale_ <- function(tbl, item1, item2, value, k = 2) { stats::cmdscale(tbl_matrix, k = k) %>% as.data.frame() %>% tibble::rownames_to_column("item") %>% - as.tibble() + tibble::as_tibble() } From 6955c29caf2b816f72afe9e6f82ae7f593919429 Mon Sep 17 00:00:00 2001 From: Kanishka Misra Date: Fri, 8 Jun 2018 17:39:12 -0500 Subject: [PATCH 4/7] Fixed documentation and widely inconsistencies in new fill_value --- R/multi_scale.R | 8 +++++++- R/widely.R | 10 +++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/R/multi_scale.R b/R/multi_scale.R index 80d1b15..0a0dff3 100644 --- a/R/multi_scale.R +++ b/R/multi_scale.R @@ -6,10 +6,16 @@ #' #' @param tbl Table obtained by running a pairwise distance method \code{pairwise_delta} or \code{pairwise_dist} #' @param item1 first item -#' @param item1 second item +#' @param item2 second item #' @param value Value #' @param k number of dimensions, defaults to 2 #' +#' @return Returns a function that takes at least four arguments: +#' \item{item}{Column to store documents or items separated by various distances as used prior to calling \code{multi_scale()}} +#' \item{V1}{First Dimension} +#' \item{V2}{Second Dimension} +#' \item{...}{Other Dimensions as specified by k's value} +#' #' @examples #' #' library(janeaustenr) diff --git a/R/widely.R b/R/widely.R index a929460..4fbb37f 100644 --- a/R/widely.R +++ b/R/widely.R @@ -11,6 +11,7 @@ #' non-sparse matrix to be created. Set to NULL to allow any size #' matrix. #' @param sparse Whether to cast to a sparse matrix +#' @param fill_value value to be used to replace NAs when converted from long to wide format. #' #' @return Returns a function that takes at least four arguments: #' \item{tbl}{A table} @@ -50,14 +51,16 @@ widely <- function(.f, sort = FALSE, sparse = FALSE, - maximum_size = 1e7) { + maximum_size = 1e7, + fill_value = 0) { function(tbl, row, column, value, ...) { inner_func <- widely_(.f, sort = sort, sparse = sparse, - maximum_size = maximum_size) + maximum_size = maximum_size, + fill_value = fill_value) inner_func(tbl, col_name(substitute(row)), @@ -73,7 +76,8 @@ widely <- function(.f, widely_ <- function(.f, sort = FALSE, sparse = FALSE, - maximum_size = 1e7, fill_value = 0) { + maximum_size = 1e7, + fill_value = 0) { f <- function(tbl, row, column, value, ...) { if (inherits(tbl, "grouped_df")) { # perform within each group, then restore groups From b265dbc3eba36e9c2a94143589aa425252aa1526 Mon Sep 17 00:00:00 2001 From: Kanishka Misra Date: Fri, 8 Jun 2018 18:08:46 -0500 Subject: [PATCH 5/7] Fixed documentation again! --- man/multi_scale.Rd | 11 +++++++++-- man/widely.Rd | 8 ++++++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/man/multi_scale.Rd b/man/multi_scale.Rd index 5d51afb..ab446ae 100644 --- a/man/multi_scale.Rd +++ b/man/multi_scale.Rd @@ -11,11 +11,18 @@ multi_scale(tbl, item1, item2, value, k = 2) \item{item1}{first item} +\item{item2}{second item} + \item{value}{Value} \item{k}{number of dimensions, defaults to 2} - -\item{item1}{second item} +} +\value{ +Returns a function that takes at least four arguments: + \item{item}{Column to store documents or items separated by various distances as used prior to calling \code{multi_scale()}} + \item{V1}{First Dimension} + \item{V2}{Second Dimension} + \item{...}{Other Dimensions as specified by k's value} } \description{ Given a tbl or data frame of pairwise distances between documents, scale each document diff --git a/man/widely.Rd b/man/widely.Rd index 573cf7b..dc8d25f 100644 --- a/man/widely.Rd +++ b/man/widely.Rd @@ -6,9 +6,11 @@ \title{Adverb for functions that operate on matrices in "wide" format} \usage{ -widely(.f, sort = FALSE, sparse = FALSE, maximum_size = 1e+07) +widely(.f, sort = FALSE, sparse = FALSE, maximum_size = 1e+07, + fill_value = 0) -widely_(.f, sort = FALSE, sparse = FALSE, maximum_size = 1e+07) +widely_(.f, sort = FALSE, sparse = FALSE, maximum_size = 1e+07, + fill_value = 0) } \arguments{ \item{.f}{Function being wrapped} @@ -20,6 +22,8 @@ widely_(.f, sort = FALSE, sparse = FALSE, maximum_size = 1e+07) \item{maximum_size}{To prevent crashing, a maximum size of a non-sparse matrix to be created. Set to NULL to allow any size matrix.} + +\item{fill_value}{value to be used to replace NAs when converted from long to wide format.} } \value{ Returns a function that takes at least four arguments: From ed2ed0a8e732cbbbdf49b5d202d7910d5a1b146b Mon Sep 17 00:00:00 2001 From: Kanishka Misra Date: Fri, 8 Jun 2018 18:40:29 -0500 Subject: [PATCH 6/7] Fixed inconsistencies in Examples for multi_scale() --- R/multi_scale.R | 1 - man/multi_scale.Rd | 1 - 2 files changed, 2 deletions(-) diff --git a/R/multi_scale.R b/R/multi_scale.R index 0a0dff3..88d0199 100644 --- a/R/multi_scale.R +++ b/R/multi_scale.R @@ -22,7 +22,6 @@ #' library(dplyr) #' library(tidyr) #' library(tidytext) -#' library(tibble) #' #' # closest documents in terms of 1000 most frequent words #' austen_delta <- austen_books() %>% diff --git a/man/multi_scale.Rd b/man/multi_scale.Rd index ab446ae..e3f1785 100644 --- a/man/multi_scale.Rd +++ b/man/multi_scale.Rd @@ -35,7 +35,6 @@ library(janeaustenr) library(dplyr) library(tidyr) library(tidytext) -library(tibble) # closest documents in terms of 1000 most frequent words austen_delta <- austen_books() \%>\% From ca14818f7799b3e2f7d2efd342a3d0a688e3ea3b Mon Sep 17 00:00:00 2001 From: Kanishka Misra Date: Mon, 30 Jul 2018 18:57:37 -0500 Subject: [PATCH 7/7] fixed :: warning from tibble --- DESCRIPTION | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index e428f59..f9c9654 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -22,7 +22,8 @@ Imports: purrr, Matrix, broom, - methods + methods, + tibble Suggests: ggraph, igraph,