From b9e58e88c7ecfbaeb2a2f33f88c7ae36f38dad86 Mon Sep 17 00:00:00 2001 From: chainsawriot Date: Sat, 11 Jul 2020 17:33:41 +0200 Subject: [PATCH] Ported ngramrr here. Remove ngramrr dependency, add tau to DESCRIPTION --- DESCRIPTION | 7 +- R/ngrams.R | 68 ++++++++++++- man/DictionaryGI.Rd | 4 +- man/DictionaryHE.Rd | 4 +- man/DictionaryLM.Rd | 4 +- man/SentimentAnalysis.Rd | 1 - man/SentimentDictionaryWeighted.Rd | 8 +- man/analyzeSentiment.Rd | 72 ++++++++++---- man/countWords.Rd | 54 ++++++++--- man/enetEstimation.Rd | 8 +- man/generateDictionary.Rd | 106 +++++++++++++++------ man/lassoEstimation.Rd | 8 +- man/plot.SentimentDictionaryWeighted.Rd | 3 +- man/plotSentiment.Rd | 9 +- man/plotSentimentResponse.Rd | 9 +- man/predict.SentimentDictionaryWeighted.Rd | 10 +- man/preprocessCorpus.Rd | 9 +- man/ridgeEstimation.Rd | 8 +- man/spikeslabEstimation.Rd | 8 +- man/toDocumentTermMatrix.Rd | 12 ++- 20 files changed, 320 insertions(+), 92 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index c9c496b..36b9787 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -6,7 +6,8 @@ Date: 2019-03-25 Authors@R: c(person("Stefan", "Feuerriegel", email="sentiment@sfeuerriegel.com", role=c("aut", "cre")), person("Nicolas", "Proellochs", email="nicolas.proellochs@is.uni-freiburg.de", - role=c("aut"))) + role=c("aut")), + person("Chung-hong", "Chan", email = "chainsawtiney@gmail.com", role = c("ctb"), comment = c(ORCID = "0000-0002-6232-7530"))) Description: Performs a sentiment analysis of textual contents in R. This implementation utilizes various existing dictionaries, such as Harvard IV, or finance-specific dictionaries. Furthermore, it can also create customized dictionaries. The latter @@ -20,7 +21,7 @@ Depends: Imports: tm (>= 0.6), qdapDictionaries, - ngramrr (>= 0.1), + tau, moments, stringdist, glmnet, @@ -34,5 +35,5 @@ Suggests: XML, mgcv LazyData: true -RoxygenNote: 6.1.1 +RoxygenNote: 7.1.1 VignetteBuilder: knitr diff --git a/R/ngrams.R b/R/ngrams.R index c425a94..b7e2db2 100644 --- a/R/ngrams.R +++ b/R/ngrams.R @@ -1,3 +1,67 @@ +### ported from ngramrr + +taungram <- function(text, n = 1, tolower = FALSE, split = "[[:space:]]+", ...) { + r <- tau::textcnt(text, method = 'string', n = n, tolower = tolower, split = split, ...) + return(Reduce(c, sapply(1:length(r), function(x) rep(names(r[x]), r[x])))) +} + +tauchar <- function(text, n = 1, tolower = FALSE, split = "[[:space:]]+", rmEOL = FALSE, ngmin = 1 , ...) { + r <- tau::textcnt(text, method = 'ngram', n = n, tolower = tolower, split = split, ...) + g <- unlist(sapply(1:length(r), function(x) rep(names(r[x]), r[x]))) + if (rmEOL) { + g <- g[grep("_", g, invert = TRUE)] + } + if (ngmin > 1 & ngmin <= n) { + g <- Filter(function(x) nchar(x) >= ngmin, g) + } + return(g) +} + +# General purpose n-gram tokenizer +# +# A non-Java based n-gram tokenizer to be used with the tm package. Support both character and word n-gram. +# +# @param x input string. +# @param char logical, using character n-gram. char = FALSE denotes word n-gram. +# @param ngmin integer, minimun order of n-gram +# @param ngmax integer, maximun order of n-gram +# @param rmEOL logical, remove ngrams wih EOL character +# @return vector of n-grams +# @examples +# require(tm) +# +# nirvana <- c("hello hello hello how low", "hello hello hello how low", +# "hello hello hello how low", "hello hello hello", +# "with the lights out", "it's less dangerous", "here we are now", "entertain us", +# "i feel stupid", "and contagious", "here we are now", "entertain us", +# "a mulatto", "an albino", "a mosquito", "my libido", "yeah", "hey yay") +# +# ngramrr(nirvana[1], ngmax = 3) +# ngramrr(nirvana[1], ngmax = 3, char = TRUE) +# nirvanacor <- Corpus(VectorSource(nirvana)) +# TermDocumentMatrix(nirvanacor, control = list(tokenize = function(x) ngramrr(x, ngmax =3))) +# +# # Character ngram +# +# TermDocumentMatrix(nirvanacor, control = list(tokenize = +# function(x) ngramrr(x, char = TRUE, ngmax =3), wordLengths = c(1, Inf))) +ngramrr <- function(x, char = FALSE, ngmin = 1, ngmax = 2, rmEOL = TRUE) { + if (ngmin > ngmax) { + stop("ngmax must be higher than or equal to ngmin") + } + y <- paste(x, collapse = " ") # why TDM is so stupid? + if (char) { + return(tauchar(y, n = ngmax, rmEOL = rmEOL, ngmin = ngmin)) + } + sentencelength <- length(unlist(strsplit(y, split = " "))) + if (sentencelength > ngmax) { + return(Reduce(c, Map(function(n) taungram(y, n), seq(from = ngmin, to = ngmax)))) + } else { + return(Reduce(c, Map(function(n) taungram(y, n), seq(from = ngmin, to = sentencelength )))) + } +} + + rep_gram <- function(text, n) { r <- stringdist::qgrams(text, q=n) g <- unlist(sapply(1:length(r), function(x) rep(colnames(r)[x], r[x]))) @@ -47,11 +111,11 @@ ngram_tokenize <- function(x, char=FALSE, ngmin=1, ngmax=3) { if (!is.logical(char)) { stop("Customized routine only supports char grams") } - + y <- paste(x, collapse=" ") # hint from ngramrr package if (char) { return(rep_grams(y, ngmin = ngmin, ngmax = ngmax)) } else { - return(ngramrr::ngramrr(x, char=char, ngmin=ngmin, ngmax=ngmax)) + return(ngramrr(x, char=char, ngmin=ngmin, ngmax=ngmax)) } } diff --git a/man/DictionaryGI.Rd b/man/DictionaryGI.Rd index 5602a1f..a1a218a 100644 --- a/man/DictionaryGI.Rd +++ b/man/DictionaryGI.Rd @@ -5,7 +5,9 @@ \alias{DictionaryGI} \title{Dictionary with opinionated words from the Harvard-IV dictionary as used in the General Inquirer software} -\format{A list with different terms according to Henry} +\format{ +A list with different terms according to Henry +} \source{ \url{http://www.wjh.harvard.edu/~inquirer/} } diff --git a/man/DictionaryHE.Rd b/man/DictionaryHE.Rd index beb0dde..613d937 100644 --- a/man/DictionaryHE.Rd +++ b/man/DictionaryHE.Rd @@ -4,7 +4,9 @@ \name{DictionaryHE} \alias{DictionaryHE} \title{Dictionary with opinionated words from Henry's Financial dictionary} -\format{A list with different wordlists according to Henry} +\format{ +A list with different wordlists according to Henry +} \usage{ data(DictionaryHE) } diff --git a/man/DictionaryLM.Rd b/man/DictionaryLM.Rd index 760579f..1ee6f1e 100644 --- a/man/DictionaryLM.Rd +++ b/man/DictionaryLM.Rd @@ -4,7 +4,9 @@ \name{DictionaryLM} \alias{DictionaryLM} \title{Dictionary with opinionated words from Loughran-McDonald Financial dictionary} -\format{A list with different terms according to Loughran-McDonald} +\format{ +A list with different terms according to Loughran-McDonald +} \source{ \url{http://www3.nd.edu/~mcdonald/Word_Lists.html} } diff --git a/man/SentimentAnalysis.Rd b/man/SentimentAnalysis.Rd index 43c332c..5a58765 100644 --- a/man/SentimentAnalysis.Rd +++ b/man/SentimentAnalysis.Rd @@ -3,7 +3,6 @@ \docType{package} \name{SentimentAnalysis} \alias{SentimentAnalysis} -\alias{SentimentAnalysis-package} \title{SentimentAnalysis: A package for analyzing sentiment of texts} \description{ The \code{SentimentAnalysis} package provides routines to quickly measure diff --git a/man/SentimentDictionaryWeighted.Rd b/man/SentimentDictionaryWeighted.Rd index f1434d0..5fcb293 100644 --- a/man/SentimentDictionaryWeighted.Rd +++ b/man/SentimentDictionaryWeighted.Rd @@ -7,8 +7,12 @@ \url{http://dx.doi.org/10.2139/ssrn.2522884} } \usage{ -SentimentDictionaryWeighted(words, scores, idf = rep(1, length(words)), - intercept = 0) +SentimentDictionaryWeighted( + words, + scores, + idf = rep(1, length(words)), + intercept = 0 +) } \arguments{ \item{words}{is collection (vector) of different words as strings} diff --git a/man/analyzeSentiment.Rd b/man/analyzeSentiment.Rd index 3e45081..199f438 100644 --- a/man/analyzeSentiment.Rd +++ b/man/analyzeSentiment.Rd @@ -9,29 +9,65 @@ \alias{analyzeSentiment.DocumentTermMatrix} \title{Sentiment analysis} \usage{ -analyzeSentiment(x, language = "english", aggregate = NULL, - rules = defaultSentimentRules(), removeStopwords = TRUE, - stemming = TRUE, ...) +analyzeSentiment( + x, + language = "english", + aggregate = NULL, + rules = defaultSentimentRules(), + removeStopwords = TRUE, + stemming = TRUE, + ... +) -\method{analyzeSentiment}{Corpus}(x, language = "english", - aggregate = NULL, rules = defaultSentimentRules(), - removeStopwords = TRUE, stemming = TRUE, ...) +\method{analyzeSentiment}{Corpus}( + x, + language = "english", + aggregate = NULL, + rules = defaultSentimentRules(), + removeStopwords = TRUE, + stemming = TRUE, + ... +) -\method{analyzeSentiment}{character}(x, language = "english", - aggregate = NULL, rules = defaultSentimentRules(), - removeStopwords = TRUE, stemming = TRUE, ...) +\method{analyzeSentiment}{character}( + x, + language = "english", + aggregate = NULL, + rules = defaultSentimentRules(), + removeStopwords = TRUE, + stemming = TRUE, + ... +) -\method{analyzeSentiment}{data.frame}(x, language = "english", - aggregate = NULL, rules = defaultSentimentRules(), - removeStopwords = TRUE, stemming = TRUE, ...) +\method{analyzeSentiment}{data.frame}( + x, + language = "english", + aggregate = NULL, + rules = defaultSentimentRules(), + removeStopwords = TRUE, + stemming = TRUE, + ... +) -\method{analyzeSentiment}{TermDocumentMatrix}(x, language = "english", - aggregate = NULL, rules = defaultSentimentRules(), - removeStopwords = TRUE, stemming = TRUE, ...) +\method{analyzeSentiment}{TermDocumentMatrix}( + x, + language = "english", + aggregate = NULL, + rules = defaultSentimentRules(), + removeStopwords = TRUE, + stemming = TRUE, + ... +) -\method{analyzeSentiment}{DocumentTermMatrix}(x, language = "english", - aggregate = NULL, rules = defaultSentimentRules(), - removeStopwords = TRUE, stemming = TRUE, ...) +\method{analyzeSentiment}{DocumentTermMatrix}( + x, + language = "english", + aggregate = NULL, + rules = defaultSentimentRules(), + removeStopwords = TRUE, + stemming = TRUE, + ... +) } \arguments{ \item{x}{A vector of characters, a \code{data.frame}, an object of type diff --git a/man/countWords.Rd b/man/countWords.Rd index 2915ab0..053a91f 100644 --- a/man/countWords.Rd +++ b/man/countWords.Rd @@ -9,23 +9,53 @@ \alias{countWords.DocumentTermMatrix} \title{Count words} \usage{ -countWords(x, aggregate = NULL, removeStopwords = TRUE, - language = "english", ...) +countWords( + x, + aggregate = NULL, + removeStopwords = TRUE, + language = "english", + ... +) -\method{countWords}{Corpus}(x, aggregate = NULL, - removeStopwords = TRUE, language = "english", ...) +\method{countWords}{Corpus}( + x, + aggregate = NULL, + removeStopwords = TRUE, + language = "english", + ... +) -\method{countWords}{character}(x, aggregate = NULL, - removeStopwords = TRUE, language = "english", ...) +\method{countWords}{character}( + x, + aggregate = NULL, + removeStopwords = TRUE, + language = "english", + ... +) -\method{countWords}{data.frame}(x, aggregate = NULL, - removeStopwords = TRUE, language = "english", ...) +\method{countWords}{data.frame}( + x, + aggregate = NULL, + removeStopwords = TRUE, + language = "english", + ... +) -\method{countWords}{TermDocumentMatrix}(x, aggregate = NULL, - removeStopwords = TRUE, language = "english", ...) +\method{countWords}{TermDocumentMatrix}( + x, + aggregate = NULL, + removeStopwords = TRUE, + language = "english", + ... +) -\method{countWords}{DocumentTermMatrix}(x, aggregate = NULL, - removeStopwords = TRUE, language = "english", ...) +\method{countWords}{DocumentTermMatrix}( + x, + aggregate = NULL, + removeStopwords = TRUE, + language = "english", + ... +) } \arguments{ \item{x}{A vector of characters, a \code{data.frame}, an object of type diff --git a/man/enetEstimation.Rd b/man/enetEstimation.Rd index d7e34da..078eaaa 100644 --- a/man/enetEstimation.Rd +++ b/man/enetEstimation.Rd @@ -4,8 +4,12 @@ \alias{enetEstimation} \title{Elastic net estimation} \usage{ -enetEstimation(x, response, control = list(alpha = 0.5, s = "lambda.min", - family = "gaussian", grouped = FALSE), ...) +enetEstimation( + x, + response, + control = list(alpha = 0.5, s = "lambda.min", family = "gaussian", grouped = FALSE), + ... +) } \arguments{ \item{x}{An object of type \code{\link[tm]{DocumentTermMatrix}}.} diff --git a/man/generateDictionary.Rd b/man/generateDictionary.Rd index a8ad349..d706252 100644 --- a/man/generateDictionary.Rd +++ b/man/generateDictionary.Rd @@ -12,35 +12,83 @@ \url{https://dx.doi.org/10.2139/ssrn.2522884} } \usage{ -generateDictionary(x, response, language = "english", - modelType = "lasso", filterTerms = NULL, control = list(), - minWordLength = 3, sparsity = 0.9, weighting = function(x) - tm::weightTfIdf(x, normalize = FALSE), ...) - -\method{generateDictionary}{Corpus}(x, response, language = "english", - modelType = "lasso", filterTerms = NULL, control = list(), - minWordLength = 3, sparsity = 0.9, weighting = function(x) - tm::weightTfIdf(x, normalize = FALSE), ...) - -\method{generateDictionary}{character}(x, response, language = "english", - modelType = "lasso", filterTerms = NULL, control = list(), - minWordLength = 3, sparsity = 0.9, weighting = function(x) - tm::weightTfIdf(x, normalize = FALSE), ...) - -\method{generateDictionary}{data.frame}(x, response, - language = "english", modelType = "lasso", filterTerms = NULL, - control = list(), minWordLength = 3, sparsity = 0.9, - weighting = function(x) tm::weightTfIdf(x, normalize = FALSE), ...) - -\method{generateDictionary}{TermDocumentMatrix}(x, response, - language = "english", modelType = "lasso", filterTerms = NULL, - control = list(), minWordLength = 3, sparsity = 0.9, - weighting = function(x) tm::weightTfIdf(x, normalize = FALSE), ...) - -\method{generateDictionary}{DocumentTermMatrix}(x, response, - language = "english", modelType = "lasso", filterTerms = NULL, - control = list(), minWordLength = 3, sparsity = 0.9, - weighting = function(x) tm::weightTfIdf(x, normalize = FALSE), ...) +generateDictionary( + x, + response, + language = "english", + modelType = "lasso", + filterTerms = NULL, + control = list(), + minWordLength = 3, + sparsity = 0.9, + weighting = function(x) tm::weightTfIdf(x, normalize = FALSE), + ... +) + +\method{generateDictionary}{Corpus}( + x, + response, + language = "english", + modelType = "lasso", + filterTerms = NULL, + control = list(), + minWordLength = 3, + sparsity = 0.9, + weighting = function(x) tm::weightTfIdf(x, normalize = FALSE), + ... +) + +\method{generateDictionary}{character}( + x, + response, + language = "english", + modelType = "lasso", + filterTerms = NULL, + control = list(), + minWordLength = 3, + sparsity = 0.9, + weighting = function(x) tm::weightTfIdf(x, normalize = FALSE), + ... +) + +\method{generateDictionary}{data.frame}( + x, + response, + language = "english", + modelType = "lasso", + filterTerms = NULL, + control = list(), + minWordLength = 3, + sparsity = 0.9, + weighting = function(x) tm::weightTfIdf(x, normalize = FALSE), + ... +) + +\method{generateDictionary}{TermDocumentMatrix}( + x, + response, + language = "english", + modelType = "lasso", + filterTerms = NULL, + control = list(), + minWordLength = 3, + sparsity = 0.9, + weighting = function(x) tm::weightTfIdf(x, normalize = FALSE), + ... +) + +\method{generateDictionary}{DocumentTermMatrix}( + x, + response, + language = "english", + modelType = "lasso", + filterTerms = NULL, + control = list(), + minWordLength = 3, + sparsity = 0.9, + weighting = function(x) tm::weightTfIdf(x, normalize = FALSE), + ... +) } \arguments{ \item{x}{A vector of characters, a \code{data.frame}, an object of type diff --git a/man/lassoEstimation.Rd b/man/lassoEstimation.Rd index 60efe1d..2917aa5 100644 --- a/man/lassoEstimation.Rd +++ b/man/lassoEstimation.Rd @@ -4,8 +4,12 @@ \alias{lassoEstimation} \title{Lasso estimation} \usage{ -lassoEstimation(x, response, control = list(alpha = 1, s = "lambda.min", - family = "gaussian", grouped = FALSE), ...) +lassoEstimation( + x, + response, + control = list(alpha = 1, s = "lambda.min", family = "gaussian", grouped = FALSE), + ... +) } \arguments{ \item{x}{An object of type \code{\link[tm]{DocumentTermMatrix}}.} diff --git a/man/plot.SentimentDictionaryWeighted.Rd b/man/plot.SentimentDictionaryWeighted.Rd index 9a2e2c0..d5d8244 100644 --- a/man/plot.SentimentDictionaryWeighted.Rd +++ b/man/plot.SentimentDictionaryWeighted.Rd @@ -4,8 +4,7 @@ \alias{plot.SentimentDictionaryWeighted} \title{KDE plot of estimated coefficients} \usage{ -\method{plot}{SentimentDictionaryWeighted}(x, color = "gray60", - theme = ggplot2::theme_bw(), ...) +\method{plot}{SentimentDictionaryWeighted}(x, color = "gray60", theme = ggplot2::theme_bw(), ...) } \arguments{ \item{x}{Dictionary of class \code{\link{SentimentDictionaryWeighted}}} diff --git a/man/plotSentiment.Rd b/man/plotSentiment.Rd index 496c83f..a4d110d 100644 --- a/man/plotSentiment.Rd +++ b/man/plotSentiment.Rd @@ -4,8 +4,13 @@ \alias{plotSentiment} \title{Line plot with sentiment scores} \usage{ -plotSentiment(sentiment, x = NULL, cumsum = FALSE, xlab = "", - ylab = "Sentiment") +plotSentiment( + sentiment, + x = NULL, + cumsum = FALSE, + xlab = "", + ylab = "Sentiment" +) } \arguments{ \item{sentiment}{\code{data.frame} or numeric vector with sentiment scores} diff --git a/man/plotSentimentResponse.Rd b/man/plotSentimentResponse.Rd index 77f87dd..0b7f7c2 100644 --- a/man/plotSentimentResponse.Rd +++ b/man/plotSentimentResponse.Rd @@ -4,8 +4,13 @@ \alias{plotSentimentResponse} \title{Scatterplot with trend line between sentiment and response} \usage{ -plotSentimentResponse(sentiment, response, smoothing = "gam", - xlab = "Sentiment", ylab = "Response") +plotSentimentResponse( + sentiment, + response, + smoothing = "gam", + xlab = "Sentiment", + ylab = "Response" +) } \arguments{ \item{sentiment}{\code{data.frame} with sentiment scores} diff --git a/man/predict.SentimentDictionaryWeighted.Rd b/man/predict.SentimentDictionaryWeighted.Rd index a5e111a..2352da5 100644 --- a/man/predict.SentimentDictionaryWeighted.Rd +++ b/man/predict.SentimentDictionaryWeighted.Rd @@ -4,9 +4,13 @@ \alias{predict.SentimentDictionaryWeighted} \title{Prediction for given dictionary} \usage{ -\method{predict}{SentimentDictionaryWeighted}(object, newdata = NULL, - language = "english", weighting = function(x) tm::weightTfIdf(x, - normalize = FALSE), ...) +\method{predict}{SentimentDictionaryWeighted}( + object, + newdata = NULL, + language = "english", + weighting = function(x) tm::weightTfIdf(x, normalize = FALSE), + ... +) } \arguments{ \item{object}{Dictionary of class \code{\link{SentimentDictionaryWeighted}}.} diff --git a/man/preprocessCorpus.Rd b/man/preprocessCorpus.Rd index e0e76bf..85eeb38 100644 --- a/man/preprocessCorpus.Rd +++ b/man/preprocessCorpus.Rd @@ -4,8 +4,13 @@ \alias{preprocessCorpus} \title{Default preprocessing of corpus} \usage{ -preprocessCorpus(corpus, language = "english", stemming = TRUE, - verbose = FALSE, removeStopwords = TRUE) +preprocessCorpus( + corpus, + language = "english", + stemming = TRUE, + verbose = FALSE, + removeStopwords = TRUE +) } \arguments{ \item{corpus}{\code{\link[tm]{Corpus}} object which should be processed} diff --git a/man/ridgeEstimation.Rd b/man/ridgeEstimation.Rd index b1d5b89..4776e6e 100644 --- a/man/ridgeEstimation.Rd +++ b/man/ridgeEstimation.Rd @@ -4,8 +4,12 @@ \alias{ridgeEstimation} \title{Ridge estimation} \usage{ -ridgeEstimation(x, response, control = list(s = "lambda.min", family = - "gaussian", grouped = FALSE), ...) +ridgeEstimation( + x, + response, + control = list(s = "lambda.min", family = "gaussian", grouped = FALSE), + ... +) } \arguments{ \item{x}{An object of type \code{\link[tm]{DocumentTermMatrix}}.} diff --git a/man/spikeslabEstimation.Rd b/man/spikeslabEstimation.Rd index 7fff566..6fe8915 100644 --- a/man/spikeslabEstimation.Rd +++ b/man/spikeslabEstimation.Rd @@ -4,8 +4,12 @@ \alias{spikeslabEstimation} \title{Spike-and-slab estimation} \usage{ -spikeslabEstimation(x, response, control = list(n.iter1 = 500, n.iter2 = - 500), ...) +spikeslabEstimation( + x, + response, + control = list(n.iter1 = 500, n.iter2 = 500), + ... +) } \arguments{ \item{x}{An object of type \code{\link[tm]{DocumentTermMatrix}}.} diff --git a/man/toDocumentTermMatrix.Rd b/man/toDocumentTermMatrix.Rd index dcac6fe..8fd4796 100644 --- a/man/toDocumentTermMatrix.Rd +++ b/man/toDocumentTermMatrix.Rd @@ -4,9 +4,15 @@ \alias{toDocumentTermMatrix} \title{Default preprocessing of corpus and conversion to document-term matrix} \usage{ -toDocumentTermMatrix(x, language = "english", minWordLength = 3, - sparsity = NULL, removeStopwords = TRUE, stemming = TRUE, - weighting = function(x) tm::weightTfIdf(x, normalize = FALSE)) +toDocumentTermMatrix( + x, + language = "english", + minWordLength = 3, + sparsity = NULL, + removeStopwords = TRUE, + stemming = TRUE, + weighting = function(x) tm::weightTfIdf(x, normalize = FALSE) +) } \arguments{ \item{x}{\code{\link[tm]{Corpus}} object which should be processed}