From 63f480f50c71622ae3ea14b787d757a67fd76879 Mon Sep 17 00:00:00 2001 From: David Fuhry Date: Wed, 15 Jan 2020 21:00:37 +0100 Subject: [PATCH 1/2] added encoding argument to TextReuseCorpus and TextReuseTextDocument --- DESCRIPTION | 4 ++-- NEWS.md | 4 ++++ R/TextReuseCorpus.R | 6 +++++- R/TextReuseTextDocument.R | 8 +++++--- man/TextReuseCorpus.Rd | 21 +++++++++++++++++---- man/TextReuseTextDocument.Rd | 19 +++++++++++++++---- 6 files changed, 48 insertions(+), 14 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 41e21a1..fb604ac 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: textreuse Type: Package Title: Detect Text Reuse and Document Similarity -Version: 0.1.4.9000 +Version: 0.1.5.9000 Date: 2016-11-28 Authors@R: c(person("Lincoln", "Mullen", role = c("aut", "cre"), email = "lincoln@lincolnmullen.com", comment = c(ORCID = "0000-0001-5103-6917"))) @@ -33,4 +33,4 @@ Suggests: rmarkdown (>= 0.8), covr LinkingTo: BH, Rcpp, RcppProgress -RoxygenNote: 6.0.1 +RoxygenNote: 7.0.2 diff --git a/NEWS.md b/NEWS.md index 0f20261..2c4adf9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# textreuse 0.1.5 + +- Added encoding argument to allow reading UTF-8 files on Windows + # textreuse 0.1.4 - Preventative maintenance release to avoid failing tests when new version of diff --git a/R/TextReuseCorpus.R b/R/TextReuseCorpus.R index 69c56f7..3a5dc16 100644 --- a/R/TextReuseCorpus.R +++ b/R/TextReuseCorpus.R @@ -45,6 +45,7 @@ #' @param keep_text Should the text be saved in the documents that are returned #' or discarded? #' @param skip_short Should short documents be skipped? (See details.) +#' @param encoding Encoding to be used when reading files. #' #' @seealso \link[=TextReuseTextDocument-accessors]{Accessors for TextReuse #' objects}. @@ -65,7 +66,8 @@ TextReuseCorpus <- function(paths, dir = NULL, text = NULL, meta = list(), minhash_func = NULL, keep_tokens = FALSE, keep_text = TRUE, - skip_short = TRUE) { + skip_short = TRUE, + encoding = "unknown") { if (!is.null(tokenizer)) { assert_that(is.function(tokenizer), @@ -115,6 +117,7 @@ TextReuseCorpus <- function(paths, dir = NULL, text = NULL, meta = list(), keep_tokens = keep_tokens, keep_text = keep_text, skip_short = skip_short, + encoding = encoding, meta = list(id = names(text)[i], tokenizer = tokenizer_name, hash_func = hash_func_name, @@ -151,6 +154,7 @@ TextReuseCorpus <- function(paths, dir = NULL, text = NULL, meta = list(), keep_tokens = keep_tokens, keep_text = keep_text, skip_short = skip_short, + encoding = encoding, meta = list(tokenizer = tokenizer_name, hash_func = hash_func_name, minhash_func = minhash_func_name)) diff --git a/R/TextReuseTextDocument.R b/R/TextReuseTextDocument.R index 33d931d..fc96bcb 100644 --- a/R/TextReuseTextDocument.R +++ b/R/TextReuseTextDocument.R @@ -24,7 +24,8 @@ #' @param keep_text Should the text be saved in the document that is returned or #' discarded? #' @param skip_short Should short documents be skipped? (See details.) -#' +#' @param encoding Encoding to be used when reading files. +#' #' @details This constructor function follows a three-step process. It reads in #' the text, either from a file or from memory. It then tokenizes that text. #' Then it hashes the tokens. Most of the comparison functions in this package @@ -67,14 +68,15 @@ TextReuseTextDocument <- function(text, file = NULL, meta = list(), minhash_func = NULL, keep_tokens = FALSE, keep_text = TRUE, - skip_short = TRUE) { + skip_short = TRUE, + encoding = "unknown") { if (!missing(text)) assert_that(has_id(meta)) if (!is.null(file)) { assert_that(missing(text), is.readable(file)) - text <- as_string(readLines(file)) + text <- as_string(readLines(file, encoding = encoding)) } assert_that(is.character(text)) diff --git a/man/TextReuseCorpus.Rd b/man/TextReuseCorpus.Rd index 6416753..5ef2e84 100644 --- a/man/TextReuseCorpus.Rd +++ b/man/TextReuseCorpus.Rd @@ -6,10 +6,21 @@ \alias{skipped} \title{TextReuseCorpus} \usage{ -TextReuseCorpus(paths, dir = NULL, text = NULL, meta = list(), - progress = interactive(), tokenizer = tokenize_ngrams, ..., - hash_func = hash_string, minhash_func = NULL, keep_tokens = FALSE, - keep_text = TRUE, skip_short = TRUE) +TextReuseCorpus( + paths, + dir = NULL, + text = NULL, + meta = list(), + progress = interactive(), + tokenizer = tokenize_ngrams, + ..., + hash_func = hash_string, + minhash_func = NULL, + keep_tokens = FALSE, + keep_text = TRUE, + skip_short = TRUE, + encoding = "unknown" +) is.TextReuseCorpus(x) @@ -47,6 +58,8 @@ or discarded?} \item{skip_short}{Should short documents be skipped? (See details.)} +\item{encoding}{Encoding to be used when reading files.} + \item{x}{An R object to check.} } \description{ diff --git a/man/TextReuseTextDocument.Rd b/man/TextReuseTextDocument.Rd index 10c03c6..3ce7f84 100644 --- a/man/TextReuseTextDocument.Rd +++ b/man/TextReuseTextDocument.Rd @@ -9,10 +9,19 @@ \alias{has_minhashes} \title{TextReuseTextDocument} \usage{ -TextReuseTextDocument(text, file = NULL, meta = list(), - tokenizer = tokenize_ngrams, ..., hash_func = hash_string, - minhash_func = NULL, keep_tokens = FALSE, keep_text = TRUE, - skip_short = TRUE) +TextReuseTextDocument( + text, + file = NULL, + meta = list(), + tokenizer = tokenize_ngrams, + ..., + hash_func = hash_string, + minhash_func = NULL, + keep_tokens = FALSE, + keep_text = TRUE, + skip_short = TRUE, + encoding = "unknown" +) is.TextReuseTextDocument(x) @@ -56,6 +65,8 @@ discarded?} \item{skip_short}{Should short documents be skipped? (See details.)} +\item{encoding}{Encoding to be used when reading files.} + \item{x}{An R object to check.} } \value{ From 5d68a16feba5e09822b0094337fb698c42967800 Mon Sep 17 00:00:00 2001 From: David Fuhry Date: Wed, 15 Jan 2020 21:10:23 +0100 Subject: [PATCH 2/2] reverted roxygennote version --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index fb604ac..ea7aa93 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -33,4 +33,4 @@ Suggests: rmarkdown (>= 0.8), covr LinkingTo: BH, Rcpp, RcppProgress -RoxygenNote: 7.0.2 +RoxygenNote: 6.0.1