diff --git a/DESCRIPTION b/DESCRIPTION index 41e21a1..ea7aa93 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: textreuse Type: Package Title: Detect Text Reuse and Document Similarity -Version: 0.1.4.9000 +Version: 0.1.5.9000 Date: 2016-11-28 Authors@R: c(person("Lincoln", "Mullen", role = c("aut", "cre"), email = "lincoln@lincolnmullen.com", comment = c(ORCID = "0000-0001-5103-6917"))) diff --git a/NEWS.md b/NEWS.md index 0f20261..2c4adf9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# textreuse 0.1.5 + +- Added encoding argument to allow reading UTF-8 files on Windows + # textreuse 0.1.4 - Preventative maintenance release to avoid failing tests when new version of diff --git a/R/TextReuseCorpus.R b/R/TextReuseCorpus.R index 69c56f7..3a5dc16 100644 --- a/R/TextReuseCorpus.R +++ b/R/TextReuseCorpus.R @@ -45,6 +45,7 @@ #' @param keep_text Should the text be saved in the documents that are returned #' or discarded? #' @param skip_short Should short documents be skipped? (See details.) +#' @param encoding Encoding to be used when reading files. #' #' @seealso \link[=TextReuseTextDocument-accessors]{Accessors for TextReuse #' objects}. @@ -65,7 +66,8 @@ TextReuseCorpus <- function(paths, dir = NULL, text = NULL, meta = list(), minhash_func = NULL, keep_tokens = FALSE, keep_text = TRUE, - skip_short = TRUE) { + skip_short = TRUE, + encoding = "unknown") { if (!is.null(tokenizer)) { assert_that(is.function(tokenizer), @@ -115,6 +117,7 @@ TextReuseCorpus <- function(paths, dir = NULL, text = NULL, meta = list(), keep_tokens = keep_tokens, keep_text = keep_text, skip_short = skip_short, + encoding = encoding, meta = list(id = names(text)[i], tokenizer = tokenizer_name, hash_func = hash_func_name, @@ -151,6 +154,7 @@ TextReuseCorpus <- function(paths, dir = NULL, text = NULL, meta = list(), keep_tokens = keep_tokens, keep_text = keep_text, skip_short = skip_short, + encoding = encoding, meta = list(tokenizer = tokenizer_name, hash_func = hash_func_name, minhash_func = minhash_func_name)) diff --git a/R/TextReuseTextDocument.R b/R/TextReuseTextDocument.R index 33d931d..fc96bcb 100644 --- a/R/TextReuseTextDocument.R +++ b/R/TextReuseTextDocument.R @@ -24,7 +24,8 @@ #' @param keep_text Should the text be saved in the document that is returned or #' discarded? #' @param skip_short Should short documents be skipped? (See details.) -#' +#' @param encoding Encoding to be used when reading files. +#' #' @details This constructor function follows a three-step process. It reads in #' the text, either from a file or from memory. It then tokenizes that text. #' Then it hashes the tokens. Most of the comparison functions in this package @@ -67,14 +68,15 @@ TextReuseTextDocument <- function(text, file = NULL, meta = list(), minhash_func = NULL, keep_tokens = FALSE, keep_text = TRUE, - skip_short = TRUE) { + skip_short = TRUE, + encoding = "unknown") { if (!missing(text)) assert_that(has_id(meta)) if (!is.null(file)) { assert_that(missing(text), is.readable(file)) - text <- as_string(readLines(file)) + text <- as_string(readLines(file, encoding = encoding)) } assert_that(is.character(text)) diff --git a/man/TextReuseCorpus.Rd b/man/TextReuseCorpus.Rd index 6416753..5ef2e84 100644 --- a/man/TextReuseCorpus.Rd +++ b/man/TextReuseCorpus.Rd @@ -6,10 +6,21 @@ \alias{skipped} \title{TextReuseCorpus} \usage{ -TextReuseCorpus(paths, dir = NULL, text = NULL, meta = list(), - progress = interactive(), tokenizer = tokenize_ngrams, ..., - hash_func = hash_string, minhash_func = NULL, keep_tokens = FALSE, - keep_text = TRUE, skip_short = TRUE) +TextReuseCorpus( + paths, + dir = NULL, + text = NULL, + meta = list(), + progress = interactive(), + tokenizer = tokenize_ngrams, + ..., + hash_func = hash_string, + minhash_func = NULL, + keep_tokens = FALSE, + keep_text = TRUE, + skip_short = TRUE, + encoding = "unknown" +) is.TextReuseCorpus(x) @@ -47,6 +58,8 @@ or discarded?} \item{skip_short}{Should short documents be skipped? (See details.)} +\item{encoding}{Encoding to be used when reading files.} + \item{x}{An R object to check.} } \description{ diff --git a/man/TextReuseTextDocument.Rd b/man/TextReuseTextDocument.Rd index 10c03c6..3ce7f84 100644 --- a/man/TextReuseTextDocument.Rd +++ b/man/TextReuseTextDocument.Rd @@ -9,10 +9,19 @@ \alias{has_minhashes} \title{TextReuseTextDocument} \usage{ -TextReuseTextDocument(text, file = NULL, meta = list(), - tokenizer = tokenize_ngrams, ..., hash_func = hash_string, - minhash_func = NULL, keep_tokens = FALSE, keep_text = TRUE, - skip_short = TRUE) +TextReuseTextDocument( + text, + file = NULL, + meta = list(), + tokenizer = tokenize_ngrams, + ..., + hash_func = hash_string, + minhash_func = NULL, + keep_tokens = FALSE, + keep_text = TRUE, + skip_short = TRUE, + encoding = "unknown" +) is.TextReuseTextDocument(x) @@ -56,6 +65,8 @@ discarded?} \item{skip_short}{Should short documents be skipped? (See details.)} +\item{encoding}{Encoding to be used when reading files.} + \item{x}{An R object to check.} } \value{