ropensci · davidfuhry · Jan 15, 2020 · Jan 15, 2020
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: textreuse
 Type: Package
 Title: Detect Text Reuse and Document Similarity
-Version: 0.1.4.9000
+Version: 0.1.5.9000
 Date: 2016-11-28
 Authors@R: c(person("Lincoln", "Mullen", role = c("aut", "cre"),
     email = "lincoln@lincolnmullen.com", comment = c(ORCID = "0000-0001-5103-6917")))

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,7 @@
+# textreuse 0.1.5
+
+- Added encoding argument to allow reading UTF-8 files on Windows
+
 # textreuse 0.1.4
 
 - Preventative maintenance release to avoid failing tests when new version of

diff --git a/R/TextReuseCorpus.R b/R/TextReuseCorpus.R
@@ -45,6 +45,7 @@
 #' @param keep_text Should the text be saved in the documents that are returned
 #'   or discarded?
 #' @param skip_short Should short documents be skipped? (See details.)
+#' @param encoding Encoding to be used when reading files.
 #'
 #' @seealso \link[=TextReuseTextDocument-accessors]{Accessors for TextReuse
 #'   objects}.
@@ -65,7 +66,8 @@ TextReuseCorpus <- function(paths, dir = NULL, text = NULL, meta = list(),
                             minhash_func = NULL,
                             keep_tokens = FALSE,
                             keep_text = TRUE,
-                            skip_short = TRUE) {
+                            skip_short = TRUE,
+                            encoding = "unknown") {
 
   if (!is.null(tokenizer)) {
     assert_that(is.function(tokenizer),
@@ -115,6 +117,7 @@ TextReuseCorpus <- function(paths, dir = NULL, text = NULL, meta = list(),
                                  keep_tokens = keep_tokens,
                                  keep_text = keep_text,
                                  skip_short = skip_short,
+                                 encoding = encoding,
                                  meta = list(id = names(text)[i],
                                              tokenizer = tokenizer_name,
                                              hash_func = hash_func_name,
@@ -151,6 +154,7 @@ TextReuseCorpus <- function(paths, dir = NULL, text = NULL, meta = list(),
                                  keep_tokens = keep_tokens,
                                  keep_text = keep_text,
                                  skip_short = skip_short,
+                                 encoding = encoding,
                                  meta = list(tokenizer = tokenizer_name,
                                              hash_func = hash_func_name,
                                              minhash_func = minhash_func_name))

diff --git a/R/TextReuseTextDocument.R b/R/TextReuseTextDocument.R
@@ -24,7 +24,8 @@
 #' @param keep_text Should the text be saved in the document that is returned or
 #'   discarded?
 #' @param skip_short Should short documents be skipped? (See details.)
-#'
+#' @param encoding Encoding to be used when reading files.
+#' 
 #' @details This constructor function follows a three-step process. It reads in
 #'   the text, either from a file or from memory. It then tokenizes that text.
 #'   Then it hashes the tokens. Most of the comparison functions in this package
@@ -67,14 +68,15 @@ TextReuseTextDocument <- function(text, file = NULL, meta = list(),
                                   minhash_func = NULL,
                                   keep_tokens = FALSE,
                                   keep_text = TRUE,
-                                  skip_short = TRUE) {
+                                  skip_short = TRUE,
+                                  encoding = "unknown") {
 
   if (!missing(text)) assert_that(has_id(meta))
 
   if (!is.null(file)) {
     assert_that(missing(text),
                 is.readable(file))
-    text <- as_string(readLines(file))
+    text <- as_string(readLines(file, encoding = encoding))
   }
 
   assert_that(is.character(text))

diff --git a/man/TextReuseCorpus.Rd b/man/TextReuseCorpus.Rd
diff --git a/man/TextReuseTextDocument.Rd b/man/TextReuseTextDocument.Rd