From 63f480f50c71622ae3ea14b787d757a67fd76879 Mon Sep 17 00:00:00 2001
From: David Fuhry <david@129a-records.de>
Date: Wed, 15 Jan 2020 21:00:37 +0100
Subject: [PATCH 1/2] added encoding argument to TextReuseCorpus and
 TextReuseTextDocument

---
 DESCRIPTION                  |  4 ++--
 NEWS.md                      |  4 ++++
 R/TextReuseCorpus.R          |  6 +++++-
 R/TextReuseTextDocument.R    |  8 +++++---
 man/TextReuseCorpus.Rd       | 21 +++++++++++++++++----
 man/TextReuseTextDocument.Rd | 19 +++++++++++++++----
 6 files changed, 48 insertions(+), 14 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 41e21a1..fb604ac 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: textreuse
 Type: Package
 Title: Detect Text Reuse and Document Similarity
-Version: 0.1.4.9000
+Version: 0.1.5.9000
 Date: 2016-11-28
 Authors@R: c(person("Lincoln", "Mullen", role = c("aut", "cre"),
     email = "lincoln@lincolnmullen.com", comment = c(ORCID = "0000-0001-5103-6917")))
@@ -33,4 +33,4 @@ Suggests:
     rmarkdown (>= 0.8),
     covr
 LinkingTo: BH, Rcpp, RcppProgress
-RoxygenNote: 6.0.1
+RoxygenNote: 7.0.2
diff --git a/NEWS.md b/NEWS.md
index 0f20261..2c4adf9 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,7 @@
+# textreuse 0.1.5
+
+- Added encoding argument to allow reading UTF-8 files on Windows
+
 # textreuse 0.1.4
 
 - Preventative maintenance release to avoid failing tests when new version of
diff --git a/R/TextReuseCorpus.R b/R/TextReuseCorpus.R
index 69c56f7..3a5dc16 100644
--- a/R/TextReuseCorpus.R
+++ b/R/TextReuseCorpus.R
@@ -45,6 +45,7 @@
 #' @param keep_text Should the text be saved in the documents that are returned
 #'   or discarded?
 #' @param skip_short Should short documents be skipped? (See details.)
+#' @param encoding Encoding to be used when reading files.
 #'
 #' @seealso \link[=TextReuseTextDocument-accessors]{Accessors for TextReuse
 #'   objects}.
@@ -65,7 +66,8 @@ TextReuseCorpus <- function(paths, dir = NULL, text = NULL, meta = list(),
                             minhash_func = NULL,
                             keep_tokens = FALSE,
                             keep_text = TRUE,
-                            skip_short = TRUE) {
+                            skip_short = TRUE,
+                            encoding = "unknown") {
 
   if (!is.null(tokenizer)) {
     assert_that(is.function(tokenizer),
@@ -115,6 +117,7 @@ TextReuseCorpus <- function(paths, dir = NULL, text = NULL, meta = list(),
                                  keep_tokens = keep_tokens,
                                  keep_text = keep_text,
                                  skip_short = skip_short,
+                                 encoding = encoding,
                                  meta = list(id = names(text)[i],
                                              tokenizer = tokenizer_name,
                                              hash_func = hash_func_name,
@@ -151,6 +154,7 @@ TextReuseCorpus <- function(paths, dir = NULL, text = NULL, meta = list(),
                                  keep_tokens = keep_tokens,
                                  keep_text = keep_text,
                                  skip_short = skip_short,
+                                 encoding = encoding,
                                  meta = list(tokenizer = tokenizer_name,
                                              hash_func = hash_func_name,
                                              minhash_func = minhash_func_name))
diff --git a/R/TextReuseTextDocument.R b/R/TextReuseTextDocument.R
index 33d931d..fc96bcb 100644
--- a/R/TextReuseTextDocument.R
+++ b/R/TextReuseTextDocument.R
@@ -24,7 +24,8 @@
 #' @param keep_text Should the text be saved in the document that is returned or
 #'   discarded?
 #' @param skip_short Should short documents be skipped? (See details.)
-#'
+#' @param encoding Encoding to be used when reading files.
+#' 
 #' @details This constructor function follows a three-step process. It reads in
 #'   the text, either from a file or from memory. It then tokenizes that text.
 #'   Then it hashes the tokens. Most of the comparison functions in this package
@@ -67,14 +68,15 @@ TextReuseTextDocument <- function(text, file = NULL, meta = list(),
                                   minhash_func = NULL,
                                   keep_tokens = FALSE,
                                   keep_text = TRUE,
-                                  skip_short = TRUE) {
+                                  skip_short = TRUE,
+                                  encoding = "unknown") {
 
   if (!missing(text)) assert_that(has_id(meta))
 
   if (!is.null(file)) {
     assert_that(missing(text),
                 is.readable(file))
-    text <- as_string(readLines(file))
+    text <- as_string(readLines(file, encoding = encoding))
   }
 
   assert_that(is.character(text))
diff --git a/man/TextReuseCorpus.Rd b/man/TextReuseCorpus.Rd
index 6416753..5ef2e84 100644
--- a/man/TextReuseCorpus.Rd
+++ b/man/TextReuseCorpus.Rd
@@ -6,10 +6,21 @@
 \alias{skipped}
 \title{TextReuseCorpus}
 \usage{
-TextReuseCorpus(paths, dir = NULL, text = NULL, meta = list(),
-  progress = interactive(), tokenizer = tokenize_ngrams, ...,
-  hash_func = hash_string, minhash_func = NULL, keep_tokens = FALSE,
-  keep_text = TRUE, skip_short = TRUE)
+TextReuseCorpus(
+  paths,
+  dir = NULL,
+  text = NULL,
+  meta = list(),
+  progress = interactive(),
+  tokenizer = tokenize_ngrams,
+  ...,
+  hash_func = hash_string,
+  minhash_func = NULL,
+  keep_tokens = FALSE,
+  keep_text = TRUE,
+  skip_short = TRUE,
+  encoding = "unknown"
+)
 
 is.TextReuseCorpus(x)
 
@@ -47,6 +58,8 @@ or discarded?}
 
 \item{skip_short}{Should short documents be skipped? (See details.)}
 
+\item{encoding}{Encoding to be used when reading files.}
+
 \item{x}{An R object to check.}
 }
 \description{
diff --git a/man/TextReuseTextDocument.Rd b/man/TextReuseTextDocument.Rd
index 10c03c6..3ce7f84 100644
--- a/man/TextReuseTextDocument.Rd
+++ b/man/TextReuseTextDocument.Rd
@@ -9,10 +9,19 @@
 \alias{has_minhashes}
 \title{TextReuseTextDocument}
 \usage{
-TextReuseTextDocument(text, file = NULL, meta = list(),
-  tokenizer = tokenize_ngrams, ..., hash_func = hash_string,
-  minhash_func = NULL, keep_tokens = FALSE, keep_text = TRUE,
-  skip_short = TRUE)
+TextReuseTextDocument(
+  text,
+  file = NULL,
+  meta = list(),
+  tokenizer = tokenize_ngrams,
+  ...,
+  hash_func = hash_string,
+  minhash_func = NULL,
+  keep_tokens = FALSE,
+  keep_text = TRUE,
+  skip_short = TRUE,
+  encoding = "unknown"
+)
 
 is.TextReuseTextDocument(x)
 
@@ -56,6 +65,8 @@ discarded?}
 
 \item{skip_short}{Should short documents be skipped? (See details.)}
 
+\item{encoding}{Encoding to be used when reading files.}
+
 \item{x}{An R object to check.}
 }
 \value{

From 5d68a16feba5e09822b0094337fb698c42967800 Mon Sep 17 00:00:00 2001
From: David Fuhry <david@129a-records.de>
Date: Wed, 15 Jan 2020 21:10:23 +0100
Subject: [PATCH 2/2] reverted roxygennote version

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index fb604ac..ea7aa93 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -33,4 +33,4 @@ Suggests:
     rmarkdown (>= 0.8),
     covr
 LinkingTo: BH, Rcpp, RcppProgress
-RoxygenNote: 7.0.2
+RoxygenNote: 6.0.1