diff --git a/pom-common.xml b/pom-common.xml
index 3e39c2a..63c659f 100755
--- a/pom-common.xml
+++ b/pom-common.xml
@@ -164,17 +164,17 @@
org.apache.tika
tika-parsers
- 1.14
+ 1.15
org.apache.tika
tika-core
- 1.14
+ 1.15
org.apache.pdfbox
pdfbox
- 1.8.1
+ 2.0.6
diff --git a/src/java/edu/stanford/muse/datacache/Blob.java b/src/java/edu/stanford/muse/datacache/Blob.java
index 19150d0..e952541 100755
--- a/src/java/edu/stanford/muse/datacache/Blob.java
+++ b/src/java/edu/stanford/muse/datacache/Blob.java
@@ -19,12 +19,14 @@
import edu.stanford.muse.util.Util;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
import java.io.IOException;
import java.io.InputStream;
@@ -132,9 +134,15 @@ public Pair getContent(BlobStore store)
try {
// skip mp3 files, tika has trouble with it and hangs
if (!Util.nullOrEmpty(this.filename) && !this.filename.toLowerCase().endsWith(".mp3"))
- parser.parse(stream, handler, metadata, context);
-
- String[] names = metadata.names();
+ try {
+ parser.parse(stream, handler, metadata, context);
+ } catch (Exception e) {
+ log.error(e.getMessage(), e);
+ log.error(filename);
+ throw new RuntimeException(e.getMessage(), e);
+ }
+
+ String[] names = metadata.names();
//Arrays.sort(names);
for (String name : names) {
// some metadata tags are problematic and result in large hex strings... ignore them. (caused memory problems with Henry's archive)
diff --git a/src/java/edu/stanford/muse/datacache/BlobStore.java b/src/java/edu/stanford/muse/datacache/BlobStore.java
index 960d22b..44c8daf 100755
--- a/src/java/edu/stanford/muse/datacache/BlobStore.java
+++ b/src/java/edu/stanford/muse/datacache/BlobStore.java
@@ -468,7 +468,7 @@ public void generate_thumbnail(Blob b) throws IOException {
tnFilename = tmp_filename.substring(0, tmp_filename.length() - ".pdf".length()); // strip the ".pdf"
tnFilename += "1.png";
String[] args = new String[]{"-imageType", "png", "-startPage", "1", "-endPage", "1", tmp_filename};
- org.apache.pdfbox.PDFToImage.main(args);
+ org.apache.pdfbox.tools.PDFToImage.main(args);
log.info("Saving PDF thumbnail to " + tnFilename);
filename = filename + ".png"; // make sure the suffix for the thumbnail is named with a .png suffix in the cache
} catch (Throwable e) {
diff --git a/src/java/edu/stanford/muse/email/EmailFetcherThread.java b/src/java/edu/stanford/muse/email/EmailFetcherThread.java
index ebff296..e89a7ef 100755
--- a/src/java/edu/stanford/muse/email/EmailFetcherThread.java
+++ b/src/java/edu/stanford/muse/email/EmailFetcherThread.java
@@ -17,6 +17,7 @@
import com.sun.mail.imap.IMAPFolder;
import edu.stanford.muse.datacache.Blob;
+import edu.stanford.muse.email.json.ArchiveSaver;
import edu.stanford.muse.index.*;
import edu.stanford.muse.util.EmailUtils;
import edu.stanford.muse.util.JSONUtils;
@@ -24,6 +25,7 @@
import edu.stanford.muse.webapp.HTMLUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+import org.apache.james.mime4j.codec.DecoderUtil;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
@@ -472,12 +474,18 @@ private List processMessagePart(int messageNum, Message m, Part p, List<
String content;
String type = p.getContentType(); // new InputStreamReader(p.getInputStream(), "UTF-8");
try {
- // if forced encoding is set, we read the string with that encoding, otherwise we just use whatever p.getContent gives us
- if (FORCED_ENCODING != null) {
+ if (type.contains("charset=")) {
byte b[] = Util.getBytesFromStream(p.getInputStream());
- content = new String(b, FORCED_ENCODING);
- } else
- content = (String) p.getContent();
+ content = new String(b, type.substring(type.indexOf("charset=") + "charset=".length()));
+ } else {
+ // if forced encoding is set, we read the string with that encoding, otherwise we just use whatever p.getContent gives us
+ if (FORCED_ENCODING != null) {
+ byte b[] = Util.getBytesFromStream(p.getInputStream());
+ content = new String(b, FORCED_ENCODING);
+ } else {
+ content = (String) p.getContent();
+ }
+ }
} catch (UnsupportedEncodingException uee) {
dataErrors.add("Unsupported encoding: " + folder_name() + " Message #" + messageNum + " type " + type + ", using brute force conversion");
// a particularly nasty issue:javamail can't handle utf-7 encoding which is common with hotmail and exchange servers.
@@ -496,6 +504,7 @@ private List processMessagePart(int messageNum, Message m, Part p, List<
// rfc822 mime type is for embedded mbox format or some such (appears for things like
// forwarded messages). the content appears to be just a multipart.
Object o = p.getContent();
+ System.setProperty("mail.mime.multipart.allowempty", "true");
if (o instanceof Multipart) {
Multipart allParts = (Multipart) o;
if (p.isMimeType("multipart/alternative")) {
@@ -582,6 +591,9 @@ private void handleAttachments(int idx, Message m, Part p, List textList
String filename = null;
try {
filename = p.getFileName();
+ if (filename != null) {
+ filename = DecoderUtil.decodeEncodedWords(filename, null);
+ }
} catch (Exception e) {
// seen this happen with:
// Folders__gmail-sent Message #12185 Expected ';', got "Message"
@@ -1089,7 +1101,17 @@ private void fetchAndIndexMessages(Folder folder, Message[] messages, int offset
}
if (contents == null)
- contents = processMessagePart(messageNum, originalMessage, mm, attachmentsList);
+ try {
+ contents = processMessagePart(messageNum, originalMessage, mm, attachmentsList);
+ } catch (Exception e) {
+ log.error(e.getMessage(), e);
+ try {
+ log.error("MessageId: " + originalMessage.getMessageID());
+ } catch (MessagingException e1) {
+ log.error(e.getMessage(), e);
+ }
+ throw e;
+ }
// if mm is not prefetched, it is the same as original_mm
// will also work, but will be slow as javamail accesses and fetches each mm separately, instead of using the bulk prefetched version
@@ -1272,14 +1294,22 @@ public void run() {
// this is a special for mbox'es because we run out of memory if we try to openFolderAndGetMessages()
// so we process in batches
//TODO: Ideally, should cap on buffer size rather than on number of messages.
- final int BATCH = 10000;
+ int nMessagesperbathc = 10000;
+ long maxMemory = Runtime.getRuntime().maxMemory();
+ if (maxMemory <= 4294967296L ) { nMessagesperbathc = 100; }
+ else {
+ if (maxMemory<= 8294967296L) { nMessagesperbathc = 1000; }
+ }
+ final int BATCH = nMessagesperbathc; //gradual decrease of batch size due to memory size
int nbatches = nMessages / BATCH;
nMessagesProcessedSuccess = 0;
long st = System.currentTimeMillis();
int b;
for (b = 0; b < nbatches + 1; b++) {
begin_msg_index = b * BATCH + 1;
- end_msg_index = Math.min((b + 1) * BATCH, nMessages) + 1;
+ end_msg_index = Math.min((b + 1) * BATCH, nMessages);
+ log.info("begin_msg_index: " + begin_msg_index);
+ log.info("end_msg_index: " + end_msg_index);
log.info("Fetching messages in index [" + begin_msg_index + ", " + end_msg_index + "] batch: " + b + "/" + nbatches + "\nTotal Messages: " + nMessages);
Message[] messages = openFolderAndGetMessages();
currentStatus = JSONUtils.getStatusJSON("");
@@ -1349,6 +1379,7 @@ public void run() {
}
log.info("Read #" + nMessages + " messages in in " + (System.currentTimeMillis() - st) + "ms");
}
+ new ArchiveSaver(archive.archiveTitle).save(archive);
} catch (Throwable t) {
if (t instanceof OutOfMemoryError)
this.mayHaveRunOutOfMemory = true;
diff --git a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
new file mode 100644
index 0000000..9fd3abf
--- /dev/null
+++ b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
@@ -0,0 +1,107 @@
+package edu.stanford.muse.email.json;
+
+import edu.stanford.muse.index.Archive;
+import edu.stanford.muse.index.Document;
+import edu.stanford.muse.index.EmailDocument;
+import edu.stanford.muse.util.Util;
+
+import javax.mail.Address;
+import javax.mail.internet.InternetAddress;
+import java.io.*;
+import java.util.Base64;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Created by sunchise on 04.06.17.
+ */
+public class ArchiveSaver {
+
+ private final String archiveName;
+
+ public ArchiveSaver(String archiveName) {
+ this.archiveName = archiveName;
+ }
+
+ public void save(Archive archive) {
+ String folderName = new String(Base64.getEncoder().encode(archiveName.getBytes()));
+ String folderPath = System.getProperty("user.home") + File.separator + "epadd-data";
+ File folder = new File(folderPath);
+ if (!folder.exists()) {
+ folder.mkdir();
+ }
+ folderPath += File.separator + folderName;
+ folder = new File(folderPath);
+ if (!folder.exists()) {
+ folder.mkdir();
+ }
+ String fileName = folderPath + File.separator + "archive.json";
+ File file = new File(fileName);
+ if (file.exists()) {
+ file.delete();
+ }
+ try {
+ file.createNewFile();
+ } catch (IOException e) {
+ throw new RuntimeException(e.getMessage(), e);
+ }
+ List allDocs = archive.getAllDocs();
+ EmailNameAgregator emailNameAgregator = new EmailNameAgregator(allDocs);
+ emailNameAgregator.save(folderPath + File.separator + "email-names.json");
+ int i = 1;
+ try (BufferedWriter stream = new BufferedWriter(new FileWriter(file))) {
+ append(stream, "[");
+ boolean fail = false;
+ for (Document doc : allDocs) {
+ if (i > 1 && !fail) {
+ append(stream, ",");
+ }
+ fail = false;
+ final EmailDocument emailDocument = (EmailDocument) doc;
+ String messageID = Util.hash (emailDocument.getSignature());
+ Email email = new Email(messageID,
+ emailDocument.date,
+ true,
+ emailDocument.getSubject(),
+ emailDocument.from == null || emailDocument.from.length == 0 ? null : emailNameAgregator.getName(emailDocument.getFromEmailAddress()),
+ emailDocument.getFromEmailAddress());
+ if (emailDocument.cc != null) {
+ for (Address address : emailDocument.cc) {
+ InternetAddress internetAddress = (InternetAddress) address;
+ email.addCc(emailNameAgregator.getName(internetAddress.getAddress()), internetAddress.getAddress());
+ }
+ }
+ if (emailDocument.bcc != null) {
+ for (Address address : emailDocument.bcc) {
+ InternetAddress internetAddress = (InternetAddress) address;
+ email.addCc(emailNameAgregator.getName(internetAddress.getAddress()), internetAddress.getAddress());
+ }
+ }
+ if (emailDocument.to != null) {
+ for (Address address : emailDocument.to) {
+ InternetAddress internetAddress = (InternetAddress) address;
+ email.addCc(emailNameAgregator.getName(internetAddress.getAddress()), internetAddress.getAddress());
+ }
+ }
+ if (email.check()) {
+ append(stream, email.toJson());
+ } else {
+ fail = true;
+ }
+ i++;
+ }
+ append(stream, "]");
+ stream.flush();
+ stream.close();
+ } catch (IOException e) {
+ throw new RuntimeException(e.getMessage(), e);
+ }
+ }
+
+
+ private void append(Writer stream, String string) throws IOException {
+ string = string.trim();
+ stream.append(string);
+ }
+
+}
diff --git a/src/java/edu/stanford/muse/email/json/Email.java b/src/java/edu/stanford/muse/email/json/Email.java
new file mode 100644
index 0000000..414845c
--- /dev/null
+++ b/src/java/edu/stanford/muse/email/json/Email.java
@@ -0,0 +1,192 @@
+package edu.stanford.muse.email.json;
+
+import org.json.JSONException;
+import org.json.JSONObject;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import javax.mail.Address;
+import javax.mail.internet.InternetAddress;
+import java.io.*;
+import java.util.Calendar;
+import java.util.Collection;
+import java.util.Date;
+import java.util.HashSet;
+/*
+{
+ "emailId": 3,
+ "dateField": "1496222800",
+ "isSent": true,
+ "toField": [
+ [
+ "Александр Игоревич",
+ "Александр Игоревич"
+ ]
+ ],
+ "ccField": [
+ [
+ "ccPlaceholder",
+ "ccPlaceholder"
+ ]
+ ],
+ "fromField": [
+ "WWF России",
+ "WWF России"
+ ],
+ "subject": "Барс по имени Крюк"
+ }
+ */
+
+
+public class Email {
+ private final Logger log = LoggerFactory.getLogger(Email.class);
+
+ private final String id;
+
+ private final Date date;
+
+ private final boolean isSent;
+
+ private final Collection to = new HashSet<>();
+
+ private final Collection cc = new HashSet<>();
+
+ private final EmailAddress from;
+
+ private final String subject;
+
+ private String toJson;
+
+ public Email(String id, Date date, boolean isSent, EmailAddress from, String subject) {
+ this.id = id;
+ Calendar calendar = Calendar.getInstance();
+ calendar.set(Calendar.YEAR, 1999);
+ calendar.set(Calendar.MONTH, Calendar.SEPTEMBER);
+ calendar.set(Calendar.DAY_OF_MONTH, 11);
+ Date minDate = calendar.getTime();
+ if (date == null || minDate.compareTo(date) > 0) {
+ date = minDate;
+ }
+ this.date = date;
+ this.isSent = isSent;
+ this.from = from;
+ this.subject = subject;
+ }
+
+
+ public Email(int id, Date date, boolean isSent, String subject, String fromName, String fromEmail) {
+ this(String.valueOf(id), date, isSent, subject, fromName, fromEmail);
+ }
+
+ public Email(String id, Date date, boolean isSent, String subject, String fromName, String fromEmail) {
+ this(id, date, isSent, new EmailAddress(fromName, fromEmail), subject);
+ }
+
+ public void addTo(EmailAddress emailAddress) {
+ toJson = null;
+ to.add(emailAddress);
+ }
+
+ public void addTo(String name, String email) {
+ toJson = null;
+ addTo(new EmailAddress(name, email));
+ }
+
+ public void addCc(EmailAddress emailAddress) {
+ toJson = null;
+ to.add(emailAddress);
+ }
+
+ public void addCc(String name, String email) {
+ toJson = null;
+ addCc(new EmailAddress(name, email));
+ }
+
+ public String toJson() {
+ if (toJson == null) {
+ StringBuilder stream = new StringBuilder();
+ stream.append("{");
+ stream.append("\"emailId\": \"").append(id).append("\",");
+ stream.append("\"dateField\": ").append(date.getTime() / 1000).append(",");
+ stream.append("\"isSent\": ").append(isSent).append(",");
+ stream.append("\"toField\": [");
+ stream.append(to.stream().map(EmailAddress::toJson).reduce((s, s2) -> s + "," + s2).orElse(""));
+ stream.append("],");
+ stream.append("\"ccField\": [");
+ if (cc.isEmpty()) {
+ stream.append(new EmailAddress("ccPlaceholder", "ccPlaceholder").toJson());
+ } else {
+ stream.append(cc.stream().map(EmailAddress::toJson).reduce((s, s2) -> s + "," + s2).orElse(""));
+ }
+ stream.append("],");
+ stream.append("\"fromField\": ");
+ if (from == null) {
+ stream.append(new EmailAddress("fromPlaceholder", "fromPlaceholder").toJson());
+ } else {
+ stream.append(from.toJson());
+ }
+ stream.append(",");
+ stream.append("\"subject\": \"");
+ String formatedSubject = subject == null ? "Without subject" : String.valueOf(subject).replaceAll("\"", "'").replace("Subject: ", "").trim();
+ if ("null".equals(formatedSubject)) {
+ formatedSubject = "Without subject";
+ }
+ append(stream, formatedSubject).append("\"");
+ stream.append("}");
+ toJson = stream.toString();
+ }
+ return toJson;
+ }
+
+ public boolean check() {
+ try {
+ new JSONObject(toJson());
+ } catch (JSONException e) {
+ log.error("Not right format of json\n\n" + toJson + "\n\n" + e.getMessage());
+ return false;
+ }
+ return true;
+ }
+
+ public static class EmailAddress {
+ private final String name;
+ private final String email;
+
+ public EmailAddress(String name, String email) {
+ this.name = name;
+ this.email = email;
+ }
+
+ public String getName() {
+ return name == null ? email : name.replaceAll("\"", "'");
+ }
+
+ public String getEmail() {
+ return email.replaceAll("\"", "'");
+ }
+
+ public String toJson() {
+ StringBuilder stream = new StringBuilder();
+ append(stream, "[");
+ append(stream, "\"" + getName() + "\"");
+ append(stream, ",");
+ append(stream, "\"" + getEmail() + "\"");
+ append(stream, "]");
+ return stream.toString();
+ }
+ }
+
+ private static StringBuilder append(StringBuilder stream, String string) {
+ string = string.replaceAll("\\s", " ");
+ string = string.replaceAll("\\n", " ");
+ string = string.replaceAll("\\\\", "\\\\\\\\");
+ string = string.replaceAll("\\r", " ");
+ string = string.replaceAll(" {2,}", " ");
+ string = string.replaceAll("\" ", "\"");
+ string = string.replaceAll(" \"", "\"");
+ string = string.replaceAll("[^\\w\\d\\sёЁА-Яа-я.,:\\\\\\[\\]|'\";()*?!#$%{}@+\\-]", "");
+ string = string.trim();
+ stream.append(string);
+ return stream;
+ }
+}
diff --git a/src/java/edu/stanford/muse/email/json/EmailInfo.java b/src/java/edu/stanford/muse/email/json/EmailInfo.java
new file mode 100644
index 0000000..5af3927
--- /dev/null
+++ b/src/java/edu/stanford/muse/email/json/EmailInfo.java
@@ -0,0 +1,56 @@
+package edu.stanford.muse.email.json;
+
+import java.io.Serializable;
+
+/**
+ * Created by sunchise on 03.06.17.
+ */
+public class EmailInfo implements Serializable {
+
+ private final int emailId;
+ private final String dateField;
+ private final boolean isSent;
+ private final String[][] toField;
+ private final String[][] ccField;
+ private final Object[] fromField;
+ private final String subject;
+
+
+ public EmailInfo(int emailId, String dateField, boolean isSent, String[][] toField, String[][] ccField, Object[] fromField, String subject) {
+ this.emailId = emailId;
+ this.dateField = dateField;
+ this.isSent = isSent;
+ this.toField = toField;
+ this.ccField = ccField;
+ this.fromField = fromField;
+ this.subject = subject;
+ }
+
+ public int getEmailId() {
+ return emailId;
+ }
+
+ public String getDateField() {
+ return dateField;
+ }
+
+ public boolean isSent() {
+ return isSent;
+ }
+
+ public String[][] getToField() {
+ return toField;
+ }
+
+ public String[][] getCcField() {
+ return ccField;
+ }
+
+ public Object[] getFromField() {
+ return fromField;
+ }
+
+ public String getSubject() {
+ return subject;
+ }
+}
diff --git a/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java b/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java
new file mode 100644
index 0000000..1090ec9
--- /dev/null
+++ b/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java
@@ -0,0 +1,138 @@
+package edu.stanford.muse.email.json;
+
+import com.google.gson.Gson;
+import com.google.gson.reflect.TypeToken;
+import edu.stanford.muse.index.Document;
+import edu.stanford.muse.index.EmailDocument;
+import org.json.JSONObject;
+
+import javax.mail.Address;
+import javax.mail.internet.InternetAddress;
+import java.io.*;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+public class EmailNameAgregator {
+ private List allDocs;
+ final Map emailNameMap = new HashMap<>();
+
+ public EmailNameAgregator(List allDocs) {
+ this.allDocs = allDocs;
+ init();
+ }
+
+
+ public EmailNameAgregator(List allDocs, String fileName) {
+ this.allDocs = allDocs;
+ if (fileName == null) {
+ init();
+ } else {
+ File file = new File(fileName);
+ if (file.exists()) {
+ load(fileName);
+ } else {
+ init();
+ }
+ }
+ }
+
+ private void init() {
+ emailNameMap.clear();
+ allDocs.forEach(document -> {
+ EmailDocument emailDocument = (EmailDocument) document;
+ if (emailDocument.to != null) {
+ for (Address address : emailDocument.to) {
+ appendToEmailNameMap(emailNameMap, (InternetAddress) address);
+ }
+ }
+ if (emailDocument.cc != null) {
+ for (Address address : emailDocument.cc) {
+ appendToEmailNameMap(emailNameMap, (InternetAddress) address);
+ }
+ }
+ if (emailDocument.bcc != null) {
+ for (Address address : emailDocument.bcc) {
+ appendToEmailNameMap(emailNameMap, (InternetAddress) address);
+ }
+ }
+ });
+ }
+
+ public String getName(String email) {
+ return emailNameMap.get(email);
+ }
+
+
+ private void appendToEmailNameMap(Map emailNameMap, InternetAddress internetAddress) {
+ String email = internetAddress.getAddress();
+ String personal = internetAddress.getPersonal();
+ if (personal == null) {
+ return;
+ }
+ personal = removeWildChars(personal);
+ String name = emailNameMap.get(email);
+ if (name != null) {
+ if (name.length() < personal.length()) {
+ if (personal.contains(" ") || (!name.contains(" "))) {
+ emailNameMap.put(email, personal);
+ }
+ } else if (!name.contains(" ") && personal.contains(" ")) {
+ emailNameMap.put(email, personal);
+ } else if (name.contains(" ") && personal.contains(" ")) {
+ int nameWordsCount = name.split(" ").length;
+ int personalWordsCount = personal.split(" ").length;
+ if (personalWordsCount < 4 && personalWordsCount < nameWordsCount) {
+ emailNameMap.put(email, personal);
+ }
+ }
+ } else {
+ emailNameMap.put(email, personal);
+ }
+ }
+
+ private String removeWildChars(String string) {
+ string = string.replaceAll("\\s", " ");
+ string = string.replaceAll("\\n", " ");
+ string = string.replaceAll("\\\\", "\\\\\\\\");
+ string = string.replaceAll("\\r", " ");
+ string = string.replaceAll(" {2,}", " ");
+ string = string.replaceAll("\" ", "\"");
+ string = string.replaceAll(" \"", "\"");
+ string = string.replaceAll("[^\\w\\d\\sёЁА-Яа-я.,:\\\\\\[\\]|'\";()*?!#$%{}@+\\-]", "");
+ if (string.endsWith("'")) {
+ string = string.substring(0, string.length() - 1);
+ }
+ return string.trim();
+ }
+
+ public void save(String fileName) {
+ File file = new File(fileName);
+ if (file.exists()) {
+ file.delete();
+ }
+ JSONObject json = new JSONObject(emailNameMap);
+ try (Writer writer = new FileWriter(file)) {
+ json.write(writer);
+ writer.close();
+ } catch (IOException e) {
+ throw new RuntimeException(e.getMessage(), e);
+ }
+ }
+
+ public void load(String fileName) {
+ File file = new File(fileName);
+ if (!file.exists()) {
+ file.delete();
+ }
+ try (FileReader fileReader = new FileReader(file)) {
+ Map tempMap = new Gson().fromJson(fileReader, new TypeToken