From a477d8d3aed29dc28cc50433c92b0b21cf0450f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=A0=D0=BE=D1=81=D1=82=D0=BE=D0=B2=20=D0=90=D0=BB=D0=B5?=
 =?UTF-8?q?=D0=BA=D1=81=D0=B0=D0=BD=D0=B4=D1=80?= <arostov83@gmail.com>
Date: Fri, 5 May 2017 02:33:52 +0300
Subject: [PATCH 01/33] encoding

---
 .../stanford/muse/email/EmailFetcherThread.java  | 16 +++++++++++-----
 .../edu/stanford/muse/webapp/EmailRenderer.java  |  2 +-
 2 files changed, 12 insertions(+), 6 deletions(-)
diff --git a/src/java/edu/stanford/muse/email/EmailFetcherThread.java b/src/java/edu/stanford/muse/email/EmailFetcherThread.java
index ebff296..e09caab 100755
--- a/src/java/edu/stanford/muse/email/EmailFetcherThread.java
+++ b/src/java/edu/stanford/muse/email/EmailFetcherThread.java
@@ -472,12 +472,18 @@ private List<String> processMessagePart(int messageNum, Message m, Part p, List<
             String content;
             String type = p.getContentType(); // new InputStreamReader(p.getInputStream(), "UTF-8");
             try {
-                // if forced encoding is set, we read the string with that encoding, otherwise we just use whatever p.getContent gives us
-                if (FORCED_ENCODING != null) {
+                if (type.contains("charset=")) {
                     byte b[] = Util.getBytesFromStream(p.getInputStream());
-                    content = new String(b, FORCED_ENCODING);
-                } else
-                    content = (String) p.getContent();
+                    content = new String(b, type.substring(type.indexOf("charset=") + "charset=".length()));
+                } else {
+                    // if forced encoding is set, we read the string with that encoding, otherwise we just use whatever p.getContent gives us
+                    if (FORCED_ENCODING != null) {
+                        byte b[] = Util.getBytesFromStream(p.getInputStream());
+                        content = new String(b, FORCED_ENCODING);
+                    } else {
+                        content = (String) p.getContent();
+                    }
+                }
             } catch (UnsupportedEncodingException uee) {
                 dataErrors.add("Unsupported encoding: " + folder_name() + " Message #" + messageNum + " type " + type + ", using brute force conversion");
                 // a particularly nasty issue:javamail can't handle utf-7 encoding which is common with hotmail and exchange servers.
diff --git a/src/java/edu/stanford/muse/webapp/EmailRenderer.java b/src/java/edu/stanford/muse/webapp/EmailRenderer.java
index 6ffa3b2..f413e3b 100755
--- a/src/java/edu/stanford/muse/webapp/EmailRenderer.java
+++ b/src/java/edu/stanford/muse/webapp/EmailRenderer.java
@@ -168,7 +168,7 @@ public static String formatAddressesAsHTML(Address addrs[], AddressBook addressB
 				InternetAddress ia = (InternetAddress) a;
 				Pair<String, String> p = JSPHelper.getNameAndURL((InternetAddress) a, addressBook);
 				String url = p.getSecond();
-				String str = ia.toString();
+				String str = ia.getPersonal() == null ? ia.getAddress() : ia.getPersonal() + "<" + ia.getAddress() + ">";
                 String addr = ia.getAddress();
                 boolean match = false;
                 if(str!=null) {

From dcb6f6f812e8f0dbd4e94c7549ebf4831238a8aa Mon Sep 17 00:00:00 2001
From: Gleb Suvorov <suvorov.gleb@gmail.com>
Date: Fri, 5 May 2017 02:52:47 +0300
Subject: [PATCH 02/33] cyrillic encoding fix

---
 .../muse/email/EmailFetcherThread.java        | 2891 +++++++++--------
 .../stanford/muse/webapp/EmailRenderer.java   | 1072 +++---
 2 files changed, 1984 insertions(+), 1979 deletions(-)

diff --git a/src/java/edu/stanford/muse/email/EmailFetcherThread.java b/src/java/edu/stanford/muse/email/EmailFetcherThread.java
index ebff296..89945fa 100755
--- a/src/java/edu/stanford/muse/email/EmailFetcherThread.java
+++ b/src/java/edu/stanford/muse/email/EmailFetcherThread.java
@@ -1,1443 +1,1448 @@
-/*
- * Copyright (C) 2012 The Stanford MobiSocial Laboratory
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
- * http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package edu.stanford.muse.email;
-
-import com.sun.mail.imap.IMAPFolder;
-import edu.stanford.muse.datacache.Blob;
-import edu.stanford.muse.index.*;
-import edu.stanford.muse.util.EmailUtils;
-import edu.stanford.muse.util.JSONUtils;
-import edu.stanford.muse.util.Util;
-import edu.stanford.muse.webapp.HTMLUtils;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.json.JSONArray;
-import org.json.JSONException;
-import org.json.JSONObject;
-import org.jsoup.Jsoup;
-
-import javax.activation.DataHandler;
-import javax.activation.DataSource;
-import javax.mail.*;
-import javax.mail.internet.AddressException;
-import javax.mail.internet.InternetAddress;
-import javax.mail.internet.MimeMessage;
-import java.io.*;
-import java.security.GeneralSecurityException;
-import java.util.*;
-
-class EmailFetcherStats implements Cloneable, Serializable {
-    private final static long serialVersionUID = 1L;
-
-    int nTotalMessages;            // total # of messages to process
-    int nMessagesAdded;            // running total messages newly added to the archive
-    int nMessagesAlreadyPresent;    // running messages that were already present
-    int nErrors = 0;
-    int nMessagesFiltered = 0;
-
-    public void merge(EmailFetcherStats other) {
-        this.nMessagesAdded += other.nMessagesAdded;
-        this.nMessagesAlreadyPresent += other.nMessagesAlreadyPresent;
-        this.nMessagesFiltered += other.nMessagesFiltered;
-        this.nErrors += other.nErrors;
-        this.nTotalMessages += other.nTotalMessages;
-    }
-
-    public String toString() {
-        return Util.fieldsToString(this);
-    }
-}
-
-/**
- * Important class for importing email.
- * implements an email fetcher for a range of message #s within a single folder.
- * In contrast, MTEmailFetcher is responsible for an entire email account, including multiple folders.
- * and MuseEmailFetcher is responsible for multiple accounts (but for a single user)
- * email fetcher stats is associated with a single email fetcher
- */
-public class EmailFetcherThread implements Runnable, Serializable {
-    private final static long serialVersionUID = 1L;
-
-    public static final int IMAP_PREFETCH_BUFSIZE = 20 * 1024 * 1024;
-    /* used for buffering imap prefetch data -- necessary for good imap performance*/
-    public static final String FORCED_ENCODING = "UTF-8";
-
-    public static Log log = LogFactory.getLog(EmailFetcherThread.class);
-
-    // set up INVALID_DATE
-    public static Date INVALID_DATE; // like 0xdeadbeef
-
-    static {
-        Calendar c = new GregorianCalendar();
-        c.set(Calendar.YEAR, 1960);
-        c.set(Calendar.DAY_OF_MONTH, 1);
-        c.set(Calendar.MONTH, Calendar.JANUARY);
-        c.set(Calendar.HOUR_OF_DAY, 0);
-        c.set(Calendar.MINUTE, 0);
-        c.set(Calendar.SECOND, 0);
-        c.set(Calendar.MILLISECOND, 0);
-        INVALID_DATE = c.getTime();
-    }
-
-    private FetchConfig fetchConfig;
-    private boolean mayHaveRunOutOfMemory = false;
-    private FolderInfo fetchedFolderInfo;
-    transient Folder folder;
-    boolean use_uid_if_available;
-
-    protected int threadID;
-    protected EmailStore emailStore;
-
-    protected boolean isCancelled;
-
-    public static boolean verbose = false;
-    public static boolean debug = false;
-
-    // notes: begin_msg_index is always correct. end_msg_index = -1  means nMessages in folder.
-    // note: msg # begin_msg_index will be processed. msg # end_msg_index will not be processed.
-    protected int begin_msg_index = 0, end_msg_index = -1;
-
-    EmailFetcherStats stats = new EmailFetcherStats();
-    String currentStatus;
-
-
-    int totalMessagesInFetch, messagesCompletedInFetch;                        // this fetcher may be part of a bigger fetch operation. we need to track the progress of the bigger fetch in order to track progress accurately.
-
-    public int getTotalMessagesInFetch() {
-        return totalMessagesInFetch;
-    }
-
-    public void setTotalMessagesInFetch(int totalMessagesInFetch) {
-        this.totalMessagesInFetch = totalMessagesInFetch;
-    }
-
-    public int getMessagesCompletedInFetch() {
-        return messagesCompletedInFetch;
-    }
-
-    public void setMessagesCompletedInFetch(int messagesCompletedInFetch) {
-        this.messagesCompletedInFetch = messagesCompletedInFetch;
-    }
-
-    // stats
-    int nMessagesProcessedSuccess, nUncachedMessagesProcessed, nMessagesCached; // running count of # of messages processed successfully
-    int nErrors = 0;
-
-    public void cancel() {
-        isCancelled = true;
-    }
-
-    public void setFetchConfig(FetchConfig fc) {
-        this.fetchConfig = fc;
-    }
-
-    public int getThreadID() {
-        return threadID;
-    }
-
-    public void setThreadID(int threadID) {
-        this.threadID = threadID;
-    }
-
-    public int getNMessagesProcessed() {
-        return nMessagesProcessedSuccess;
-    }
-
-    public int getNUncachedMessagesProcessed() {
-        return nUncachedMessagesProcessed;
-    }
-
-    protected String folder_name() {
-        return fetchedFolderInfo.longName;
-    }
-
-    protected String email_source() {
-        return fetchedFolderInfo.accountKey;
-    }
-
-    public boolean mayHaveRunOutOfMemory() {
-        return mayHaveRunOutOfMemory;
-    }
-
-    //	private String folderPrefix; // prefix for folder files
-    transient Store store;                                        // we don't really need this serialized across sessions
-
-    transient Archive archive;
-    Collection<String> dataErrors = new LinkedHashSet<String>();    // log of input data errors
-
-    Date prevDate = null;
-
-	/*
-     * // comment out unused constructors, so it's cleaner/easier to trace the
-	 * setting member fields.
-	 * public EmailFetcherThread() { super(); }
-	 * 
-	 * public EmailFetcherThread(EmailStore store, String folder_name)
-	 * {
-	 * this.emailStore = store;
-	 * this.folder_name = folder_name;
-	 * }
-	 */
-
-    public EmailFetcherThread(EmailStore store, FolderInfo fi, int begin_msg_index, int end_msg_index) {
-        this.emailStore = store;
-        this.fetchedFolderInfo = fi;
-        stats.nTotalMessages = end_msg_index - begin_msg_index;
-        this.begin_msg_index = begin_msg_index;
-        this.end_msg_index = end_msg_index;
-    }
-
-    public void setArchive(Archive a) {
-        archive = a;
-    }
-
-    public Archive getArchive() {
-        return archive;
-    }
-
-    /**
-     * merges results with another email fetcher. does some lightweight work
-     * including updating stats. consider removing this and simplifying in the
-     * future
-     */
-    public void merge(EmailFetcherThread other) {
-        verify();
-        if (other != null) {
-            other.verify();
-
-            // TOFIX: we should eliminate duplicates
-            dataErrors.addAll(other.dataErrors);
-            stats.merge(other.stats);
-
-            nMessagesProcessedSuccess += other.nMessagesProcessedSuccess;
-            nErrors += other.nErrors;
-            mayHaveRunOutOfMemory |= other.mayHaveRunOutOfMemory;
-        }
-        verify();
-    }
-
-    /**
-     * intern a bunch of addrs, to save memory
-     *
-     * @throws UnsupportedEncodingException
-     */
-    private static void internAddressList(Address[] addrs) throws UnsupportedEncodingException {
-        if (addrs == null)
-            return;
-
-        for (Address a : addrs) {
-            if (a instanceof InternetAddress) {
-                InternetAddress ia = (InternetAddress) a;
-                String address = ia.getAddress(), personal = ia.getPersonal();
-                if (address != null)
-                    ia.setAddress(InternTable.intern(address));
-                if (personal != null)
-                    ia.setPersonal(InternTable.intern(personal));
-            }
-        }
-    }
-
-    /**
-     * Key method for importing email: converts a javamail obj. to our own data structure (EmailDocument)
-     */
-    //public EmailDocument convertToEmailDocument(MimeMessage m, int num, String url) throws MessagingException, IOException
-    private EmailDocument convertToEmailDocument(MimeMessage m, String id) throws MessagingException, IOException {
-        // get the date.
-        // prevDate is a hack for the cases where the message is lacking an explicit Date: header. e.g.
-        //		From hangal Sun Jun 10 13:46:46 2001
-        //		To: ewatkins@stanford.edu
-        //		Subject: Re: return value bugs
-        // though the date is on the From separator line, the mbox provider fails to parse it and provide it to us.
-        // so as a hack, we will assign such messages the same date as the previous one this fetcher has seen! ;-)
-        // update: having the exact same date causes the message to be considered a duplicate, so just increment
-        // the timestamp it by 1 millisecond!
-        // a better fix would be to improve the parsing in the provider
-
-        boolean hackyDate = false;
-        Date d = m.getSentDate();
-        if (d == null)
-            d = m.getReceivedDate();
-        if (d == null) {
-            if (prevDate != null) {
-                long newTime = prevDate.getTime() + 1L; // added +1 so that this email is not considered the same object as the prev. one if they are in the same thread
-                d = new Date(newTime);
-                dataErrors.add("No date for message id:" + id + ": " + EmailUtils.formatMessageHeader(m) + " assigned approximate date");
-            } else {
-                d = INVALID_DATE; // wrong, but what can we do... :-(
-                dataErrors.add("No date for message id:" + id + ": " + EmailUtils.formatMessageHeader(m) + " assigned deliberately invalid date");
-            }
-            hackyDate = true;
-        } else {
-            Calendar c = new GregorianCalendar();
-            c.setTime(d);
-            int yy = c.get(Calendar.YEAR);
-            if (yy < 1960 || yy > 2020) {
-                dataErrors.add("Probably bad date: " + Util.formatDate(c) + " message: " + EmailUtils.formatMessageHeader(m));
-                hackyDate = true;
-            }
-        }
-
-        if (hackyDate && prevDate != null) {
-            long newTime = prevDate.getTime() + 1L; // added +1 so that this email is not considered the same object as the prev. one if they are in the same thread
-            d = new Date(newTime);
-            Util.ASSERT(!d.equals(prevDate));
-        }
-
-        Calendar c = new GregorianCalendar();
-        c.setTime(d != null ? d : new Date());
-
-        prevDate = d;
-
-        Address to[] = null, cc[] = null, bcc[] = null;
-        Address[] from = null;
-        try {
-            // 			allrecip = m.getAllRecipients(); // turns out to be too expensive because it looks for newsgroup headers for imap
-            // assemble to, cc, bcc into a list and copy it into allrecip
-            List<Address> list = new ArrayList<Address>();
-            from = m.getFrom();
-            to = m.getRecipients(Message.RecipientType.TO);
-            if (to != null)
-                list.addAll(Arrays.asList(to));
-            cc = m.getRecipients(Message.RecipientType.CC);
-            if (cc != null)
-                list.addAll(Arrays.asList(cc));
-            bcc = m.getRecipients(Message.RecipientType.BCC);
-            if (bcc != null)
-                list.addAll(Arrays.asList(bcc));
-
-            // intern the strings in these addresses to save memory cos they are repeated often in a large archive
-            internAddressList(from);
-            internAddressList(to);
-            internAddressList(cc);
-            internAddressList(bcc);
-        } catch (AddressException ae) {
-            String s = "Bad address in folder " + folder_name() + " message id" + id + " " + ae;
-            dataErrors.add(s);
-        }
-
-        // take a deep breath. This object is going to live longer than most of us.
-        EmailDocument ed = new EmailDocument(id, email_source(), folder_name(), to, cc, bcc, from, m.getSubject(), m.getMessageID(), c.getTime());
-
-        String[] headers = m.getHeader("List-Post");
-        if (headers != null && headers.length > 0) {
-            // trim the headers because they usually look like: "<mailto:prpl-devel@lists.stanford.edu>"
-            ed.sentToMailingLists = new String[headers.length];
-            int i = 0;
-            for (String header : headers) {
-                header = header.trim();
-                header = header.toLowerCase();
-
-                if (header.startsWith("<") && header.endsWith(">"))
-                    header = header.substring(1, header.length() - 1);
-                if (header.startsWith("mailto:") && !"mailto:".equals(header)) // defensive check in case header == "mailto:"
-                    header = header.substring(("mailto:").length());
-                ed.sentToMailingLists[i++] = header;
-            }
-        }
-        if (hackyDate) {
-            String s = "Guessed date " + Util.formatDate(c) + " for message id: " + id + ": " + ed.getHeader();
-            dataErrors.add(s);
-            ed.hackyDate = true;
-        }
-
-        // check if the message has attachments.
-        // if it does and we're not downloading attachments, then we mark the ed as such.
-        // otherwise we had a problem where a message header (and maybe text) was downloaded but without attachments in one run
-        // but in a subsequent run where attachments were needed, we thought the message was already cached and there was no
-        // need to recompute it, leaving the attachments field in this ed incorrect.
-        List<String> attachmentNames = getAttachmentNames(m, m);
-        if (!Util.nullOrEmpty(attachmentNames)) {
-            ed.attachmentsYetToBeDownloaded = true; // will set it to false later if attachments really were downloaded (not sure why)
-            //			log.info ("added " + attachmentNames.size() + " attachments to message: " + ed);
-        }
-        return ed;
-    }
-
-    /*
-     * we try to get the attachment names cheaply, i.e. without having to
-     * process the whole message
-     */
-    private List<String> getAttachmentNames(MimeMessage m, Part p) throws MessagingException, IOException {
-        List<String> result = new ArrayList<String>();
-        try {
-            if (p.isMimeType("multipart/*") || p.isMimeType("message/rfc822")) {
-                if (p.isMimeType("multipart/alternative"))
-                    return result; // ignore alternative's because real attachments don't have alternatives
-                DataHandler dh = p.getDataHandler();
-                DataSource ds = dh.getDataSource();
-                if (ds instanceof MultipartDataSource) {
-                    MultipartDataSource mpds = (MultipartDataSource) ds;
-                    for (int i = 0; i < mpds.getCount(); i++)
-                        result.addAll(getAttachmentNames(m, mpds.getBodyPart(i)));
-                } else {
-                    String name = ds.getName();
-                    if (!Util.nullOrEmpty(name))
-                        result.add(name);
-                }
-            } else {
-                String filename = p.getFileName();
-                if (filename != null)
-                    result.add(filename);
-            }
-        } catch (Exception e) {
-            // sometimes we see javax.mail.MessagingException: Unable to load BODYSTRUCTURE
-            // in this case, just ignore, not much we can do i guess.
-            Util.print_exception(e, log);
-        }
-        return result;
-    }
-
-    //	public void setEmailCache (DocCache cache)
-    //	{
-    //		this.cache = cache;
-    //	}
-
-    /**
-     * this method returns the text content of the message as a list of strings
-     * // each element of the list could be the content of a multipart message
-     * // m is the top level subject
-     * // p is the specific part that we are processing (p could be == m)
-     * also sets up names of attachments (though it will not download the
-     * attachment unless downloadAttachments is true)
-     */
-    private List<String> processMessagePart(int messageNum, Message m, Part p, List<Blob> attachmentsList) throws MessagingException, IOException {
-        List<String> list = new ArrayList<String>(); // return list
-        if (p == null) {
-            dataErrors.add("part is null: " + folder_name() + " idx " + messageNum);
-            return list;
-        }
-
-        if (p == m && p.isMimeType("text/html")) {
-            /*
-            String s = "top level part is html! message:" + m.getSubject() + " " + m.getDescription();
-            dataErrors.add(s);
-            */
-            // we don't normally expect the top-level part to have content-type text/html
-            // but we saw this happen on some sample archives pst -> emailchemy. so allow it and handle it by parsing the html
-            String html = (String) p.getContent();
-            String text = Util.unescapeHTML(html);
-            org.jsoup.nodes.Document doc = Jsoup.parse(text);
-
-            StringBuilder sb = new StringBuilder();
-            HTMLUtils.extractTextFromHTML(doc.body(), sb);
-            list.add(sb.toString());
-            return list;
-        }
-
-        if (p.isMimeType("text/plain")) {
-            //make sure, p is not wrongly labelled as plain text.
-            Enumeration headers = p.getAllHeaders();
-            boolean dirty = false;
-            if (headers != null)
-                while (headers.hasMoreElements()) {
-                    Header h = (Header) headers.nextElement();
-                    String name = h.getName();
-                    String value = h.getValue();
-                    if (name != null && value != null) {
-                        if (name.equals("Content-transfer-encoding") && value.equals("base64")) {
-                            dirty = true;
-                            break;
-                        }
-                    }
-                }
-            String fname = p.getFileName();
-            if (fname != null) {
-                int idx = fname.lastIndexOf('.');
-                if ((idx < fname.length()) && (idx >= 0)) {
-                    String extension = fname.substring(idx);
-                    //anything extension other than .txt is suspicious.
-                    if (!extension.equals(".txt"))
-                        dirty = true;
-                }
-            }
-            if (dirty) {
-                dataErrors.add("Dirty message part, has conflicting message part headers."  + folder_name() + " Message# " + messageNum);
-                return list;
-            }
-
-            log.debug("Message part with content type text/plain");
-            String content;
-            String type = p.getContentType(); // new InputStreamReader(p.getInputStream(), "UTF-8");
-            try {
-                // if forced encoding is set, we read the string with that encoding, otherwise we just use whatever p.getContent gives us
-                if (FORCED_ENCODING != null) {
-                    byte b[] = Util.getBytesFromStream(p.getInputStream());
-                    content = new String(b, FORCED_ENCODING);
-                } else
-                    content = (String) p.getContent();
-            } catch (UnsupportedEncodingException uee) {
-                dataErrors.add("Unsupported encoding: " + folder_name() + " Message #" + messageNum + " type " + type + ", using brute force conversion");
-                // a particularly nasty issue:javamail can't handle utf-7 encoding which is common with hotmail and exchange servers.
-                // we're using the workaround suggested on this page: http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4304013
-                // though it may be better to consider official support for utf-7 or other encodings.
-
-                // TOFIX: I get an exception for utfutf8-encoding which has a base64 encoding embedded on it.
-                // Unsupported encoding: gmail-sent Message #10477 type text/plain; charset=x-utf8utf8; name="newyorker.txt",
-                // the hack below doesn't work for it.
-                ByteArrayOutputStream bao = new ByteArrayOutputStream();
-                p.writeTo(bao);
-                content = bao.toString();
-            }
-            list.add(content);
-        } else if (p.isMimeType("multipart/*") || p.isMimeType("message/rfc822")) {
-            // rfc822 mime type is for embedded mbox format or some such (appears for things like
-            // forwarded messages). the content appears to be just a multipart.
-            Object o = p.getContent();
-            if (o instanceof Multipart) {
-                Multipart allParts = (Multipart) o;
-                if (p.isMimeType("multipart/alternative")) {
-                    // this is an alternative mime type. v common case to have text and html alternatives
-                    // so just process the text part if there is one, and avoid fetching the alternatives.
-                    // useful esp. because many ordinary messages are alternative: text and html and we don't want to fetch the html.
-                    // revisit in future we want to retain the html alternative for display purposes
-                    Part[] parts = new Part[allParts.getCount()];
-                    for (int i = 0; i < parts.length; i++)
-                        parts[i] = allParts.getBodyPart(i);
-
-                    for (int i = 0; i < parts.length; i++) {
-                        Part thisPart = parts[i];
-                        if (thisPart.isMimeType("text/plain")) {
-                            // common case, return quickly
-                            list.add((String) thisPart.getContent());
-                            log.debug("Multipart/alternative with content type text/plain");
-                            return list;
-                        }
-                    }
-
-                    // no text part, let's look for an html part. this happens for html parts.
-                    for (int i = 0; i < allParts.getCount(); i++) {
-                        Part thisPart = parts[i];
-                        if (thisPart.isMimeType("text/html")) {
-                            // common case, return quickly
-                            String html = (String) thisPart.getContent();
-                            String text = Util.unescapeHTML(html);
-                            org.jsoup.nodes.Document doc = Jsoup.parse(text);
-
-                            StringBuilder sb = new StringBuilder();
-                            HTMLUtils.extractTextFromHTML(doc.body(), sb);
-                            list.add(sb.toString());
-
-                            log.debug("Multipart/alternative with content type text/html");
-                            return list;
-                        }
-                    }
-
-                    // no text or html part. hmmm... blindly process the first part only
-                    if (allParts.getCount() >= 1)
-                        list.addAll(processMessagePart(messageNum, m, allParts.getBodyPart(0), attachmentsList));
-                } else {
-                    // process it like a regular multipart
-                    for (int i = 0; i < allParts.getCount(); i++) {
-                        BodyPart bp = allParts.getBodyPart(i);
-                        list.addAll(processMessagePart(messageNum, m, bp, attachmentsList));
-                    }
-                }
-            } else if (o instanceof Part)
-                list.addAll(processMessagePart(messageNum, m, (Part) o, attachmentsList));
-            else
-                dataErrors.add("Unhandled part content, " + folder_name() + " Message #" + messageNum + "Java type: " + o.getClass() + " Content-Type: " + p.getContentType());
-        } else {
-            try {
-                // do attachments only if downloadAttachments is set.
-                // some apps do not need attachments, so this saves some time.
-                // however, it seems like a lot of time is taken in imap prefetch, which gets attachments too?
-                if (fetchConfig.downloadAttachments)
-                    handleAttachments(messageNum, m, p, list, attachmentsList);
-            } catch (Exception e) {
-                dataErrors.add("Ignoring attachment for " + folder_name() + " Message #" + messageNum + ": " + Util.stackTrace(e));
-            }
-        }
-
-        return list;
-    }
-
-    /**
-     * recursively processes attachments, fetching and saving it if needed
-     * parses the given part p, and adds it to hte attachmentsList.
-     * in some cases, like a text/html type without a filename, we instead append it to the textlist
-     * @throws MessagingException
-     */
-    private void handleAttachments(int idx, Message m, Part p, List<String> textList, List<Blob> attachmentsList) throws MessagingException {
-        String ct = null;
-        if (!(m instanceof MimeMessage)) {
-            Exception e = new IllegalArgumentException("Not a MIME message!");
-            e.fillInStackTrace();
-            log.warn(Util.stackTrace(e));
-            return;
-        }
-
-        String filename = null;
-        try {
-            filename = p.getFileName();
-        } catch (Exception e) {
-            // seen this happen with:
-            // Folders__gmail-sent Message #12185 Expected ';', got "Message"
-            // javax.mail.internet.ParseException: Expected ';', got "Message"
-
-            dataErrors.add("Unable to read attachment name: " + folder_name() + " Message# " + idx);
-            return;
-        }
-
-        String sanitizedFName = Util.sanitizeFolderName(emailStore.getAccountID() + "." + folder_name());
-        if (filename == null) {
-            String tempFname = sanitizedFName + "." + idx;
-            dataErrors.add("attachment filename is null for " + sanitizedFName + " Message#" + idx + " assigning it the name: " + tempFname);
-            if (p.isMimeType("text/html")) {
-                try {
-                    log.info("Turning message " + sanitizedFName + " Message#" + idx + " into text although it is an attachment");
-                    String html = (String) p.getContent();
-                    String text = Util.unescapeHTML(html);
-                    org.jsoup.nodes.Document doc = Jsoup.parse(text);
-
-                    StringBuilder sb = new StringBuilder();
-                    HTMLUtils.extractTextFromHTML(doc.body(), sb);
-                    textList.add(sb.toString());
-                    return;
-                } catch (Exception e) {
-                    Util.print_exception("Error reading contents of text/html multipart without a filename!", e, log);
-                    return;
-                }
-            }
-            filename = tempFname;
-        }
-
-        // Replacing any of the disallowed filename characters (\/:*?"<>|&) to _
-        // (note: & causes problems with URLs for serveAttachment etc, so it's also replaced)
-        String newFilename = Util.sanitizeFileName(filename);
-
-        // Updating filename if it's changed after sanitizing.
-        if (!newFilename.equals(filename)) {
-            log.info("Filename changed from " + filename + " to " + newFilename);
-            filename = newFilename;
-        }
-
-        try {
-            ct = p.getContentType();
-            if (filename.indexOf(".") < 0) // no ext in filename... let's fix it if possible
-            {
-                // Using startsWith instead of equals because sometimes the ct has crud beyond the image/jpeg;...crud....
-                // Below are the most common file types, more type can be added if needed
-
-                // Most common APPLICATION TYPE
-                if (ct.startsWith("application/pdf"))
-                    filename = filename + ".pdf";
-                if (ct.startsWith("application/zip"))
-                    filename = filename + ",zip";
-                // Most common IMAGE TYPE
-                if (ct.startsWith("image/jpeg"))
-                    filename = filename + ".jpg";
-                if (ct.startsWith("image/gif"))
-                    filename = filename + ".gif";
-                if (ct.startsWith("image/png"))
-                    filename = filename + ".png";
-                // Most Common VIDEO TYPE
-                if (ct.startsWith("video/x-ms-wmv"))
-                    filename = filename + ".wmv";
-                // Most Common AUDIO TYPE
-                if (ct.startsWith("audio/mpeg"))
-                    filename = filename + ".mp3";
-                if (ct.startsWith("audio/mp4"))
-                    filename = filename + ".mp4";
-                // Most Common TEXT TYPE
-                if (ct.startsWith("text/html"))
-                    filename = filename + ".html";
-                // Windows Office
-                if (ct.startsWith("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) //Word
-                    filename = filename + ".docx";
-                if (ct.startsWith("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")) //Excel
-                    filename = filename + ".xlsx";
-                if (ct.startsWith("application/vnd.openxmlformats-officedocument.presentationml.presentation")) //PowerPoint
-                    filename = filename + ".pptx";
-            }
-            // retain only up to first semi-colon; often ct is something like text/plain; name="filename"' we don't want to log the filename
-            int x = ct.indexOf(";");
-            if (x >= 0)
-                ct = ct.substring(0, x);
-            log.info("Attachment content type: " + ct + " filename = " + Util.blurKeepingExtension(filename));
-        } catch (Exception pex) {
-            dataErrors.add("Can't read CONTENT-TYPE: " + ct + " filename:" + filename + " size = " + p.getSize() + " subject: " + m.getSubject() + " Date : " + m.getSentDate().toString() + "\n Exception: " + pex + "\n" + Util.stackTrace(pex));
-            return;
-        }
-
-        //	    if (filename == null && !p.isMimeType("text/html") && !p.isMimeType("message/partial")) // expected not to have a filename with mime type text/html
-        //	    	log.warn ("Attachment filename is null: " + Util.stackTrace());
-
-
-        boolean success = true;
-        // the size passed in here is the part size, which is not really the binary blob size.
-        // when we read the stream below in blobStore.add(), we'll set it again to the binary blob size
-        Blob b = new EmailAttachmentBlob(filename, p.getSize(), (MimeMessage) m, p);
-
-        if (fetchConfig.downloadAttachments) {
-            // this containment check is only on the basis of file name and size currently,
-            // not on the actual hash
-            if (archive.getBlobStore().contains(b)) {
-                log.debug("Cache hit! " + b);
-            } else {
-                try {
-                    if (filename.endsWith(".tif"))
-                        log.info("Fetching attachment..." + Util.blurKeepingExtension(filename));
-
-                    // performance critical! use large buffer! currently 256KB
-                    // stream will be closed by callee
-
-                    long start = System.currentTimeMillis();
-                    long nBytes = archive.getBlobStore().add(b, new BufferedInputStream(p.getInputStream(), 256 * 1024));
-                    long end = System.currentTimeMillis();
-                    if (nBytes != -1) {
-                        long diff = end - start;
-                        String s = "attachment size " + nBytes + " bytes, fetched in " + diff + " millis";
-                        if (diff > 0)
-                            s += " (" + (nBytes / diff) + " KB/s)";
-                        log.info(s);
-                    }
-
-                    Util.ASSERT(archive.getBlobStore().contains(b));
-
-                } catch (IOException ioe) {
-                    success = false;
-                    dataErrors.add("WARNING: Unable to fetch attachment: filename: " + filename + " size = " + p.getSize() + " subject: " + m.getSubject() + " Date : " + m.getSentDate().toString() + "\nException: " + ioe);
-                    ioe.printStackTrace(System.out);
-                }
-            }
-
-            if (success) {
-                attachmentsList.add(b);
-
-                /// generate thumbnail only if not already cached
-                try {
-                    archive.getBlobStore().generate_thumbnail(b); // supplement
-                } catch (IOException ioe) {
-                    log.warn("failed to create thumbnail, filename: " + filename + " size = " + p.getSize() + " subject: " + m.getSubject() + " Date : " + m.getSentDate().toString() + "\nException: " + ioe);
-                    ioe.printStackTrace(System.out);
-                }
-            }
-        }
-    }
-
-    @SuppressWarnings("unused")
-    private static String processLastReceived(String header) {
-        header = header.toLowerCase();
-        StringTokenizer st = new StringTokenizer(header, " \t()[]");
-        String x = st.nextToken();
-        if (!x.equals("from")) {
-            log.warn("Warning: unrecognized header: " + header);
-            return null;
-        }
-
-        while (st.hasMoreTokens()) {
-            String s = st.nextToken();
-            if (Character.isDigit(s.charAt(0))) {
-                log.warn("IP address: " + s);
-                return s;
-            }
-        }
-        return null;
-    }
-
-    public void verify() {
-    }
-
-    public void finish() {
-        currentStatus = JSONUtils.getStatusJSON("Verifying email headers...");
-        currentStatus = JSONUtils.getStatusJSON("");
-    }
-
-    /**
-     * prepare a status json with up to N_TEASERS teasers from the most recent
-     * emails, starting backwards from idx. specifically ask for ArrayList as
-     * List.get() can be costly otherwise.
-     */
-    private static String getStatusJSONWithTeasers(String message, int pctComplete, long secsElapsed, long secsRemaining, ArrayList<EmailDocument> emails, int N_TEASERS) {
-        JSONObject json = new JSONObject();
-        try {
-            json.put("pctComplete", pctComplete);
-            json.put("message", message);
-            json.put("secsElapsed", secsElapsed);
-            json.put("secsRemaining", secsRemaining);
-            if (!Util.nullOrEmpty(emails)) {
-                JSONArray arr = new JSONArray();
-                int idx_end = emails.size();
-                int idx_start = idx_end - N_TEASERS;
-                if (idx_start < 0)
-                    idx_start = 0;
-                for (int i = idx_start, j = 0; i < idx_end; i++) {
-                    EmailDocument email = emails.get(i);
-                    if (email != null) {
-                        String subject = email.description;
-                        if (!Util.nullOrEmpty(subject))
-                            arr.put(j++, subject);
-                    }
-                }
-                json.put("teasers", arr);
-            }
-        } catch (JSONException jsone) {
-            try {
-                json.put("error", jsone.toString());
-            } catch (Exception e) {
-                Util.report_exception(e);
-            }
-        }
-        return json.toString();
-    }
-
-    /**
-     * best effort to prefetch messages for messages[startMsgIdx] onwards, up to
-     * the IMAP_PREFETCH_BUFSIZE
-     * return List<String> if bodyTextOnly is true, otherwise List<MimeMessage>
-     */
-    private List<?> do_imap_prefetch(Message[] messages, int startMsgIdx, Folder folder, boolean bodyTextOnly) {
-        // its perfectly ok for correctness for this method to do nothing and return null
-        List<?> prefetchedMessages = null;
-        try {
-
-            if (IMAP_PREFETCH_BUFSIZE > 0 && folder instanceof IMAPFolder) {
-                int prefetch_messages_size = 0;
-
-                int start_message_num = messages[startMsgIdx].getMessageNumber();
-                int end_message_num = start_message_num;
-
-                List<Integer> messageNums = new ArrayList<Integer>();
-
-                // figure out message num range to fetch. if anything is unusual -- bad content type, non-consec. msg nums etc -- break out.
-                // non consec. message numbers are a problem because they cause a very long imap command string, which we found was returning an "invalid command" response.
-                int prev_message_num = -1;
-                for (int msgIdx = startMsgIdx; msgIdx < messages.length; msgIdx++) {
-                    if (bodyTextOnly) {
-                        String contentType = messages[msgIdx].getContentType().toLowerCase();
-                        if (!contentType.startsWith("multipart/") && !contentType.startsWith("text/plain")) {
-                            log.info("Warn: message idx" + msgIdx + " msg#" + messages[msgIdx].getMessageNumber() + " has unexpected content type " + contentType);
-                            break;
-                        }
-                    }
-
-                    // check if sequence is as expected
-                    int next_message_num = messages[msgIdx].getMessageNumber(); // may be better to switch this to uid and prefetcher uses uid fetch
-                    if (next_message_num != prev_message_num + 1 && prev_message_num != -1)
-                        break;
-
-                    // if this message would push prefetch size beyond the buf size, break out, not including this message
-                    if (prefetch_messages_size + messages[msgIdx].getSize() >= IMAP_PREFETCH_BUFSIZE)
-                        break;
-                    prev_message_num = next_message_num;
-                    prefetch_messages_size += messages[msgIdx].getSize();
-                    messageNums.add(next_message_num);
-                }
-
-                if (messageNums.size() == 0)
-                    return null;
-
-                // now we prefetch messages from start_message_num to end_message_num
-                long startMillis = System.currentTimeMillis();
-                log.info("prefetching " + messageNums.size() + " messages");
-                ImapPrefetcher prefetcher = bodyTextOnly ? new TextOnlyImapPrefetcher(((ImapPopEmailStore) emailStore).session, messageNums) : new ImapPrefetcher(((ImapPopEmailStore) emailStore).session, messageNums);
-                prefetchedMessages = (List<?>) ((IMAPFolder) folder).doCommand(prefetcher); // start_message_num, end_message_num));
-                long elapsedMillis = System.currentTimeMillis() - startMillis;
-                long kb_per_sec = prefetch_messages_size / elapsedMillis;
-                log.info("prefetched " + messageNums.size() + " messages in " + Util.blur(folder.getName()) + " [" + start_message_num + ":" + end_message_num + "], " + Util.commatize(prefetch_messages_size / 1024) + "KB in " + Util.commatize(elapsedMillis) + "ms (" + Util.commatize(kb_per_sec) + " KB/sec)");
-            }
-        } catch (Exception e) {
-            Util.print_exception(e, log);
-        }
-        return prefetchedMessages;
-    }
-
-    private void fetchHeaders(Message[] messages) throws MessagingException {
-        // fetch headers (don't do it for mbox folders, waste of time)
-        // this is an essential perf. step so that we fetch the headers in bulk.
-        // otherwise it takes a long time to fetch header info one at a time for each message
-        if (!(emailStore instanceof MboxEmailStore)) {
-            long startTimeMillis = System.currentTimeMillis();
-            currentStatus = JSONUtils.getStatusJSON("Reading headers from " + folder.getName() + "...");
-            FetchProfile fp = new FetchProfile();
-            fp.add(FetchProfile.Item.ENVELOPE);
-            fp.add(FetchProfile.Item.CONTENT_INFO);
-            fp.add(UIDFolder.FetchProfileItem.UID); // important, otherwise reading UIDs takes a long time later
-            fp.add("List-Post");
-            folder.fetch(messages, fp);
-            long endTimeMillis = System.currentTimeMillis();
-            log.info("Done fetching headers: " + Util.commatize(endTimeMillis - startTimeMillis) + "ms");
-        }
-    }
-
-    private void fetchHeaders(int nMessages) throws MessagingException {
-        // fetch headers (don't do it for mbox folders, waste of time)
-        // this is an essential perf. step so that we fetch the headers in bulk.
-        // otherwise it takes a long time to fetch header info one at a time for each message
-        if (!(emailStore instanceof MboxEmailStore)) {
-            long startTimeMillis = System.currentTimeMillis();
-            currentStatus = JSONUtils.getStatusJSON("Reading headers from " + folder.getName() + "...");
-            FetchProfile fp = new FetchProfile();
-            fp.add(FetchProfile.Item.ENVELOPE);
-            fp.add(FetchProfile.Item.CONTENT_INFO);
-            fp.add(UIDFolder.FetchProfileItem.UID); // important, otherwise reading UIDs takes a long time later
-            fp.add("List-Post");
-            for (int i = 0; i < nMessages; i++) {
-                Message[] messages = new Message[]{folder.getMessage(i)};
-                folder.fetch(messages, fp);
-            }
-            long endTimeMillis = System.currentTimeMillis();
-            log.info("Done fetching headers: " + Util.commatize(endTimeMillis - startTimeMillis) + "ms");
-        }
-    }
-
-    private Message[] removeMessagesAlreadyInArchive(Archive archive, Message[] messages) {
-        // early out for the common case that we have an empty archive
-        if (archive.getAllDocs().size() == 0)
-            return messages;
-
-        List<Message> resultList = new ArrayList<Message>();
-        for (int i = 0; i < messages.length; i++) {
-            //int idx = messages[i].getMessageNumber();
-            Message m = messages[i];
-            MimeMessage mm = (MimeMessage) m;
-            try {
-                EmailDocument ed = convertToEmailDocument(mm, "dummy"); // id doesn't really matter here
-                if (archive.containsDoc(ed)) {
-                    stats.nMessagesAlreadyPresent++;
-                    dataErrors.add("Duplicate message: " + ed); // note: report.jsp depends on this exact string
-                    continue;
-                }
-            } catch (Exception e) {
-                Util.print_exception(e, log);
-            }
-            resultList.add(mm);
-            messages[i] = null; // no harm explicitly nulling out messages
-        }
-        Message[] resultArray = resultList.toArray(new Message[0]);
-        return resultArray;
-    }
-
-    /**
-     * Make few post checks on the content and returns true if the message looks
-     * ok
-     */
-    private boolean messageLooksOk(String content) {
-        if (content == null)
-            //let others handle it.
-            return true;
-        String[] lines = content.split("\n");
-        int badlines = 0;
-        if (lines.length > 50)
-            for (String line : lines) {
-                if (!line.contains(" "))
-                    badlines++;
-                else
-                    badlines = 0;
-                if (badlines > 50)
-                    return false;
-            }
-        return true;
-    }
-
-    //keep track of the total time elapsed in fetching messages across batches
-    static long fetchStartTime = System.currentTimeMillis();
-
-    /**
-     * fetch given message idx's in given folder -- @performance critical
-     *
-     * @param offset - the original offset of the first message in the messages array, important to initialize
-     *               for proper assignment of unique id or doc Id
-     */
-    //private void fetchUncachedMessages(String sanitizedFName, Folder folder, DocCache cache, List<Integer> msgIdxs) throws MessagingException, FileNotFoundException, IOException, GeneralSecurityException {
-    private void fetchAndIndexMessages(Folder folder, Message[] messages, int offset, int totalMessages) throws MessagingException, IOException, GeneralSecurityException {
-        //mark the processing of new batch
-        if (offset == 0)
-            fetchStartTime = System.currentTimeMillis();
-
-        currentStatus = JSONUtils.getStatusJSON((emailStore instanceof MboxEmailStore) ? "Parsing " + folder.getName() + " (can take a while)..." : "Reading " + folder.getName() + "...");
-
-        // bulk fetch of all message headers
-        int n = messages.length;
-
-        // eliminate any messages the archive already has
-        messages = removeMessagesAlreadyInArchive(archive, messages);
-
-        log.info(n - messages.length + " message(s) already in the archive");
-
-        ArrayList<EmailDocument> emails = new ArrayList<EmailDocument>();
-
-        // for performance, we need to do bulk prefetches, instead of fetching 1 message at a time
-        // prefetchedMessages will be a temp cache of prefetched messages
-        int first_i_prefetched = -1, last_i_prefetched = -1;
-        List<?> prefetchedMessages = null; // the type of this can be either list<string> if text only, otherwise list<mimemmessage>
-
-        long highestUID = archive.getLastUIDForFolder(fetchedFolderInfo.accountKey, fetchedFolderInfo.longName);
-        long lastAssignedUID = highestUID;
-        boolean bodyTextOnly = !fetchConfig.downloadAttachments;
-        try {
-            archive.openForWrite();
-            for (int i = 0; i < messages.length; i++) {
-                // critical step: (thanks, yourkit!)
-                // null out the ref to the previous message, otherwise it stays in memory, and the heap effectively needs to be as big as the size of all messages
-                if (i > 0)
-                    messages[i - 1] = null;
-
-                if (isCancelled)
-                    break;
-
-                Message m = messages[i];
-                MimeMessage mm = (MimeMessage) m;
-
-                if (i >= last_i_prefetched) {
-                    // critical perf. step: do a bulk imap prefetch
-                    // the prefetch will fetch as many messages as possible up to a max buffer size, and return the messages prefetched
-                    // last_i_prefetched tracks what is the last index into idxs that we have prefetched.
-                    // when we run out of prefetched messages, we do another bulk prefetch
-
-                    prefetchedMessages = do_imap_prefetch(messages, i, folder, bodyTextOnly);
-                    if (prefetchedMessages != null) {
-                        first_i_prefetched = i;
-                        last_i_prefetched = i + prefetchedMessages.size();
-                    }
-                }
-
-                int pctDone = ((i + offset) * 100) / totalMessages;
-                long elapsedMillis = System.currentTimeMillis() - fetchStartTime;
-                long unprocessedSecs = Util.getUnprocessedMessage(i + offset, totalMessages, elapsedMillis);
-                int N_TEASERS = 50; // 50 ok here, because it takes a long time to fetch and process messages, so teaser computation is relatively not expensive
-                int nTriesForThisMessage = 0;
-                currentStatus = getStatusJSONWithTeasers("Reading " + Util.commatize(totalMessages) + " messages from " + folder.getName() + "...", pctDone, elapsedMillis / 1000, unprocessedSecs, emails, N_TEASERS);
-
-                int messageNum = mm.getMessageNumber();
-
-                try {
-                    long unique_id;
-
-                    // if we have uid, that's even better
-                    // don't use uid's for mbox, it has a bug and always gives -1
-                    // see http://james.apache.org/server/rfclist/imap4/rfc2060.txt for uid spec
-                    if (folder instanceof UIDFolder && !(emailStore instanceof MboxEmailStore)) {
-                        long uid = ((UIDFolder) folder).getUID(m);
-                        unique_id = uid;
-                    } else
-                        unique_id = lastAssignedUID + 1 + i + offset; // +1 since i starts from 0 (but lastAssignedUID can be -1 -- is that safe? -sgh)
-
-                    if (unique_id > highestUID)
-                        highestUID = unique_id;
-
-                    String unique_id_as_string = Long.toString(unique_id);
-
-                    // well, we already converted to emaildoc above during removeMessagesAlreadyInArchive
-                    // not a serious perf. concern now, but revisit if needed
-                    EmailDocument ed = convertToEmailDocument(mm, unique_id_as_string); // this messageNum is mostly for debugging, it should not be used for equals etc.
-                    // need to check this again, because there might be duplicates such within the set we are currently processing.
-                    if (archive.containsDoc(ed)) {
-                        stats.nMessagesAlreadyPresent++;
-                        dataErrors.add("Duplicate message: " + ed); // note: report.jsp depends on this specific string
-                        continue;
-                    }
-
-                    MimeMessage originalMessage = mm; // this is the mm that has all the headers etc.
-                    List<Blob> attachmentsList = new ArrayList<Blob>();
-
-                    // if we already have it prefetched, use the prefetched version
-                    List<String> contents = null;
-
-                    if (first_i_prefetched >= 0 && prefetchedMessages != null) {
-                        if (!fetchConfig.downloadAttachments) {
-                            // text only means the prefetchedMessages are stored directly as a list of strings
-                            String content = (String) prefetchedMessages.get(i - first_i_prefetched); // note: this_mm only has the prefetched content, but not the headers
-                            contents = new ArrayList<String>();
-
-                            try {
-                                // a special for yahoo which routinely uses quoted-printable. content looks like  =0A0D.... = etc.
-                                if (mm.isMimeType("multipart/alternative")) {
-                                    Multipart mm_mp = (Multipart) mm.getContent();
-                                    Part p0 = mm_mp.getBodyPart(0);
-                                    if (p0 instanceof com.sun.mail.imap.IMAPBodyPart) {
-                                        String encoding = ((com.sun.mail.imap.IMAPBodyPart) p0).getEncoding();
-                                        if ("quoted-printable".equals(encoding)) {
-                                            content = new String(Util.getBytesFromStream(javax.mail.internet.MimeUtility.decode(new java.io.ByteArrayInputStream(content.getBytes()), "quoted-printable")));
-                                        }
-                                    }
-                                }
-                            } catch (Exception e) {
-                                Util.print_exception("Error trying to parse encoding of multipart", e, log);
-                            }
-
-                            contents.add(content);
-                        } else {
-                            // subtle issue here: the contentType of the prefetchedMessage needs to be be set to the original_mm's content-type.
-                            // this was found for cases where the original message is multipart-alternative with a text and html part.
-                            // if we don't set prefetchedMessage's content type, it gets a mime type of text/plain and a body = the entire multipart including both parts.
-                            // found on sgh's sent mail w/subject: "text to add in help" from  Fri, 7 Jun 2013
-                            MimeMessage prefetchedMessage = (MimeMessage) prefetchedMessages.get(i - first_i_prefetched);
-                            String contentTypeHeaders[] = originalMessage.getHeader("Content-Type");
-                            String contentTypeHeader = null;
-                            if (contentTypeHeaders != null && contentTypeHeaders.length == 1)
-                                contentTypeHeader = contentTypeHeaders[0];
-
-                            if (!Util.nullOrEmpty(contentTypeHeader)) // we do care about body structure, hang on to it
-                                prefetchedMessage.setHeader("Content-Type", contentTypeHeader);
-                            mm = prefetchedMessage;
-                        }
-                        prefetchedMessages.set(i - first_i_prefetched, null); // null out to save memory
-                    }
-
-                    if (contents == null)
-                        contents = processMessagePart(messageNum, originalMessage, mm, attachmentsList);
-
-                    // if mm is not prefetched, it is the same as original_mm
-                    // will also work, but will be slow as javamail accesses and fetches each mm separately, instead of using the bulk prefetched version
-                    // even when prefetched, the processMessagePart is somewhat expensive because the attachments have to be extracted etc.
-
-                    // we could overlap processMessagePart with do_imap_prefetch by prefetching in a separate thread, since prefetch is network limited.
-                    // but profiling shows processMessagePart takes only 1/4th the time of do_imap_prefetch so overlapping would be a relatively small gain.
-                    // not worth the effort right now.
-                    ed.attachments = attachmentsList;
-                    if (fetchConfig.downloadAttachments)
-                        ed.attachmentsYetToBeDownloaded = false; // we've already downloaded our attachments
-
-                    // concat all the contents parts
-                    StringBuilder sb = new StringBuilder();
-                    for (String s : contents) {
-                        sb.append(s);
-                        sb.append("\n");
-                    }
-
-                    String contentStr = sb.toString();
-                    if (!messageLooksOk(contentStr)) {
-                        dataErrors.add("Skipping message as it seems to have very long words: " + ed);
-                        continue;
-                    }
-                    contentStr = IndexUtils.normalizeNewlines(contentStr); // just get rid of \r's
-
-                    archive.addDoc(ed, contentStr);
-
-                    List<LinkInfo> linkList = new ArrayList<LinkInfo>();
-                    // linkList might be used only for slant
-                    IndexUtils.populateDocLinks(ed, contentStr, linkList, true);
-                    ed.links = linkList;
-                    stats.nMessagesAdded++;
-                } catch (Exception ex) {
-                    // sometimes we get unexpected folder closed, so try again
-                    boolean retry = false;
-                    if (ex instanceof javax.mail.FolderClosedException) {
-                        log.warn("Oops, thread " + threadID + " got the folder closed in its face! " + ex.getMessage());
-
-                        // sometimes we get this exception about folder closed
-                        // retry up to 3 times, then give up
-                        if (nTriesForThisMessage < 3) {
-                            retry = true;
-                            log.info("Re-opening email store; attempt #" + (nTriesForThisMessage + 1) + " for message " + i);
-                            nTriesForThisMessage++;
-                            messages = openFolderAndGetMessages();
-                            fetchHeaders(messages);
-                            --i; // adjust the message index n try again
-                        }
-                    }
-
-                    if (!retry) {
-                        // we sometimes see UnsupportedEncodingException with x-utf8utf8 mime type and ParseException
-                        // nothing much can be done, just create a dummy doc and add it to the cache
-                        nErrors++;
-                        stats.nErrors++;
-                        EmailDocument ed = new EmailDocument(Integer.toString(messageNum));
-                        log.warn("Exception reading message from " + folder_name() + " Message #" + messageNum + " " + ex.getMessage() + "\n" + Util.stackTrace(ex));
-
-                        ed.setErrorString(Util.stackTrace(ex));
-                    }
-                }
-            }
-        } catch (Throwable t) {
-            Util.print_exception(t, log);
-        } finally {
-            //				if (cancelled && false) // TODO: disable for now as currently only indexes are rolled back and allDocs/blobs are not rolled back in sync yet
-            //					archive.rollbackIndexWrites();
-            //				else
-            currentStatus = JSONUtils.getStatusJSON("Saving archive...");
-            archive.close();
-        }
-
-        fetchedFolderInfo.lastSeenUID = highestUID;
-        log.info("at end of fetch, folder info is " + fetchedFolderInfo);
-
-        log.info("emailfetcher thread completed, archive has " + archive.getAllDocs().size() + " docs");
-    }
-
-    public FolderInfo getFetchedFolderInfo() {
-        return fetchedFolderInfo;
-    }
-
-    private int openFolderAndGetMessageCount() throws MessagingException {
-        folder = null;
-
-        store = emailStore.connect();
-        folder = emailStore.get_folder(store, folder_name());
-        if (folder != null)
-            return folder.getMessageCount();
-        else
-            return 0;
-    }
-
-    /**
-     * Comment by @vihari
-     * Not sure what uid id and folder are,I think this code should be more predictable
-     * The params begin idx and end idx are used for both uid filtering and Mbox message indexing.
-     * does not make sense
-     */
-    private Message[] openFolderAndGetMessages() throws MessagingException {
-        if (folder == null)
-            openFolderAndGetMessageCount();
-
-        Message[] messages = null;
-        if (folder == null)
-            return messages;
-
-        String descr = emailStore.getAccountID() + ":" + folder;
-        boolean haveUID = false;
-        int count = folder.getMessageCount();
-        use_uid_if_available = (begin_msg_index == 1 && end_msg_index == count + 1);
-        log.info("use_uid_if_available is set to " + use_uid_if_available);
-
-        if (fetchConfig.filter != null && fetchConfig.filter.isActive()) {
-            log.info("Issuing server side filters for " + fetchConfig.filter);
-            boolean useReceivedDateTerms = descr.indexOf("yahoo.com") >= 0;
-            messages = folder.search(fetchConfig.filter.convertToSearchTerm(useReceivedDateTerms));
-        } else {
-            // mbox provider claims to provide UIDFolder but the uids are bogus so we treat mboemailstore folders as not uidfolders
-            boolean is_uid_folder = (folder instanceof UIDFolder) && !(emailStore instanceof MboxEmailStore);
-
-            if (use_uid_if_available && is_uid_folder) {
-                // for uidfolders, we want to update the last seen uid in the FolderInfo
-                long uid = archive.getLastUIDForFolder(emailStore.getAccountID(), folder_name());
-                if (uid > 0) {
-                    messages = ((UIDFolder) folder).getMessagesByUID(uid + 1, UIDFolder.LASTUID);
-                    log.info("Archive has already seen this folder: " + descr + " will only fetch messages from uid " + uid + " onwards, " + messages.length + " messages will be incrementally fetched");
-                    haveUID = true;
-                } else
-                    log.info(descr + " is a UIDFolder but not seen before");
-            } else
-                log.info(descr + " is not a UIDFolder");
-
-            if (!haveUID) {
-                log.info("All " + count + " messages in " + descr + " will be fetched");
-                //messages = folder.getMessages();
-
-                if (begin_msg_index > 0 && end_msg_index > 0) {
-                    // we have to use only specified messages
-                    // if there are 8 messages, count = 8, end_msg_index will be 9
-                    if (end_msg_index > count + 1)
-                        log.warn("Warning: bad end_msg_index " + end_msg_index + " count = " + count); // use the full messages
-                    else {
-                        int nMessages = end_msg_index - begin_msg_index;
-                        Message[] newMessages = new Message[nMessages];
-                        for (int i = 0; i < end_msg_index - begin_msg_index; i++)
-                            newMessages[i] = folder.getMessage(begin_msg_index + i);//messages[begin_msg_index - 1 + i]; // -1 cos messages array is indexed from 0, but begin_msg_index from 1
-                        log.info("total # of messages: " + count + " reduced # of messages: " + newMessages.length);
-                        messages = newMessages;
-                    }
-                }
-            }
-        }
-
-        return messages;
-    }
-
-    /**
-     * main fetch+index method
-     * The assumptions that the heap is big enough to enough to fit all the messages i the folder is not scalable for larger archive.
-     * Instead, we process each message individually.
-     * fetchHeaders may be penalised due to multiple requests of fetch?
-     * In order to make indexing of large archives possible, fetch of NON-MBOXEmailstrore formats is penalised. It is possible to avoid this by handling MBox and IMAP/POP formats differently.
-     */
-    public void run() {
-        currentStatus = JSONUtils.getStatusJSON("Starting to process " + folder_name());
-
-        isCancelled = false;
-        Thread.currentThread().setName("EmailFetcher");
-        nErrors = 0;
-        //Message[] messages = null;
-        // use_uid is set only if we are reading the whole folder. otherwise we won't use it, and we won't update the highest UID seen for the folder in the archive.
-        try {
-            //			long t1 = System.currentTimeMillis();
-            int nMessages = openFolderAndGetMessageCount();
-            log.info("Total number of messages: " + nMessages);
-
-            if (emailStore instanceof MboxEmailStore) {
-                // this is a special for mbox'es because we run out of memory if we try to openFolderAndGetMessages()
-                // so we process in batches
-                //TODO: Ideally, should cap on buffer size rather than on number of messages.
-                final int BATCH = 10000;
-                int nbatches = nMessages / BATCH;
-                nMessagesProcessedSuccess = 0;
-                long st = System.currentTimeMillis();
-                int b;
-                for (b = 0; b < nbatches + 1; b++) {
-                    begin_msg_index = b * BATCH + 1;
-                    end_msg_index = Math.min((b + 1) * BATCH, nMessages) + 1;
-                    log.info("Fetching messages in index [" + begin_msg_index + ", " + end_msg_index + "] batch: " + b + "/" + nbatches + "\nTotal Messages: " + nMessages);
-                    Message[] messages = openFolderAndGetMessages();
-                    currentStatus = JSONUtils.getStatusJSON("");
-                    if (isCancelled)
-                        return;
-
-                    if (messages.length > 0) {
-                        try {
-                            if (fetchConfig.downloadMessages) {
-                                log.info(nMessages + " messages will be fetched for indexing");
-                                fetchAndIndexMessages(folder, messages, begin_msg_index, nMessages);
-                            } else {
-                                // this is for memory test screening mode.
-                                // we create a dummy archive without any real contents
-                                for (int i = 0; i < nMessages; i++) {
-                                    String unique_id_as_string = Long.toString(i);
-
-                                    // well, we already converted to emaildoc above during removeMessagesAlreadyInArchive
-                                    // not a serious perf. concern now, but revisit if needed
-                                    EmailDocument ed = convertToEmailDocument((MimeMessage) messages[i], unique_id_as_string); // this messageNum is mostly for debugging, it should not be used for equals etc.
-                                    archive.addDocWithoutContents(ed);
-                                }
-                            }
-                        } catch (Exception e) {
-                            log.error("Exception trying to fetch messages, results will be incomplete! " + e + "\n" + Util.stackTrace(e));
-                        }
-                    }
-                    log.info("Fetch stats for this fetcher thread: " + stats);
-                }
-                log.info("Read #" + nMessages + " messages in #" + b + " batches of size: " + BATCH + " in " + (System.currentTimeMillis() - st) + "ms");
-            } else {
-                // IMAP etc are pretty efficient with lazily populating message objects, so unlike mbox, its ok to use openFolderAndGetMessages() on the entire folder.
-                // remember to init the begin/end_msg_index before calling openFolderAndGetMessages
-                begin_msg_index = 1;
-                end_msg_index = nMessages + 1;
-                nMessagesProcessedSuccess = 0;
-                Message[] messages = openFolderAndGetMessages();
-
-                long st = System.currentTimeMillis();
-                currentStatus = JSONUtils.getStatusJSON("");
-                if (isCancelled)
-                    return;
-
-                if (messages.length > 0) {
-                    try {
-                        fetchHeaders(messages); // always fetch headers
-                        if (fetchConfig.downloadMessages) {
-                            log.info(nMessages + " messages will be fetched for indexing");
-                            //we process all the messages together here unlike the case of mstor
-                            //hence the begin index is always 0
-                            fetchAndIndexMessages(folder, messages, 0, messages.length);
-                        } else {
-                            // this is for memory test screening mode.
-                            // we create a dummy archive without any real contents
-                            for (int i = 0; i < nMessages && i < messages.length; i++) {
-                                String unique_id_as_string = Long.toString(i);
-
-                                // well, we already converted to emaildoc above during removeMessagesAlreadyInArchive
-                                // not a serious perf. concern now, but revisit if needed
-                                EmailDocument ed = convertToEmailDocument((MimeMessage) messages[i], unique_id_as_string); // this messageNum is mostly for debugging, it should not be used for equals etc.
-                                archive.addDocWithoutContents(ed);
-                            }
-                        }
-                    } catch (Exception e) {
-                        Util.print_exception("Exception trying to fetch messages, results will be incomplete! ", e, log);
-                    }
-                }
-                log.info("Read #" + nMessages + " messages in  in " + (System.currentTimeMillis() - st) + "ms");
-            }
-        } catch (Throwable t) {
-            if (t instanceof OutOfMemoryError)
-                this.mayHaveRunOutOfMemory = true;
-            // this is important, because there could be an out of memory etc over here.
-            Util.print_exception(t, log);
-        } finally {
-            try {
-                if (folder != null)
-                    folder.close(false);
-                if (store != null)
-                    store.close();
-            } catch (Exception e) {
-                Util.print_exception(e);
-            }
-        }
-    }
-
-	/*
-	 * code for handling other kinds of headers, e.g. to find location of the
-	 * message -- not used right now, but may use in the future.
-	 * public void processHeaders(MimeMessage m) throws Exception
-	 * {
-	 * Address[] froms = m.getFrom();
-	 * if (froms == null)
-	 * return;
-	 * InternetAddress a = (InternetAddress) froms[0];
-	 * ContactInfo ci = addressBook.getContactInfoForAddress(a);
-	 * Enumeration<Header> e = (Enumeration<Header>) m.getAllHeaders();
-	 * String lastReceivedHeader = null;
-	 * while (e.hasMoreElements())
-	 * {
-	 * Header h = e.nextElement();
-	 * String n = h.getName();
-	 * String v = h.getValue();
-	 * // log.info ("header: " + n + " = " + n);
-	 * String s = n.toLowerCase();
-	 * if ("x-mailer".equals(s) || "user-agent".equals(s))
-	 * {
-	 * log.warn (m.getFrom()[0] + " --> " + n + " " + v);
-	 * ci.addMailer(v);
-	 * }
-	 * if ("x-originating-ip".equals(s) || "x-yahoo-post-ip".equals(s))
-	 * {
-	 * log.warn (m.getFrom()[0] + " --> " + n + " " + v);
-	 * ci.addIPAddr(v);
-	 * }
-	 * if ("x-yahoo-profile".equals(s))
-	 * log.warn (m.getFrom()[0] + " --> " + n + " " + v);
-	 * if ("message-id".equals(s))
-	 * {
-	 * log.warn("messageID = " + v);
-	 * ci.addMessageID(v);
-	 * }
-	 * if ("received".equals(s) || "x-received".equals(s))
-	 * {
-	 * lastReceivedHeader = v;
-	 * }
-	 * }
-	 * 
-	 * // sometimes the headers have an extra ctrl-m at the end, strip it if
-	 * this is the case.
-	 * if (lastReceivedHeader != null && lastReceivedHeader.endsWith("\r"))
-	 * lastReceivedHeader = lastReceivedHeader.substring(0,
-	 * lastReceivedHeader.length()-1);
-	 * 
-	 * ci.addLastReceivedHeader(lastReceivedHeader);
-	 * 
-	 * String from = froms[0].toString();
-	 * 
-	 * log.info (from + " lastReceived " + lastReceivedHeader);
-	 * if (lastReceivedHeader == null)
-	 * log.warn ("WARNING: " + from + " --> no received header!?");
-	 * else
-	 * {
-	 * String ipAddrStr = processLastReceived(lastReceivedHeader);
-	 * if (ipAddrStr != null)
-	 * {
-	 * byte[] ipAddrBytes = Util.parseIPAddress(ipAddrStr);
-	 * if (ipAddrBytes != null)
-	 * {
-	 * // InetAddress ipAddr = InetAddress.getByAddress(ipAddrBytes);
-	 * // log.info ("Received: " + locationService.lookupLocation(ipAddr));
-	 * }
-	 * }
-	 * }
-	 * }
-	 */
-
-    public String toString() {
-        return Util.fieldsToString(this);
-    }
-}
+/*
+ * Copyright (C) 2012 The Stanford MobiSocial Laboratory
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.stanford.muse.email;
+
+import com.sun.mail.imap.IMAPFolder;
+import edu.stanford.muse.datacache.Blob;
+import edu.stanford.muse.index.*;
+import edu.stanford.muse.util.EmailUtils;
+import edu.stanford.muse.util.JSONUtils;
+import edu.stanford.muse.util.Util;
+import edu.stanford.muse.webapp.HTMLUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.json.JSONArray;
+import org.json.JSONException;
+import org.json.JSONObject;
+import org.jsoup.Jsoup;
+
+import javax.activation.DataHandler;
+import javax.activation.DataSource;
+import javax.mail.*;
+import javax.mail.internet.AddressException;
+import javax.mail.internet.InternetAddress;
+import javax.mail.internet.MimeMessage;
+import java.io.*;
+import java.security.GeneralSecurityException;
+import java.util.*;
+
+class EmailFetcherStats implements Cloneable, Serializable {
+    private final static long serialVersionUID = 1L;
+
+    int nTotalMessages;            // total # of messages to process
+    int nMessagesAdded;            // running total messages newly added to the archive
+    int nMessagesAlreadyPresent;    // running messages that were already present
+    int nErrors = 0;
+    int nMessagesFiltered = 0;
+
+    public void merge(EmailFetcherStats other) {
+        this.nMessagesAdded += other.nMessagesAdded;
+        this.nMessagesAlreadyPresent += other.nMessagesAlreadyPresent;
+        this.nMessagesFiltered += other.nMessagesFiltered;
+        this.nErrors += other.nErrors;
+        this.nTotalMessages += other.nTotalMessages;
+    }
+
+    public String toString() {
+        return Util.fieldsToString(this);
+    }
+}
+
+/**
+ * Important class for importing email.
+ * implements an email fetcher for a range of message #s within a single folder.
+ * In contrast, MTEmailFetcher is responsible for an entire email account, including multiple folders.
+ * and MuseEmailFetcher is responsible for multiple accounts (but for a single user)
+ * email fetcher stats is associated with a single email fetcher
+ */
+public class EmailFetcherThread implements Runnable, Serializable {
+    private final static long serialVersionUID = 1L;
+
+    public static final int IMAP_PREFETCH_BUFSIZE = 20 * 1024 * 1024;
+    /* used for buffering imap prefetch data -- necessary for good imap performance*/
+    public static final String FORCED_ENCODING = "UTF-8";
+
+    public static Log log = LogFactory.getLog(EmailFetcherThread.class);
+
+    // set up INVALID_DATE
+    public static Date INVALID_DATE; // like 0xdeadbeef
+
+    static {
+        Calendar c = new GregorianCalendar();
+        c.set(Calendar.YEAR, 1960);
+        c.set(Calendar.DAY_OF_MONTH, 1);
+        c.set(Calendar.MONTH, Calendar.JANUARY);
+        c.set(Calendar.HOUR_OF_DAY, 0);
+        c.set(Calendar.MINUTE, 0);
+        c.set(Calendar.SECOND, 0);
+        c.set(Calendar.MILLISECOND, 0);
+        INVALID_DATE = c.getTime();
+    }
+
+    private FetchConfig fetchConfig;
+    private boolean mayHaveRunOutOfMemory = false;
+    private FolderInfo fetchedFolderInfo;
+    transient Folder folder;
+    boolean use_uid_if_available;
+
+    protected int threadID;
+    protected EmailStore emailStore;
+
+    protected boolean isCancelled;
+
+    public static boolean verbose = false;
+    public static boolean debug = false;
+
+    // notes: begin_msg_index is always correct. end_msg_index = -1  means nMessages in folder.
+    // note: msg # begin_msg_index will be processed. msg # end_msg_index will not be processed.
+    protected int begin_msg_index = 0, end_msg_index = -1;
+
+    EmailFetcherStats stats = new EmailFetcherStats();
+    String currentStatus;
+
+
+    int totalMessagesInFetch, messagesCompletedInFetch;                        // this fetcher may be part of a bigger fetch operation. we need to track the progress of the bigger fetch in order to track progress accurately.
+
+    public int getTotalMessagesInFetch() {
+        return totalMessagesInFetch;
+    }
+
+    public void setTotalMessagesInFetch(int totalMessagesInFetch) {
+        this.totalMessagesInFetch = totalMessagesInFetch;
+    }
+
+    public int getMessagesCompletedInFetch() {
+        return messagesCompletedInFetch;
+    }
+
+    public void setMessagesCompletedInFetch(int messagesCompletedInFetch) {
+        this.messagesCompletedInFetch = messagesCompletedInFetch;
+    }
+
+    // stats
+    int nMessagesProcessedSuccess, nUncachedMessagesProcessed, nMessagesCached; // running count of # of messages processed successfully
+    int nErrors = 0;
+
+    public void cancel() {
+        isCancelled = true;
+    }
+
+    public void setFetchConfig(FetchConfig fc) {
+        this.fetchConfig = fc;
+    }
+
+    public int getThreadID() {
+        return threadID;
+    }
+
+    public void setThreadID(int threadID) {
+        this.threadID = threadID;
+    }
+
+    public int getNMessagesProcessed() {
+        return nMessagesProcessedSuccess;
+    }
+
+    public int getNUncachedMessagesProcessed() {
+        return nUncachedMessagesProcessed;
+    }
+
+    protected String folder_name() {
+        return fetchedFolderInfo.longName;
+    }
+
+    protected String email_source() {
+        return fetchedFolderInfo.accountKey;
+    }
+
+    public boolean mayHaveRunOutOfMemory() {
+        return mayHaveRunOutOfMemory;
+    }
+
+    //	private String folderPrefix; // prefix for folder files
+    transient Store store;                                        // we don't really need this serialized across sessions
+
+    transient Archive archive;
+    Collection<String> dataErrors = new LinkedHashSet<String>();    // log of input data errors
+
+    Date prevDate = null;
+
+	/*
+     * // comment out unused constructors, so it's cleaner/easier to trace the
+	 * setting member fields.
+	 * public EmailFetcherThread() { super(); }
+	 * 
+	 * public EmailFetcherThread(EmailStore store, String folder_name)
+	 * {
+	 * this.emailStore = store;
+	 * this.folder_name = folder_name;
+	 * }
+	 */
+
+    public EmailFetcherThread(EmailStore store, FolderInfo fi, int begin_msg_index, int end_msg_index) {
+        this.emailStore = store;
+        this.fetchedFolderInfo = fi;
+        stats.nTotalMessages = end_msg_index - begin_msg_index;
+        this.begin_msg_index = begin_msg_index;
+        this.end_msg_index = end_msg_index;
+    }
+
+    public void setArchive(Archive a) {
+        archive = a;
+    }
+
+    public Archive getArchive() {
+        return archive;
+    }
+
+    /**
+     * merges results with another email fetcher. does some lightweight work
+     * including updating stats. consider removing this and simplifying in the
+     * future
+     */
+    public void merge(EmailFetcherThread other) {
+        verify();
+        if (other != null) {
+            other.verify();
+
+            // TOFIX: we should eliminate duplicates
+            dataErrors.addAll(other.dataErrors);
+            stats.merge(other.stats);
+
+            nMessagesProcessedSuccess += other.nMessagesProcessedSuccess;
+            nErrors += other.nErrors;
+            mayHaveRunOutOfMemory |= other.mayHaveRunOutOfMemory;
+        }
+        verify();
+    }
+
+    /**
+     * intern a bunch of addrs, to save memory
+     *
+     * @throws UnsupportedEncodingException
+     */
+    private static void internAddressList(Address[] addrs) throws UnsupportedEncodingException {
+        if (addrs == null)
+            return;
+
+        for (Address a : addrs) {
+            if (a instanceof InternetAddress) {
+                InternetAddress ia = (InternetAddress) a;
+                String address = ia.getAddress(), personal = ia.getPersonal();
+                if (address != null)
+                    ia.setAddress(InternTable.intern(address));
+                if (personal != null)
+                    ia.setPersonal(InternTable.intern(personal));
+            }
+        }
+    }
+
+    /**
+     * Key method for importing email: converts a javamail obj. to our own data structure (EmailDocument)
+     */
+    //public EmailDocument convertToEmailDocument(MimeMessage m, int num, String url) throws MessagingException, IOException
+    private EmailDocument convertToEmailDocument(MimeMessage m, String id) throws MessagingException, IOException {
+        // get the date.
+        // prevDate is a hack for the cases where the message is lacking an explicit Date: header. e.g.
+        //		From hangal Sun Jun 10 13:46:46 2001
+        //		To: ewatkins@stanford.edu
+        //		Subject: Re: return value bugs
+        // though the date is on the From separator line, the mbox provider fails to parse it and provide it to us.
+        // so as a hack, we will assign such messages the same date as the previous one this fetcher has seen! ;-)
+        // update: having the exact same date causes the message to be considered a duplicate, so just increment
+        // the timestamp it by 1 millisecond!
+        // a better fix would be to improve the parsing in the provider
+
+        boolean hackyDate = false;
+        Date d = m.getSentDate();
+        if (d == null)
+            d = m.getReceivedDate();
+        if (d == null) {
+            if (prevDate != null) {
+                long newTime = prevDate.getTime() + 1L; // added +1 so that this email is not considered the same object as the prev. one if they are in the same thread
+                d = new Date(newTime);
+                dataErrors.add("No date for message id:" + id + ": " + EmailUtils.formatMessageHeader(m) + " assigned approximate date");
+            } else {
+                d = INVALID_DATE; // wrong, but what can we do... :-(
+                dataErrors.add("No date for message id:" + id + ": " + EmailUtils.formatMessageHeader(m) + " assigned deliberately invalid date");
+            }
+            hackyDate = true;
+        } else {
+            Calendar c = new GregorianCalendar();
+            c.setTime(d);
+            int yy = c.get(Calendar.YEAR);
+            if (yy < 1960 || yy > 2020) {
+                dataErrors.add("Probably bad date: " + Util.formatDate(c) + " message: " + EmailUtils.formatMessageHeader(m));
+                hackyDate = true;
+            }
+        }
+
+        if (hackyDate && prevDate != null) {
+            long newTime = prevDate.getTime() + 1L; // added +1 so that this email is not considered the same object as the prev. one if they are in the same thread
+            d = new Date(newTime);
+            Util.ASSERT(!d.equals(prevDate));
+        }
+
+        Calendar c = new GregorianCalendar();
+        c.setTime(d != null ? d : new Date());
+
+        prevDate = d;
+
+        Address to[] = null, cc[] = null, bcc[] = null;
+        Address[] from = null;
+        try {
+            // 			allrecip = m.getAllRecipients(); // turns out to be too expensive because it looks for newsgroup headers for imap
+            // assemble to, cc, bcc into a list and copy it into allrecip
+            List<Address> list = new ArrayList<Address>();
+            from = m.getFrom();
+            to = m.getRecipients(Message.RecipientType.TO);
+            if (to != null)
+                list.addAll(Arrays.asList(to));
+            cc = m.getRecipients(Message.RecipientType.CC);
+            if (cc != null)
+                list.addAll(Arrays.asList(cc));
+            bcc = m.getRecipients(Message.RecipientType.BCC);
+            if (bcc != null)
+                list.addAll(Arrays.asList(bcc));
+
+            // intern the strings in these addresses to save memory cos they are repeated often in a large archive
+            internAddressList(from);
+            internAddressList(to);
+            internAddressList(cc);
+            internAddressList(bcc);
+        } catch (AddressException ae) {
+            String s = "Bad address in folder " + folder_name() + " message id" + id + " " + ae;
+            dataErrors.add(s);
+        }
+
+        // take a deep breath. This object is going to live longer than most of us.
+        EmailDocument ed = new EmailDocument(id, email_source(), folder_name(), to, cc, bcc, from, m.getSubject(), m.getMessageID(), c.getTime());
+
+        String[] headers = m.getHeader("List-Post");
+        if (headers != null && headers.length > 0) {
+            // trim the headers because they usually look like: "<mailto:prpl-devel@lists.stanford.edu>"
+            ed.sentToMailingLists = new String[headers.length];
+            int i = 0;
+            for (String header : headers) {
+                header = header.trim();
+                header = header.toLowerCase();
+
+                if (header.startsWith("<") && header.endsWith(">"))
+                    header = header.substring(1, header.length() - 1);
+                if (header.startsWith("mailto:") && !"mailto:".equals(header)) // defensive check in case header == "mailto:"
+                    header = header.substring(("mailto:").length());
+                ed.sentToMailingLists[i++] = header;
+            }
+        }
+        if (hackyDate) {
+            String s = "Guessed date " + Util.formatDate(c) + " for message id: " + id + ": " + ed.getHeader();
+            dataErrors.add(s);
+            ed.hackyDate = true;
+        }
+
+        // check if the message has attachments.
+        // if it does and we're not downloading attachments, then we mark the ed as such.
+        // otherwise we had a problem where a message header (and maybe text) was downloaded but without attachments in one run
+        // but in a subsequent run where attachments were needed, we thought the message was already cached and there was no
+        // need to recompute it, leaving the attachments field in this ed incorrect.
+        List<String> attachmentNames = getAttachmentNames(m, m);
+        if (!Util.nullOrEmpty(attachmentNames)) {
+            ed.attachmentsYetToBeDownloaded = true; // will set it to false later if attachments really were downloaded (not sure why)
+            //			log.info ("added " + attachmentNames.size() + " attachments to message: " + ed);
+        }
+        return ed;
+    }
+
+    /*
+     * we try to get the attachment names cheaply, i.e. without having to
+     * process the whole message
+     */
+    private List<String> getAttachmentNames(MimeMessage m, Part p) throws MessagingException, IOException {
+        List<String> result = new ArrayList<String>();
+        try {
+            if (p.isMimeType("multipart/*") || p.isMimeType("message/rfc822")) {
+                if (p.isMimeType("multipart/alternative"))
+                    return result; // ignore alternative's because real attachments don't have alternatives
+                DataHandler dh = p.getDataHandler();
+                DataSource ds = dh.getDataSource();
+                if (ds instanceof MultipartDataSource) {
+                    MultipartDataSource mpds = (MultipartDataSource) ds;
+                    for (int i = 0; i < mpds.getCount(); i++)
+                        result.addAll(getAttachmentNames(m, mpds.getBodyPart(i)));
+                } else {
+                    String name = ds.getName();
+                    if (!Util.nullOrEmpty(name))
+                        result.add(name);
+                }
+            } else {
+                String filename = p.getFileName();
+                if (filename != null)
+                    result.add(filename);
+            }
+        } catch (Exception e) {
+            // sometimes we see javax.mail.MessagingException: Unable to load BODYSTRUCTURE
+            // in this case, just ignore, not much we can do i guess.
+            Util.print_exception(e, log);
+        }
+        return result;
+    }
+
+    //	public void setEmailCache (DocCache cache)
+    //	{
+    //		this.cache = cache;
+    //	}
+
+    /**
+     * this method returns the text content of the message as a list of strings
+     * // each element of the list could be the content of a multipart message
+     * // m is the top level subject
+     * // p is the specific part that we are processing (p could be == m)
+     * also sets up names of attachments (though it will not download the
+     * attachment unless downloadAttachments is true)
+     */
+    private List<String> processMessagePart(int messageNum, Message m, Part p, List<Blob> attachmentsList) throws MessagingException, IOException {
+        List<String> list = new ArrayList<String>(); // return list
+        if (p == null) {
+            dataErrors.add("part is null: " + folder_name() + " idx " + messageNum);
+            return list;
+        }
+
+        if (p == m && p.isMimeType("text/html")) {
+            /*
+            String s = "top level part is html! message:" + m.getSubject() + " " + m.getDescription();
+            dataErrors.add(s);
+            */
+            // we don't normally expect the top-level part to have content-type text/html
+            // but we saw this happen on some sample archives pst -> emailchemy. so allow it and handle it by parsing the html
+            String html = (String) p.getContent();
+            String text = Util.unescapeHTML(html);
+            org.jsoup.nodes.Document doc = Jsoup.parse(text);
+
+            StringBuilder sb = new StringBuilder();
+            HTMLUtils.extractTextFromHTML(doc.body(), sb);
+            list.add(sb.toString());
+            return list;
+        }
+
+        if (p.isMimeType("text/plain")) {
+            //make sure, p is not wrongly labelled as plain text.
+            Enumeration headers = p.getAllHeaders();
+            boolean dirty = false;
+            if (headers != null)
+                while (headers.hasMoreElements()) {
+                    Header h = (Header) headers.nextElement();
+                    String name = h.getName();
+                    String value = h.getValue();
+                    if (name != null && value != null) {
+                        if (name.equals("Content-transfer-encoding") && value.equals("base64")) {
+                            dirty = true;
+                            break;
+                        }
+                    }
+                }
+            String fname = p.getFileName();
+            if (fname != null) {
+                int idx = fname.lastIndexOf('.');
+                if ((idx < fname.length()) && (idx >= 0)) {
+                    String extension = fname.substring(idx);
+                    //anything extension other than .txt is suspicious.
+                    if (!extension.equals(".txt"))
+                        dirty = true;
+                }
+            }
+            if (dirty) {
+                dataErrors.add("Dirty message part, has conflicting message part headers."  + folder_name() + " Message# " + messageNum);
+                return list;
+            }
+
+            log.debug("Message part with content type text/plain");
+            String content;
+            String type = p.getContentType(); // new InputStreamReader(p.getInputStream(), "UTF-8");
+            try {
+                if (type.contains("charset=")) {
+                    byte b[] = Util.getBytesFromStream(p.getInputStream());
+                    content = new String(b, type.substring(type.indexOf("charset=") + "charset=".length()));
+                } else {
+                    // if forced encoding is set, we read the string with that encoding, otherwise we just use whatever p.getContent gives us
+                    if (FORCED_ENCODING != null) {
+                        byte b[] = Util.getBytesFromStream(p.getInputStream());
+                        content = new String(b, FORCED_ENCODING);
+                    } else
+                        content = (String) p.getContent();
+                }
+            } catch (UnsupportedEncodingException uee) {
+                dataErrors.add("Unsupported encoding: " + folder_name() + " Message #" + messageNum + " type " + type + ", using brute force conversion");
+                // a particularly nasty issue:javamail can't handle utf-7 encoding which is common with hotmail and exchange servers.
+                // we're using the workaround suggested on this page: http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4304013
+                // though it may be better to consider official support for utf-7 or other encodings.
+
+                // TOFIX: I get an exception for utfutf8-encoding which has a base64 encoding embedded on it.
+                // Unsupported encoding: gmail-sent Message #10477 type text/plain; charset=x-utf8utf8; name="newyorker.txt",
+                // the hack below doesn't work for it.
+                ByteArrayOutputStream bao = new ByteArrayOutputStream();
+                p.writeTo(bao);
+                content = bao.toString();
+            }
+            list.add(content);
+        } else if (p.isMimeType("multipart/*") || p.isMimeType("message/rfc822")) {
+            // rfc822 mime type is for embedded mbox format or some such (appears for things like
+            // forwarded messages). the content appears to be just a multipart.
+            Object o = p.getContent();
+            if (o instanceof Multipart) {
+                Multipart allParts = (Multipart) o;
+                if (p.isMimeType("multipart/alternative")) {
+                    // this is an alternative mime type. v common case to have text and html alternatives
+                    // so just process the text part if there is one, and avoid fetching the alternatives.
+                    // useful esp. because many ordinary messages are alternative: text and html and we don't want to fetch the html.
+                    // revisit in future we want to retain the html alternative for display purposes
+                    Part[] parts = new Part[allParts.getCount()];
+                    for (int i = 0; i < parts.length; i++)
+                        parts[i] = allParts.getBodyPart(i);
+
+                    for (int i = 0; i < parts.length; i++) {
+                        Part thisPart = parts[i];
+                        if (thisPart.isMimeType("text/plain")) {
+                            // common case, return quickly
+                            list.add((String) thisPart.getContent());
+                            log.debug("Multipart/alternative with content type text/plain");
+                            return list;
+                        }
+                    }
+
+                    // no text part, let's look for an html part. this happens for html parts.
+                    for (int i = 0; i < allParts.getCount(); i++) {
+                        Part thisPart = parts[i];
+                        if (thisPart.isMimeType("text/html")) {
+                            // common case, return quickly
+                            String html = (String) thisPart.getContent();
+                            String text = Util.unescapeHTML(html);
+                            org.jsoup.nodes.Document doc = Jsoup.parse(text);
+
+                            StringBuilder sb = new StringBuilder();
+                            HTMLUtils.extractTextFromHTML(doc.body(), sb);
+                            list.add(sb.toString());
+
+                            log.debug("Multipart/alternative with content type text/html");
+                            return list;
+                        }
+                    }
+
+                    // no text or html part. hmmm... blindly process the first part only
+                    if (allParts.getCount() >= 1)
+                        list.addAll(processMessagePart(messageNum, m, allParts.getBodyPart(0), attachmentsList));
+                } else {
+                    // process it like a regular multipart
+                    for (int i = 0; i < allParts.getCount(); i++) {
+                        BodyPart bp = allParts.getBodyPart(i);
+                        list.addAll(processMessagePart(messageNum, m, bp, attachmentsList));
+                    }
+                }
+            } else if (o instanceof Part)
+                list.addAll(processMessagePart(messageNum, m, (Part) o, attachmentsList));
+            else
+                dataErrors.add("Unhandled part content, " + folder_name() + " Message #" + messageNum + "Java type: " + o.getClass() + " Content-Type: " + p.getContentType());
+        } else {
+            try {
+                // do attachments only if downloadAttachments is set.
+                // some apps do not need attachments, so this saves some time.
+                // however, it seems like a lot of time is taken in imap prefetch, which gets attachments too?
+                if (fetchConfig.downloadAttachments)
+                    handleAttachments(messageNum, m, p, list, attachmentsList);
+            } catch (Exception e) {
+                dataErrors.add("Ignoring attachment for " + folder_name() + " Message #" + messageNum + ": " + Util.stackTrace(e));
+            }
+        }
+
+        return list;
+    }
+
+    /**
+     * recursively processes attachments, fetching and saving it if needed
+     * parses the given part p, and adds it to hte attachmentsList.
+     * in some cases, like a text/html type without a filename, we instead append it to the textlist
+     * @throws MessagingException
+     */
+    private void handleAttachments(int idx, Message m, Part p, List<String> textList, List<Blob> attachmentsList) throws MessagingException {
+        String ct = null;
+        if (!(m instanceof MimeMessage)) {
+            Exception e = new IllegalArgumentException("Not a MIME message!");
+            e.fillInStackTrace();
+            log.warn(Util.stackTrace(e));
+            return;
+        }
+
+        String filename = null;
+        try {
+            filename = p.getFileName();
+        } catch (Exception e) {
+            // seen this happen with:
+            // Folders__gmail-sent Message #12185 Expected ';', got "Message"
+            // javax.mail.internet.ParseException: Expected ';', got "Message"
+
+            dataErrors.add("Unable to read attachment name: " + folder_name() + " Message# " + idx);
+            return;
+        }
+
+        String sanitizedFName = Util.sanitizeFolderName(emailStore.getAccountID() + "." + folder_name());
+        if (filename == null) {
+            String tempFname = sanitizedFName + "." + idx;
+            dataErrors.add("attachment filename is null for " + sanitizedFName + " Message#" + idx + " assigning it the name: " + tempFname);
+            if (p.isMimeType("text/html")) {
+                try {
+                    log.info("Turning message " + sanitizedFName + " Message#" + idx + " into text although it is an attachment");
+                    String html = (String) p.getContent();
+                    String text = Util.unescapeHTML(html);
+                    org.jsoup.nodes.Document doc = Jsoup.parse(text);
+
+                    StringBuilder sb = new StringBuilder();
+                    HTMLUtils.extractTextFromHTML(doc.body(), sb);
+                    textList.add(sb.toString());
+                    return;
+                } catch (Exception e) {
+                    Util.print_exception("Error reading contents of text/html multipart without a filename!", e, log);
+                    return;
+                }
+            }
+            filename = tempFname;
+        }
+
+        // Replacing any of the disallowed filename characters (\/:*?"<>|&) to _
+        // (note: & causes problems with URLs for serveAttachment etc, so it's also replaced)
+        String newFilename = Util.sanitizeFileName(filename);
+
+        // Updating filename if it's changed after sanitizing.
+        if (!newFilename.equals(filename)) {
+            log.info("Filename changed from " + filename + " to " + newFilename);
+            filename = newFilename;
+        }
+
+        try {
+            ct = p.getContentType();
+            if (filename.indexOf(".") < 0) // no ext in filename... let's fix it if possible
+            {
+                // Using startsWith instead of equals because sometimes the ct has crud beyond the image/jpeg;...crud....
+                // Below are the most common file types, more type can be added if needed
+
+                // Most common APPLICATION TYPE
+                if (ct.startsWith("application/pdf"))
+                    filename = filename + ".pdf";
+                if (ct.startsWith("application/zip"))
+                    filename = filename + ",zip";
+                // Most common IMAGE TYPE
+                if (ct.startsWith("image/jpeg"))
+                    filename = filename + ".jpg";
+                if (ct.startsWith("image/gif"))
+                    filename = filename + ".gif";
+                if (ct.startsWith("image/png"))
+                    filename = filename + ".png";
+                // Most Common VIDEO TYPE
+                if (ct.startsWith("video/x-ms-wmv"))
+                    filename = filename + ".wmv";
+                // Most Common AUDIO TYPE
+                if (ct.startsWith("audio/mpeg"))
+                    filename = filename + ".mp3";
+                if (ct.startsWith("audio/mp4"))
+                    filename = filename + ".mp4";
+                // Most Common TEXT TYPE
+                if (ct.startsWith("text/html"))
+                    filename = filename + ".html";
+                // Windows Office
+                if (ct.startsWith("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) //Word
+                    filename = filename + ".docx";
+                if (ct.startsWith("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")) //Excel
+                    filename = filename + ".xlsx";
+                if (ct.startsWith("application/vnd.openxmlformats-officedocument.presentationml.presentation")) //PowerPoint
+                    filename = filename + ".pptx";
+            }
+            // retain only up to first semi-colon; often ct is something like text/plain; name="filename"' we don't want to log the filename
+            int x = ct.indexOf(";");
+            if (x >= 0)
+                ct = ct.substring(0, x);
+            log.info("Attachment content type: " + ct + " filename = " + Util.blurKeepingExtension(filename));
+        } catch (Exception pex) {
+            dataErrors.add("Can't read CONTENT-TYPE: " + ct + " filename:" + filename + " size = " + p.getSize() + " subject: " + m.getSubject() + " Date : " + m.getSentDate().toString() + "\n Exception: " + pex + "\n" + Util.stackTrace(pex));
+            return;
+        }
+
+        //	    if (filename == null && !p.isMimeType("text/html") && !p.isMimeType("message/partial")) // expected not to have a filename with mime type text/html
+        //	    	log.warn ("Attachment filename is null: " + Util.stackTrace());
+
+
+        boolean success = true;
+        // the size passed in here is the part size, which is not really the binary blob size.
+        // when we read the stream below in blobStore.add(), we'll set it again to the binary blob size
+        Blob b = new EmailAttachmentBlob(filename, p.getSize(), (MimeMessage) m, p);
+
+        if (fetchConfig.downloadAttachments) {
+            // this containment check is only on the basis of file name and size currently,
+            // not on the actual hash
+            if (archive.getBlobStore().contains(b)) {
+                log.debug("Cache hit! " + b);
+            } else {
+                try {
+                    if (filename.endsWith(".tif"))
+                        log.info("Fetching attachment..." + Util.blurKeepingExtension(filename));
+
+                    // performance critical! use large buffer! currently 256KB
+                    // stream will be closed by callee
+
+                    long start = System.currentTimeMillis();
+                    long nBytes = archive.getBlobStore().add(b, new BufferedInputStream(p.getInputStream(), 256 * 1024));
+                    long end = System.currentTimeMillis();
+                    if (nBytes != -1) {
+                        long diff = end - start;
+                        String s = "attachment size " + nBytes + " bytes, fetched in " + diff + " millis";
+                        if (diff > 0)
+                            s += " (" + (nBytes / diff) + " KB/s)";
+                        log.info(s);
+                    }
+
+                    Util.ASSERT(archive.getBlobStore().contains(b));
+
+                } catch (IOException ioe) {
+                    success = false;
+                    dataErrors.add("WARNING: Unable to fetch attachment: filename: " + filename + " size = " + p.getSize() + " subject: " + m.getSubject() + " Date : " + m.getSentDate().toString() + "\nException: " + ioe);
+                    ioe.printStackTrace(System.out);
+                }
+            }
+
+            if (success) {
+                attachmentsList.add(b);
+
+                /// generate thumbnail only if not already cached
+                try {
+                    archive.getBlobStore().generate_thumbnail(b); // supplement
+                } catch (IOException ioe) {
+                    log.warn("failed to create thumbnail, filename: " + filename + " size = " + p.getSize() + " subject: " + m.getSubject() + " Date : " + m.getSentDate().toString() + "\nException: " + ioe);
+                    ioe.printStackTrace(System.out);
+                }
+            }
+        }
+    }
+
+    @SuppressWarnings("unused")
+    private static String processLastReceived(String header) {
+        header = header.toLowerCase();
+        StringTokenizer st = new StringTokenizer(header, " \t()[]");
+        String x = st.nextToken();
+        if (!x.equals("from")) {
+            log.warn("Warning: unrecognized header: " + header);
+            return null;
+        }
+
+        while (st.hasMoreTokens()) {
+            String s = st.nextToken();
+            if (Character.isDigit(s.charAt(0))) {
+                log.warn("IP address: " + s);
+                return s;
+            }
+        }
+        return null;
+    }
+
+    public void verify() {
+    }
+
+    public void finish() {
+        currentStatus = JSONUtils.getStatusJSON("Verifying email headers...");
+        currentStatus = JSONUtils.getStatusJSON("");
+    }
+
+    /**
+     * prepare a status json with up to N_TEASERS teasers from the most recent
+     * emails, starting backwards from idx. specifically ask for ArrayList as
+     * List.get() can be costly otherwise.
+     */
+    private static String getStatusJSONWithTeasers(String message, int pctComplete, long secsElapsed, long secsRemaining, ArrayList<EmailDocument> emails, int N_TEASERS) {
+        JSONObject json = new JSONObject();
+        try {
+            json.put("pctComplete", pctComplete);
+            json.put("message", message);
+            json.put("secsElapsed", secsElapsed);
+            json.put("secsRemaining", secsRemaining);
+            if (!Util.nullOrEmpty(emails)) {
+                JSONArray arr = new JSONArray();
+                int idx_end = emails.size();
+                int idx_start = idx_end - N_TEASERS;
+                if (idx_start < 0)
+                    idx_start = 0;
+                for (int i = idx_start, j = 0; i < idx_end; i++) {
+                    EmailDocument email = emails.get(i);
+                    if (email != null) {
+                        String subject = email.description;
+                        if (!Util.nullOrEmpty(subject))
+                            arr.put(j++, subject);
+                    }
+                }
+                json.put("teasers", arr);
+            }
+        } catch (JSONException jsone) {
+            try {
+                json.put("error", jsone.toString());
+            } catch (Exception e) {
+                Util.report_exception(e);
+            }
+        }
+        return json.toString();
+    }
+
+    /**
+     * best effort to prefetch messages for messages[startMsgIdx] onwards, up to
+     * the IMAP_PREFETCH_BUFSIZE
+     * return List<String> if bodyTextOnly is true, otherwise List<MimeMessage>
+     */
+    private List<?> do_imap_prefetch(Message[] messages, int startMsgIdx, Folder folder, boolean bodyTextOnly) {
+        // its perfectly ok for correctness for this method to do nothing and return null
+        List<?> prefetchedMessages = null;
+        try {
+
+            if (IMAP_PREFETCH_BUFSIZE > 0 && folder instanceof IMAPFolder) {
+                int prefetch_messages_size = 0;
+
+                int start_message_num = messages[startMsgIdx].getMessageNumber();
+                int end_message_num = start_message_num;
+
+                List<Integer> messageNums = new ArrayList<Integer>();
+
+                // figure out message num range to fetch. if anything is unusual -- bad content type, non-consec. msg nums etc -- break out.
+                // non consec. message numbers are a problem because they cause a very long imap command string, which we found was returning an "invalid command" response.
+                int prev_message_num = -1;
+                for (int msgIdx = startMsgIdx; msgIdx < messages.length; msgIdx++) {
+                    if (bodyTextOnly) {
+                        String contentType = messages[msgIdx].getContentType().toLowerCase();
+                        if (!contentType.startsWith("multipart/") && !contentType.startsWith("text/plain")) {
+                            log.info("Warn: message idx" + msgIdx + " msg#" + messages[msgIdx].getMessageNumber() + " has unexpected content type " + contentType);
+                            break;
+                        }
+                    }
+
+                    // check if sequence is as expected
+                    int next_message_num = messages[msgIdx].getMessageNumber(); // may be better to switch this to uid and prefetcher uses uid fetch
+                    if (next_message_num != prev_message_num + 1 && prev_message_num != -1)
+                        break;
+
+                    // if this message would push prefetch size beyond the buf size, break out, not including this message
+                    if (prefetch_messages_size + messages[msgIdx].getSize() >= IMAP_PREFETCH_BUFSIZE)
+                        break;
+                    prev_message_num = next_message_num;
+                    prefetch_messages_size += messages[msgIdx].getSize();
+                    messageNums.add(next_message_num);
+                }
+
+                if (messageNums.size() == 0)
+                    return null;
+
+                // now we prefetch messages from start_message_num to end_message_num
+                long startMillis = System.currentTimeMillis();
+                log.info("prefetching " + messageNums.size() + " messages");
+                ImapPrefetcher prefetcher = bodyTextOnly ? new TextOnlyImapPrefetcher(((ImapPopEmailStore) emailStore).session, messageNums) : new ImapPrefetcher(((ImapPopEmailStore) emailStore).session, messageNums);
+                prefetchedMessages = (List<?>) ((IMAPFolder) folder).doCommand(prefetcher); // start_message_num, end_message_num));
+                long elapsedMillis = System.currentTimeMillis() - startMillis;
+                long kb_per_sec = prefetch_messages_size / elapsedMillis;
+                log.info("prefetched " + messageNums.size() + " messages in " + Util.blur(folder.getName()) + " [" + start_message_num + ":" + end_message_num + "], " + Util.commatize(prefetch_messages_size / 1024) + "KB in " + Util.commatize(elapsedMillis) + "ms (" + Util.commatize(kb_per_sec) + " KB/sec)");
+            }
+        } catch (Exception e) {
+            Util.print_exception(e, log);
+        }
+        return prefetchedMessages;
+    }
+
+    private void fetchHeaders(Message[] messages) throws MessagingException {
+        // fetch headers (don't do it for mbox folders, waste of time)
+        // this is an essential perf. step so that we fetch the headers in bulk.
+        // otherwise it takes a long time to fetch header info one at a time for each message
+        if (!(emailStore instanceof MboxEmailStore)) {
+            long startTimeMillis = System.currentTimeMillis();
+            currentStatus = JSONUtils.getStatusJSON("Reading headers from " + folder.getName() + "...");
+            FetchProfile fp = new FetchProfile();
+            fp.add(FetchProfile.Item.ENVELOPE);
+            fp.add(FetchProfile.Item.CONTENT_INFO);
+            fp.add(UIDFolder.FetchProfileItem.UID); // important, otherwise reading UIDs takes a long time later
+            fp.add("List-Post");
+            folder.fetch(messages, fp);
+            long endTimeMillis = System.currentTimeMillis();
+            log.info("Done fetching headers: " + Util.commatize(endTimeMillis - startTimeMillis) + "ms");
+        }
+    }
+
+    private void fetchHeaders(int nMessages) throws MessagingException {
+        // fetch headers (don't do it for mbox folders, waste of time)
+        // this is an essential perf. step so that we fetch the headers in bulk.
+        // otherwise it takes a long time to fetch header info one at a time for each message
+        if (!(emailStore instanceof MboxEmailStore)) {
+            long startTimeMillis = System.currentTimeMillis();
+            currentStatus = JSONUtils.getStatusJSON("Reading headers from " + folder.getName() + "...");
+            FetchProfile fp = new FetchProfile();
+            fp.add(FetchProfile.Item.ENVELOPE);
+            fp.add(FetchProfile.Item.CONTENT_INFO);
+            fp.add(UIDFolder.FetchProfileItem.UID); // important, otherwise reading UIDs takes a long time later
+            fp.add("List-Post");
+            for (int i = 0; i < nMessages; i++) {
+                Message[] messages = new Message[]{folder.getMessage(i)};
+                folder.fetch(messages, fp);
+            }
+            long endTimeMillis = System.currentTimeMillis();
+            log.info("Done fetching headers: " + Util.commatize(endTimeMillis - startTimeMillis) + "ms");
+        }
+    }
+
+    private Message[] removeMessagesAlreadyInArchive(Archive archive, Message[] messages) {
+        // early out for the common case that we have an empty archive
+        if (archive.getAllDocs().size() == 0)
+            return messages;
+
+        List<Message> resultList = new ArrayList<Message>();
+        for (int i = 0; i < messages.length; i++) {
+            //int idx = messages[i].getMessageNumber();
+            Message m = messages[i];
+            MimeMessage mm = (MimeMessage) m;
+            try {
+                EmailDocument ed = convertToEmailDocument(mm, "dummy"); // id doesn't really matter here
+                if (archive.containsDoc(ed)) {
+                    stats.nMessagesAlreadyPresent++;
+                    dataErrors.add("Duplicate message: " + ed); // note: report.jsp depends on this exact string
+                    continue;
+                }
+            } catch (Exception e) {
+                Util.print_exception(e, log);
+            }
+            resultList.add(mm);
+            messages[i] = null; // no harm explicitly nulling out messages
+        }
+        Message[] resultArray = resultList.toArray(new Message[0]);
+        return resultArray;
+    }
+
+    /**
+     * Make few post checks on the content and returns true if the message looks
+     * ok
+     */
+    private boolean messageLooksOk(String content) {
+        if (content == null)
+            //let others handle it.
+            return true;
+        String[] lines = content.split("\n");
+        int badlines = 0;
+        if (lines.length > 50)
+            for (String line : lines) {
+                if (!line.contains(" "))
+                    badlines++;
+                else
+                    badlines = 0;
+                if (badlines > 50)
+                    return false;
+            }
+        return true;
+    }
+
+    //keep track of the total time elapsed in fetching messages across batches
+    static long fetchStartTime = System.currentTimeMillis();
+
+    /**
+     * fetch given message idx's in given folder -- @performance critical
+     *
+     * @param offset - the original offset of the first message in the messages array, important to initialize
+     *               for proper assignment of unique id or doc Id
+     */
+    //private void fetchUncachedMessages(String sanitizedFName, Folder folder, DocCache cache, List<Integer> msgIdxs) throws MessagingException, FileNotFoundException, IOException, GeneralSecurityException {
+    private void fetchAndIndexMessages(Folder folder, Message[] messages, int offset, int totalMessages) throws MessagingException, IOException, GeneralSecurityException {
+        //mark the processing of new batch
+        if (offset == 0)
+            fetchStartTime = System.currentTimeMillis();
+
+        currentStatus = JSONUtils.getStatusJSON((emailStore instanceof MboxEmailStore) ? "Parsing " + folder.getName() + " (can take a while)..." : "Reading " + folder.getName() + "...");
+
+        // bulk fetch of all message headers
+        int n = messages.length;
+
+        // eliminate any messages the archive already has
+        messages = removeMessagesAlreadyInArchive(archive, messages);
+
+        log.info(n - messages.length + " message(s) already in the archive");
+
+        ArrayList<EmailDocument> emails = new ArrayList<EmailDocument>();
+
+        // for performance, we need to do bulk prefetches, instead of fetching 1 message at a time
+        // prefetchedMessages will be a temp cache of prefetched messages
+        int first_i_prefetched = -1, last_i_prefetched = -1;
+        List<?> prefetchedMessages = null; // the type of this can be either list<string> if text only, otherwise list<mimemmessage>
+
+        long highestUID = archive.getLastUIDForFolder(fetchedFolderInfo.accountKey, fetchedFolderInfo.longName);
+        long lastAssignedUID = highestUID;
+        boolean bodyTextOnly = !fetchConfig.downloadAttachments;
+        try {
+            archive.openForWrite();
+            for (int i = 0; i < messages.length; i++) {
+                // critical step: (thanks, yourkit!)
+                // null out the ref to the previous message, otherwise it stays in memory, and the heap effectively needs to be as big as the size of all messages
+                if (i > 0)
+                    messages[i - 1] = null;
+
+                if (isCancelled)
+                    break;
+
+                Message m = messages[i];
+                MimeMessage mm = (MimeMessage) m;
+
+                if (i >= last_i_prefetched) {
+                    // critical perf. step: do a bulk imap prefetch
+                    // the prefetch will fetch as many messages as possible up to a max buffer size, and return the messages prefetched
+                    // last_i_prefetched tracks what is the last index into idxs that we have prefetched.
+                    // when we run out of prefetched messages, we do another bulk prefetch
+
+                    prefetchedMessages = do_imap_prefetch(messages, i, folder, bodyTextOnly);
+                    if (prefetchedMessages != null) {
+                        first_i_prefetched = i;
+                        last_i_prefetched = i + prefetchedMessages.size();
+                    }
+                }
+
+                int pctDone = ((i + offset) * 100) / totalMessages;
+                long elapsedMillis = System.currentTimeMillis() - fetchStartTime;
+                long unprocessedSecs = Util.getUnprocessedMessage(i + offset, totalMessages, elapsedMillis);
+                int N_TEASERS = 50; // 50 ok here, because it takes a long time to fetch and process messages, so teaser computation is relatively not expensive
+                int nTriesForThisMessage = 0;
+                currentStatus = getStatusJSONWithTeasers("Reading " + Util.commatize(totalMessages) + " messages from " + folder.getName() + "...", pctDone, elapsedMillis / 1000, unprocessedSecs, emails, N_TEASERS);
+
+                int messageNum = mm.getMessageNumber();
+
+                try {
+                    long unique_id;
+
+                    // if we have uid, that's even better
+                    // don't use uid's for mbox, it has a bug and always gives -1
+                    // see http://james.apache.org/server/rfclist/imap4/rfc2060.txt for uid spec
+                    if (folder instanceof UIDFolder && !(emailStore instanceof MboxEmailStore)) {
+                        long uid = ((UIDFolder) folder).getUID(m);
+                        unique_id = uid;
+                    } else
+                        unique_id = lastAssignedUID + 1 + i + offset; // +1 since i starts from 0 (but lastAssignedUID can be -1 -- is that safe? -sgh)
+
+                    if (unique_id > highestUID)
+                        highestUID = unique_id;
+
+                    String unique_id_as_string = Long.toString(unique_id);
+
+                    // well, we already converted to emaildoc above during removeMessagesAlreadyInArchive
+                    // not a serious perf. concern now, but revisit if needed
+                    EmailDocument ed = convertToEmailDocument(mm, unique_id_as_string); // this messageNum is mostly for debugging, it should not be used for equals etc.
+                    // need to check this again, because there might be duplicates such within the set we are currently processing.
+                    if (archive.containsDoc(ed)) {
+                        stats.nMessagesAlreadyPresent++;
+                        dataErrors.add("Duplicate message: " + ed); // note: report.jsp depends on this specific string
+                        continue;
+                    }
+
+                    MimeMessage originalMessage = mm; // this is the mm that has all the headers etc.
+                    List<Blob> attachmentsList = new ArrayList<Blob>();
+
+                    // if we already have it prefetched, use the prefetched version
+                    List<String> contents = null;
+
+                    if (first_i_prefetched >= 0 && prefetchedMessages != null) {
+                        if (!fetchConfig.downloadAttachments) {
+                            // text only means the prefetchedMessages are stored directly as a list of strings
+                            String content = (String) prefetchedMessages.get(i - first_i_prefetched); // note: this_mm only has the prefetched content, but not the headers
+                            contents = new ArrayList<String>();
+
+                            try {
+                                // a special for yahoo which routinely uses quoted-printable. content looks like  =0A0D.... = etc.
+                                if (mm.isMimeType("multipart/alternative")) {
+                                    Multipart mm_mp = (Multipart) mm.getContent();
+                                    Part p0 = mm_mp.getBodyPart(0);
+                                    if (p0 instanceof com.sun.mail.imap.IMAPBodyPart) {
+                                        String encoding = ((com.sun.mail.imap.IMAPBodyPart) p0).getEncoding();
+                                        if ("quoted-printable".equals(encoding)) {
+                                            content = new String(Util.getBytesFromStream(javax.mail.internet.MimeUtility.decode(new java.io.ByteArrayInputStream(content.getBytes()), "quoted-printable")));
+                                        }
+                                    }
+                                }
+                            } catch (Exception e) {
+                                Util.print_exception("Error trying to parse encoding of multipart", e, log);
+                            }
+
+                            contents.add(content);
+                        } else {
+                            // subtle issue here: the contentType of the prefetchedMessage needs to be be set to the original_mm's content-type.
+                            // this was found for cases where the original message is multipart-alternative with a text and html part.
+                            // if we don't set prefetchedMessage's content type, it gets a mime type of text/plain and a body = the entire multipart including both parts.
+                            // found on sgh's sent mail w/subject: "text to add in help" from  Fri, 7 Jun 2013
+                            MimeMessage prefetchedMessage = (MimeMessage) prefetchedMessages.get(i - first_i_prefetched);
+                            String contentTypeHeaders[] = originalMessage.getHeader("Content-Type");
+                            String contentTypeHeader = null;
+                            if (contentTypeHeaders != null && contentTypeHeaders.length == 1)
+                                contentTypeHeader = contentTypeHeaders[0];
+
+                            if (!Util.nullOrEmpty(contentTypeHeader)) // we do care about body structure, hang on to it
+                                prefetchedMessage.setHeader("Content-Type", contentTypeHeader);
+                            mm = prefetchedMessage;
+                        }
+                        prefetchedMessages.set(i - first_i_prefetched, null); // null out to save memory
+                    }
+
+                    if (contents == null)
+                        contents = processMessagePart(messageNum, originalMessage, mm, attachmentsList);
+
+                    // if mm is not prefetched, it is the same as original_mm
+                    // will also work, but will be slow as javamail accesses and fetches each mm separately, instead of using the bulk prefetched version
+                    // even when prefetched, the processMessagePart is somewhat expensive because the attachments have to be extracted etc.
+
+                    // we could overlap processMessagePart with do_imap_prefetch by prefetching in a separate thread, since prefetch is network limited.
+                    // but profiling shows processMessagePart takes only 1/4th the time of do_imap_prefetch so overlapping would be a relatively small gain.
+                    // not worth the effort right now.
+                    ed.attachments = attachmentsList;
+                    if (fetchConfig.downloadAttachments)
+                        ed.attachmentsYetToBeDownloaded = false; // we've already downloaded our attachments
+
+                    // concat all the contents parts
+                    StringBuilder sb = new StringBuilder();
+                    for (String s : contents) {
+                        sb.append(s);
+                        sb.append("\n");
+                    }
+
+                    String contentStr = sb.toString();
+                    if (!messageLooksOk(contentStr)) {
+                        dataErrors.add("Skipping message as it seems to have very long words: " + ed);
+                        continue;
+                    }
+                    contentStr = IndexUtils.normalizeNewlines(contentStr); // just get rid of \r's
+
+                    archive.addDoc(ed, contentStr);
+
+                    List<LinkInfo> linkList = new ArrayList<LinkInfo>();
+                    // linkList might be used only for slant
+                    IndexUtils.populateDocLinks(ed, contentStr, linkList, true);
+                    ed.links = linkList;
+                    stats.nMessagesAdded++;
+                } catch (Exception ex) {
+                    // sometimes we get unexpected folder closed, so try again
+                    boolean retry = false;
+                    if (ex instanceof javax.mail.FolderClosedException) {
+                        log.warn("Oops, thread " + threadID + " got the folder closed in its face! " + ex.getMessage());
+
+                        // sometimes we get this exception about folder closed
+                        // retry up to 3 times, then give up
+                        if (nTriesForThisMessage < 3) {
+                            retry = true;
+                            log.info("Re-opening email store; attempt #" + (nTriesForThisMessage + 1) + " for message " + i);
+                            nTriesForThisMessage++;
+                            messages = openFolderAndGetMessages();
+                            fetchHeaders(messages);
+                            --i; // adjust the message index n try again
+                        }
+                    }
+
+                    if (!retry) {
+                        // we sometimes see UnsupportedEncodingException with x-utf8utf8 mime type and ParseException
+                        // nothing much can be done, just create a dummy doc and add it to the cache
+                        nErrors++;
+                        stats.nErrors++;
+                        EmailDocument ed = new EmailDocument(Integer.toString(messageNum));
+                        log.warn("Exception reading message from " + folder_name() + " Message #" + messageNum + " " + ex.getMessage() + "\n" + Util.stackTrace(ex));
+
+                        ed.setErrorString(Util.stackTrace(ex));
+                    }
+                }
+            }
+        } catch (Throwable t) {
+            Util.print_exception(t, log);
+        } finally {
+            //				if (cancelled && false) // TODO: disable for now as currently only indexes are rolled back and allDocs/blobs are not rolled back in sync yet
+            //					archive.rollbackIndexWrites();
+            //				else
+            currentStatus = JSONUtils.getStatusJSON("Saving archive...");
+            archive.close();
+        }
+
+        fetchedFolderInfo.lastSeenUID = highestUID;
+        log.info("at end of fetch, folder info is " + fetchedFolderInfo);
+
+        log.info("emailfetcher thread completed, archive has " + archive.getAllDocs().size() + " docs");
+    }
+
+    public FolderInfo getFetchedFolderInfo() {
+        return fetchedFolderInfo;
+    }
+
+    private int openFolderAndGetMessageCount() throws MessagingException {
+        folder = null;
+
+        store = emailStore.connect();
+        folder = emailStore.get_folder(store, folder_name());
+        if (folder != null)
+            return folder.getMessageCount();
+        else
+            return 0;
+    }
+
+    /**
+     * Comment by @vihari
+     * Not sure what uid id and folder are,I think this code should be more predictable
+     * The params begin idx and end idx are used for both uid filtering and Mbox message indexing.
+     * does not make sense
+     */
+    private Message[] openFolderAndGetMessages() throws MessagingException {
+        if (folder == null)
+            openFolderAndGetMessageCount();
+
+        Message[] messages = null;
+        if (folder == null)
+            return messages;
+
+        String descr = emailStore.getAccountID() + ":" + folder;
+        boolean haveUID = false;
+        int count = folder.getMessageCount();
+        use_uid_if_available = (begin_msg_index == 1 && end_msg_index == count + 1);
+        log.info("use_uid_if_available is set to " + use_uid_if_available);
+
+        if (fetchConfig.filter != null && fetchConfig.filter.isActive()) {
+            log.info("Issuing server side filters for " + fetchConfig.filter);
+            boolean useReceivedDateTerms = descr.indexOf("yahoo.com") >= 0;
+            messages = folder.search(fetchConfig.filter.convertToSearchTerm(useReceivedDateTerms));
+        } else {
+            // mbox provider claims to provide UIDFolder but the uids are bogus so we treat mboemailstore folders as not uidfolders
+            boolean is_uid_folder = (folder instanceof UIDFolder) && !(emailStore instanceof MboxEmailStore);
+
+            if (use_uid_if_available && is_uid_folder) {
+                // for uidfolders, we want to update the last seen uid in the FolderInfo
+                long uid = archive.getLastUIDForFolder(emailStore.getAccountID(), folder_name());
+                if (uid > 0) {
+                    messages = ((UIDFolder) folder).getMessagesByUID(uid + 1, UIDFolder.LASTUID);
+                    log.info("Archive has already seen this folder: " + descr + " will only fetch messages from uid " + uid + " onwards, " + messages.length + " messages will be incrementally fetched");
+                    haveUID = true;
+                } else
+                    log.info(descr + " is a UIDFolder but not seen before");
+            } else
+                log.info(descr + " is not a UIDFolder");
+
+            if (!haveUID) {
+                log.info("All " + count + " messages in " + descr + " will be fetched");
+                //messages = folder.getMessages();
+
+                if (begin_msg_index > 0 && end_msg_index > 0) {
+                    // we have to use only specified messages
+                    // if there are 8 messages, count = 8, end_msg_index will be 9
+                    if (end_msg_index > count + 1)
+                        log.warn("Warning: bad end_msg_index " + end_msg_index + " count = " + count); // use the full messages
+                    else {
+                        int nMessages = end_msg_index - begin_msg_index;
+                        Message[] newMessages = new Message[nMessages];
+                        for (int i = 0; i < end_msg_index - begin_msg_index; i++)
+                            newMessages[i] = folder.getMessage(begin_msg_index + i);//messages[begin_msg_index - 1 + i]; // -1 cos messages array is indexed from 0, but begin_msg_index from 1
+                        log.info("total # of messages: " + count + " reduced # of messages: " + newMessages.length);
+                        messages = newMessages;
+                    }
+                }
+            }
+        }
+
+        return messages;
+    }
+
+    /**
+     * main fetch+index method
+     * The assumptions that the heap is big enough to enough to fit all the messages i the folder is not scalable for larger archive.
+     * Instead, we process each message individually.
+     * fetchHeaders may be penalised due to multiple requests of fetch?
+     * In order to make indexing of large archives possible, fetch of NON-MBOXEmailstrore formats is penalised. It is possible to avoid this by handling MBox and IMAP/POP formats differently.
+     */
+    public void run() {
+        currentStatus = JSONUtils.getStatusJSON("Starting to process " + folder_name());
+
+        isCancelled = false;
+        Thread.currentThread().setName("EmailFetcher");
+        nErrors = 0;
+        //Message[] messages = null;
+        // use_uid is set only if we are reading the whole folder. otherwise we won't use it, and we won't update the highest UID seen for the folder in the archive.
+        try {
+            //			long t1 = System.currentTimeMillis();
+            int nMessages = openFolderAndGetMessageCount();
+            log.info("Total number of messages: " + nMessages);
+
+            if (emailStore instanceof MboxEmailStore) {
+                // this is a special for mbox'es because we run out of memory if we try to openFolderAndGetMessages()
+                // so we process in batches
+                //TODO: Ideally, should cap on buffer size rather than on number of messages.
+                final int BATCH = 10000;
+                int nbatches = nMessages / BATCH;
+                nMessagesProcessedSuccess = 0;
+                long st = System.currentTimeMillis();
+                int b;
+                for (b = 0; b < nbatches + 1; b++) {
+                    begin_msg_index = b * BATCH + 1;
+                    end_msg_index = Math.min((b + 1) * BATCH, nMessages) + 1;
+                    log.info("Fetching messages in index [" + begin_msg_index + ", " + end_msg_index + "] batch: " + b + "/" + nbatches + "\nTotal Messages: " + nMessages);
+                    Message[] messages = openFolderAndGetMessages();
+                    currentStatus = JSONUtils.getStatusJSON("");
+                    if (isCancelled)
+                        return;
+
+                    if (messages.length > 0) {
+                        try {
+                            if (fetchConfig.downloadMessages) {
+                                log.info(nMessages + " messages will be fetched for indexing");
+                                fetchAndIndexMessages(folder, messages, begin_msg_index, nMessages);
+                            } else {
+                                // this is for memory test screening mode.
+                                // we create a dummy archive without any real contents
+                                for (int i = 0; i < nMessages; i++) {
+                                    String unique_id_as_string = Long.toString(i);
+
+                                    // well, we already converted to emaildoc above during removeMessagesAlreadyInArchive
+                                    // not a serious perf. concern now, but revisit if needed
+                                    EmailDocument ed = convertToEmailDocument((MimeMessage) messages[i], unique_id_as_string); // this messageNum is mostly for debugging, it should not be used for equals etc.
+                                    archive.addDocWithoutContents(ed);
+                                }
+                            }
+                        } catch (Exception e) {
+                            log.error("Exception trying to fetch messages, results will be incomplete! " + e + "\n" + Util.stackTrace(e));
+                        }
+                    }
+                    log.info("Fetch stats for this fetcher thread: " + stats);
+                }
+                log.info("Read #" + nMessages + " messages in #" + b + " batches of size: " + BATCH + " in " + (System.currentTimeMillis() - st) + "ms");
+            } else {
+                // IMAP etc are pretty efficient with lazily populating message objects, so unlike mbox, its ok to use openFolderAndGetMessages() on the entire folder.
+                // remember to init the begin/end_msg_index before calling openFolderAndGetMessages
+                begin_msg_index = 1;
+                end_msg_index = nMessages + 1;
+                nMessagesProcessedSuccess = 0;
+                Message[] messages = openFolderAndGetMessages();
+
+                long st = System.currentTimeMillis();
+                currentStatus = JSONUtils.getStatusJSON("");
+                if (isCancelled)
+                    return;
+
+                if (messages.length > 0) {
+                    try {
+                        fetchHeaders(messages); // always fetch headers
+                        if (fetchConfig.downloadMessages) {
+                            log.info(nMessages + " messages will be fetched for indexing");
+                            //we process all the messages together here unlike the case of mstor
+                            //hence the begin index is always 0
+                            fetchAndIndexMessages(folder, messages, 0, messages.length);
+                        } else {
+                            // this is for memory test screening mode.
+                            // we create a dummy archive without any real contents
+                            for (int i = 0; i < nMessages && i < messages.length; i++) {
+                                String unique_id_as_string = Long.toString(i);
+
+                                // well, we already converted to emaildoc above during removeMessagesAlreadyInArchive
+                                // not a serious perf. concern now, but revisit if needed
+                                EmailDocument ed = convertToEmailDocument((MimeMessage) messages[i], unique_id_as_string); // this messageNum is mostly for debugging, it should not be used for equals etc.
+                                archive.addDocWithoutContents(ed);
+                            }
+                        }
+                    } catch (Exception e) {
+                        Util.print_exception("Exception trying to fetch messages, results will be incomplete! ", e, log);
+                    }
+                }
+                log.info("Read #" + nMessages + " messages in  in " + (System.currentTimeMillis() - st) + "ms");
+            }
+        } catch (Throwable t) {
+            if (t instanceof OutOfMemoryError)
+                this.mayHaveRunOutOfMemory = true;
+            // this is important, because there could be an out of memory etc over here.
+            Util.print_exception(t, log);
+        } finally {
+            try {
+                if (folder != null)
+                    folder.close(false);
+                if (store != null)
+                    store.close();
+            } catch (Exception e) {
+                Util.print_exception(e);
+            }
+        }
+    }
+
+	/*
+	 * code for handling other kinds of headers, e.g. to find location of the
+	 * message -- not used right now, but may use in the future.
+	 * public void processHeaders(MimeMessage m) throws Exception
+	 * {
+	 * Address[] froms = m.getFrom();
+	 * if (froms == null)
+	 * return;
+	 * InternetAddress a = (InternetAddress) froms[0];
+	 * ContactInfo ci = addressBook.getContactInfoForAddress(a);
+	 * Enumeration<Header> e = (Enumeration<Header>) m.getAllHeaders();
+	 * String lastReceivedHeader = null;
+	 * while (e.hasMoreElements())
+	 * {
+	 * Header h = e.nextElement();
+	 * String n = h.getName();
+	 * String v = h.getValue();
+	 * // log.info ("header: " + n + " = " + n);
+	 * String s = n.toLowerCase();
+	 * if ("x-mailer".equals(s) || "user-agent".equals(s))
+	 * {
+	 * log.warn (m.getFrom()[0] + " --> " + n + " " + v);
+	 * ci.addMailer(v);
+	 * }
+	 * if ("x-originating-ip".equals(s) || "x-yahoo-post-ip".equals(s))
+	 * {
+	 * log.warn (m.getFrom()[0] + " --> " + n + " " + v);
+	 * ci.addIPAddr(v);
+	 * }
+	 * if ("x-yahoo-profile".equals(s))
+	 * log.warn (m.getFrom()[0] + " --> " + n + " " + v);
+	 * if ("message-id".equals(s))
+	 * {
+	 * log.warn("messageID = " + v);
+	 * ci.addMessageID(v);
+	 * }
+	 * if ("received".equals(s) || "x-received".equals(s))
+	 * {
+	 * lastReceivedHeader = v;
+	 * }
+	 * }
+	 * 
+	 * // sometimes the headers have an extra ctrl-m at the end, strip it if
+	 * this is the case.
+	 * if (lastReceivedHeader != null && lastReceivedHeader.endsWith("\r"))
+	 * lastReceivedHeader = lastReceivedHeader.substring(0,
+	 * lastReceivedHeader.length()-1);
+	 * 
+	 * ci.addLastReceivedHeader(lastReceivedHeader);
+	 * 
+	 * String from = froms[0].toString();
+	 * 
+	 * log.info (from + " lastReceived " + lastReceivedHeader);
+	 * if (lastReceivedHeader == null)
+	 * log.warn ("WARNING: " + from + " --> no received header!?");
+	 * else
+	 * {
+	 * String ipAddrStr = processLastReceived(lastReceivedHeader);
+	 * if (ipAddrStr != null)
+	 * {
+	 * byte[] ipAddrBytes = Util.parseIPAddress(ipAddrStr);
+	 * if (ipAddrBytes != null)
+	 * {
+	 * // InetAddress ipAddr = InetAddress.getByAddress(ipAddrBytes);
+	 * // log.info ("Received: " + locationService.lookupLocation(ipAddr));
+	 * }
+	 * }
+	 * }
+	 * }
+	 */
+
+    public String toString() {
+        return Util.fieldsToString(this);
+    }
+}
diff --git a/src/java/edu/stanford/muse/webapp/EmailRenderer.java b/src/java/edu/stanford/muse/webapp/EmailRenderer.java
index 6ffa3b2..b9bf797 100755
--- a/src/java/edu/stanford/muse/webapp/EmailRenderer.java
+++ b/src/java/edu/stanford/muse/webapp/EmailRenderer.java
@@ -1,536 +1,536 @@
-package edu.stanford.muse.webapp;
-
-import java.io.IOException;
-import java.util.*;
-
-import javax.mail.Address;
-import javax.mail.internet.InternetAddress;
-
-import edu.stanford.muse.datacache.Blob;
-import edu.stanford.muse.datacache.BlobStore;
-import edu.stanford.muse.email.AddressBook;
-import edu.stanford.muse.email.Contact;
-import edu.stanford.muse.groups.SimilarGroup;
-import edu.stanford.muse.index.*;
-import edu.stanford.muse.ner.model.NEType;
-import edu.stanford.muse.util.Pair;
-import edu.stanford.muse.util.Span;
-import edu.stanford.muse.util.Util;
-
-/** This class has util methods to display an email message in an html page */
-
-public class EmailRenderer {
-
-	static final int	TEXT_WRAP_WIDTH	= 80;	// used to be 80, but that wraps
-												// around too soon. 120 is too
-												// much with courier font.
-
-    public static Pair<DataSet, String> pagesForDocuments(Collection<Document> ds, Archive archive, String datasetTitle,
-                                                          Set<String> highlightTerms)
-            throws Exception{
-        return pagesForDocuments(ds, archive, datasetTitle, null, highlightTerms, null, MultiDoc.ClusteringType.MONTHLY);
-    }
-
-    public static Pair<DataSet, String> pagesForDocuments(Collection<Document> ds, Archive archive, String datasetTitle,
-                                                          Set<String> highlightTerms, Collection<Blob> highlightAttachments)
-            throws Exception{
-        return pagesForDocuments(ds, archive, datasetTitle, null, highlightTerms, highlightAttachments, MultiDoc.ClusteringType.MONTHLY);
-    }
-
-    public static Pair<DataSet, String> pagesForDocuments(Collection<Document> ds, Archive archive, String datasetTitle,
-														  Set<Integer> highlightContactIds, Set<String> highlightTerms)
-			throws Exception{
-		return pagesForDocuments(ds, archive, datasetTitle, highlightContactIds, highlightTerms, null, MultiDoc.ClusteringType.MONTHLY);
-	}
-
-	public static Pair<DataSet, String> pagesForDocuments(Collection<Document> ds, Archive archive, String datasetTitle,
-                                                          Set<Integer> highlightContactIds, Set<String> highlightTerms, Collection<Blob> highlightAttachments)
-            throws Exception{
-        return pagesForDocuments(ds, archive, datasetTitle, highlightContactIds, highlightTerms, highlightAttachments, MultiDoc.ClusteringType.MONTHLY);
-    }
-
-	/*
-	 * returns pages and html for a collection of docs, which can be put into a
-	 * jog frame. indexer clusters are used to
-	 *
-	 * Changed the first arg type from: Collection<? extends EmailDocument> to Collection<Document>, as we get Collection<Document> in browse page or from docsforquery, its a hassle to make them all return EmailDocument
-	 * especially when no other document type is used anywhere
-	 */
-	public static Pair<DataSet, String> pagesForDocuments(Collection<Document> ds, Archive archive, String datasetTitle,
-			Set<Integer> highlightContactIds, Set<String> highlightTerms, Collection<Blob> highlightAttachments, MultiDoc.ClusteringType coptions)
-			throws Exception
-	{
-		StringBuilder html = new StringBuilder();
-		int pageNum = 0;
-		List<String> pages = new ArrayList<String>();
-
-		// need clusters which map to sections in the browsing interface
-		List<MultiDoc> clusters;
-
-        // indexer may or may not have indexed all the docs in ds
-		// if it has, use its clustering (could be yearly or monthly or category
-		// wise)
-		// if (indexer != null && indexer.clustersIncludeAllDocs(ds))
-		// if (indexer != null)
-		clusters = archive.clustersForDocs(ds, coptions);
-		/*
-		 * else { // categorize by month if the docs have dates if
-		 * (EmailUtils.allDocsAreDatedDocs(ds)) clusters =
-		 * IndexUtils.partitionDocsByInterval(new ArrayList<DatedDocument>((Set)
-		 * ds), true); else // must be category docs clusters =
-		 * CategoryDocument.clustersDocsByCategoryName((Collection) ds); }
-		 */
-
-		List<Document> datasetDocs = new ArrayList<>();
-
-		// we build up a hierarchy of <section, document, page>
-		for (MultiDoc md : clusters)
-		{
-			if (md.docs.size() == 0)
-				continue;
-
-			String description = md.description;
-			description = description.replace("\"", "\\\""); // escape a double
-																// quote if any
-																// in the
-																// description
-			html.append("<div class=\"section\" name=\"" + description + "\">\n");
-
-			List<List<String>> clusterResult = new ArrayList<>();
-
-			for (Document d : md.docs)
-			{
-				String pdfAttrib = "";
-				/*
-				 * if (d instanceof PDFDocument) pdfAttrib = "pdfLink=\"" +
-				 * ((PDFDocument) d).relativeURLForPDF + "\"";
-				 */
-				html.append("<div class=\"document\" " + pdfAttrib + ">\n");
-
-				datasetDocs.add(d);
-				pages.add(null);
-				clusterResult.add(null);
-				// clusterResult.add(docPageList);
-				// for (String s: docPageList)
-				{
-					String comment = Util.escapeHTML(d.comment);
-					html.append("<div class=\"page\"");
-					if (!Util.nullOrEmpty(comment))
-						html.append(" comment=\"" + comment + "\"");
-
-					if (!Util.nullOrEmpty(comment) && (d instanceof EmailDocument))
-					{
-						String messageId = d.getUniqueId();
-						html.append(" messageID=\"" + messageId + "\"");
-					}
-
-					if (d.isLiked())
-						html.append(" liked=\"true\"");
-					if (d instanceof EmailDocument && ((EmailDocument) d).doNotTransfer)
-						html.append(" doNotTransfer=\"true\"");
-					if (d instanceof EmailDocument && ((EmailDocument) d).transferWithRestrictions)
-						html.append(" transferWithRestrictions=\"true\"");
-					if (d instanceof EmailDocument && ((EmailDocument) d).reviewed)
-						html.append(" reviewed=\"true\"");
-					if (d instanceof EmailDocument && ((EmailDocument) d).addedToCart)
-						html.append(" addToCart=\"true\"");
-					if (d instanceof EmailDocument)
-						html.append(" pageId='" + pageNum++ + "' " + " signature='" + Util.hash (((EmailDocument) d).getSignature()) + "' docId='" + d.getUniqueId() + "'></div>\n");
-				}
-
-				html.append("</div>"); // document
-			}
-			html.append("</div>\n"); // section
-		}
-
-		DataSet dataset = new DataSet(datasetDocs, archive, datasetTitle, highlightContactIds, highlightTerms, highlightAttachments);
-
-		return new Pair<>(dataset, html.toString());
-	}
-
-	/**
-	 * format given addresses as comma separated html, linewrap after given
-	 * number of chars
-	 * 
-	 * @param addressBook
-	 */
-	public static String formatAddressesAsHTML(Address addrs[], AddressBook addressBook, int lineWrap, Set<String> highlightUnstemmed, Set<String> highlightNames, Set<String> highlightAddresses)
-	{
-		StringBuilder sb = new StringBuilder();
-		int outputLineLength = 0;
-		for (int i = 0; i < addrs.length; i++)
-		{
-			String thisAddrStr;
-
-			Address a = addrs[i];
-			if (a instanceof InternetAddress)
-			{
-				InternetAddress ia = (InternetAddress) a;
-				Pair<String, String> p = JSPHelper.getNameAndURL((InternetAddress) a, addressBook);
-				String url = p.getSecond();
-				String str = ia.toString();
-                String addr = ia.getAddress();
-                boolean match = false;
-                if(str!=null) {
-                    //The goal here is to explain why a doc is selected and hence we should replicate Lucene doc selection and Lucene is case insensitive most of the times
-                    String lc = str.toLowerCase();
-                    if (highlightUnstemmed != null)
-                        for (String hs : highlightUnstemmed) {
-                            String hlc = hs.toLowerCase().replaceAll("^\\W+|\\W+$","");
-                            if (lc.contains(hlc)) {
-                                match = true;
-                                break;
-                            }
-                        }
-                    if (!match && highlightNames != null)
-                        for (String hn : highlightNames) {
-                            String hlc = hn.toLowerCase().replaceAll("^\\W+|\\W+$","");
-                            if (lc.contains(hlc)) {
-                                match = true;
-                                break;
-                            }
-                        }
-                }
-                if(addr!=null){
-                    if (!match && highlightAddresses != null)
-                        for (String ha : highlightAddresses)
-                            if (addr.contains(ha)) {
-                                match = true;
-                                break;
-                            }
-                }
-
-                if(match)
-                    thisAddrStr = ("<a href=\"" + url + "\"><span class=\"hilitedTerm rounded\">" + Util.escapeHTML(str) + "</span></a>");
-                else
-                    thisAddrStr = ("<a href=\"" + url + "\">" + Util.escapeHTML(str) + "</a>");
-
-				if (str != null)
-	                outputLineLength += str.length();
-			}
-			else
-			{
-				String str = a.toString();
-				thisAddrStr = str;
-				outputLineLength += str.length();
-                JSPHelper.log.warn("Address is not an instance of InternetAddress - is of instance: "+a.getClass().getName() + ", highlighting won't work.");
-			}
-
-			if (i + 1 < addrs.length)
-				outputLineLength += 2; // +2 for the comma that will follow...
-
-			if (outputLineLength + 2 > lineWrap)
-			{
-				sb.append("<br/>\n");
-				outputLineLength = 0;
-			}
-			sb.append(thisAddrStr);
-			if (i + 1 < addrs.length)
-				sb.append(", ");
-		}
-
-		return sb.toString();
-	}
-
-	/**
-	 * returns a string for documents.
-	 * 
-	 * @param highlightAttachments
-	 * @throws Exception
-	 */
-    //TODO: inFull, debug params can be removed
-    //TODO: Consider a HighlighterOptions class
-	public static Pair<String, Boolean> htmlForDocument(Document d, Archive archive, String datasetTitle, BlobStore attachmentsStore,
-			Boolean sensitive, Set<Integer> highlightContactIds, Set<String> highlightTerms, Set<Blob> highlightAttachments, Map<String, Map<String, Short>> authorisedEntities,
-			boolean IA_links, boolean inFull, boolean debug) throws Exception
-	{
-		JSPHelper.log.debug("Generating HTML for document: " + d);
-		EmailDocument ed = null;
-		String html = null;
-		boolean overflow = false;
-		if (d instanceof EmailDocument)
-		{
-			// for email docs, 1 doc = 1 page
-			ed = (EmailDocument) d;
-			StringBuilder page = new StringBuilder();
-			page.append("<div class=\"muse-doc\">\n");
-
-			page.append("<div class=\"muse-doc-header\">\n");
-			page.append(EmailRenderer.getHTMLForHeader(archive, ed, sensitive, highlightContactIds, highlightTerms, IA_links, debug));
-			page.append("</div>"); // muse-doc-header
-
-			/*
-			 * Map<String, List<String>> sentimentMap =
-			 * indexer.getSentiments(ed); for (String emotion:
-			 * sentimentMap.keySet()) { page.append ("<b>" + emotion +
-			 * "</b>: "); for (String word: sentimentMap.get(emotion))
-			 * page.append (word + " "); page.append ("<br/>\n");
-			 * page.append("<br/>\n"); }
-			 */
-			page.append("\n<div class=\"muse-doc-body\">\n");
-			Pair<StringBuilder, Boolean> contentsHtml = archive.getHTMLForContents(d, ((EmailDocument) d).getDate(), d.getUniqueId(), sensitive, highlightTerms,
-					authorisedEntities, IA_links, inFull, true);
-
-			StringBuilder htmlMessageBody = contentsHtml.first;
-			overflow = contentsHtml.second;
-			// page.append(ed.getHTMLForContents(indexer, highlightTermsStemmed,
-			// highlightTermsUnstemmed, IA_links));
-			page.append(htmlMessageBody);
-			page.append("\n</div> <!-- .muse-doc-body -->\n"); // muse-doc-body
-
-			// page.append("\n<hr class=\"end-of-browse-contents-line\"/>\n");
-			List<Blob> attachments = ed.attachments;
-			if (attachments != null && attachments.size() > 0)
-			{
-				// show thumbnails of all the attachments
-
-				if (ModeConfig.isPublicMode()) {
-					page.append(attachments.size() + " attachment" + (attachments.size() == 1 ? "" : "s") + ".");
-				} else {
-					page.append("<hr/>\n<div class=\"attachments\">\n");
-					page.append("<table>\n");
-					int i = 0;
-					for (; i < attachments.size(); i++)
-					{
-						if (i % 4 == 0)
-							page.append((i == 0) ? "<tr>\n" : "</tr><tr>\n");
-						page.append("<td>");
-
-						Blob attachment = attachments.get(i);
-						String thumbnailURL = null, attachmentURL = null;
-						boolean is_image = Util.is_image_filename(attachment.filename);
-
-						if (attachmentsStore != null)
-						{
-							String contentFileDataStoreURL = attachmentsStore.get_URL(attachment);
-							attachmentURL = "serveAttachment.jsp?file=" + Util.URLtail(contentFileDataStoreURL);
-							String tnFileDataStoreURL = attachmentsStore.getViewURL(attachment, "tn");
-							if (tnFileDataStoreURL != null)
-								thumbnailURL = "serveAttachment.jsp?file=" + Util.URLtail(tnFileDataStoreURL);
-							else
-							{
-								if (attachment.is_image())
-									thumbnailURL = attachmentURL;
-								else
-									thumbnailURL = "images/sorry.png";
-							}
-						}
-						else
-							JSPHelper.log.warn("attachments store is null!");
-
-						// toString the filename in any case,
-						String s = attachment.filename;
-						// cap to a length of 25, otherwise the attachment name
-						// overflows the tn
-						String display = Util.ellipsize(s, 25);
-                        boolean highlight = highlightAttachments != null && highlightAttachments.contains(attachment);
-                        page.append("&nbsp;" + "<span title=\"" + Util.escapeHTML(s) + "\" class='" + (highlight?"highlight":"") + "'>"+ Util.escapeHTML(display) + "</span>&nbsp;");
-						page.append("<br/>");
-
-						String css_class = "attachment-preview" + (is_image ? " img" : "") + (highlight ? " highlight" : "");
-						String leader = "<img class=\"" + css_class + "\" ";
-
-						// punt on the thumbnail if the attachment tn or content
-						// URL is not found
-						if (thumbnailURL != null && attachmentURL != null)
-						{
-							// d.hashCode() is just something to identify this
-							// page/message
-							page.append("<a rel=\"page" + d.hashCode() + "\" title=\"" + attachment.filename + "\" class=\"" + (highlight?"highlight":"") + "\" href=\"" + attachmentURL + "\">");
-							page.append(leader + "href=\"" + attachmentURL + "\" src=\"" + thumbnailURL + "\"></img>\n");
-							page.append("<a>\n");
-						}
-						else
-						{
-							// page.append
-							// ("&nbsp;<br/>&nbsp;<br/>Not fetched<br/>&nbsp;<br/>&nbsp;&nbsp;&nbsp;");
-							// page.append("<a title=\"" + attachment.filename +
-							// "\" href=\"" + attachmentURL + "\">");
-							page.append(leader + "src=\"images/no-attachment.png\"></img>\n");
-							// page.append ("<a>\n");
-
-							if (thumbnailURL == null)
-								JSPHelper.log.info("No thumbnail for " + attachment);
-							if (attachmentURL == null)
-								JSPHelper.log.info("No attachment URL for " + attachment);
-						}
-						page.append("</td>\n");
-					}
-					if (i % 4 != 0)
-						page.append("</tr>");
-					page.append("</table>");
-					page.append("\n</div>  <!-- .muse-doc-attachments -->\n"); // muse-doc-attachments
-				}
-
-			}
-			page.append("\n</div>  <!-- .muse-doc -->\n"); // .muse-doc
-			html = page.toString();
-		}
-		else if (d instanceof DatedDocument)
-		{
-			/*
-			 * DatedDocument dd = (DatedDocument) d; StringBuilder page = new
-			 * StringBuilder();
-			 * 
-			 * page.append (dd.getHTMLForHeader()); // directly jam in contents
-			 * page.append ("<div class=\"muse-doc\">\n"); page.append
-			 * (dd.getHTMLForContents(indexer)); // directly jam in contents
-			 * page.append ("\n</div>"); // doc-contents return page.toString();
-			 */
-			html = "To be implemented";
-		}
-		else
-		{
-			JSPHelper.log.warn("Unsupported Document: " + d.getClass().getName());
-			html = "";
-		}
-
-		return new Pair<String, Boolean>(html, overflow);
-	}
-
-	/**
-	 * returns a HTML table string for the doc header
-	 * 
-	 * @param sensitive
-	 *            - when set will highlight any sensitive info in subject based
-	 *            on preset regexs
-	 * @throws IOException
-	 */
-	public static StringBuilder getHTMLForHeader(Archive archive, EmailDocument ed, Boolean sensitive, Set<Integer> highlightContactIds, Set<String> highlightTerms,
-			boolean IA_links, boolean debug) throws IOException
-	{
-		AddressBook addressBook = archive.addressBook;
-		GroupAssigner groupAssigner = archive.groupAssigner;
-        Set<String> contactNames = new LinkedHashSet<>();
-        Set<String> contactAddresses = new LinkedHashSet<>();
-        if(highlightContactIds!=null)
-            for(Integer hci: highlightContactIds) {
-                if(hci == null)
-                    continue;
-                Contact c = archive.addressBook.getContact(hci);
-                if(c==null)
-                    continue;
-                contactNames.addAll(c.names);
-                contactAddresses.addAll(c.emails);
-            }
-        contactNames.addAll(highlightTerms);
-
-		StringBuilder result = new StringBuilder();
-		// header table
-		result.append("<table class=\"docheader rounded\">\n");
-		// result.append
-		// ("<tr><td width=\"100px\" align=\"right\" class=\"muted\">Folder:</td><td>"
-		// + this.folderName + "</td></tr>\n");
-		if(debug)
-			result.append("<tr><td>docId: </td><td>"+ed.getUniqueId()+"</td></tr>\n");
-		result.append(JSPHelper.getHTMLForDate(ed.date));
-
-		final String style = "<tr><td align=\"right\" class=\"muted\" valign=\"top\">";
-
-		// email specific headers
-		result.append(style + "From: </td><td align=\"left\">");
-		Address[] addrs = ed.from;
-		if (addrs != null)
-		{
-			result.append(formatAddressesAsHTML(addrs, addressBook, TEXT_WRAP_WIDTH, highlightTerms, contactNames, contactAddresses));
-		}
-
-		result.append(style + "To: </td><td align=\"left\">");
-		addrs = ed.to;
-		if (addrs != null)
-			result.append(formatAddressesAsHTML(addrs, addressBook, TEXT_WRAP_WIDTH, highlightTerms, contactNames, contactAddresses) + "");
-
-		result.append("\n</td></tr>\n");
-
-		if (ed.cc != null && ed.cc.length > 0)
-		{
-			result.append(style + "Cc: </td><td align=\"left\">");
-			result.append(formatAddressesAsHTML(ed.cc, addressBook, TEXT_WRAP_WIDTH, highlightTerms, contactNames, contactAddresses) + "");
-			result.append("\n</td></tr>\n");
-		}
-
-		if (ed.bcc != null && ed.bcc.length > 0)
-		{
-			result.append(style + "Bcc: </td><td align=\"left\">");
-			result.append(formatAddressesAsHTML(ed.bcc, addressBook, TEXT_WRAP_WIDTH, highlightTerms, contactNames, contactAddresses) + "");
-			result.append("\n</td></tr>\n");
-		}
-
-		if (groupAssigner != null)
-		{
-			SimilarGroup<String> g = groupAssigner.getClosestGroup(ed);
-			if (g != null && g.size() > 1) // if its just a singleton group, no
-											// point explicitly listing a group
-											// line
-			{
-				String url = "browse?groupIdx=" + groupAssigner.getClosestGroupIdx(ed);
-				result.append(style + "Group: </td>\n");
-				result.append("<td align=\"left\">");
-				String description = g.elementsToString();
-				result.append("<span class=\"facet\" style=\"padding-left:2px;padding-right:2px\" onclick=\"javascript:window.open('" + url + "');\" title=\""
-						+ Util.escapeHTML(description) + "\">" + g.name + "</span></br>");
-				result.append("</td>\n</tr>\n");
-			}
-		}
-
-		String x = ed.description;
-		if (x == null)
-			x = "<None>";
-
-		result.append(style + "Subject: </td>");
-		// <pre> to escape special chars if any in the subject. max 70 chars in
-		// one line, otherwise spill to next line
-		result.append("<td align=\"left\"><b>");
-		x = DatedDocument.formatStringForMaxCharsPerLine(x, 70).toString();
-		if (x.endsWith("\n"))
-			x = x.substring(0, x.length() - 1);
-
-        Span[] names = archive.getAllNamesInDoc(ed, false);
-
-        // Contains all entities and id if it is authorised else null
-        Map<String, Entity> entitiesWithId = new HashMap<>();
-        //we annotate three specially recognized types
-        Map<Short,String> recMap = new HashMap<>();
-        recMap.put(NEType.Type.PERSON.getCode(),"cp");
-        recMap.put(NEType.Type.PLACE.getCode(),"cl");
-        recMap.put(NEType.Type.ORGANISATION.getCode(),"co");
-        Arrays.asList(names).stream().filter(n -> recMap.keySet().contains(NEType.getCoarseType(n.type).getCode()))
-                .forEach(n -> {
-                    Set<String> types = new HashSet<>();
-                    types.add(recMap.get(NEType.getCoarseType(n.type).getCode()));
-                    entitiesWithId.put(n.text, new Entity(n.text, null, types));
-                });
-
-        x = archive.annotate(x, ed.getDate(), ed.getUniqueId(), sensitive, highlightTerms, entitiesWithId, IA_links, false);
-
-		result.append(x);
-		result.append("</b>\n");
-		result.append("\n</td></tr>\n");
-		result.append ("\n" + style + "ID: " + "</td><td>" + Util.hash (ed.getSignature()) + "</td></tr>");
-		result.append("</table>\n"); // end docheader table
-
-		if (ModeConfig.isPublicMode())
-			return new StringBuilder(Util.maskEmailDomain(result.toString()));
-
-		return result;
-	}
-
-	/** I'm not sure what this is used for -- I think its used only for rendering HTML for the message. */
-    public static class Entity {
-        public Map<String, Short> ids;
-        //person,places,orgs, custom
-        public String name;
-        public Set<String> types = new HashSet<String>();
-
-        public Entity(String name, Map<String, Short> ids, Set<String> types) {
-            this.name = name;
-            this.ids = ids;
-            this.types = types;
-        }
-
-        @Override
-        public String toString() {
-            return types.toString();
-        }
-    }
-}
+package edu.stanford.muse.webapp;
+
+import java.io.IOException;
+import java.util.*;
+
+import javax.mail.Address;
+import javax.mail.internet.InternetAddress;
+
+import edu.stanford.muse.datacache.Blob;
+import edu.stanford.muse.datacache.BlobStore;
+import edu.stanford.muse.email.AddressBook;
+import edu.stanford.muse.email.Contact;
+import edu.stanford.muse.groups.SimilarGroup;
+import edu.stanford.muse.index.*;
+import edu.stanford.muse.ner.model.NEType;
+import edu.stanford.muse.util.Pair;
+import edu.stanford.muse.util.Span;
+import edu.stanford.muse.util.Util;
+
+/** This class has util methods to display an email message in an html page */
+
+public class EmailRenderer {
+
+	static final int	TEXT_WRAP_WIDTH	= 80;	// used to be 80, but that wraps
+												// around too soon. 120 is too
+												// much with courier font.
+
+    public static Pair<DataSet, String> pagesForDocuments(Collection<Document> ds, Archive archive, String datasetTitle,
+                                                          Set<String> highlightTerms)
+            throws Exception{
+        return pagesForDocuments(ds, archive, datasetTitle, null, highlightTerms, null, MultiDoc.ClusteringType.MONTHLY);
+    }
+
+    public static Pair<DataSet, String> pagesForDocuments(Collection<Document> ds, Archive archive, String datasetTitle,
+                                                          Set<String> highlightTerms, Collection<Blob> highlightAttachments)
+            throws Exception{
+        return pagesForDocuments(ds, archive, datasetTitle, null, highlightTerms, highlightAttachments, MultiDoc.ClusteringType.MONTHLY);
+    }
+
+    public static Pair<DataSet, String> pagesForDocuments(Collection<Document> ds, Archive archive, String datasetTitle,
+														  Set<Integer> highlightContactIds, Set<String> highlightTerms)
+			throws Exception{
+		return pagesForDocuments(ds, archive, datasetTitle, highlightContactIds, highlightTerms, null, MultiDoc.ClusteringType.MONTHLY);
+	}
+
+	public static Pair<DataSet, String> pagesForDocuments(Collection<Document> ds, Archive archive, String datasetTitle,
+                                                          Set<Integer> highlightContactIds, Set<String> highlightTerms, Collection<Blob> highlightAttachments)
+            throws Exception{
+        return pagesForDocuments(ds, archive, datasetTitle, highlightContactIds, highlightTerms, highlightAttachments, MultiDoc.ClusteringType.MONTHLY);
+    }
+
+	/*
+	 * returns pages and html for a collection of docs, which can be put into a
+	 * jog frame. indexer clusters are used to
+	 *
+	 * Changed the first arg type from: Collection<? extends EmailDocument> to Collection<Document>, as we get Collection<Document> in browse page or from docsforquery, its a hassle to make them all return EmailDocument
+	 * especially when no other document type is used anywhere
+	 */
+	public static Pair<DataSet, String> pagesForDocuments(Collection<Document> ds, Archive archive, String datasetTitle,
+			Set<Integer> highlightContactIds, Set<String> highlightTerms, Collection<Blob> highlightAttachments, MultiDoc.ClusteringType coptions)
+			throws Exception
+	{
+		StringBuilder html = new StringBuilder();
+		int pageNum = 0;
+		List<String> pages = new ArrayList<String>();
+
+		// need clusters which map to sections in the browsing interface
+		List<MultiDoc> clusters;
+
+        // indexer may or may not have indexed all the docs in ds
+		// if it has, use its clustering (could be yearly or monthly or category
+		// wise)
+		// if (indexer != null && indexer.clustersIncludeAllDocs(ds))
+		// if (indexer != null)
+		clusters = archive.clustersForDocs(ds, coptions);
+		/*
+		 * else { // categorize by month if the docs have dates if
+		 * (EmailUtils.allDocsAreDatedDocs(ds)) clusters =
+		 * IndexUtils.partitionDocsByInterval(new ArrayList<DatedDocument>((Set)
+		 * ds), true); else // must be category docs clusters =
+		 * CategoryDocument.clustersDocsByCategoryName((Collection) ds); }
+		 */
+
+		List<Document> datasetDocs = new ArrayList<>();
+
+		// we build up a hierarchy of <section, document, page>
+		for (MultiDoc md : clusters)
+		{
+			if (md.docs.size() == 0)
+				continue;
+
+			String description = md.description;
+			description = description.replace("\"", "\\\""); // escape a double
+																// quote if any
+																// in the
+																// description
+			html.append("<div class=\"section\" name=\"" + description + "\">\n");
+
+			List<List<String>> clusterResult = new ArrayList<>();
+
+			for (Document d : md.docs)
+			{
+				String pdfAttrib = "";
+				/*
+				 * if (d instanceof PDFDocument) pdfAttrib = "pdfLink=\"" +
+				 * ((PDFDocument) d).relativeURLForPDF + "\"";
+				 */
+				html.append("<div class=\"document\" " + pdfAttrib + ">\n");
+
+				datasetDocs.add(d);
+				pages.add(null);
+				clusterResult.add(null);
+				// clusterResult.add(docPageList);
+				// for (String s: docPageList)
+				{
+					String comment = Util.escapeHTML(d.comment);
+					html.append("<div class=\"page\"");
+					if (!Util.nullOrEmpty(comment))
+						html.append(" comment=\"" + comment + "\"");
+
+					if (!Util.nullOrEmpty(comment) && (d instanceof EmailDocument))
+					{
+						String messageId = d.getUniqueId();
+						html.append(" messageID=\"" + messageId + "\"");
+					}
+
+					if (d.isLiked())
+						html.append(" liked=\"true\"");
+					if (d instanceof EmailDocument && ((EmailDocument) d).doNotTransfer)
+						html.append(" doNotTransfer=\"true\"");
+					if (d instanceof EmailDocument && ((EmailDocument) d).transferWithRestrictions)
+						html.append(" transferWithRestrictions=\"true\"");
+					if (d instanceof EmailDocument && ((EmailDocument) d).reviewed)
+						html.append(" reviewed=\"true\"");
+					if (d instanceof EmailDocument && ((EmailDocument) d).addedToCart)
+						html.append(" addToCart=\"true\"");
+					if (d instanceof EmailDocument)
+						html.append(" pageId='" + pageNum++ + "' " + " signature='" + Util.hash (((EmailDocument) d).getSignature()) + "' docId='" + d.getUniqueId() + "'></div>\n");
+				}
+
+				html.append("</div>"); // document
+			}
+			html.append("</div>\n"); // section
+		}
+
+		DataSet dataset = new DataSet(datasetDocs, archive, datasetTitle, highlightContactIds, highlightTerms, highlightAttachments);
+
+		return new Pair<>(dataset, html.toString());
+	}
+
+	/**
+	 * format given addresses as comma separated html, linewrap after given
+	 * number of chars
+	 * 
+	 * @param addressBook
+	 */
+	public static String formatAddressesAsHTML(Address addrs[], AddressBook addressBook, int lineWrap, Set<String> highlightUnstemmed, Set<String> highlightNames, Set<String> highlightAddresses)
+	{
+		StringBuilder sb = new StringBuilder();
+		int outputLineLength = 0;
+		for (int i = 0; i < addrs.length; i++)
+		{
+			String thisAddrStr;
+
+			Address a = addrs[i];
+			if (a instanceof InternetAddress)
+			{
+				InternetAddress ia = (InternetAddress) a;
+				Pair<String, String> p = JSPHelper.getNameAndURL((InternetAddress) a, addressBook);
+				String url = p.getSecond();
+				String str = ia.getPersonal() == null ? ia.getAddress() : ia.getPersonal() + "<" + ia.getAddress() + ">";
+                String addr = ia.getAddress();
+                boolean match = false;
+                if(str!=null) {
+                    //The goal here is to explain why a doc is selected and hence we should replicate Lucene doc selection and Lucene is case insensitive most of the times
+                    String lc = str.toLowerCase();
+                    if (highlightUnstemmed != null)
+                        for (String hs : highlightUnstemmed) {
+                            String hlc = hs.toLowerCase().replaceAll("^\\W+|\\W+$","");
+                            if (lc.contains(hlc)) {
+                                match = true;
+                                break;
+                            }
+                        }
+                    if (!match && highlightNames != null)
+                        for (String hn : highlightNames) {
+                            String hlc = hn.toLowerCase().replaceAll("^\\W+|\\W+$","");
+                            if (lc.contains(hlc)) {
+                                match = true;
+                                break;
+                            }
+                        }
+                }
+                if(addr!=null){
+                    if (!match && highlightAddresses != null)
+                        for (String ha : highlightAddresses)
+                            if (addr.contains(ha)) {
+                                match = true;
+                                break;
+                            }
+                }
+
+                if(match)
+                    thisAddrStr = ("<a href=\"" + url + "\"><span class=\"hilitedTerm rounded\">" + Util.escapeHTML(str) + "</span></a>");
+                else
+                    thisAddrStr = ("<a href=\"" + url + "\">" + Util.escapeHTML(str) + "</a>");
+
+				if (str != null)
+	                outputLineLength += str.length();
+			}
+			else
+			{
+				String str = a.toString();
+				thisAddrStr = str;
+				outputLineLength += str.length();
+                JSPHelper.log.warn("Address is not an instance of InternetAddress - is of instance: "+a.getClass().getName() + ", highlighting won't work.");
+			}
+
+			if (i + 1 < addrs.length)
+				outputLineLength += 2; // +2 for the comma that will follow...
+
+			if (outputLineLength + 2 > lineWrap)
+			{
+				sb.append("<br/>\n");
+				outputLineLength = 0;
+			}
+			sb.append(thisAddrStr);
+			if (i + 1 < addrs.length)
+				sb.append(", ");
+		}
+
+		return sb.toString();
+	}
+
+	/**
+	 * returns a string for documents.
+	 * 
+	 * @param highlightAttachments
+	 * @throws Exception
+	 */
+    //TODO: inFull, debug params can be removed
+    //TODO: Consider a HighlighterOptions class
+	public static Pair<String, Boolean> htmlForDocument(Document d, Archive archive, String datasetTitle, BlobStore attachmentsStore,
+			Boolean sensitive, Set<Integer> highlightContactIds, Set<String> highlightTerms, Set<Blob> highlightAttachments, Map<String, Map<String, Short>> authorisedEntities,
+			boolean IA_links, boolean inFull, boolean debug) throws Exception
+	{
+		JSPHelper.log.debug("Generating HTML for document: " + d);
+		EmailDocument ed = null;
+		String html = null;
+		boolean overflow = false;
+		if (d instanceof EmailDocument)
+		{
+			// for email docs, 1 doc = 1 page
+			ed = (EmailDocument) d;
+			StringBuilder page = new StringBuilder();
+			page.append("<div class=\"muse-doc\">\n");
+
+			page.append("<div class=\"muse-doc-header\">\n");
+			page.append(EmailRenderer.getHTMLForHeader(archive, ed, sensitive, highlightContactIds, highlightTerms, IA_links, debug));
+			page.append("</div>"); // muse-doc-header
+
+			/*
+			 * Map<String, List<String>> sentimentMap =
+			 * indexer.getSentiments(ed); for (String emotion:
+			 * sentimentMap.keySet()) { page.append ("<b>" + emotion +
+			 * "</b>: "); for (String word: sentimentMap.get(emotion))
+			 * page.append (word + " "); page.append ("<br/>\n");
+			 * page.append("<br/>\n"); }
+			 */
+			page.append("\n<div class=\"muse-doc-body\">\n");
+			Pair<StringBuilder, Boolean> contentsHtml = archive.getHTMLForContents(d, ((EmailDocument) d).getDate(), d.getUniqueId(), sensitive, highlightTerms,
+					authorisedEntities, IA_links, inFull, true);
+
+			StringBuilder htmlMessageBody = contentsHtml.first;
+			overflow = contentsHtml.second;
+			// page.append(ed.getHTMLForContents(indexer, highlightTermsStemmed,
+			// highlightTermsUnstemmed, IA_links));
+			page.append(htmlMessageBody);
+			page.append("\n</div> <!-- .muse-doc-body -->\n"); // muse-doc-body
+
+			// page.append("\n<hr class=\"end-of-browse-contents-line\"/>\n");
+			List<Blob> attachments = ed.attachments;
+			if (attachments != null && attachments.size() > 0)
+			{
+				// show thumbnails of all the attachments
+
+				if (ModeConfig.isPublicMode()) {
+					page.append(attachments.size() + " attachment" + (attachments.size() == 1 ? "" : "s") + ".");
+				} else {
+					page.append("<hr/>\n<div class=\"attachments\">\n");
+					page.append("<table>\n");
+					int i = 0;
+					for (; i < attachments.size(); i++)
+					{
+						if (i % 4 == 0)
+							page.append((i == 0) ? "<tr>\n" : "</tr><tr>\n");
+						page.append("<td>");
+
+						Blob attachment = attachments.get(i);
+						String thumbnailURL = null, attachmentURL = null;
+						boolean is_image = Util.is_image_filename(attachment.filename);
+
+						if (attachmentsStore != null)
+						{
+							String contentFileDataStoreURL = attachmentsStore.get_URL(attachment);
+							attachmentURL = "serveAttachment.jsp?file=" + Util.URLtail(contentFileDataStoreURL);
+							String tnFileDataStoreURL = attachmentsStore.getViewURL(attachment, "tn");
+							if (tnFileDataStoreURL != null)
+								thumbnailURL = "serveAttachment.jsp?file=" + Util.URLtail(tnFileDataStoreURL);
+							else
+							{
+								if (attachment.is_image())
+									thumbnailURL = attachmentURL;
+								else
+									thumbnailURL = "images/sorry.png";
+							}
+						}
+						else
+							JSPHelper.log.warn("attachments store is null!");
+
+						// toString the filename in any case,
+						String s = attachment.filename;
+						// cap to a length of 25, otherwise the attachment name
+						// overflows the tn
+						String display = Util.ellipsize(s, 25);
+                        boolean highlight = highlightAttachments != null && highlightAttachments.contains(attachment);
+                        page.append("&nbsp;" + "<span title=\"" + Util.escapeHTML(s) + "\" class='" + (highlight?"highlight":"") + "'>"+ Util.escapeHTML(display) + "</span>&nbsp;");
+						page.append("<br/>");
+
+						String css_class = "attachment-preview" + (is_image ? " img" : "") + (highlight ? " highlight" : "");
+						String leader = "<img class=\"" + css_class + "\" ";
+
+						// punt on the thumbnail if the attachment tn or content
+						// URL is not found
+						if (thumbnailURL != null && attachmentURL != null)
+						{
+							// d.hashCode() is just something to identify this
+							// page/message
+							page.append("<a rel=\"page" + d.hashCode() + "\" title=\"" + attachment.filename + "\" class=\"" + (highlight?"highlight":"") + "\" href=\"" + attachmentURL + "\">");
+							page.append(leader + "href=\"" + attachmentURL + "\" src=\"" + thumbnailURL + "\"></img>\n");
+							page.append("<a>\n");
+						}
+						else
+						{
+							// page.append
+							// ("&nbsp;<br/>&nbsp;<br/>Not fetched<br/>&nbsp;<br/>&nbsp;&nbsp;&nbsp;");
+							// page.append("<a title=\"" + attachment.filename +
+							// "\" href=\"" + attachmentURL + "\">");
+							page.append(leader + "src=\"images/no-attachment.png\"></img>\n");
+							// page.append ("<a>\n");
+
+							if (thumbnailURL == null)
+								JSPHelper.log.info("No thumbnail for " + attachment);
+							if (attachmentURL == null)
+								JSPHelper.log.info("No attachment URL for " + attachment);
+						}
+						page.append("</td>\n");
+					}
+					if (i % 4 != 0)
+						page.append("</tr>");
+					page.append("</table>");
+					page.append("\n</div>  <!-- .muse-doc-attachments -->\n"); // muse-doc-attachments
+				}
+
+			}
+			page.append("\n</div>  <!-- .muse-doc -->\n"); // .muse-doc
+			html = page.toString();
+		}
+		else if (d instanceof DatedDocument)
+		{
+			/*
+			 * DatedDocument dd = (DatedDocument) d; StringBuilder page = new
+			 * StringBuilder();
+			 * 
+			 * page.append (dd.getHTMLForHeader()); // directly jam in contents
+			 * page.append ("<div class=\"muse-doc\">\n"); page.append
+			 * (dd.getHTMLForContents(indexer)); // directly jam in contents
+			 * page.append ("\n</div>"); // doc-contents return page.toString();
+			 */
+			html = "To be implemented";
+		}
+		else
+		{
+			JSPHelper.log.warn("Unsupported Document: " + d.getClass().getName());
+			html = "";
+		}
+
+		return new Pair<String, Boolean>(html, overflow);
+	}
+
+	/**
+	 * returns a HTML table string for the doc header
+	 * 
+	 * @param sensitive
+	 *            - when set will highlight any sensitive info in subject based
+	 *            on preset regexs
+	 * @throws IOException
+	 */
+	public static StringBuilder getHTMLForHeader(Archive archive, EmailDocument ed, Boolean sensitive, Set<Integer> highlightContactIds, Set<String> highlightTerms,
+			boolean IA_links, boolean debug) throws IOException
+	{
+		AddressBook addressBook = archive.addressBook;
+		GroupAssigner groupAssigner = archive.groupAssigner;
+        Set<String> contactNames = new LinkedHashSet<>();
+        Set<String> contactAddresses = new LinkedHashSet<>();
+        if(highlightContactIds!=null)
+            for(Integer hci: highlightContactIds) {
+                if(hci == null)
+                    continue;
+                Contact c = archive.addressBook.getContact(hci);
+                if(c==null)
+                    continue;
+                contactNames.addAll(c.names);
+                contactAddresses.addAll(c.emails);
+            }
+        contactNames.addAll(highlightTerms);
+
+		StringBuilder result = new StringBuilder();
+		// header table
+		result.append("<table class=\"docheader rounded\">\n");
+		// result.append
+		// ("<tr><td width=\"100px\" align=\"right\" class=\"muted\">Folder:</td><td>"
+		// + this.folderName + "</td></tr>\n");
+		if(debug)
+			result.append("<tr><td>docId: </td><td>"+ed.getUniqueId()+"</td></tr>\n");
+		result.append(JSPHelper.getHTMLForDate(ed.date));
+
+		final String style = "<tr><td align=\"right\" class=\"muted\" valign=\"top\">";
+
+		// email specific headers
+		result.append(style + "From: </td><td align=\"left\">");
+		Address[] addrs = ed.from;
+		if (addrs != null)
+		{
+			result.append(formatAddressesAsHTML(addrs, addressBook, TEXT_WRAP_WIDTH, highlightTerms, contactNames, contactAddresses));
+		}
+
+		result.append(style + "To: </td><td align=\"left\">");
+		addrs = ed.to;
+		if (addrs != null)
+			result.append(formatAddressesAsHTML(addrs, addressBook, TEXT_WRAP_WIDTH, highlightTerms, contactNames, contactAddresses) + "");
+
+		result.append("\n</td></tr>\n");
+
+		if (ed.cc != null && ed.cc.length > 0)
+		{
+			result.append(style + "Cc: </td><td align=\"left\">");
+			result.append(formatAddressesAsHTML(ed.cc, addressBook, TEXT_WRAP_WIDTH, highlightTerms, contactNames, contactAddresses) + "");
+			result.append("\n</td></tr>\n");
+		}
+
+		if (ed.bcc != null && ed.bcc.length > 0)
+		{
+			result.append(style + "Bcc: </td><td align=\"left\">");
+			result.append(formatAddressesAsHTML(ed.bcc, addressBook, TEXT_WRAP_WIDTH, highlightTerms, contactNames, contactAddresses) + "");
+			result.append("\n</td></tr>\n");
+		}
+
+		if (groupAssigner != null)
+		{
+			SimilarGroup<String> g = groupAssigner.getClosestGroup(ed);
+			if (g != null && g.size() > 1) // if its just a singleton group, no
+											// point explicitly listing a group
+											// line
+			{
+				String url = "browse?groupIdx=" + groupAssigner.getClosestGroupIdx(ed);
+				result.append(style + "Group: </td>\n");
+				result.append("<td align=\"left\">");
+				String description = g.elementsToString();
+				result.append("<span class=\"facet\" style=\"padding-left:2px;padding-right:2px\" onclick=\"javascript:window.open('" + url + "');\" title=\""
+						+ Util.escapeHTML(description) + "\">" + g.name + "</span></br>");
+				result.append("</td>\n</tr>\n");
+			}
+		}
+
+		String x = ed.description;
+		if (x == null)
+			x = "<None>";
+
+		result.append(style + "Subject: </td>");
+		// <pre> to escape special chars if any in the subject. max 70 chars in
+		// one line, otherwise spill to next line
+		result.append("<td align=\"left\"><b>");
+		x = DatedDocument.formatStringForMaxCharsPerLine(x, 70).toString();
+		if (x.endsWith("\n"))
+			x = x.substring(0, x.length() - 1);
+
+        Span[] names = archive.getAllNamesInDoc(ed, false);
+
+        // Contains all entities and id if it is authorised else null
+        Map<String, Entity> entitiesWithId = new HashMap<>();
+        //we annotate three specially recognized types
+        Map<Short,String> recMap = new HashMap<>();
+        recMap.put(NEType.Type.PERSON.getCode(),"cp");
+        recMap.put(NEType.Type.PLACE.getCode(),"cl");
+        recMap.put(NEType.Type.ORGANISATION.getCode(),"co");
+        Arrays.asList(names).stream().filter(n -> recMap.keySet().contains(NEType.getCoarseType(n.type).getCode()))
+                .forEach(n -> {
+                    Set<String> types = new HashSet<>();
+                    types.add(recMap.get(NEType.getCoarseType(n.type).getCode()));
+                    entitiesWithId.put(n.text, new Entity(n.text, null, types));
+                });
+
+        x = archive.annotate(x, ed.getDate(), ed.getUniqueId(), sensitive, highlightTerms, entitiesWithId, IA_links, false);
+
+		result.append(x);
+		result.append("</b>\n");
+		result.append("\n</td></tr>\n");
+		result.append ("\n" + style + "ID: " + "</td><td>" + Util.hash (ed.getSignature()) + "</td></tr>");
+		result.append("</table>\n"); // end docheader table
+
+		if (ModeConfig.isPublicMode())
+			return new StringBuilder(Util.maskEmailDomain(result.toString()));
+
+		return result;
+	}
+
+	/** I'm not sure what this is used for -- I think its used only for rendering HTML for the message. */
+    public static class Entity {
+        public Map<String, Short> ids;
+        //person,places,orgs, custom
+        public String name;
+        public Set<String> types = new HashSet<String>();
+
+        public Entity(String name, Map<String, Short> ids, Set<String> types) {
+            this.name = name;
+            this.ids = ids;
+            this.types = types;
+        }
+
+        @Override
+        public String toString() {
+            return types.toString();
+        }
+    }
+}

From c2934374e316cbb24f615fba8e05085909d4b8cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=A0=D0=BE=D1=81=D1=82=D0=BE=D0=B2=20=D0=90=D0=BB=D0=B5?=
 =?UTF-8?q?=D0=BA=D1=81=D0=B0=D0=BD=D0=B4=D1=80?= <arostov83@gmail.com>
Date: Fri, 5 May 2017 22:37:07 +0300
Subject: [PATCH 03/33] encoding in attachments

---
 src/java/edu/stanford/muse/email/EmailFetcherThread.java | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/java/edu/stanford/muse/email/EmailFetcherThread.java b/src/java/edu/stanford/muse/email/EmailFetcherThread.java
index e09caab..53835c3 100755
--- a/src/java/edu/stanford/muse/email/EmailFetcherThread.java
+++ b/src/java/edu/stanford/muse/email/EmailFetcherThread.java
@@ -24,6 +24,7 @@
 import edu.stanford.muse.webapp.HTMLUtils;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+import org.apache.james.mime4j.codec.DecoderUtil;
 import org.json.JSONArray;
 import org.json.JSONException;
 import org.json.JSONObject;
@@ -588,6 +589,9 @@ private void handleAttachments(int idx, Message m, Part p, List<String> textList
         String filename = null;
         try {
             filename = p.getFileName();
+            if (filename != null) {
+                filename = DecoderUtil.decodeEncodedWords(filename, null);
+            }
         } catch (Exception e) {
             // seen this happen with:
             // Folders__gmail-sent Message #12185 Expected ';', got "Message"

From d923dedcb292586792d575652a6cf1a24dfa3c3a Mon Sep 17 00:00:00 2001
From: Gleb Suvorov <suvorov.gleb@gmail.com>
Date: Sat, 6 May 2017 00:15:14 +0300
Subject: [PATCH 04/33] unverified fix for cyrillic in search request

---
 .../edu/stanford/muse/webapp/JSPHelper.java   | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/java/edu/stanford/muse/webapp/JSPHelper.java b/src/java/edu/stanford/muse/webapp/JSPHelper.java
index f6a05bb..f894481 100755
--- a/src/java/edu/stanford/muse/webapp/JSPHelper.java
+++ b/src/java/edu/stanford/muse/webapp/JSPHelper.java
@@ -267,17 +267,17 @@ public static String[] convertRequestParamsToUTF8(String params[]) throws Unsupp
 	// converts an array of strings from iso-8859-1 to utf8. useful for converting i18n chars in http request parameters
 	public static String convertRequestParamToUTF8(String param) throws UnsupportedEncodingException
 	{
-		if (RUNNING_ON_JETTY)
-		{
-			log.info("running on jetty: no conversion for " + param);
+	//	if (RUNNING_ON_JETTY)
+	//	{
+	//		log.info("running on jetty: no conversion for " + param);
 			return param;
-		}
-		if (param == null)
-			return null;
-		String newParam = new String(param.getBytes("ISO-8859-1"), "UTF-8");
-		if (!newParam.equals(param))
-			log.info("Converted to utf-8: " + param + " -> " + newParam);
-		return newParam;
+	//	}
+	//	if (param == null)
+	//		return null;
+	//	String newParam = new String(param.getBytes("ISO-8859-1"), "UTF-8");
+	//	if (!newParam.equals(param))
+	//		log.info("Converted to utf-8: " + param + " -> " + newParam);
+	//	return newParam;
 	}
 
 	public static boolean runningOnLocalhost(HttpServletRequest request)

From fa61cf73cdb791cf0db014df276de41c2c1c23b7 Mon Sep 17 00:00:00 2001
From: Gleb Suvorov <suvorov.gleb@gmail.com>
Date: Wed, 24 May 2017 00:52:58 +0300
Subject: [PATCH 05/33] small memory fix

---
 src/java/edu/stanford/muse/email/EmailFetcherThread.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/java/edu/stanford/muse/email/EmailFetcherThread.java b/src/java/edu/stanford/muse/email/EmailFetcherThread.java
index 53835c3..bb3192b 100755
--- a/src/java/edu/stanford/muse/email/EmailFetcherThread.java
+++ b/src/java/edu/stanford/muse/email/EmailFetcherThread.java
@@ -1282,7 +1282,7 @@ public void run() {
                 // this is a special for mbox'es because we run out of memory if we try to openFolderAndGetMessages()
                 // so we process in batches
                 //TODO: Ideally, should cap on buffer size rather than on number of messages.
-                final int BATCH = 10000;
+                final int BATCH = 100; //it ate too much memory more than 12gb with 10000
                 int nbatches = nMessages / BATCH;
                 nMessagesProcessedSuccess = 0;
                 long st = System.currentTimeMillis();

From 182ca3fbce9aca54f52c0b5fd346ba9743f86670 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=A0=D0=BE=D1=81=D1=82=D0=BE=D0=B2=20=D0=90=D0=BB=D0=B5?=
 =?UTF-8?q?=D0=BA=D1=81=D0=B0=D0=BD=D0=B4=D1=80?= <arostov83@gmail.com>
Date: Thu, 25 May 2017 22:25:50 +0300
Subject: [PATCH 06/33] encoding in download attachments

---
 src/java/edu/stanford/muse/util/Util.java           |  5 ++---
 .../edu/stanford/muse/webapp/EmailRenderer.java     | 13 ++++++-------
 src/java/edu/stanford/muse/webapp/JSPHelper.java    | 10 ++++++++--
 3 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/src/java/edu/stanford/muse/util/Util.java b/src/java/edu/stanford/muse/util/Util.java
index 3991b95..c5419df 100755
--- a/src/java/edu/stanford/muse/util/Util.java
+++ b/src/java/edu/stanford/muse/util/Util.java
@@ -2412,9 +2412,8 @@ public static void test_tail()
 	 * actual file in the URL.
 	 * returns null if the input is null.
 	 */
-	public static String URLtail(String url)
-	{
-		return tail(url, "/");
+	public static String URLtail(String url) {
+			return URLEncode(tail(url, "/"));
 	}
 
 	/**
diff --git a/src/java/edu/stanford/muse/webapp/EmailRenderer.java b/src/java/edu/stanford/muse/webapp/EmailRenderer.java
index f413e3b..f56ccb3 100755
--- a/src/java/edu/stanford/muse/webapp/EmailRenderer.java
+++ b/src/java/edu/stanford/muse/webapp/EmailRenderer.java
@@ -1,11 +1,5 @@
 package edu.stanford.muse.webapp;
 
-import java.io.IOException;
-import java.util.*;
-
-import javax.mail.Address;
-import javax.mail.internet.InternetAddress;
-
 import edu.stanford.muse.datacache.Blob;
 import edu.stanford.muse.datacache.BlobStore;
 import edu.stanford.muse.email.AddressBook;
@@ -17,6 +11,11 @@
 import edu.stanford.muse.util.Span;
 import edu.stanford.muse.util.Util;
 
+import javax.mail.Address;
+import javax.mail.internet.InternetAddress;
+import java.io.IOException;
+import java.util.*;
+
 /** This class has util methods to display an email message in an html page */
 
 public class EmailRenderer {
@@ -337,7 +336,7 @@ public static Pair<String, Boolean> htmlForDocument(Document d, Archive archive,
 							// d.hashCode() is just something to identify this
 							// page/message
 							page.append("<a rel=\"page" + d.hashCode() + "\" title=\"" + attachment.filename + "\" class=\"" + (highlight?"highlight":"") + "\" href=\"" + attachmentURL + "\">");
-							page.append(leader + "href=\"" + attachmentURL + "\" src=\"" + thumbnailURL + "\"></img>\n");
+							page.append(leader + "href=\"" + attachmentURL + "\" download src=\"" + thumbnailURL + "\"></img>\n");
 							page.append("<a>\n");
 						}
 						else
diff --git a/src/java/edu/stanford/muse/webapp/JSPHelper.java b/src/java/edu/stanford/muse/webapp/JSPHelper.java
index f894481..0e4fe61 100755
--- a/src/java/edu/stanford/muse/webapp/JSPHelper.java
+++ b/src/java/edu/stanford/muse/webapp/JSPHelper.java
@@ -44,6 +44,8 @@
 import javax.servlet.http.HttpSession;
 import javax.xml.transform.TransformerException;
 import java.io.*;
+import java.net.URLDecoder;
+import java.net.URLEncoder;
 import java.util.*;
 import java.util.stream.Collectors;
 
@@ -1589,7 +1591,11 @@ public static void serveBlob(HttpServletRequest request, HttpServletResponse res
 	{
 		HttpSession session = request.getSession();
 		String filename = request.getParameter("file");
-		filename = convertRequestParamToUTF8(filename);
+		try {
+			filename = URLDecoder.decode(filename, "utf-8");
+		} catch (Exception e) {
+			throw new RuntimeException(e.getMessage(), e);
+		}
 		String baseDir = (String) getSessionAttribute(session, "cacheDir");
 
 		if (filename.indexOf(".." + File.separator) >= 0) // avoid file injection!
@@ -1663,7 +1669,7 @@ public static void writeFileToResponse(HttpSession session, HttpServletResponse
 		if (asAttachment)
 		{
 			response.setHeader("Content-Length", String.valueOf(file.length()));
-			response.setHeader("Content-Disposition", "attachment; filename=\"" + file.getName() + "\"");
+			response.setHeader("Content-Disposition", "attachment; filename=\"" + URLEncoder.encode(file.getName(), "utf-8") + "\"");
 		}
 		// Prepare streams.
 		BufferedInputStream input = null;

From 78c6329e392e785a53c9e89e601e1d6b5fd5c2dd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=A0=D0=BE=D1=81=D1=82=D0=BE=D0=B2=20=D0=90=D0=BB=D0=B5?=
 =?UTF-8?q?=D0=BA=D1=81=D0=B0=D0=BD=D0=B4=D1=80?= <arostov83@gmail.com>
Date: Fri, 26 May 2017 00:25:13 +0300
Subject: [PATCH 07/33] encoding in file picker

---
 src/java/edu/stanford/muse/webapp/JSPHelper.java | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/java/edu/stanford/muse/webapp/JSPHelper.java b/src/java/edu/stanford/muse/webapp/JSPHelper.java
index 0e4fe61..67c6139 100755
--- a/src/java/edu/stanford/muse/webapp/JSPHelper.java
+++ b/src/java/edu/stanford/muse/webapp/JSPHelper.java
@@ -1669,7 +1669,8 @@ public static void writeFileToResponse(HttpSession session, HttpServletResponse
 		if (asAttachment)
 		{
 			response.setHeader("Content-Length", String.valueOf(file.length()));
-			response.setHeader("Content-Disposition", "attachment; filename=\"" + URLEncoder.encode(file.getName(), "utf-8") + "\"");
+			String fileName = URLEncoder.encode(file.getName(), "utf-8").replace("+", "%20");
+			response.setHeader("Content-Disposition", "attachment; filename=\"" + fileName + "\"");
 		}
 		// Prepare streams.
 		BufferedInputStream input = null;

From 15277480d653ae2ca6de208e68ea57da1c2a6e3e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=A0=D0=BE=D1=81=D1=82=D0=BE=D0=B2=20=D0=90=D0=BB=D0=B5?=
 =?UTF-8?q?=D0=BA=D1=81=D0=B0=D0=BD=D0=B4=D1=80?= <arostov83@gmail.com>
Date: Fri, 26 May 2017 00:25:56 +0300
Subject: [PATCH 08/33] replace plus to spase on downloaded filename

---
 src/java/edu/stanford/muse/util/Util.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/java/edu/stanford/muse/util/Util.java b/src/java/edu/stanford/muse/util/Util.java
index c5419df..5c0a8b4 100755
--- a/src/java/edu/stanford/muse/util/Util.java
+++ b/src/java/edu/stanford/muse/util/Util.java
@@ -2413,7 +2413,7 @@ public static void test_tail()
 	 * returns null if the input is null.
 	 */
 	public static String URLtail(String url) {
-			return URLEncode(tail(url, "/"));
+		return URLEncode(tail(url, "/"));
 	}
 
 	/**

From 1999d666ef7087370f893ca3e1b9aeb5c15f728c Mon Sep 17 00:00:00 2001
From: Gleb Suvorov <suvorov.gleb@gmail.com>
Date: Fri, 26 May 2017 00:33:26 +0300
Subject: [PATCH 09/33] gradual memory consumption fix for standalone.jar

---
 src/java/edu/stanford/muse/email/EmailFetcherThread.java | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/java/edu/stanford/muse/email/EmailFetcherThread.java b/src/java/edu/stanford/muse/email/EmailFetcherThread.java
index bb3192b..ec4430a 100755
--- a/src/java/edu/stanford/muse/email/EmailFetcherThread.java
+++ b/src/java/edu/stanford/muse/email/EmailFetcherThread.java
@@ -1282,7 +1282,13 @@ public void run() {
                 // this is a special for mbox'es because we run out of memory if we try to openFolderAndGetMessages()
                 // so we process in batches
                 //TODO: Ideally, should cap on buffer size rather than on number of messages.
-                final int BATCH = 100; //it ate too much memory more than 12gb with 10000
+            	int nMessagesperbathc = 10000;
+            	long maxMemory = Runtime.getRuntime().maxMemory();
+            	 if (maxMemory <= 4294967296L ) {	nMessagesperbathc = 100;	} 
+            	 	else {
+            	 		if  (maxMemory<= 8294967296L)  {	nMessagesperbathc = 1000;	}  
+            	 		}
+				final int BATCH = nMessagesperbathc; //gradual decrease of batch size due to memory size
                 int nbatches = nMessages / BATCH;
                 nMessagesProcessedSuccess = 0;
                 long st = System.currentTimeMillis();

From aed6ec3e998b23ee54544b56acac709f2ee0d5b0 Mon Sep 17 00:00:00 2001
From: arostov <arostov83@gmail.com>
Date: Sun, 4 Jun 2017 22:02:58 +0300
Subject: [PATCH 10/33] save json on fs

---
 .../muse/email/EmailFetcherThread.java        |   2 +
 .../muse/email/json/ArchiveSaver.java         | 125 ++++++++++++++++++
 2 files changed, 127 insertions(+)
 create mode 100644 src/java/edu/stanford/muse/email/json/ArchiveSaver.java

diff --git a/src/java/edu/stanford/muse/email/EmailFetcherThread.java b/src/java/edu/stanford/muse/email/EmailFetcherThread.java
index ec4430a..db983f8 100755
--- a/src/java/edu/stanford/muse/email/EmailFetcherThread.java
+++ b/src/java/edu/stanford/muse/email/EmailFetcherThread.java
@@ -17,6 +17,7 @@
 
 import com.sun.mail.imap.IMAPFolder;
 import edu.stanford.muse.datacache.Blob;
+import edu.stanford.muse.email.json.ArchiveSaver;
 import edu.stanford.muse.index.*;
 import edu.stanford.muse.util.EmailUtils;
 import edu.stanford.muse.util.JSONUtils;
@@ -1365,6 +1366,7 @@ public void run() {
                 }
                 log.info("Read #" + nMessages + " messages in  in " + (System.currentTimeMillis() - st) + "ms");
             }
+            new ArchiveSaver().save(archive);
         } catch (Throwable t) {
             if (t instanceof OutOfMemoryError)
                 this.mayHaveRunOutOfMemory = true;
diff --git a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
new file mode 100644
index 0000000..275b39f
--- /dev/null
+++ b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
@@ -0,0 +1,125 @@
+package edu.stanford.muse.email.json;
+
+import edu.stanford.muse.email.AddressBook;
+import edu.stanford.muse.index.Archive;
+import edu.stanford.muse.index.Document;
+import edu.stanford.muse.index.EmailDocument;
+import edu.stanford.muse.webapp.JSPHelper;
+import org.codehaus.plexus.util.StringOutputStream;
+
+import javax.mail.Address;
+import javax.mail.internet.InternetAddress;
+import java.io.*;
+import java.util.List;
+
+/**
+ * Created by sunchise on 04.06.17.
+ */
+public class ArchiveSaver {
+
+    public void save(Archive archive) {
+        String fileName = System.getProperty("user.home") + File.separator + "archive.json";
+        File file = new File(fileName);
+        if (file.exists()) {
+            file.delete();
+        }
+        try {
+            file.createNewFile();
+        } catch (IOException e) {
+            throw new RuntimeException(e.getMessage(), e);
+        }
+        BufferedWriter stream;
+        try {
+             stream = new BufferedWriter(new FileWriter(file));
+        } catch (IOException e) {
+            throw new RuntimeException(e.getMessage(), e);
+        }
+        List<Document> allDocs = archive.getAllDocs();
+        int i = 1;
+        try {
+            stream.append("[");
+            for (Document doc : allDocs) {
+                if (i > 1) {
+                    stream.append(",");
+                }
+                stream.append("{");
+                EmailDocument emailDocument = (EmailDocument) doc;
+                stream.append("\"emailId\": " + i++ + ",");
+                stream.append("\"dateField\": \"" + emailDocument.getDate().getTime() + "\",");
+                stream.append("\"isSent\": " + true + ",");
+                stream.append("\"toField\": [");
+                if (emailDocument.to != null) {
+                    boolean first = true;
+                    for (Address address : emailDocument.to) {
+                        if (!first) {
+                            stream.append(",");
+                        }
+                        InternetAddress internetAddress = (InternetAddress) address;
+                        stream.append("[");
+                        stream.append(getAddressString(internetAddress));
+                        stream.append("]");
+                        first = false;
+                    }
+                }
+                stream.append("],");
+                stream.append("\"ccField\": [");
+                if (emailDocument.cc != null && emailDocument.cc.length != 0) {
+                    boolean first = true;
+                    for (Address address : emailDocument.cc) {
+                        if (!first) {
+                            stream.append(",");
+                        }
+                        InternetAddress internetAddress = (InternetAddress) address;
+                        stream.append("[");
+                        stream.append(getAddressString(internetAddress));
+                        stream.append("]");
+                        first = false;
+                    }
+                } else {
+                    stream.append("[");
+                    stream.append("\"ccPlaceholder\",\"ccPlaceholder\"");
+                    stream.append("]");
+                }
+                stream.append("],");
+
+                stream.append("\"fromField\": [");
+                if (emailDocument.from != null && emailDocument.from.length > 0) {
+                    boolean first = true;
+                    for (Address address : emailDocument.from) {
+                        if (!first) {
+                            stream.append(",");
+                        }
+                        InternetAddress internetAddress = (InternetAddress) address;
+                        stream.append("[");
+                        stream.append(getAddressString(internetAddress));
+                        stream.append("], ");
+                        stream.append("\"" + internetAddress.getAddress() + "\"");
+                        first = false;
+                    }
+                }  else {
+                    stream.append("[");
+                    stream.append("\"fromPlaceholder\",\"fromPlaceholder\"");
+                    stream.append("], ");
+                    stream.append("\"fromPlaceholder\"");
+                }
+                stream.append("],");
+                stream.append("\"subject\": \"" + String.valueOf(emailDocument.getSubject()).trim().replaceAll("\"", "''").replaceAll("\n", " ") + "\"");
+                stream.append("}");
+            }
+            stream.append("]");
+            stream.flush();
+        } catch (IOException e) {
+            throw new RuntimeException(e.getMessage(), e);
+        }
+    }
+
+
+
+    private String getAddressString(InternetAddress internetAddress) {
+        return "\""
+                + (internetAddress.getPersonal() == null
+                        ? internetAddress.getAddress()
+                        : internetAddress.getPersonal().replaceAll("\"", "''"))
+                + "\", \"" + internetAddress.getAddress() + "\"";
+    }
+}

From 4917c8b3ec2eba8331b983be6aee6f65b94ca43b Mon Sep 17 00:00:00 2001
From: arostov <arostov83@gmail.com>
Date: Mon, 5 Jun 2017 00:02:24 +0300
Subject: [PATCH 11/33] save json on fs by servlet

---
 .../stanford/muse/email/json/EmailInfo.java   | 56 +++++++++++++++++++
 1 file changed, 56 insertions(+)
 create mode 100644 src/java/edu/stanford/muse/email/json/EmailInfo.java

diff --git a/src/java/edu/stanford/muse/email/json/EmailInfo.java b/src/java/edu/stanford/muse/email/json/EmailInfo.java
new file mode 100644
index 0000000..5af3927
--- /dev/null
+++ b/src/java/edu/stanford/muse/email/json/EmailInfo.java
@@ -0,0 +1,56 @@
+package edu.stanford.muse.email.json;
+
+import java.io.Serializable;
+
+/**
+ * Created by sunchise on 03.06.17.
+ */
+public class EmailInfo implements Serializable {
+
+    private final int emailId;
+    private final String dateField;
+    private final boolean isSent;
+    private final String[][] toField;
+    private final String[][] ccField;
+    private final Object[] fromField;
+    private final String subject;
+
+
+    public EmailInfo(int emailId, String dateField, boolean isSent, String[][] toField, String[][] ccField, Object[] fromField, String subject) {
+        this.emailId = emailId;
+        this.dateField = dateField;
+        this.isSent = isSent;
+        this.toField = toField;
+        this.ccField = ccField;
+        this.fromField = fromField;
+        this.subject = subject;
+    }
+
+    public int getEmailId() {
+        return emailId;
+    }
+
+    public String getDateField() {
+        return dateField;
+    }
+
+    public boolean isSent() {
+        return isSent;
+    }
+
+    public String[][] getToField() {
+        return toField;
+    }
+
+    public String[][] getCcField() {
+        return ccField;
+    }
+
+    public Object[] getFromField() {
+        return fromField;
+    }
+
+    public String getSubject() {
+        return subject;
+    }
+}

From 427adf56cda3fa632b71070f9a87d5227e879c21 Mon Sep 17 00:00:00 2001
From: arostov <arostov83@gmail.com>
Date: Mon, 5 Jun 2017 20:52:09 +0300
Subject: [PATCH 12/33] save json filter special chars

---
 .../muse/email/json/ArchiveSaver.java         | 84 +++++++++----------
 1 file changed, 42 insertions(+), 42 deletions(-)

diff --git a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
index 275b39f..615251c 100644
--- a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
+++ b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
@@ -28,85 +28,79 @@ public void save(Archive archive) {
         } catch (IOException e) {
             throw new RuntimeException(e.getMessage(), e);
         }
-        BufferedWriter stream;
-        try {
-             stream = new BufferedWriter(new FileWriter(file));
-        } catch (IOException e) {
-            throw new RuntimeException(e.getMessage(), e);
-        }
         List<Document> allDocs = archive.getAllDocs();
         int i = 1;
-        try {
-            stream.append("[");
+        try (BufferedWriter stream = new BufferedWriter(new FileWriter(file))) {
+            append(stream, "[");
             for (Document doc : allDocs) {
                 if (i > 1) {
-                    stream.append(",");
+                    append(stream, ",");
                 }
-                stream.append("{");
+                append(stream, "{");
                 EmailDocument emailDocument = (EmailDocument) doc;
-                stream.append("\"emailId\": " + i++ + ",");
-                stream.append("\"dateField\": \"" + emailDocument.getDate().getTime() + "\",");
-                stream.append("\"isSent\": " + true + ",");
-                stream.append("\"toField\": [");
+                append(stream, "\"emailId\": " + i++ + ",");
+                append(stream, "\"dateField\": \"" + emailDocument.getDate().getTime() + "\",");
+                append(stream, "\"isSent\": " + true + ",");
+                append(stream, "\"toField\": [");
                 if (emailDocument.to != null) {
                     boolean first = true;
                     for (Address address : emailDocument.to) {
                         if (!first) {
-                            stream.append(",");
+                            append(stream, ",");
                         }
                         InternetAddress internetAddress = (InternetAddress) address;
-                        stream.append("[");
-                        stream.append(getAddressString(internetAddress));
-                        stream.append("]");
+                        append(stream, "[");
+                        append(stream, getAddressString(internetAddress));
+                        append(stream, "]");
                         first = false;
                     }
                 }
-                stream.append("],");
-                stream.append("\"ccField\": [");
+                append(stream, "],");
+                append(stream, "\"ccField\": [");
                 if (emailDocument.cc != null && emailDocument.cc.length != 0) {
                     boolean first = true;
                     for (Address address : emailDocument.cc) {
                         if (!first) {
-                            stream.append(",");
+                            append(stream, ",");
                         }
                         InternetAddress internetAddress = (InternetAddress) address;
-                        stream.append("[");
-                        stream.append(getAddressString(internetAddress));
-                        stream.append("]");
+                        append(stream, "[");
+                        append(stream, getAddressString(internetAddress));
+                        append(stream, "]");
                         first = false;
                     }
                 } else {
-                    stream.append("[");
-                    stream.append("\"ccPlaceholder\",\"ccPlaceholder\"");
-                    stream.append("]");
+                    append(stream, "[");
+                    append(stream, "\"ccPlaceholder\",\"ccPlaceholder\"");
+                    append(stream, "]");
                 }
-                stream.append("],");
+                append(stream, "],");
 
-                stream.append("\"fromField\": [");
+                append(stream, "\"fromField\": [");
                 if (emailDocument.from != null && emailDocument.from.length > 0) {
                     boolean first = true;
                     for (Address address : emailDocument.from) {
                         if (!first) {
-                            stream.append(",");
+                            append(stream, ",");
                         }
                         InternetAddress internetAddress = (InternetAddress) address;
-                        stream.append("[");
-                        stream.append(getAddressString(internetAddress));
-                        stream.append("], ");
-                        stream.append("\"" + internetAddress.getAddress() + "\"");
+                        append(stream, "[");
+                        append(stream, getAddressString(internetAddress));
+                        append(stream, "], ");
+                        append(stream, "\"" + internetAddress.getAddress() + "\"");
                         first = false;
                     }
                 }  else {
-                    stream.append("[");
-                    stream.append("\"fromPlaceholder\",\"fromPlaceholder\"");
-                    stream.append("], ");
-                    stream.append("\"fromPlaceholder\"");
+                    append(stream, "[");
+                    append(stream, "\"fromPlaceholder\",\"fromPlaceholder\"");
+                    append(stream, "], ");
+                    append(stream, "\"fromPlaceholder\"");
                 }
-                stream.append("],");
-                stream.append("\"subject\": \"" + String.valueOf(emailDocument.getSubject()).trim().replaceAll("\"", "''").replaceAll("\n", " ") + "\"");
-                stream.append("}");
+                append(stream, "],");
+                append(stream, "\"subject\": \"" + String.valueOf(emailDocument.getSubject()).trim().replaceAll("\"", "''").replaceAll("\n", " ") + "\"");
+                append(stream, "}");
             }
-            stream.append("]");
+            append(stream, "]");
             stream.flush();
         } catch (IOException e) {
             throw new RuntimeException(e.getMessage(), e);
@@ -114,6 +108,12 @@ public void save(Archive archive) {
     }
 
 
+    private void append(Writer stream, String string) throws IOException {
+        string = string.replaceAll("\"", "''");
+        string = string.replaceAll("\n", " ");
+        string = string.trim();
+        stream.append(string);
+    }
 
     private String getAddressString(InternetAddress internetAddress) {
         return "\""

From aed3a79e278d8614fc3cb61a38b1b24d03c7e0d5 Mon Sep 17 00:00:00 2001
From: arostov <arostov83@gmail.com>
Date: Mon, 5 Jun 2017 21:19:19 +0300
Subject: [PATCH 13/33] save json filter special chars

---
 .../stanford/muse/email/json/ArchiveSaver.java    | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
index 615251c..6d86ffc 100644
--- a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
+++ b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
@@ -87,7 +87,7 @@ public void save(Archive archive) {
                         append(stream, "[");
                         append(stream, getAddressString(internetAddress));
                         append(stream, "], ");
-                        append(stream, "\"" + internetAddress.getAddress() + "\"");
+                        append(stream, "\"" + internetAddress.getAddress().replaceAll("\"", "''") + "\"");
                         first = false;
                     }
                 }  else {
@@ -97,7 +97,7 @@ public void save(Archive archive) {
                     append(stream, "\"fromPlaceholder\"");
                 }
                 append(stream, "],");
-                append(stream, "\"subject\": \"" + String.valueOf(emailDocument.getSubject()).trim().replaceAll("\"", "''").replaceAll("\n", " ") + "\"");
+                append(stream, "\"subject\": \"" + String.valueOf(emailDocument.getSubject()).replaceAll("\"", "''") + "\"");
                 append(stream, "}");
             }
             append(stream, "]");
@@ -109,8 +109,11 @@ public void save(Archive archive) {
 
 
     private void append(Writer stream, String string) throws IOException {
-        string = string.replaceAll("\"", "''");
-        string = string.replaceAll("\n", " ");
+        string = string.replaceAll("\\\n", " ");
+        string = string.replaceAll("\\\r", " ");
+        string = string.replaceAll(" {2,}", " ");
+        string = string.replaceAll("\\\" ", "\"");
+        string = string.replaceAll(" \\\"", "\"");
         string = string.trim();
         stream.append(string);
     }
@@ -118,8 +121,8 @@ private void append(Writer stream, String string) throws IOException {
     private String getAddressString(InternetAddress internetAddress) {
         return "\""
                 + (internetAddress.getPersonal() == null
-                        ? internetAddress.getAddress()
+                        ? internetAddress.getAddress().replaceAll("\"", "''")
                         : internetAddress.getPersonal().replaceAll("\"", "''"))
-                + "\", \"" + internetAddress.getAddress() + "\"";
+                + "\", \"" + internetAddress.getAddress().replaceAll("\"", "''") + "\"";
     }
 }

From a399e6471f0f47f1f7490460d9a082eb4c746b5f Mon Sep 17 00:00:00 2001
From: arostov <arostov83@gmail.com>
Date: Mon, 5 Jun 2017 21:42:56 +0300
Subject: [PATCH 14/33] save json filter special chars

---
 src/java/edu/stanford/muse/email/json/ArchiveSaver.java | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
index 6d86ffc..be27816 100644
--- a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
+++ b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
@@ -110,6 +110,7 @@ public void save(Archive archive) {
 
     private void append(Writer stream, String string) throws IOException {
         string = string.replaceAll("\\\n", " ");
+        string = string.replaceAll("\\\\", "\\\\");
         string = string.replaceAll("\\\r", " ");
         string = string.replaceAll(" {2,}", " ");
         string = string.replaceAll("\\\" ", "\"");

From 9aecdc2d704949934da9feaa596fd0210416f9c9 Mon Sep 17 00:00:00 2001
From: arostov <arostov83@gmail.com>
Date: Mon, 5 Jun 2017 22:02:16 +0300
Subject: [PATCH 15/33] save json filter special chars

---
 src/java/edu/stanford/muse/email/json/ArchiveSaver.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
index be27816..38d28f3 100644
--- a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
+++ b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
@@ -110,7 +110,7 @@ public void save(Archive archive) {
 
     private void append(Writer stream, String string) throws IOException {
         string = string.replaceAll("\\\n", " ");
-        string = string.replaceAll("\\\\", "\\\\");
+        string = string.replaceAll("\\\\", "\\\\\\\\");
         string = string.replaceAll("\\\r", " ");
         string = string.replaceAll(" {2,}", " ");
         string = string.replaceAll("\\\" ", "\"");

From ca8d4e37de0b9f3f3cad70bfe8072b785942f315 Mon Sep 17 00:00:00 2001
From: arostov <arostov83@gmail.com>
Date: Mon, 5 Jun 2017 23:01:22 +0300
Subject: [PATCH 16/33] save json filter special chars

---
 .../edu/stanford/muse/email/json/ArchiveSaver.java     | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
index 38d28f3..b2244a1 100644
--- a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
+++ b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
@@ -109,12 +109,14 @@ public void save(Archive archive) {
 
 
     private void append(Writer stream, String string) throws IOException {
-        string = string.replaceAll("\\\n", " ");
+        string = string.replaceAll("\\s", " ");
+        string = string.replaceAll("\\n", " ");
         string = string.replaceAll("\\\\", "\\\\\\\\");
-        string = string.replaceAll("\\\r", " ");
+        string = string.replaceAll("\\r", " ");
         string = string.replaceAll(" {2,}", " ");
-        string = string.replaceAll("\\\" ", "\"");
-        string = string.replaceAll(" \\\"", "\"");
+        string = string.replaceAll("\" ", "\"");
+        string = string.replaceAll(" \"", "\"");
+        string = string.replaceAll("[^\\w\\d\\sёЁА-Яа-я.,:\\\\\\[\\]|'\";()*?!#$%{}]", "");
         string = string.trim();
         stream.append(string);
     }

From c8ef54585dc98e6bcccec2c577883d2a2e66d7c4 Mon Sep 17 00:00:00 2001
From: arostov <arostov83@gmail.com>
Date: Tue, 6 Jun 2017 00:33:57 +0300
Subject: [PATCH 17/33] json format

---
 .../stanford/muse/email/json/ArchiveSaver.java   | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
index b2244a1..9533ccc 100644
--- a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
+++ b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
@@ -39,7 +39,7 @@ public void save(Archive archive) {
                 append(stream, "{");
                 EmailDocument emailDocument = (EmailDocument) doc;
                 append(stream, "\"emailId\": " + i++ + ",");
-                append(stream, "\"dateField\": \"" + emailDocument.getDate().getTime() + "\",");
+                append(stream, "\"dateField\": \"" + emailDocument.getDate().getTime() / 1000 + "\",");
                 append(stream, "\"isSent\": " + true + ",");
                 append(stream, "\"toField\": [");
                 if (emailDocument.to != null) {
@@ -87,7 +87,7 @@ public void save(Archive archive) {
                         append(stream, "[");
                         append(stream, getAddressString(internetAddress));
                         append(stream, "], ");
-                        append(stream, "\"" + internetAddress.getAddress().replaceAll("\"", "''") + "\"");
+                        append(stream, "\"" + internetAddress.getAddress().replaceAll("\"", "'") + "\"");
                         first = false;
                     }
                 }  else {
@@ -97,7 +97,7 @@ public void save(Archive archive) {
                     append(stream, "\"fromPlaceholder\"");
                 }
                 append(stream, "],");
-                append(stream, "\"subject\": \"" + String.valueOf(emailDocument.getSubject()).replaceAll("\"", "''") + "\"");
+                append(stream, "\"subject\": \"" + String.valueOf(emailDocument.getSubject()).replaceAll("\"", "'").replace("Subject: ", "") + "\"");
                 append(stream, "}");
             }
             append(stream, "]");
@@ -122,10 +122,10 @@ private void append(Writer stream, String string) throws IOException {
     }
 
     private String getAddressString(InternetAddress internetAddress) {
-        return "\""
-                + (internetAddress.getPersonal() == null
-                        ? internetAddress.getAddress().replaceAll("\"", "''")
-                        : internetAddress.getPersonal().replaceAll("\"", "''"))
-                + "\", \"" + internetAddress.getAddress().replaceAll("\"", "''") + "\"";
+        String personal = (internetAddress.getPersonal() == null
+                                ? internetAddress.getAddress()
+                                : internetAddress.getPersonal())
+                .replaceAll("\"", "'");
+        return "\"" + personal + "\", \"" + personal + "\"";
     }
 }

From 429b639e9993a597d3069cdce6c1612fc9d630c5 Mon Sep 17 00:00:00 2001
From: arostov <arostov83@gmail.com>
Date: Tue, 6 Jun 2017 01:36:32 +0300
Subject: [PATCH 18/33] json format

---
 .../edu/stanford/muse/email/json/ArchiveSaver.java     | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
index 9533ccc..af7444f 100644
--- a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
+++ b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
@@ -76,7 +76,7 @@ public void save(Archive archive) {
                 }
                 append(stream, "],");
 
-                append(stream, "\"fromField\": [");
+                append(stream, "\"fromField\": ");
                 if (emailDocument.from != null && emailDocument.from.length > 0) {
                     boolean first = true;
                     for (Address address : emailDocument.from) {
@@ -86,17 +86,15 @@ public void save(Archive archive) {
                         InternetAddress internetAddress = (InternetAddress) address;
                         append(stream, "[");
                         append(stream, getAddressString(internetAddress));
-                        append(stream, "], ");
-                        append(stream, "\"" + internetAddress.getAddress().replaceAll("\"", "'") + "\"");
+                        append(stream, "] ");
                         first = false;
                     }
                 }  else {
                     append(stream, "[");
                     append(stream, "\"fromPlaceholder\",\"fromPlaceholder\"");
-                    append(stream, "], ");
-                    append(stream, "\"fromPlaceholder\"");
+                    append(stream, "] ");
                 }
-                append(stream, "],");
+                append(stream, ",");
                 append(stream, "\"subject\": \"" + String.valueOf(emailDocument.getSubject()).replaceAll("\"", "'").replace("Subject: ", "") + "\"");
                 append(stream, "}");
             }

From f0bd327f85500fa2fe1732f0919de7d846969eb8 Mon Sep 17 00:00:00 2001
From: arostov <arostov83@gmail.com>
Date: Wed, 7 Jun 2017 00:44:50 +0300
Subject: [PATCH 19/33] megagraph5

---
 src/java/edu/stanford/muse/email/json/ArchiveSaver.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
index af7444f..ae018e1 100644
--- a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
+++ b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
@@ -114,7 +114,7 @@ private void append(Writer stream, String string) throws IOException {
         string = string.replaceAll(" {2,}", " ");
         string = string.replaceAll("\" ", "\"");
         string = string.replaceAll(" \"", "\"");
-        string = string.replaceAll("[^\\w\\d\\sёЁА-Яа-я.,:\\\\\\[\\]|'\";()*?!#$%{}]", "");
+        string = string.replaceAll("[^\\w\\d\\sёЁА-Яа-я.,:\\\\\\[\\]|'\";()*?!#$%{}@+\\-]", "");
         string = string.trim();
         stream.append(string);
     }

From 3ff6e29149f39ed13a34f8008d27c5efc5023bf9 Mon Sep 17 00:00:00 2001
From: arostov <arostov83@gmail.com>
Date: Wed, 7 Jun 2017 00:56:59 +0300
Subject: [PATCH 20/33] fix

---
 src/java/edu/stanford/muse/index/EmailDocument.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/java/edu/stanford/muse/index/EmailDocument.java b/src/java/edu/stanford/muse/index/EmailDocument.java
index 9daa68a..03875e2 100755
--- a/src/java/edu/stanford/muse/index/EmailDocument.java
+++ b/src/java/edu/stanford/muse/index/EmailDocument.java
@@ -51,7 +51,7 @@ public class EmailDocument extends DatedDocument implements Serializable
 
 	public String folderName, emailSource;
 	public Set<String> folderNames = new LinkedHashSet<>(), emailSources = new LinkedHashSet<>(); // email can now belong to multiple folders, folderName field also maintained for backward compatibility
-	public Address[] to, from, cc, bcc;
+	public Address[] to,  from, cc, bcc;
 	
 	public String messageID;
 	public String sentToMailingLists[];

From 54dbae205f1f4b44917181ccd4bc53370812708f Mon Sep 17 00:00:00 2001
From: arostov <arostov83@gmail.com>
Date: Thu, 8 Jun 2017 01:17:25 +0300
Subject: [PATCH 21/33] fix

---
 src/java/edu/stanford/muse/email/json/ArchiveSaver.java | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
index ae018e1..a6b1291 100644
--- a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
+++ b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
@@ -80,14 +80,11 @@ public void save(Archive archive) {
                 if (emailDocument.from != null && emailDocument.from.length > 0) {
                     boolean first = true;
                     for (Address address : emailDocument.from) {
-                        if (!first) {
-                            append(stream, ",");
-                        }
                         InternetAddress internetAddress = (InternetAddress) address;
                         append(stream, "[");
                         append(stream, getAddressString(internetAddress));
                         append(stream, "] ");
-                        first = false;
+                        break;
                     }
                 }  else {
                     append(stream, "[");

From 2ca05efa12fd171aefcdc46c92bbfc68e99c3ad2 Mon Sep 17 00:00:00 2001
From: arostov <arostov83@gmail.com>
Date: Fri, 9 Jun 2017 00:08:35 +0300
Subject: [PATCH 22/33] refactoring

---
 .../muse/email/json/ArchiveSaver.java         |  95 +++------
 .../edu/stanford/muse/email/json/Email.java   | 188 ++++++++++++++++++
 .../muse/email/json/EmailNameAgregator.java   |  66 ++++++
 3 files changed, 281 insertions(+), 68 deletions(-)
 create mode 100644 src/java/edu/stanford/muse/email/json/Email.java
 create mode 100644 src/java/edu/stanford/muse/email/json/EmailNameAgregator.java

diff --git a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
index a6b1291..cb96b24 100644
--- a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
+++ b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
@@ -1,16 +1,14 @@
 package edu.stanford.muse.email.json;
 
-import edu.stanford.muse.email.AddressBook;
 import edu.stanford.muse.index.Archive;
 import edu.stanford.muse.index.Document;
 import edu.stanford.muse.index.EmailDocument;
-import edu.stanford.muse.webapp.JSPHelper;
-import org.codehaus.plexus.util.StringOutputStream;
 
 import javax.mail.Address;
 import javax.mail.internet.InternetAddress;
 import java.io.*;
 import java.util.List;
+import java.util.Map;
 
 /**
  * Created by sunchise on 04.06.17.
@@ -29,71 +27,47 @@ public void save(Archive archive) {
             throw new RuntimeException(e.getMessage(), e);
         }
         List<Document> allDocs = archive.getAllDocs();
+        EmailNameAgregator emailNameAgregator = new EmailNameAgregator(allDocs);
         int i = 1;
         try (BufferedWriter stream = new BufferedWriter(new FileWriter(file))) {
             append(stream, "[");
+            boolean fail = false;
             for (Document doc : allDocs) {
-                if (i > 1) {
+                if (i > 1 && !fail) {
                     append(stream, ",");
                 }
-                append(stream, "{");
-                EmailDocument emailDocument = (EmailDocument) doc;
-                append(stream, "\"emailId\": " + i++ + ",");
-                append(stream, "\"dateField\": \"" + emailDocument.getDate().getTime() / 1000 + "\",");
-                append(stream, "\"isSent\": " + true + ",");
-                append(stream, "\"toField\": [");
-                if (emailDocument.to != null) {
-                    boolean first = true;
-                    for (Address address : emailDocument.to) {
-                        if (!first) {
-                            append(stream, ",");
-                        }
+                fail = false;
+                final EmailDocument emailDocument = (EmailDocument) doc;
+                Email email = new Email(i,
+                        emailDocument.date,
+                        true,
+                        emailDocument.getSubject(),
+                        emailDocument.from == null || emailDocument.from.length == 0 ? null : emailNameAgregator.getName(emailDocument.getFromEmailAddress()),
+                        emailDocument.getFromEmailAddress());
+                if (emailDocument.cc != null) {
+                    for (Address address : emailDocument.cc) {
                         InternetAddress internetAddress = (InternetAddress) address;
-                        append(stream, "[");
-                        append(stream, getAddressString(internetAddress));
-                        append(stream, "]");
-                        first = false;
+                        email.addCc(emailNameAgregator.getName(internetAddress.getAddress()), internetAddress.getAddress());
                     }
                 }
-                append(stream, "],");
-                append(stream, "\"ccField\": [");
-                if (emailDocument.cc != null && emailDocument.cc.length != 0) {
-                    boolean first = true;
-                    for (Address address : emailDocument.cc) {
-                        if (!first) {
-                            append(stream, ",");
-                        }
+                if (emailDocument.bcc != null) {
+                    for (Address address : emailDocument.bcc) {
                         InternetAddress internetAddress = (InternetAddress) address;
-                        append(stream, "[");
-                        append(stream, getAddressString(internetAddress));
-                        append(stream, "]");
-                        first = false;
+                        email.addCc(emailNameAgregator.getName(internetAddress.getAddress()), internetAddress.getAddress());
                     }
-                } else {
-                    append(stream, "[");
-                    append(stream, "\"ccPlaceholder\",\"ccPlaceholder\"");
-                    append(stream, "]");
                 }
-                append(stream, "],");
-
-                append(stream, "\"fromField\": ");
-                if (emailDocument.from != null && emailDocument.from.length > 0) {
-                    boolean first = true;
-                    for (Address address : emailDocument.from) {
+                if (emailDocument.to != null) {
+                    for (Address address : emailDocument.to) {
                         InternetAddress internetAddress = (InternetAddress) address;
-                        append(stream, "[");
-                        append(stream, getAddressString(internetAddress));
-                        append(stream, "] ");
-                        break;
+                        email.addCc(emailNameAgregator.getName(internetAddress.getAddress()), internetAddress.getAddress());
                     }
-                }  else {
-                    append(stream, "[");
-                    append(stream, "\"fromPlaceholder\",\"fromPlaceholder\"");
-                    append(stream, "] ");
                 }
-                append(stream, ",");
-                append(stream, "\"subject\": \"" + String.valueOf(emailDocument.getSubject()).replaceAll("\"", "'").replace("Subject: ", "") + "\"");
-                append(stream, "}");
+                if (email.check()) {
+                    append(stream, email.toJson());
+                } else {
+                    fail = true;
+                }
+                i++;
             }
             append(stream, "]");
             stream.flush();
@@ -104,23 +78,8 @@ public void save(Archive archive) {
 
 
     private void append(Writer stream, String string) throws IOException {
-        string = string.replaceAll("\\s", " ");
-        string = string.replaceAll("\\n", " ");
-        string = string.replaceAll("\\\\", "\\\\\\\\");
-        string = string.replaceAll("\\r", " ");
-        string = string.replaceAll(" {2,}", " ");
-        string = string.replaceAll("\" ", "\"");
-        string = string.replaceAll(" \"", "\"");
-        string = string.replaceAll("[^\\w\\d\\sёЁА-Яа-я.,:\\\\\\[\\]|'\";()*?!#$%{}@+\\-]", "");
         string = string.trim();
         stream.append(string);
     }
 
-    private String getAddressString(InternetAddress internetAddress) {
-        String personal = (internetAddress.getPersonal() == null
-                                ? internetAddress.getAddress()
-                                : internetAddress.getPersonal())
-                .replaceAll("\"", "'");
-        return "\"" + personal + "\", \"" + personal + "\"";
-    }
 }
diff --git a/src/java/edu/stanford/muse/email/json/Email.java b/src/java/edu/stanford/muse/email/json/Email.java
new file mode 100644
index 0000000..7fbd499
--- /dev/null
+++ b/src/java/edu/stanford/muse/email/json/Email.java
@@ -0,0 +1,188 @@
+package edu.stanford.muse.email.json;
+
+import org.json.JSONException;
+import org.json.JSONObject;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import javax.mail.Address;
+import javax.mail.internet.InternetAddress;
+import java.io.*;
+import java.util.Calendar;
+import java.util.Collection;
+import java.util.Date;
+import java.util.HashSet;
+/*
+{
+    "emailId": 3,
+    "dateField": "1496222800",
+    "isSent": true,
+    "toField": [
+      [
+        "Александр Игоревич",
+        "Александр Игоревич"
+      ]
+    ],
+    "ccField": [
+      [
+        "ccPlaceholder",
+        "ccPlaceholder"
+      ]
+    ],
+    "fromField": [
+      "WWF России",
+      "WWF России"
+    ],
+    "subject": "Барс по имени Крюк"
+  }
+ */
+
+
+public class Email {
+    private final Logger log = LoggerFactory.getLogger(Email.class);
+
+    private final String id;
+
+    private final Date date;
+
+    private final boolean isSent;
+
+    private final Collection<EmailAddress> to = new HashSet<>();
+
+    private final Collection<EmailAddress> cc = new HashSet<>();
+
+    private final EmailAddress from;
+
+    private final String subject;
+
+    private String toJson;
+
+    public Email(String id, Date date, boolean isSent, EmailAddress from, String subject) {
+        this.id = id;
+        Calendar calendar = Calendar.getInstance();
+        calendar.set(Calendar.YEAR, 1999);
+        calendar.set(Calendar.MONTH, Calendar.SEPTEMBER);
+        calendar.set(Calendar.DAY_OF_MONTH, 11);
+        Date minDate = calendar.getTime();
+        if (date == null || minDate.compareTo(date) > 0) {
+            date = minDate;
+        }
+        this.date = date;
+        this.isSent = isSent;
+        this.from = from;
+        this.subject = subject;
+    }
+
+
+    public Email(int id, Date date, boolean isSent, String subject, String fromName, String fromEmail) {
+        this(String.valueOf(id), date, isSent, subject, fromName, fromEmail);
+    }
+
+    public Email(String id, Date date, boolean isSent, String subject, String fromName, String fromEmail) {
+        this(id, date, isSent, new EmailAddress(fromName, fromEmail), subject);
+    }
+
+    public void addTo(EmailAddress emailAddress) {
+        toJson = null;
+        to.add(emailAddress);
+    }
+
+    public void addTo(String name, String email) {
+        toJson = null;
+        addTo(new EmailAddress(name, email));
+    }
+
+    public void addCc(EmailAddress emailAddress) {
+        toJson = null;
+        to.add(emailAddress);
+    }
+
+    public void addCc(String name, String email) {
+        toJson = null;
+        addCc(new EmailAddress(name, email));
+    }
+
+    public String toJson() {
+        if (toJson == null) {
+            StringBuilder stream = new StringBuilder();
+            stream.append("{");
+            stream.append("\"emailId\": ").append(id).append(",");
+            stream.append("\"dateField\": ").append(date.getTime() / 1000).append(",");
+            stream.append("\"isSent\": ").append(isSent).append(",");
+            stream.append("\"toField\": [");
+            stream.append(to.stream().map(EmailAddress::toJson).reduce((s, s2) -> s + "," + s2).orElse(""));
+            stream.append("],");
+            stream.append("\"ccField\": [");
+            if (cc.isEmpty()) {
+                stream.append(new EmailAddress("ccPlaceholder", "ccPlaceholder").toJson());
+            } else {
+                stream.append(cc.stream().map(EmailAddress::toJson).reduce((s, s2) -> s + "," + s2).orElse(""));
+            }
+            stream.append("],");
+            stream.append("\"fromField\": ");
+            if (from == null) {
+                stream.append(new EmailAddress("fromPlaceholder", "fromPlaceholder").toJson());
+            } else {
+                stream.append(from.toJson());
+            }
+            stream.append(",");
+            stream.append("\"subject\": \"");
+            append(stream, String.valueOf(subject).replaceAll("\"", "'").replace("Subject: ", "")).append("\"");
+            stream.append("}");
+            toJson = stream.toString();
+        }
+        return toJson;
+    }
+
+    public boolean check() {
+        try {
+            new JSONObject(toJson());
+        } catch (JSONException e) {
+            log.error("Not right format of json\n\n" + toJson + "\n\n" + e.getMessage());
+            return false;
+        }
+        return true;
+    }
+
+    public static class EmailAddress {
+        private final String name;
+        private final String email;
+
+        public EmailAddress(String name, String email) {
+            this.name = name;
+            this.email = email;
+        }
+
+        public String getName() {
+            return name == null ? email : name.replaceAll("\"", "'");
+        }
+
+        public String getEmail() {
+            return email.replaceAll("\"", "'");
+        }
+
+        public String toJson() {
+            StringBuilder stream = new StringBuilder();
+            append(stream, "[");
+            append(stream, "\"" + getName() + "\"");
+            append(stream, ",");
+            append(stream, "\"" + getEmail() + "\"");
+            append(stream, "]");
+            return stream.toString();
+        }
+    }
+
+    private static StringBuilder append(StringBuilder stream, String string) {
+        string = string.replaceAll("\\s", " ");
+        string = string.replaceAll("\\n", " ");
+        string = string.replaceAll("\\\\", "\\\\\\\\");
+        string = string.replaceAll("\\r", " ");
+        string = string.replaceAll(" {2,}", " ");
+        string = string.replaceAll("\" ", "\"");
+        string = string.replaceAll(" \"", "\"");
+        string = string.replaceAll("[^\\w\\d\\sёЁА-Яа-я.,:\\\\\\[\\]|'\";()*?!#$%{}@+\\-]", "");
+        string = string.trim();
+        stream.append(string);
+        return stream;
+    }
+}
diff --git a/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java b/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java
new file mode 100644
index 0000000..04d9038
--- /dev/null
+++ b/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java
@@ -0,0 +1,66 @@
+package edu.stanford.muse.email.json;
+
+import edu.stanford.muse.index.Document;
+import edu.stanford.muse.index.EmailDocument;
+
+import javax.mail.Address;
+import javax.mail.internet.InternetAddress;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+public class EmailNameAgregator {
+    private List<Document> allDocs;
+    final Map<String, String> emailNameMap = new HashMap<>();
+
+    public EmailNameAgregator(List<Document> allDocs) {
+        this.allDocs = allDocs;
+        init();
+    }
+
+    private void init() {
+        emailNameMap.clear();
+        allDocs.forEach(document -> {
+            EmailDocument emailDocument = (EmailDocument) document;
+            if (emailDocument.to != null) {
+                for (Address address : emailDocument.to) {
+                    appendToEmailNameMap(emailNameMap, (InternetAddress) address);
+                }
+            }
+            if (emailDocument.cc != null) {
+                for (Address address : emailDocument.cc) {
+                    appendToEmailNameMap(emailNameMap, (InternetAddress) address);
+                }
+            }
+            if (emailDocument.bcc != null) {
+                for (Address address : emailDocument.bcc) {
+                    appendToEmailNameMap(emailNameMap, (InternetAddress) address);
+                }
+            }
+        });
+    }
+
+    public String getName(String email) {
+        return emailNameMap.get(email);
+    }
+
+
+    private void appendToEmailNameMap(Map<String, String> emailNameMap, InternetAddress internetAddress) {
+        String email = internetAddress.getAddress();
+        String personal = internetAddress.getPersonal();
+        String name = emailNameMap.get(email);
+        if (name != null) {
+            if (personal != null && name.length() < personal.length()) {
+                if (personal.contains(" ") || (!name.contains(" "))) {
+                    emailNameMap.put(email, personal);
+                } else {
+                    if (!name.contains(" ") && personal.contains(" ")) {
+                        emailNameMap.put(email, personal);
+                    }
+                }
+            }
+        } else {
+            emailNameMap.put(email, personal);
+        }
+    }
+}

From 2b38c147648d590845ca3b9d076ff067b3d46a3e Mon Sep 17 00:00:00 2001
From: arostov <arostov83@gmail.com>
Date: Fri, 9 Jun 2017 00:50:06 +0300
Subject: [PATCH 23/33] fix

---
 .../muse/email/json/EmailNameAgregator.java     | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java b/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java
index 04d9038..8d0e0e0 100644
--- a/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java
+++ b/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java
@@ -48,15 +48,22 @@ public String getName(String email) {
     private void appendToEmailNameMap(Map<String, String> emailNameMap, InternetAddress internetAddress) {
         String email = internetAddress.getAddress();
         String personal = internetAddress.getPersonal();
+        if (personal == null) {
+            return;
+        }
         String name = emailNameMap.get(email);
         if (name != null) {
-            if (personal != null && name.length() < personal.length()) {
+            if (name.length() < personal.length()) {
                 if (personal.contains(" ") || (!name.contains(" "))) {
                     emailNameMap.put(email, personal);
-                } else {
-                    if (!name.contains(" ") && personal.contains(" ")) {
-                        emailNameMap.put(email, personal);
-                    }
+                }
+            } else if (!name.contains(" ") && personal.contains(" ")) {
+                emailNameMap.put(email, personal);
+            } else if (name.contains(" ") && personal.contains(" ")) {
+                int nameWordsCount = name.split(" ").length;
+                int personalWordsCount = personal.split(" ").length;
+                if (personalWordsCount < 4 && personalWordsCount < nameWordsCount) {
+                    emailNameMap.put(email, personal);
                 }
             }
         } else {

From 4d6437d4e238c49a283691fc1d689bb4c77557cf Mon Sep 17 00:00:00 2001
From: arostov <arostov83@gmail.com>
Date: Fri, 9 Jun 2017 02:26:39 +0300
Subject: [PATCH 24/33] fix

---
 src/java/edu/stanford/muse/email/json/ArchiveSaver.java | 4 +++-
 src/java/edu/stanford/muse/email/json/Email.java        | 8 ++++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
index cb96b24..0c658f8 100644
--- a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
+++ b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
@@ -3,6 +3,7 @@
 import edu.stanford.muse.index.Archive;
 import edu.stanford.muse.index.Document;
 import edu.stanford.muse.index.EmailDocument;
+import edu.stanford.muse.util.Util;
 
 import javax.mail.Address;
 import javax.mail.internet.InternetAddress;
@@ -38,7 +39,8 @@ public void save(Archive archive) {
                 }
                 fail = false;
                 final EmailDocument emailDocument = (EmailDocument) doc;
-                Email email = new Email(i,
+                String messageID = Util.hash (emailDocument.getSignature());
+                Email email = new Email(messageID,
                         emailDocument.date,
                         true,
                         emailDocument.getSubject(),
diff --git a/src/java/edu/stanford/muse/email/json/Email.java b/src/java/edu/stanford/muse/email/json/Email.java
index 7fbd499..414845c 100644
--- a/src/java/edu/stanford/muse/email/json/Email.java
+++ b/src/java/edu/stanford/muse/email/json/Email.java
@@ -106,7 +106,7 @@ public String toJson() {
         if (toJson == null) {
             StringBuilder stream = new StringBuilder();
             stream.append("{");
-            stream.append("\"emailId\": ").append(id).append(",");
+            stream.append("\"emailId\": \"").append(id).append("\",");
             stream.append("\"dateField\": ").append(date.getTime() / 1000).append(",");
             stream.append("\"isSent\": ").append(isSent).append(",");
             stream.append("\"toField\": [");
@@ -127,7 +127,11 @@ public String toJson() {
             }
             stream.append(",");
             stream.append("\"subject\": \"");
-            append(stream, String.valueOf(subject).replaceAll("\"", "'").replace("Subject: ", "")).append("\"");
+            String formatedSubject = subject == null ? "Without subject" : String.valueOf(subject).replaceAll("\"", "'").replace("Subject: ", "").trim();
+            if ("null".equals(formatedSubject)) {
+                formatedSubject = "Without subject";
+            }
+            append(stream, formatedSubject).append("\"");
             stream.append("}");
             toJson = stream.toString();
         }

From 0b407fa01dcebfdb66ea9c692dc14cf87609c9dc Mon Sep 17 00:00:00 2001
From: arostov <arostov83@gmail.com>
Date: Sun, 11 Jun 2017 01:07:44 +0300
Subject: [PATCH 25/33] fix

---
 .../muse/email/json/ArchiveSaver.java         | 17 ++++++-
 .../muse/email/json/EmailNameAgregator.java   | 49 +++++++++++++++++++
 2 files changed, 65 insertions(+), 1 deletion(-)

diff --git a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
index 0c658f8..c3f29c1 100644
--- a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
+++ b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
@@ -8,6 +8,7 @@
 import javax.mail.Address;
 import javax.mail.internet.InternetAddress;
 import java.io.*;
+import java.util.Base64;
 import java.util.List;
 import java.util.Map;
 
@@ -16,8 +17,20 @@
  */
 public class ArchiveSaver {
 
+    private final String archiveName;
+
+    public ArchiveSaver(String archiveName) {
+        this.archiveName = archiveName;
+    }
+
     public void save(Archive archive) {
-        String fileName = System.getProperty("user.home") + File.separator + "archive.json";
+        String folderName = new String(Base64.getEncoder().encode(archiveName.getBytes()));
+        String folderPath = System.getProperty("user.home") + File.separator + folderName;
+        File folder = new File(folderPath);
+        if (!folder.exists()) {
+            folder.mkdir();
+        }
+        String fileName = folderPath + File.separator + "archive.json";
         File file = new File(fileName);
         if (file.exists()) {
             file.delete();
@@ -29,6 +42,7 @@ public void save(Archive archive) {
         }
         List<Document> allDocs = archive.getAllDocs();
         EmailNameAgregator emailNameAgregator = new EmailNameAgregator(allDocs);
+        emailNameAgregator.save(folderPath + File.separator + "email-names.json");
         int i = 1;
         try (BufferedWriter stream = new BufferedWriter(new FileWriter(file))) {
             append(stream, "[");
@@ -73,6 +87,7 @@ public void save(Archive archive) {
             }
             append(stream, "]");
             stream.flush();
+            stream.close();
         } catch (IOException e) {
             throw new RuntimeException(e.getMessage(), e);
         }
diff --git a/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java b/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java
index 8d0e0e0..16cf9d7 100644
--- a/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java
+++ b/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java
@@ -1,10 +1,14 @@
 package edu.stanford.muse.email.json;
 
+import com.google.gson.Gson;
+import com.google.gson.reflect.TypeToken;
 import edu.stanford.muse.index.Document;
 import edu.stanford.muse.index.EmailDocument;
+import org.json.JSONObject;
 
 import javax.mail.Address;
 import javax.mail.internet.InternetAddress;
+import java.io.*;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
@@ -18,6 +22,21 @@ public EmailNameAgregator(List<Document> allDocs) {
         init();
     }
 
+
+    public EmailNameAgregator(List<Document> allDocs, String fileName) {
+        this.allDocs = allDocs;
+        if (fileName == null) {
+            init();
+        } else {
+            File file = new File(fileName);
+            if (file.exists()) {
+                load(fileName);
+            } else {
+                init();
+            }
+        }
+    }
+
     private void init() {
         emailNameMap.clear();
         allDocs.forEach(document -> {
@@ -70,4 +89,34 @@ private void appendToEmailNameMap(Map<String, String> emailNameMap, InternetAddr
             emailNameMap.put(email, personal);
         }
     }
+
+    public void save(String fileName) {
+        File file = new File(fileName);
+        if (file.exists()) {
+            file.delete();
+        }
+        JSONObject json = new JSONObject(emailNameMap);
+        try (Writer writer = new FileWriter(file)) {
+            json.write(writer);
+            writer.close();
+        } catch (IOException e) {
+            throw new RuntimeException(e.getMessage(), e);
+        }
+    }
+
+    public void load(String fileName) {
+        File file = new File(fileName);
+        if (!file.exists()) {
+            file.delete();
+        }
+        try (FileReader fileReader = new FileReader(file)) {
+            Map<String, String> tempMap = new Gson().fromJson(fileReader, new TypeToken<Map<String, String>>() {}.getType());
+            if (tempMap != null) {
+                emailNameMap.putAll(tempMap);
+            }
+            fileReader.close();
+        } catch (IOException e) {
+            throw new RuntimeException(e.getMessage(), e);
+        }
+    }
 }

From 384e97e784d57f9334b3adef39ac78ddda11e1c5 Mon Sep 17 00:00:00 2001
From: arostov <arostov83@gmail.com>
Date: Tue, 13 Jun 2017 23:08:52 +0300
Subject: [PATCH 26/33] fix

---
 src/java/edu/stanford/muse/email/EmailFetcherThread.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/java/edu/stanford/muse/email/EmailFetcherThread.java b/src/java/edu/stanford/muse/email/EmailFetcherThread.java
index db983f8..63a520a 100755
--- a/src/java/edu/stanford/muse/email/EmailFetcherThread.java
+++ b/src/java/edu/stanford/muse/email/EmailFetcherThread.java
@@ -1366,7 +1366,7 @@ public void run() {
                 }
                 log.info("Read #" + nMessages + " messages in  in " + (System.currentTimeMillis() - st) + "ms");
             }
-            new ArchiveSaver().save(archive);
+            new ArchiveSaver(archive.archiveTitle).save(archive);
         } catch (Throwable t) {
             if (t instanceof OutOfMemoryError)
                 this.mayHaveRunOutOfMemory = true;

From c704c374300c80c64e6f13229d6e780a66b8084f Mon Sep 17 00:00:00 2001
From: arostov <arostov83@gmail.com>
Date: Thu, 15 Jun 2017 01:30:46 +0300
Subject: [PATCH 27/33] fix

---
 pom-common.xml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pom-common.xml b/pom-common.xml
index 3e39c2a..63c659f 100755
--- a/pom-common.xml
+++ b/pom-common.xml
@@ -164,17 +164,17 @@
 		<dependency>
 			<groupId>org.apache.tika</groupId>
 			<artifactId>tika-parsers</artifactId>
-			<version>1.14</version>
+			<version>1.15</version>
 		</dependency>
 		<dependency>
 			<groupId>org.apache.tika</groupId>
 			<artifactId>tika-core</artifactId>
-			<version>1.14</version>
+			<version>1.15</version>
 		</dependency>
 		<dependency>
 			<groupId>org.apache.pdfbox</groupId>
 			<artifactId>pdfbox</artifactId>
-			<version>1.8.1</version>
+			<version>2.0.6</version>
 		</dependency>
 
 		<dependency>

From f2c91f00f1820b23c33f3ef0a251c84a42ad30aa Mon Sep 17 00:00:00 2001
From: arostov <arostov83@gmail.com>
Date: Thu, 15 Jun 2017 01:32:57 +0300
Subject: [PATCH 28/33] fix

---
 src/java/edu/stanford/muse/datacache/Blob.java     | 14 +++++++++++---
 .../edu/stanford/muse/datacache/BlobStore.java     |  2 +-
 .../stanford/muse/email/EmailFetcherThread.java    |  1 +
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/java/edu/stanford/muse/datacache/Blob.java b/src/java/edu/stanford/muse/datacache/Blob.java
index 19150d0..e952541 100755
--- a/src/java/edu/stanford/muse/datacache/Blob.java
+++ b/src/java/edu/stanford/muse/datacache/Blob.java
@@ -19,12 +19,14 @@
 import edu.stanford.muse.util.Util;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.BodyContentHandler;
 import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
 
 import java.io.IOException;
 import java.io.InputStream;
@@ -132,9 +134,15 @@ public Pair<String, String> getContent(BlobStore store)
 		try {
 			// skip mp3 files, tika has trouble with it and hangs
 			if (!Util.nullOrEmpty(this.filename) && !this.filename.toLowerCase().endsWith(".mp3"))
-				parser.parse(stream, handler, metadata, context);
-	
-		    String[] names = metadata.names();
+				try {
+					parser.parse(stream, handler, metadata, context);
+				} catch (Exception e) {
+					log.error(e.getMessage(), e);
+					log.error(filename);
+					throw new RuntimeException(e.getMessage(), e);
+			}
+
+			String[] names = metadata.names();
 		    //Arrays.sort(names);
 		    for (String name : names) {
 		    	// some metadata tags are problematic and result in large hex strings... ignore them. (caused memory problems with Henry's archive)
diff --git a/src/java/edu/stanford/muse/datacache/BlobStore.java b/src/java/edu/stanford/muse/datacache/BlobStore.java
index 960d22b..44c8daf 100755
--- a/src/java/edu/stanford/muse/datacache/BlobStore.java
+++ b/src/java/edu/stanford/muse/datacache/BlobStore.java
@@ -468,7 +468,7 @@ public void generate_thumbnail(Blob b) throws IOException {
                 tnFilename = tmp_filename.substring(0, tmp_filename.length() - ".pdf".length()); // strip the ".pdf"
                 tnFilename += "1.png";
                 String[] args = new String[]{"-imageType", "png", "-startPage", "1", "-endPage", "1", tmp_filename};
-                org.apache.pdfbox.PDFToImage.main(args);
+                org.apache.pdfbox.tools.PDFToImage.main(args);
                 log.info("Saving PDF thumbnail to " + tnFilename);
                 filename = filename + ".png"; // make sure the suffix for the thumbnail is named with a .png suffix in the cache
             } catch (Throwable e) {
diff --git a/src/java/edu/stanford/muse/email/EmailFetcherThread.java b/src/java/edu/stanford/muse/email/EmailFetcherThread.java
index 63a520a..8c18a09 100755
--- a/src/java/edu/stanford/muse/email/EmailFetcherThread.java
+++ b/src/java/edu/stanford/muse/email/EmailFetcherThread.java
@@ -504,6 +504,7 @@ private List<String> processMessagePart(int messageNum, Message m, Part p, List<
             // rfc822 mime type is for embedded mbox format or some such (appears for things like
             // forwarded messages). the content appears to be just a multipart.
             Object o = p.getContent();
+            System.setProperty("mail.mime.multipart.allowempty", "true");
             if (o instanceof Multipart) {
                 Multipart allParts = (Multipart) o;
                 if (p.isMimeType("multipart/alternative")) {

From 31ff59977622fb9222d344dbf29704f28f9d3242 Mon Sep 17 00:00:00 2001
From: arostov <arostov83@gmail.com>
Date: Sun, 18 Jun 2017 23:40:50 +0300
Subject: [PATCH 29/33] do not forget emails

---
 src/java/edu/stanford/muse/email/EmailFetcherThread.java | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/java/edu/stanford/muse/email/EmailFetcherThread.java b/src/java/edu/stanford/muse/email/EmailFetcherThread.java
index 8c18a09..52168ea 100755
--- a/src/java/edu/stanford/muse/email/EmailFetcherThread.java
+++ b/src/java/edu/stanford/muse/email/EmailFetcherThread.java
@@ -1297,7 +1297,9 @@ public void run() {
                 int b;
                 for (b = 0; b < nbatches + 1; b++) {
                     begin_msg_index = b * BATCH + 1;
-                    end_msg_index = Math.min((b + 1) * BATCH, nMessages) + 1;
+                    end_msg_index = Math.min((b + 1) * BATCH, nMessages);
+                    log.info("begin_msg_index: " + begin_msg_index);
+                    log.info("end_msg_index: " + end_msg_index);
                     log.info("Fetching messages in index [" + begin_msg_index + ", " + end_msg_index + "] batch: " + b + "/" + nbatches + "\nTotal Messages: " + nMessages);
                     Message[] messages = openFolderAndGetMessages();
                     currentStatus = JSONUtils.getStatusJSON("");

From bcbb7cbe4319ea4c2b7e20d5e721b5fcc8604b1e Mon Sep 17 00:00:00 2001
From: arostov <arostov83@gmail.com>
Date: Wed, 21 Jun 2017 22:07:38 +0300
Subject: [PATCH 30/33] save archive data in special folder

---
 src/java/edu/stanford/muse/email/json/ArchiveSaver.java | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
index c3f29c1..9fd3abf 100644
--- a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
+++ b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java
@@ -25,11 +25,16 @@ public ArchiveSaver(String archiveName) {
 
     public void save(Archive archive) {
         String folderName = new String(Base64.getEncoder().encode(archiveName.getBytes()));
-        String folderPath = System.getProperty("user.home") + File.separator + folderName;
+        String folderPath = System.getProperty("user.home") + File.separator + "epadd-data";
         File folder = new File(folderPath);
         if (!folder.exists()) {
             folder.mkdir();
         }
+        folderPath += File.separator + folderName;
+        folder = new File(folderPath);
+        if (!folder.exists()) {
+            folder.mkdir();
+        }
         String fileName = folderPath + File.separator + "archive.json";
         File file = new File(fileName);
         if (file.exists()) {

From bfb5eb711c5ba0738b95faf85f20fb732e07fe19 Mon Sep 17 00:00:00 2001
From: arostov <arostov83@gmail.com>
Date: Thu, 22 Jun 2017 22:26:23 +0300
Subject: [PATCH 31/33] eliminate wild chars

---
 .../muse/email/json/EmailNameAgregator.java         | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java b/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java
index 16cf9d7..2006563 100644
--- a/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java
+++ b/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java
@@ -70,6 +70,7 @@ private void appendToEmailNameMap(Map<String, String> emailNameMap, InternetAddr
         if (personal == null) {
             return;
         }
+        personal = removeWildChars(personal);
         String name = emailNameMap.get(email);
         if (name != null) {
             if (name.length() < personal.length()) {
@@ -90,6 +91,18 @@ private void appendToEmailNameMap(Map<String, String> emailNameMap, InternetAddr
         }
     }
 
+    private String removeWildChars(String string) {
+        string = string.replaceAll("\\s", " ");
+        string = string.replaceAll("\\n", " ");
+        string = string.replaceAll("\\\\", "\\\\\\\\");
+        string = string.replaceAll("\\r", " ");
+        string = string.replaceAll(" {2,}", " ");
+        string = string.replaceAll("\" ", "\"");
+        string = string.replaceAll(" \"", "\"");
+        string = string.replaceAll("[^\\w\\d\\sёЁА-Яа-я.,:\\\\\\[\\]|'\";()*?!#$%{}@+\\-]", "");
+        return string.trim();
+    }
+
     public void save(String fileName) {
         File file = new File(fileName);
         if (file.exists()) {

From 4d113338c6110d2e57cbffe27731f04ba3311017 Mon Sep 17 00:00:00 2001
From: arostov <arostov83@gmail.com>
Date: Sun, 2 Jul 2017 21:38:38 +0300
Subject: [PATCH 32/33] introduced fix

---
 src/java/edu/stanford/muse/email/json/EmailNameAgregator.java | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java b/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java
index 2006563..1090ec9 100644
--- a/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java
+++ b/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java
@@ -100,6 +100,9 @@ private String removeWildChars(String string) {
         string = string.replaceAll("\" ", "\"");
         string = string.replaceAll(" \"", "\"");
         string = string.replaceAll("[^\\w\\d\\sёЁА-Яа-я.,:\\\\\\[\\]|'\";()*?!#$%{}@+\\-]", "");
+        if (string.endsWith("'")) {
+            string = string.substring(0, string.length() - 1);
+        }
         return string.trim();
     }
 

From 45fde7a551194dcda2dfd4008318fd712b0479c7 Mon Sep 17 00:00:00 2001
From: arostov <arostov83@gmail.com>
Date: Sun, 2 Jul 2017 22:29:01 +0300
Subject: [PATCH 33/33] error catching

---
 .../edu/stanford/muse/email/EmailFetcherThread.java  | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/java/edu/stanford/muse/email/EmailFetcherThread.java b/src/java/edu/stanford/muse/email/EmailFetcherThread.java
index 52168ea..e89a7ef 100755
--- a/src/java/edu/stanford/muse/email/EmailFetcherThread.java
+++ b/src/java/edu/stanford/muse/email/EmailFetcherThread.java
@@ -1101,7 +1101,17 @@ private void fetchAndIndexMessages(Folder folder, Message[] messages, int offset
                     }
 
                     if (contents == null)
-                        contents = processMessagePart(messageNum, originalMessage, mm, attachmentsList);
+                        try {
+                            contents = processMessagePart(messageNum, originalMessage, mm, attachmentsList);
+                        } catch (Exception e) {
+                            log.error(e.getMessage(), e);
+                            try {
+                                log.error("MessageId: " + originalMessage.getMessageID());
+                            } catch (MessagingException e1) {
+                                log.error(e.getMessage(), e);
+                            }
+                            throw e;
+                        }
 
                     // if mm is not prefetched, it is the same as original_mm
                     // will also work, but will be slow as javamail accesses and fetches each mm separately, instead of using the bulk prefetched version