From a477d8d3aed29dc28cc50433c92b0b21cf0450f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=A0=D0=BE=D1=81=D1=82=D0=BE=D0=B2=20=D0=90=D0=BB=D0=B5?= =?UTF-8?q?=D0=BA=D1=81=D0=B0=D0=BD=D0=B4=D1=80?= Date: Fri, 5 May 2017 02:33:52 +0300 Subject: [PATCH 01/33] encoding --- .../stanford/muse/email/EmailFetcherThread.java | 16 +++++++++++----- .../edu/stanford/muse/webapp/EmailRenderer.java | 2 +- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/java/edu/stanford/muse/email/EmailFetcherThread.java b/src/java/edu/stanford/muse/email/EmailFetcherThread.java index ebff296..e09caab 100755 --- a/src/java/edu/stanford/muse/email/EmailFetcherThread.java +++ b/src/java/edu/stanford/muse/email/EmailFetcherThread.java @@ -472,12 +472,18 @@ private List processMessagePart(int messageNum, Message m, Part p, List< String content; String type = p.getContentType(); // new InputStreamReader(p.getInputStream(), "UTF-8"); try { - // if forced encoding is set, we read the string with that encoding, otherwise we just use whatever p.getContent gives us - if (FORCED_ENCODING != null) { + if (type.contains("charset=")) { byte b[] = Util.getBytesFromStream(p.getInputStream()); - content = new String(b, FORCED_ENCODING); - } else - content = (String) p.getContent(); + content = new String(b, type.substring(type.indexOf("charset=") + "charset=".length())); + } else { + // if forced encoding is set, we read the string with that encoding, otherwise we just use whatever p.getContent gives us + if (FORCED_ENCODING != null) { + byte b[] = Util.getBytesFromStream(p.getInputStream()); + content = new String(b, FORCED_ENCODING); + } else { + content = (String) p.getContent(); + } + } } catch (UnsupportedEncodingException uee) { dataErrors.add("Unsupported encoding: " + folder_name() + " Message #" + messageNum + " type " + type + ", using brute force conversion"); // a particularly nasty issue:javamail can't handle utf-7 encoding which is common with hotmail and exchange servers. diff --git a/src/java/edu/stanford/muse/webapp/EmailRenderer.java b/src/java/edu/stanford/muse/webapp/EmailRenderer.java index 6ffa3b2..f413e3b 100755 --- a/src/java/edu/stanford/muse/webapp/EmailRenderer.java +++ b/src/java/edu/stanford/muse/webapp/EmailRenderer.java @@ -168,7 +168,7 @@ public static String formatAddressesAsHTML(Address addrs[], AddressBook addressB InternetAddress ia = (InternetAddress) a; Pair p = JSPHelper.getNameAndURL((InternetAddress) a, addressBook); String url = p.getSecond(); - String str = ia.toString(); + String str = ia.getPersonal() == null ? ia.getAddress() : ia.getPersonal() + "<" + ia.getAddress() + ">"; String addr = ia.getAddress(); boolean match = false; if(str!=null) { From dcb6f6f812e8f0dbd4e94c7549ebf4831238a8aa Mon Sep 17 00:00:00 2001 From: Gleb Suvorov Date: Fri, 5 May 2017 02:52:47 +0300 Subject: [PATCH 02/33] cyrillic encoding fix --- .../muse/email/EmailFetcherThread.java | 2891 +++++++++-------- .../stanford/muse/webapp/EmailRenderer.java | 1072 +++--- 2 files changed, 1984 insertions(+), 1979 deletions(-) diff --git a/src/java/edu/stanford/muse/email/EmailFetcherThread.java b/src/java/edu/stanford/muse/email/EmailFetcherThread.java index ebff296..89945fa 100755 --- a/src/java/edu/stanford/muse/email/EmailFetcherThread.java +++ b/src/java/edu/stanford/muse/email/EmailFetcherThread.java @@ -1,1443 +1,1448 @@ -/* - * Copyright (C) 2012 The Stanford MobiSocial Laboratory - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package edu.stanford.muse.email; - -import com.sun.mail.imap.IMAPFolder; -import edu.stanford.muse.datacache.Blob; -import edu.stanford.muse.index.*; -import edu.stanford.muse.util.EmailUtils; -import edu.stanford.muse.util.JSONUtils; -import edu.stanford.muse.util.Util; -import edu.stanford.muse.webapp.HTMLUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.json.JSONArray; -import org.json.JSONException; -import org.json.JSONObject; -import org.jsoup.Jsoup; - -import javax.activation.DataHandler; -import javax.activation.DataSource; -import javax.mail.*; -import javax.mail.internet.AddressException; -import javax.mail.internet.InternetAddress; -import javax.mail.internet.MimeMessage; -import java.io.*; -import java.security.GeneralSecurityException; -import java.util.*; - -class EmailFetcherStats implements Cloneable, Serializable { - private final static long serialVersionUID = 1L; - - int nTotalMessages; // total # of messages to process - int nMessagesAdded; // running total messages newly added to the archive - int nMessagesAlreadyPresent; // running messages that were already present - int nErrors = 0; - int nMessagesFiltered = 0; - - public void merge(EmailFetcherStats other) { - this.nMessagesAdded += other.nMessagesAdded; - this.nMessagesAlreadyPresent += other.nMessagesAlreadyPresent; - this.nMessagesFiltered += other.nMessagesFiltered; - this.nErrors += other.nErrors; - this.nTotalMessages += other.nTotalMessages; - } - - public String toString() { - return Util.fieldsToString(this); - } -} - -/** - * Important class for importing email. - * implements an email fetcher for a range of message #s within a single folder. - * In contrast, MTEmailFetcher is responsible for an entire email account, including multiple folders. - * and MuseEmailFetcher is responsible for multiple accounts (but for a single user) - * email fetcher stats is associated with a single email fetcher - */ -public class EmailFetcherThread implements Runnable, Serializable { - private final static long serialVersionUID = 1L; - - public static final int IMAP_PREFETCH_BUFSIZE = 20 * 1024 * 1024; - /* used for buffering imap prefetch data -- necessary for good imap performance*/ - public static final String FORCED_ENCODING = "UTF-8"; - - public static Log log = LogFactory.getLog(EmailFetcherThread.class); - - // set up INVALID_DATE - public static Date INVALID_DATE; // like 0xdeadbeef - - static { - Calendar c = new GregorianCalendar(); - c.set(Calendar.YEAR, 1960); - c.set(Calendar.DAY_OF_MONTH, 1); - c.set(Calendar.MONTH, Calendar.JANUARY); - c.set(Calendar.HOUR_OF_DAY, 0); - c.set(Calendar.MINUTE, 0); - c.set(Calendar.SECOND, 0); - c.set(Calendar.MILLISECOND, 0); - INVALID_DATE = c.getTime(); - } - - private FetchConfig fetchConfig; - private boolean mayHaveRunOutOfMemory = false; - private FolderInfo fetchedFolderInfo; - transient Folder folder; - boolean use_uid_if_available; - - protected int threadID; - protected EmailStore emailStore; - - protected boolean isCancelled; - - public static boolean verbose = false; - public static boolean debug = false; - - // notes: begin_msg_index is always correct. end_msg_index = -1 means nMessages in folder. - // note: msg # begin_msg_index will be processed. msg # end_msg_index will not be processed. - protected int begin_msg_index = 0, end_msg_index = -1; - - EmailFetcherStats stats = new EmailFetcherStats(); - String currentStatus; - - - int totalMessagesInFetch, messagesCompletedInFetch; // this fetcher may be part of a bigger fetch operation. we need to track the progress of the bigger fetch in order to track progress accurately. - - public int getTotalMessagesInFetch() { - return totalMessagesInFetch; - } - - public void setTotalMessagesInFetch(int totalMessagesInFetch) { - this.totalMessagesInFetch = totalMessagesInFetch; - } - - public int getMessagesCompletedInFetch() { - return messagesCompletedInFetch; - } - - public void setMessagesCompletedInFetch(int messagesCompletedInFetch) { - this.messagesCompletedInFetch = messagesCompletedInFetch; - } - - // stats - int nMessagesProcessedSuccess, nUncachedMessagesProcessed, nMessagesCached; // running count of # of messages processed successfully - int nErrors = 0; - - public void cancel() { - isCancelled = true; - } - - public void setFetchConfig(FetchConfig fc) { - this.fetchConfig = fc; - } - - public int getThreadID() { - return threadID; - } - - public void setThreadID(int threadID) { - this.threadID = threadID; - } - - public int getNMessagesProcessed() { - return nMessagesProcessedSuccess; - } - - public int getNUncachedMessagesProcessed() { - return nUncachedMessagesProcessed; - } - - protected String folder_name() { - return fetchedFolderInfo.longName; - } - - protected String email_source() { - return fetchedFolderInfo.accountKey; - } - - public boolean mayHaveRunOutOfMemory() { - return mayHaveRunOutOfMemory; - } - - // private String folderPrefix; // prefix for folder files - transient Store store; // we don't really need this serialized across sessions - - transient Archive archive; - Collection dataErrors = new LinkedHashSet(); // log of input data errors - - Date prevDate = null; - - /* - * // comment out unused constructors, so it's cleaner/easier to trace the - * setting member fields. - * public EmailFetcherThread() { super(); } - * - * public EmailFetcherThread(EmailStore store, String folder_name) - * { - * this.emailStore = store; - * this.folder_name = folder_name; - * } - */ - - public EmailFetcherThread(EmailStore store, FolderInfo fi, int begin_msg_index, int end_msg_index) { - this.emailStore = store; - this.fetchedFolderInfo = fi; - stats.nTotalMessages = end_msg_index - begin_msg_index; - this.begin_msg_index = begin_msg_index; - this.end_msg_index = end_msg_index; - } - - public void setArchive(Archive a) { - archive = a; - } - - public Archive getArchive() { - return archive; - } - - /** - * merges results with another email fetcher. does some lightweight work - * including updating stats. consider removing this and simplifying in the - * future - */ - public void merge(EmailFetcherThread other) { - verify(); - if (other != null) { - other.verify(); - - // TOFIX: we should eliminate duplicates - dataErrors.addAll(other.dataErrors); - stats.merge(other.stats); - - nMessagesProcessedSuccess += other.nMessagesProcessedSuccess; - nErrors += other.nErrors; - mayHaveRunOutOfMemory |= other.mayHaveRunOutOfMemory; - } - verify(); - } - - /** - * intern a bunch of addrs, to save memory - * - * @throws UnsupportedEncodingException - */ - private static void internAddressList(Address[] addrs) throws UnsupportedEncodingException { - if (addrs == null) - return; - - for (Address a : addrs) { - if (a instanceof InternetAddress) { - InternetAddress ia = (InternetAddress) a; - String address = ia.getAddress(), personal = ia.getPersonal(); - if (address != null) - ia.setAddress(InternTable.intern(address)); - if (personal != null) - ia.setPersonal(InternTable.intern(personal)); - } - } - } - - /** - * Key method for importing email: converts a javamail obj. to our own data structure (EmailDocument) - */ - //public EmailDocument convertToEmailDocument(MimeMessage m, int num, String url) throws MessagingException, IOException - private EmailDocument convertToEmailDocument(MimeMessage m, String id) throws MessagingException, IOException { - // get the date. - // prevDate is a hack for the cases where the message is lacking an explicit Date: header. e.g. - // From hangal Sun Jun 10 13:46:46 2001 - // To: ewatkins@stanford.edu - // Subject: Re: return value bugs - // though the date is on the From separator line, the mbox provider fails to parse it and provide it to us. - // so as a hack, we will assign such messages the same date as the previous one this fetcher has seen! ;-) - // update: having the exact same date causes the message to be considered a duplicate, so just increment - // the timestamp it by 1 millisecond! - // a better fix would be to improve the parsing in the provider - - boolean hackyDate = false; - Date d = m.getSentDate(); - if (d == null) - d = m.getReceivedDate(); - if (d == null) { - if (prevDate != null) { - long newTime = prevDate.getTime() + 1L; // added +1 so that this email is not considered the same object as the prev. one if they are in the same thread - d = new Date(newTime); - dataErrors.add("No date for message id:" + id + ": " + EmailUtils.formatMessageHeader(m) + " assigned approximate date"); - } else { - d = INVALID_DATE; // wrong, but what can we do... :-( - dataErrors.add("No date for message id:" + id + ": " + EmailUtils.formatMessageHeader(m) + " assigned deliberately invalid date"); - } - hackyDate = true; - } else { - Calendar c = new GregorianCalendar(); - c.setTime(d); - int yy = c.get(Calendar.YEAR); - if (yy < 1960 || yy > 2020) { - dataErrors.add("Probably bad date: " + Util.formatDate(c) + " message: " + EmailUtils.formatMessageHeader(m)); - hackyDate = true; - } - } - - if (hackyDate && prevDate != null) { - long newTime = prevDate.getTime() + 1L; // added +1 so that this email is not considered the same object as the prev. one if they are in the same thread - d = new Date(newTime); - Util.ASSERT(!d.equals(prevDate)); - } - - Calendar c = new GregorianCalendar(); - c.setTime(d != null ? d : new Date()); - - prevDate = d; - - Address to[] = null, cc[] = null, bcc[] = null; - Address[] from = null; - try { - // allrecip = m.getAllRecipients(); // turns out to be too expensive because it looks for newsgroup headers for imap - // assemble to, cc, bcc into a list and copy it into allrecip - List
list = new ArrayList
(); - from = m.getFrom(); - to = m.getRecipients(Message.RecipientType.TO); - if (to != null) - list.addAll(Arrays.asList(to)); - cc = m.getRecipients(Message.RecipientType.CC); - if (cc != null) - list.addAll(Arrays.asList(cc)); - bcc = m.getRecipients(Message.RecipientType.BCC); - if (bcc != null) - list.addAll(Arrays.asList(bcc)); - - // intern the strings in these addresses to save memory cos they are repeated often in a large archive - internAddressList(from); - internAddressList(to); - internAddressList(cc); - internAddressList(bcc); - } catch (AddressException ae) { - String s = "Bad address in folder " + folder_name() + " message id" + id + " " + ae; - dataErrors.add(s); - } - - // take a deep breath. This object is going to live longer than most of us. - EmailDocument ed = new EmailDocument(id, email_source(), folder_name(), to, cc, bcc, from, m.getSubject(), m.getMessageID(), c.getTime()); - - String[] headers = m.getHeader("List-Post"); - if (headers != null && headers.length > 0) { - // trim the headers because they usually look like: "" - ed.sentToMailingLists = new String[headers.length]; - int i = 0; - for (String header : headers) { - header = header.trim(); - header = header.toLowerCase(); - - if (header.startsWith("<") && header.endsWith(">")) - header = header.substring(1, header.length() - 1); - if (header.startsWith("mailto:") && !"mailto:".equals(header)) // defensive check in case header == "mailto:" - header = header.substring(("mailto:").length()); - ed.sentToMailingLists[i++] = header; - } - } - if (hackyDate) { - String s = "Guessed date " + Util.formatDate(c) + " for message id: " + id + ": " + ed.getHeader(); - dataErrors.add(s); - ed.hackyDate = true; - } - - // check if the message has attachments. - // if it does and we're not downloading attachments, then we mark the ed as such. - // otherwise we had a problem where a message header (and maybe text) was downloaded but without attachments in one run - // but in a subsequent run where attachments were needed, we thought the message was already cached and there was no - // need to recompute it, leaving the attachments field in this ed incorrect. - List attachmentNames = getAttachmentNames(m, m); - if (!Util.nullOrEmpty(attachmentNames)) { - ed.attachmentsYetToBeDownloaded = true; // will set it to false later if attachments really were downloaded (not sure why) - // log.info ("added " + attachmentNames.size() + " attachments to message: " + ed); - } - return ed; - } - - /* - * we try to get the attachment names cheaply, i.e. without having to - * process the whole message - */ - private List getAttachmentNames(MimeMessage m, Part p) throws MessagingException, IOException { - List result = new ArrayList(); - try { - if (p.isMimeType("multipart/*") || p.isMimeType("message/rfc822")) { - if (p.isMimeType("multipart/alternative")) - return result; // ignore alternative's because real attachments don't have alternatives - DataHandler dh = p.getDataHandler(); - DataSource ds = dh.getDataSource(); - if (ds instanceof MultipartDataSource) { - MultipartDataSource mpds = (MultipartDataSource) ds; - for (int i = 0; i < mpds.getCount(); i++) - result.addAll(getAttachmentNames(m, mpds.getBodyPart(i))); - } else { - String name = ds.getName(); - if (!Util.nullOrEmpty(name)) - result.add(name); - } - } else { - String filename = p.getFileName(); - if (filename != null) - result.add(filename); - } - } catch (Exception e) { - // sometimes we see javax.mail.MessagingException: Unable to load BODYSTRUCTURE - // in this case, just ignore, not much we can do i guess. - Util.print_exception(e, log); - } - return result; - } - - // public void setEmailCache (DocCache cache) - // { - // this.cache = cache; - // } - - /** - * this method returns the text content of the message as a list of strings - * // each element of the list could be the content of a multipart message - * // m is the top level subject - * // p is the specific part that we are processing (p could be == m) - * also sets up names of attachments (though it will not download the - * attachment unless downloadAttachments is true) - */ - private List processMessagePart(int messageNum, Message m, Part p, List attachmentsList) throws MessagingException, IOException { - List list = new ArrayList(); // return list - if (p == null) { - dataErrors.add("part is null: " + folder_name() + " idx " + messageNum); - return list; - } - - if (p == m && p.isMimeType("text/html")) { - /* - String s = "top level part is html! message:" + m.getSubject() + " " + m.getDescription(); - dataErrors.add(s); - */ - // we don't normally expect the top-level part to have content-type text/html - // but we saw this happen on some sample archives pst -> emailchemy. so allow it and handle it by parsing the html - String html = (String) p.getContent(); - String text = Util.unescapeHTML(html); - org.jsoup.nodes.Document doc = Jsoup.parse(text); - - StringBuilder sb = new StringBuilder(); - HTMLUtils.extractTextFromHTML(doc.body(), sb); - list.add(sb.toString()); - return list; - } - - if (p.isMimeType("text/plain")) { - //make sure, p is not wrongly labelled as plain text. - Enumeration headers = p.getAllHeaders(); - boolean dirty = false; - if (headers != null) - while (headers.hasMoreElements()) { - Header h = (Header) headers.nextElement(); - String name = h.getName(); - String value = h.getValue(); - if (name != null && value != null) { - if (name.equals("Content-transfer-encoding") && value.equals("base64")) { - dirty = true; - break; - } - } - } - String fname = p.getFileName(); - if (fname != null) { - int idx = fname.lastIndexOf('.'); - if ((idx < fname.length()) && (idx >= 0)) { - String extension = fname.substring(idx); - //anything extension other than .txt is suspicious. - if (!extension.equals(".txt")) - dirty = true; - } - } - if (dirty) { - dataErrors.add("Dirty message part, has conflicting message part headers." + folder_name() + " Message# " + messageNum); - return list; - } - - log.debug("Message part with content type text/plain"); - String content; - String type = p.getContentType(); // new InputStreamReader(p.getInputStream(), "UTF-8"); - try { - // if forced encoding is set, we read the string with that encoding, otherwise we just use whatever p.getContent gives us - if (FORCED_ENCODING != null) { - byte b[] = Util.getBytesFromStream(p.getInputStream()); - content = new String(b, FORCED_ENCODING); - } else - content = (String) p.getContent(); - } catch (UnsupportedEncodingException uee) { - dataErrors.add("Unsupported encoding: " + folder_name() + " Message #" + messageNum + " type " + type + ", using brute force conversion"); - // a particularly nasty issue:javamail can't handle utf-7 encoding which is common with hotmail and exchange servers. - // we're using the workaround suggested on this page: http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4304013 - // though it may be better to consider official support for utf-7 or other encodings. - - // TOFIX: I get an exception for utfutf8-encoding which has a base64 encoding embedded on it. - // Unsupported encoding: gmail-sent Message #10477 type text/plain; charset=x-utf8utf8; name="newyorker.txt", - // the hack below doesn't work for it. - ByteArrayOutputStream bao = new ByteArrayOutputStream(); - p.writeTo(bao); - content = bao.toString(); - } - list.add(content); - } else if (p.isMimeType("multipart/*") || p.isMimeType("message/rfc822")) { - // rfc822 mime type is for embedded mbox format or some such (appears for things like - // forwarded messages). the content appears to be just a multipart. - Object o = p.getContent(); - if (o instanceof Multipart) { - Multipart allParts = (Multipart) o; - if (p.isMimeType("multipart/alternative")) { - // this is an alternative mime type. v common case to have text and html alternatives - // so just process the text part if there is one, and avoid fetching the alternatives. - // useful esp. because many ordinary messages are alternative: text and html and we don't want to fetch the html. - // revisit in future we want to retain the html alternative for display purposes - Part[] parts = new Part[allParts.getCount()]; - for (int i = 0; i < parts.length; i++) - parts[i] = allParts.getBodyPart(i); - - for (int i = 0; i < parts.length; i++) { - Part thisPart = parts[i]; - if (thisPart.isMimeType("text/plain")) { - // common case, return quickly - list.add((String) thisPart.getContent()); - log.debug("Multipart/alternative with content type text/plain"); - return list; - } - } - - // no text part, let's look for an html part. this happens for html parts. - for (int i = 0; i < allParts.getCount(); i++) { - Part thisPart = parts[i]; - if (thisPart.isMimeType("text/html")) { - // common case, return quickly - String html = (String) thisPart.getContent(); - String text = Util.unescapeHTML(html); - org.jsoup.nodes.Document doc = Jsoup.parse(text); - - StringBuilder sb = new StringBuilder(); - HTMLUtils.extractTextFromHTML(doc.body(), sb); - list.add(sb.toString()); - - log.debug("Multipart/alternative with content type text/html"); - return list; - } - } - - // no text or html part. hmmm... blindly process the first part only - if (allParts.getCount() >= 1) - list.addAll(processMessagePart(messageNum, m, allParts.getBodyPart(0), attachmentsList)); - } else { - // process it like a regular multipart - for (int i = 0; i < allParts.getCount(); i++) { - BodyPart bp = allParts.getBodyPart(i); - list.addAll(processMessagePart(messageNum, m, bp, attachmentsList)); - } - } - } else if (o instanceof Part) - list.addAll(processMessagePart(messageNum, m, (Part) o, attachmentsList)); - else - dataErrors.add("Unhandled part content, " + folder_name() + " Message #" + messageNum + "Java type: " + o.getClass() + " Content-Type: " + p.getContentType()); - } else { - try { - // do attachments only if downloadAttachments is set. - // some apps do not need attachments, so this saves some time. - // however, it seems like a lot of time is taken in imap prefetch, which gets attachments too? - if (fetchConfig.downloadAttachments) - handleAttachments(messageNum, m, p, list, attachmentsList); - } catch (Exception e) { - dataErrors.add("Ignoring attachment for " + folder_name() + " Message #" + messageNum + ": " + Util.stackTrace(e)); - } - } - - return list; - } - - /** - * recursively processes attachments, fetching and saving it if needed - * parses the given part p, and adds it to hte attachmentsList. - * in some cases, like a text/html type without a filename, we instead append it to the textlist - * @throws MessagingException - */ - private void handleAttachments(int idx, Message m, Part p, List textList, List attachmentsList) throws MessagingException { - String ct = null; - if (!(m instanceof MimeMessage)) { - Exception e = new IllegalArgumentException("Not a MIME message!"); - e.fillInStackTrace(); - log.warn(Util.stackTrace(e)); - return; - } - - String filename = null; - try { - filename = p.getFileName(); - } catch (Exception e) { - // seen this happen with: - // Folders__gmail-sent Message #12185 Expected ';', got "Message" - // javax.mail.internet.ParseException: Expected ';', got "Message" - - dataErrors.add("Unable to read attachment name: " + folder_name() + " Message# " + idx); - return; - } - - String sanitizedFName = Util.sanitizeFolderName(emailStore.getAccountID() + "." + folder_name()); - if (filename == null) { - String tempFname = sanitizedFName + "." + idx; - dataErrors.add("attachment filename is null for " + sanitizedFName + " Message#" + idx + " assigning it the name: " + tempFname); - if (p.isMimeType("text/html")) { - try { - log.info("Turning message " + sanitizedFName + " Message#" + idx + " into text although it is an attachment"); - String html = (String) p.getContent(); - String text = Util.unescapeHTML(html); - org.jsoup.nodes.Document doc = Jsoup.parse(text); - - StringBuilder sb = new StringBuilder(); - HTMLUtils.extractTextFromHTML(doc.body(), sb); - textList.add(sb.toString()); - return; - } catch (Exception e) { - Util.print_exception("Error reading contents of text/html multipart without a filename!", e, log); - return; - } - } - filename = tempFname; - } - - // Replacing any of the disallowed filename characters (\/:*?"<>|&) to _ - // (note: & causes problems with URLs for serveAttachment etc, so it's also replaced) - String newFilename = Util.sanitizeFileName(filename); - - // Updating filename if it's changed after sanitizing. - if (!newFilename.equals(filename)) { - log.info("Filename changed from " + filename + " to " + newFilename); - filename = newFilename; - } - - try { - ct = p.getContentType(); - if (filename.indexOf(".") < 0) // no ext in filename... let's fix it if possible - { - // Using startsWith instead of equals because sometimes the ct has crud beyond the image/jpeg;...crud.... - // Below are the most common file types, more type can be added if needed - - // Most common APPLICATION TYPE - if (ct.startsWith("application/pdf")) - filename = filename + ".pdf"; - if (ct.startsWith("application/zip")) - filename = filename + ",zip"; - // Most common IMAGE TYPE - if (ct.startsWith("image/jpeg")) - filename = filename + ".jpg"; - if (ct.startsWith("image/gif")) - filename = filename + ".gif"; - if (ct.startsWith("image/png")) - filename = filename + ".png"; - // Most Common VIDEO TYPE - if (ct.startsWith("video/x-ms-wmv")) - filename = filename + ".wmv"; - // Most Common AUDIO TYPE - if (ct.startsWith("audio/mpeg")) - filename = filename + ".mp3"; - if (ct.startsWith("audio/mp4")) - filename = filename + ".mp4"; - // Most Common TEXT TYPE - if (ct.startsWith("text/html")) - filename = filename + ".html"; - // Windows Office - if (ct.startsWith("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) //Word - filename = filename + ".docx"; - if (ct.startsWith("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")) //Excel - filename = filename + ".xlsx"; - if (ct.startsWith("application/vnd.openxmlformats-officedocument.presentationml.presentation")) //PowerPoint - filename = filename + ".pptx"; - } - // retain only up to first semi-colon; often ct is something like text/plain; name="filename"' we don't want to log the filename - int x = ct.indexOf(";"); - if (x >= 0) - ct = ct.substring(0, x); - log.info("Attachment content type: " + ct + " filename = " + Util.blurKeepingExtension(filename)); - } catch (Exception pex) { - dataErrors.add("Can't read CONTENT-TYPE: " + ct + " filename:" + filename + " size = " + p.getSize() + " subject: " + m.getSubject() + " Date : " + m.getSentDate().toString() + "\n Exception: " + pex + "\n" + Util.stackTrace(pex)); - return; - } - - // if (filename == null && !p.isMimeType("text/html") && !p.isMimeType("message/partial")) // expected not to have a filename with mime type text/html - // log.warn ("Attachment filename is null: " + Util.stackTrace()); - - - boolean success = true; - // the size passed in here is the part size, which is not really the binary blob size. - // when we read the stream below in blobStore.add(), we'll set it again to the binary blob size - Blob b = new EmailAttachmentBlob(filename, p.getSize(), (MimeMessage) m, p); - - if (fetchConfig.downloadAttachments) { - // this containment check is only on the basis of file name and size currently, - // not on the actual hash - if (archive.getBlobStore().contains(b)) { - log.debug("Cache hit! " + b); - } else { - try { - if (filename.endsWith(".tif")) - log.info("Fetching attachment..." + Util.blurKeepingExtension(filename)); - - // performance critical! use large buffer! currently 256KB - // stream will be closed by callee - - long start = System.currentTimeMillis(); - long nBytes = archive.getBlobStore().add(b, new BufferedInputStream(p.getInputStream(), 256 * 1024)); - long end = System.currentTimeMillis(); - if (nBytes != -1) { - long diff = end - start; - String s = "attachment size " + nBytes + " bytes, fetched in " + diff + " millis"; - if (diff > 0) - s += " (" + (nBytes / diff) + " KB/s)"; - log.info(s); - } - - Util.ASSERT(archive.getBlobStore().contains(b)); - - } catch (IOException ioe) { - success = false; - dataErrors.add("WARNING: Unable to fetch attachment: filename: " + filename + " size = " + p.getSize() + " subject: " + m.getSubject() + " Date : " + m.getSentDate().toString() + "\nException: " + ioe); - ioe.printStackTrace(System.out); - } - } - - if (success) { - attachmentsList.add(b); - - /// generate thumbnail only if not already cached - try { - archive.getBlobStore().generate_thumbnail(b); // supplement - } catch (IOException ioe) { - log.warn("failed to create thumbnail, filename: " + filename + " size = " + p.getSize() + " subject: " + m.getSubject() + " Date : " + m.getSentDate().toString() + "\nException: " + ioe); - ioe.printStackTrace(System.out); - } - } - } - } - - @SuppressWarnings("unused") - private static String processLastReceived(String header) { - header = header.toLowerCase(); - StringTokenizer st = new StringTokenizer(header, " \t()[]"); - String x = st.nextToken(); - if (!x.equals("from")) { - log.warn("Warning: unrecognized header: " + header); - return null; - } - - while (st.hasMoreTokens()) { - String s = st.nextToken(); - if (Character.isDigit(s.charAt(0))) { - log.warn("IP address: " + s); - return s; - } - } - return null; - } - - public void verify() { - } - - public void finish() { - currentStatus = JSONUtils.getStatusJSON("Verifying email headers..."); - currentStatus = JSONUtils.getStatusJSON(""); - } - - /** - * prepare a status json with up to N_TEASERS teasers from the most recent - * emails, starting backwards from idx. specifically ask for ArrayList as - * List.get() can be costly otherwise. - */ - private static String getStatusJSONWithTeasers(String message, int pctComplete, long secsElapsed, long secsRemaining, ArrayList emails, int N_TEASERS) { - JSONObject json = new JSONObject(); - try { - json.put("pctComplete", pctComplete); - json.put("message", message); - json.put("secsElapsed", secsElapsed); - json.put("secsRemaining", secsRemaining); - if (!Util.nullOrEmpty(emails)) { - JSONArray arr = new JSONArray(); - int idx_end = emails.size(); - int idx_start = idx_end - N_TEASERS; - if (idx_start < 0) - idx_start = 0; - for (int i = idx_start, j = 0; i < idx_end; i++) { - EmailDocument email = emails.get(i); - if (email != null) { - String subject = email.description; - if (!Util.nullOrEmpty(subject)) - arr.put(j++, subject); - } - } - json.put("teasers", arr); - } - } catch (JSONException jsone) { - try { - json.put("error", jsone.toString()); - } catch (Exception e) { - Util.report_exception(e); - } - } - return json.toString(); - } - - /** - * best effort to prefetch messages for messages[startMsgIdx] onwards, up to - * the IMAP_PREFETCH_BUFSIZE - * return List if bodyTextOnly is true, otherwise List - */ - private List do_imap_prefetch(Message[] messages, int startMsgIdx, Folder folder, boolean bodyTextOnly) { - // its perfectly ok for correctness for this method to do nothing and return null - List prefetchedMessages = null; - try { - - if (IMAP_PREFETCH_BUFSIZE > 0 && folder instanceof IMAPFolder) { - int prefetch_messages_size = 0; - - int start_message_num = messages[startMsgIdx].getMessageNumber(); - int end_message_num = start_message_num; - - List messageNums = new ArrayList(); - - // figure out message num range to fetch. if anything is unusual -- bad content type, non-consec. msg nums etc -- break out. - // non consec. message numbers are a problem because they cause a very long imap command string, which we found was returning an "invalid command" response. - int prev_message_num = -1; - for (int msgIdx = startMsgIdx; msgIdx < messages.length; msgIdx++) { - if (bodyTextOnly) { - String contentType = messages[msgIdx].getContentType().toLowerCase(); - if (!contentType.startsWith("multipart/") && !contentType.startsWith("text/plain")) { - log.info("Warn: message idx" + msgIdx + " msg#" + messages[msgIdx].getMessageNumber() + " has unexpected content type " + contentType); - break; - } - } - - // check if sequence is as expected - int next_message_num = messages[msgIdx].getMessageNumber(); // may be better to switch this to uid and prefetcher uses uid fetch - if (next_message_num != prev_message_num + 1 && prev_message_num != -1) - break; - - // if this message would push prefetch size beyond the buf size, break out, not including this message - if (prefetch_messages_size + messages[msgIdx].getSize() >= IMAP_PREFETCH_BUFSIZE) - break; - prev_message_num = next_message_num; - prefetch_messages_size += messages[msgIdx].getSize(); - messageNums.add(next_message_num); - } - - if (messageNums.size() == 0) - return null; - - // now we prefetch messages from start_message_num to end_message_num - long startMillis = System.currentTimeMillis(); - log.info("prefetching " + messageNums.size() + " messages"); - ImapPrefetcher prefetcher = bodyTextOnly ? new TextOnlyImapPrefetcher(((ImapPopEmailStore) emailStore).session, messageNums) : new ImapPrefetcher(((ImapPopEmailStore) emailStore).session, messageNums); - prefetchedMessages = (List) ((IMAPFolder) folder).doCommand(prefetcher); // start_message_num, end_message_num)); - long elapsedMillis = System.currentTimeMillis() - startMillis; - long kb_per_sec = prefetch_messages_size / elapsedMillis; - log.info("prefetched " + messageNums.size() + " messages in " + Util.blur(folder.getName()) + " [" + start_message_num + ":" + end_message_num + "], " + Util.commatize(prefetch_messages_size / 1024) + "KB in " + Util.commatize(elapsedMillis) + "ms (" + Util.commatize(kb_per_sec) + " KB/sec)"); - } - } catch (Exception e) { - Util.print_exception(e, log); - } - return prefetchedMessages; - } - - private void fetchHeaders(Message[] messages) throws MessagingException { - // fetch headers (don't do it for mbox folders, waste of time) - // this is an essential perf. step so that we fetch the headers in bulk. - // otherwise it takes a long time to fetch header info one at a time for each message - if (!(emailStore instanceof MboxEmailStore)) { - long startTimeMillis = System.currentTimeMillis(); - currentStatus = JSONUtils.getStatusJSON("Reading headers from " + folder.getName() + "..."); - FetchProfile fp = new FetchProfile(); - fp.add(FetchProfile.Item.ENVELOPE); - fp.add(FetchProfile.Item.CONTENT_INFO); - fp.add(UIDFolder.FetchProfileItem.UID); // important, otherwise reading UIDs takes a long time later - fp.add("List-Post"); - folder.fetch(messages, fp); - long endTimeMillis = System.currentTimeMillis(); - log.info("Done fetching headers: " + Util.commatize(endTimeMillis - startTimeMillis) + "ms"); - } - } - - private void fetchHeaders(int nMessages) throws MessagingException { - // fetch headers (don't do it for mbox folders, waste of time) - // this is an essential perf. step so that we fetch the headers in bulk. - // otherwise it takes a long time to fetch header info one at a time for each message - if (!(emailStore instanceof MboxEmailStore)) { - long startTimeMillis = System.currentTimeMillis(); - currentStatus = JSONUtils.getStatusJSON("Reading headers from " + folder.getName() + "..."); - FetchProfile fp = new FetchProfile(); - fp.add(FetchProfile.Item.ENVELOPE); - fp.add(FetchProfile.Item.CONTENT_INFO); - fp.add(UIDFolder.FetchProfileItem.UID); // important, otherwise reading UIDs takes a long time later - fp.add("List-Post"); - for (int i = 0; i < nMessages; i++) { - Message[] messages = new Message[]{folder.getMessage(i)}; - folder.fetch(messages, fp); - } - long endTimeMillis = System.currentTimeMillis(); - log.info("Done fetching headers: " + Util.commatize(endTimeMillis - startTimeMillis) + "ms"); - } - } - - private Message[] removeMessagesAlreadyInArchive(Archive archive, Message[] messages) { - // early out for the common case that we have an empty archive - if (archive.getAllDocs().size() == 0) - return messages; - - List resultList = new ArrayList(); - for (int i = 0; i < messages.length; i++) { - //int idx = messages[i].getMessageNumber(); - Message m = messages[i]; - MimeMessage mm = (MimeMessage) m; - try { - EmailDocument ed = convertToEmailDocument(mm, "dummy"); // id doesn't really matter here - if (archive.containsDoc(ed)) { - stats.nMessagesAlreadyPresent++; - dataErrors.add("Duplicate message: " + ed); // note: report.jsp depends on this exact string - continue; - } - } catch (Exception e) { - Util.print_exception(e, log); - } - resultList.add(mm); - messages[i] = null; // no harm explicitly nulling out messages - } - Message[] resultArray = resultList.toArray(new Message[0]); - return resultArray; - } - - /** - * Make few post checks on the content and returns true if the message looks - * ok - */ - private boolean messageLooksOk(String content) { - if (content == null) - //let others handle it. - return true; - String[] lines = content.split("\n"); - int badlines = 0; - if (lines.length > 50) - for (String line : lines) { - if (!line.contains(" ")) - badlines++; - else - badlines = 0; - if (badlines > 50) - return false; - } - return true; - } - - //keep track of the total time elapsed in fetching messages across batches - static long fetchStartTime = System.currentTimeMillis(); - - /** - * fetch given message idx's in given folder -- @performance critical - * - * @param offset - the original offset of the first message in the messages array, important to initialize - * for proper assignment of unique id or doc Id - */ - //private void fetchUncachedMessages(String sanitizedFName, Folder folder, DocCache cache, List msgIdxs) throws MessagingException, FileNotFoundException, IOException, GeneralSecurityException { - private void fetchAndIndexMessages(Folder folder, Message[] messages, int offset, int totalMessages) throws MessagingException, IOException, GeneralSecurityException { - //mark the processing of new batch - if (offset == 0) - fetchStartTime = System.currentTimeMillis(); - - currentStatus = JSONUtils.getStatusJSON((emailStore instanceof MboxEmailStore) ? "Parsing " + folder.getName() + " (can take a while)..." : "Reading " + folder.getName() + "..."); - - // bulk fetch of all message headers - int n = messages.length; - - // eliminate any messages the archive already has - messages = removeMessagesAlreadyInArchive(archive, messages); - - log.info(n - messages.length + " message(s) already in the archive"); - - ArrayList emails = new ArrayList(); - - // for performance, we need to do bulk prefetches, instead of fetching 1 message at a time - // prefetchedMessages will be a temp cache of prefetched messages - int first_i_prefetched = -1, last_i_prefetched = -1; - List prefetchedMessages = null; // the type of this can be either list if text only, otherwise list - - long highestUID = archive.getLastUIDForFolder(fetchedFolderInfo.accountKey, fetchedFolderInfo.longName); - long lastAssignedUID = highestUID; - boolean bodyTextOnly = !fetchConfig.downloadAttachments; - try { - archive.openForWrite(); - for (int i = 0; i < messages.length; i++) { - // critical step: (thanks, yourkit!) - // null out the ref to the previous message, otherwise it stays in memory, and the heap effectively needs to be as big as the size of all messages - if (i > 0) - messages[i - 1] = null; - - if (isCancelled) - break; - - Message m = messages[i]; - MimeMessage mm = (MimeMessage) m; - - if (i >= last_i_prefetched) { - // critical perf. step: do a bulk imap prefetch - // the prefetch will fetch as many messages as possible up to a max buffer size, and return the messages prefetched - // last_i_prefetched tracks what is the last index into idxs that we have prefetched. - // when we run out of prefetched messages, we do another bulk prefetch - - prefetchedMessages = do_imap_prefetch(messages, i, folder, bodyTextOnly); - if (prefetchedMessages != null) { - first_i_prefetched = i; - last_i_prefetched = i + prefetchedMessages.size(); - } - } - - int pctDone = ((i + offset) * 100) / totalMessages; - long elapsedMillis = System.currentTimeMillis() - fetchStartTime; - long unprocessedSecs = Util.getUnprocessedMessage(i + offset, totalMessages, elapsedMillis); - int N_TEASERS = 50; // 50 ok here, because it takes a long time to fetch and process messages, so teaser computation is relatively not expensive - int nTriesForThisMessage = 0; - currentStatus = getStatusJSONWithTeasers("Reading " + Util.commatize(totalMessages) + " messages from " + folder.getName() + "...", pctDone, elapsedMillis / 1000, unprocessedSecs, emails, N_TEASERS); - - int messageNum = mm.getMessageNumber(); - - try { - long unique_id; - - // if we have uid, that's even better - // don't use uid's for mbox, it has a bug and always gives -1 - // see http://james.apache.org/server/rfclist/imap4/rfc2060.txt for uid spec - if (folder instanceof UIDFolder && !(emailStore instanceof MboxEmailStore)) { - long uid = ((UIDFolder) folder).getUID(m); - unique_id = uid; - } else - unique_id = lastAssignedUID + 1 + i + offset; // +1 since i starts from 0 (but lastAssignedUID can be -1 -- is that safe? -sgh) - - if (unique_id > highestUID) - highestUID = unique_id; - - String unique_id_as_string = Long.toString(unique_id); - - // well, we already converted to emaildoc above during removeMessagesAlreadyInArchive - // not a serious perf. concern now, but revisit if needed - EmailDocument ed = convertToEmailDocument(mm, unique_id_as_string); // this messageNum is mostly for debugging, it should not be used for equals etc. - // need to check this again, because there might be duplicates such within the set we are currently processing. - if (archive.containsDoc(ed)) { - stats.nMessagesAlreadyPresent++; - dataErrors.add("Duplicate message: " + ed); // note: report.jsp depends on this specific string - continue; - } - - MimeMessage originalMessage = mm; // this is the mm that has all the headers etc. - List attachmentsList = new ArrayList(); - - // if we already have it prefetched, use the prefetched version - List contents = null; - - if (first_i_prefetched >= 0 && prefetchedMessages != null) { - if (!fetchConfig.downloadAttachments) { - // text only means the prefetchedMessages are stored directly as a list of strings - String content = (String) prefetchedMessages.get(i - first_i_prefetched); // note: this_mm only has the prefetched content, but not the headers - contents = new ArrayList(); - - try { - // a special for yahoo which routinely uses quoted-printable. content looks like =0A0D.... = etc. - if (mm.isMimeType("multipart/alternative")) { - Multipart mm_mp = (Multipart) mm.getContent(); - Part p0 = mm_mp.getBodyPart(0); - if (p0 instanceof com.sun.mail.imap.IMAPBodyPart) { - String encoding = ((com.sun.mail.imap.IMAPBodyPart) p0).getEncoding(); - if ("quoted-printable".equals(encoding)) { - content = new String(Util.getBytesFromStream(javax.mail.internet.MimeUtility.decode(new java.io.ByteArrayInputStream(content.getBytes()), "quoted-printable"))); - } - } - } - } catch (Exception e) { - Util.print_exception("Error trying to parse encoding of multipart", e, log); - } - - contents.add(content); - } else { - // subtle issue here: the contentType of the prefetchedMessage needs to be be set to the original_mm's content-type. - // this was found for cases where the original message is multipart-alternative with a text and html part. - // if we don't set prefetchedMessage's content type, it gets a mime type of text/plain and a body = the entire multipart including both parts. - // found on sgh's sent mail w/subject: "text to add in help" from Fri, 7 Jun 2013 - MimeMessage prefetchedMessage = (MimeMessage) prefetchedMessages.get(i - first_i_prefetched); - String contentTypeHeaders[] = originalMessage.getHeader("Content-Type"); - String contentTypeHeader = null; - if (contentTypeHeaders != null && contentTypeHeaders.length == 1) - contentTypeHeader = contentTypeHeaders[0]; - - if (!Util.nullOrEmpty(contentTypeHeader)) // we do care about body structure, hang on to it - prefetchedMessage.setHeader("Content-Type", contentTypeHeader); - mm = prefetchedMessage; - } - prefetchedMessages.set(i - first_i_prefetched, null); // null out to save memory - } - - if (contents == null) - contents = processMessagePart(messageNum, originalMessage, mm, attachmentsList); - - // if mm is not prefetched, it is the same as original_mm - // will also work, but will be slow as javamail accesses and fetches each mm separately, instead of using the bulk prefetched version - // even when prefetched, the processMessagePart is somewhat expensive because the attachments have to be extracted etc. - - // we could overlap processMessagePart with do_imap_prefetch by prefetching in a separate thread, since prefetch is network limited. - // but profiling shows processMessagePart takes only 1/4th the time of do_imap_prefetch so overlapping would be a relatively small gain. - // not worth the effort right now. - ed.attachments = attachmentsList; - if (fetchConfig.downloadAttachments) - ed.attachmentsYetToBeDownloaded = false; // we've already downloaded our attachments - - // concat all the contents parts - StringBuilder sb = new StringBuilder(); - for (String s : contents) { - sb.append(s); - sb.append("\n"); - } - - String contentStr = sb.toString(); - if (!messageLooksOk(contentStr)) { - dataErrors.add("Skipping message as it seems to have very long words: " + ed); - continue; - } - contentStr = IndexUtils.normalizeNewlines(contentStr); // just get rid of \r's - - archive.addDoc(ed, contentStr); - - List linkList = new ArrayList(); - // linkList might be used only for slant - IndexUtils.populateDocLinks(ed, contentStr, linkList, true); - ed.links = linkList; - stats.nMessagesAdded++; - } catch (Exception ex) { - // sometimes we get unexpected folder closed, so try again - boolean retry = false; - if (ex instanceof javax.mail.FolderClosedException) { - log.warn("Oops, thread " + threadID + " got the folder closed in its face! " + ex.getMessage()); - - // sometimes we get this exception about folder closed - // retry up to 3 times, then give up - if (nTriesForThisMessage < 3) { - retry = true; - log.info("Re-opening email store; attempt #" + (nTriesForThisMessage + 1) + " for message " + i); - nTriesForThisMessage++; - messages = openFolderAndGetMessages(); - fetchHeaders(messages); - --i; // adjust the message index n try again - } - } - - if (!retry) { - // we sometimes see UnsupportedEncodingException with x-utf8utf8 mime type and ParseException - // nothing much can be done, just create a dummy doc and add it to the cache - nErrors++; - stats.nErrors++; - EmailDocument ed = new EmailDocument(Integer.toString(messageNum)); - log.warn("Exception reading message from " + folder_name() + " Message #" + messageNum + " " + ex.getMessage() + "\n" + Util.stackTrace(ex)); - - ed.setErrorString(Util.stackTrace(ex)); - } - } - } - } catch (Throwable t) { - Util.print_exception(t, log); - } finally { - // if (cancelled && false) // TODO: disable for now as currently only indexes are rolled back and allDocs/blobs are not rolled back in sync yet - // archive.rollbackIndexWrites(); - // else - currentStatus = JSONUtils.getStatusJSON("Saving archive..."); - archive.close(); - } - - fetchedFolderInfo.lastSeenUID = highestUID; - log.info("at end of fetch, folder info is " + fetchedFolderInfo); - - log.info("emailfetcher thread completed, archive has " + archive.getAllDocs().size() + " docs"); - } - - public FolderInfo getFetchedFolderInfo() { - return fetchedFolderInfo; - } - - private int openFolderAndGetMessageCount() throws MessagingException { - folder = null; - - store = emailStore.connect(); - folder = emailStore.get_folder(store, folder_name()); - if (folder != null) - return folder.getMessageCount(); - else - return 0; - } - - /** - * Comment by @vihari - * Not sure what uid id and folder are,I think this code should be more predictable - * The params begin idx and end idx are used for both uid filtering and Mbox message indexing. - * does not make sense - */ - private Message[] openFolderAndGetMessages() throws MessagingException { - if (folder == null) - openFolderAndGetMessageCount(); - - Message[] messages = null; - if (folder == null) - return messages; - - String descr = emailStore.getAccountID() + ":" + folder; - boolean haveUID = false; - int count = folder.getMessageCount(); - use_uid_if_available = (begin_msg_index == 1 && end_msg_index == count + 1); - log.info("use_uid_if_available is set to " + use_uid_if_available); - - if (fetchConfig.filter != null && fetchConfig.filter.isActive()) { - log.info("Issuing server side filters for " + fetchConfig.filter); - boolean useReceivedDateTerms = descr.indexOf("yahoo.com") >= 0; - messages = folder.search(fetchConfig.filter.convertToSearchTerm(useReceivedDateTerms)); - } else { - // mbox provider claims to provide UIDFolder but the uids are bogus so we treat mboemailstore folders as not uidfolders - boolean is_uid_folder = (folder instanceof UIDFolder) && !(emailStore instanceof MboxEmailStore); - - if (use_uid_if_available && is_uid_folder) { - // for uidfolders, we want to update the last seen uid in the FolderInfo - long uid = archive.getLastUIDForFolder(emailStore.getAccountID(), folder_name()); - if (uid > 0) { - messages = ((UIDFolder) folder).getMessagesByUID(uid + 1, UIDFolder.LASTUID); - log.info("Archive has already seen this folder: " + descr + " will only fetch messages from uid " + uid + " onwards, " + messages.length + " messages will be incrementally fetched"); - haveUID = true; - } else - log.info(descr + " is a UIDFolder but not seen before"); - } else - log.info(descr + " is not a UIDFolder"); - - if (!haveUID) { - log.info("All " + count + " messages in " + descr + " will be fetched"); - //messages = folder.getMessages(); - - if (begin_msg_index > 0 && end_msg_index > 0) { - // we have to use only specified messages - // if there are 8 messages, count = 8, end_msg_index will be 9 - if (end_msg_index > count + 1) - log.warn("Warning: bad end_msg_index " + end_msg_index + " count = " + count); // use the full messages - else { - int nMessages = end_msg_index - begin_msg_index; - Message[] newMessages = new Message[nMessages]; - for (int i = 0; i < end_msg_index - begin_msg_index; i++) - newMessages[i] = folder.getMessage(begin_msg_index + i);//messages[begin_msg_index - 1 + i]; // -1 cos messages array is indexed from 0, but begin_msg_index from 1 - log.info("total # of messages: " + count + " reduced # of messages: " + newMessages.length); - messages = newMessages; - } - } - } - } - - return messages; - } - - /** - * main fetch+index method - * The assumptions that the heap is big enough to enough to fit all the messages i the folder is not scalable for larger archive. - * Instead, we process each message individually. - * fetchHeaders may be penalised due to multiple requests of fetch? - * In order to make indexing of large archives possible, fetch of NON-MBOXEmailstrore formats is penalised. It is possible to avoid this by handling MBox and IMAP/POP formats differently. - */ - public void run() { - currentStatus = JSONUtils.getStatusJSON("Starting to process " + folder_name()); - - isCancelled = false; - Thread.currentThread().setName("EmailFetcher"); - nErrors = 0; - //Message[] messages = null; - // use_uid is set only if we are reading the whole folder. otherwise we won't use it, and we won't update the highest UID seen for the folder in the archive. - try { - // long t1 = System.currentTimeMillis(); - int nMessages = openFolderAndGetMessageCount(); - log.info("Total number of messages: " + nMessages); - - if (emailStore instanceof MboxEmailStore) { - // this is a special for mbox'es because we run out of memory if we try to openFolderAndGetMessages() - // so we process in batches - //TODO: Ideally, should cap on buffer size rather than on number of messages. - final int BATCH = 10000; - int nbatches = nMessages / BATCH; - nMessagesProcessedSuccess = 0; - long st = System.currentTimeMillis(); - int b; - for (b = 0; b < nbatches + 1; b++) { - begin_msg_index = b * BATCH + 1; - end_msg_index = Math.min((b + 1) * BATCH, nMessages) + 1; - log.info("Fetching messages in index [" + begin_msg_index + ", " + end_msg_index + "] batch: " + b + "/" + nbatches + "\nTotal Messages: " + nMessages); - Message[] messages = openFolderAndGetMessages(); - currentStatus = JSONUtils.getStatusJSON(""); - if (isCancelled) - return; - - if (messages.length > 0) { - try { - if (fetchConfig.downloadMessages) { - log.info(nMessages + " messages will be fetched for indexing"); - fetchAndIndexMessages(folder, messages, begin_msg_index, nMessages); - } else { - // this is for memory test screening mode. - // we create a dummy archive without any real contents - for (int i = 0; i < nMessages; i++) { - String unique_id_as_string = Long.toString(i); - - // well, we already converted to emaildoc above during removeMessagesAlreadyInArchive - // not a serious perf. concern now, but revisit if needed - EmailDocument ed = convertToEmailDocument((MimeMessage) messages[i], unique_id_as_string); // this messageNum is mostly for debugging, it should not be used for equals etc. - archive.addDocWithoutContents(ed); - } - } - } catch (Exception e) { - log.error("Exception trying to fetch messages, results will be incomplete! " + e + "\n" + Util.stackTrace(e)); - } - } - log.info("Fetch stats for this fetcher thread: " + stats); - } - log.info("Read #" + nMessages + " messages in #" + b + " batches of size: " + BATCH + " in " + (System.currentTimeMillis() - st) + "ms"); - } else { - // IMAP etc are pretty efficient with lazily populating message objects, so unlike mbox, its ok to use openFolderAndGetMessages() on the entire folder. - // remember to init the begin/end_msg_index before calling openFolderAndGetMessages - begin_msg_index = 1; - end_msg_index = nMessages + 1; - nMessagesProcessedSuccess = 0; - Message[] messages = openFolderAndGetMessages(); - - long st = System.currentTimeMillis(); - currentStatus = JSONUtils.getStatusJSON(""); - if (isCancelled) - return; - - if (messages.length > 0) { - try { - fetchHeaders(messages); // always fetch headers - if (fetchConfig.downloadMessages) { - log.info(nMessages + " messages will be fetched for indexing"); - //we process all the messages together here unlike the case of mstor - //hence the begin index is always 0 - fetchAndIndexMessages(folder, messages, 0, messages.length); - } else { - // this is for memory test screening mode. - // we create a dummy archive without any real contents - for (int i = 0; i < nMessages && i < messages.length; i++) { - String unique_id_as_string = Long.toString(i); - - // well, we already converted to emaildoc above during removeMessagesAlreadyInArchive - // not a serious perf. concern now, but revisit if needed - EmailDocument ed = convertToEmailDocument((MimeMessage) messages[i], unique_id_as_string); // this messageNum is mostly for debugging, it should not be used for equals etc. - archive.addDocWithoutContents(ed); - } - } - } catch (Exception e) { - Util.print_exception("Exception trying to fetch messages, results will be incomplete! ", e, log); - } - } - log.info("Read #" + nMessages + " messages in in " + (System.currentTimeMillis() - st) + "ms"); - } - } catch (Throwable t) { - if (t instanceof OutOfMemoryError) - this.mayHaveRunOutOfMemory = true; - // this is important, because there could be an out of memory etc over here. - Util.print_exception(t, log); - } finally { - try { - if (folder != null) - folder.close(false); - if (store != null) - store.close(); - } catch (Exception e) { - Util.print_exception(e); - } - } - } - - /* - * code for handling other kinds of headers, e.g. to find location of the - * message -- not used right now, but may use in the future. - * public void processHeaders(MimeMessage m) throws Exception - * { - * Address[] froms = m.getFrom(); - * if (froms == null) - * return; - * InternetAddress a = (InternetAddress) froms[0]; - * ContactInfo ci = addressBook.getContactInfoForAddress(a); - * Enumeration
e = (Enumeration
) m.getAllHeaders(); - * String lastReceivedHeader = null; - * while (e.hasMoreElements()) - * { - * Header h = e.nextElement(); - * String n = h.getName(); - * String v = h.getValue(); - * // log.info ("header: " + n + " = " + n); - * String s = n.toLowerCase(); - * if ("x-mailer".equals(s) || "user-agent".equals(s)) - * { - * log.warn (m.getFrom()[0] + " --> " + n + " " + v); - * ci.addMailer(v); - * } - * if ("x-originating-ip".equals(s) || "x-yahoo-post-ip".equals(s)) - * { - * log.warn (m.getFrom()[0] + " --> " + n + " " + v); - * ci.addIPAddr(v); - * } - * if ("x-yahoo-profile".equals(s)) - * log.warn (m.getFrom()[0] + " --> " + n + " " + v); - * if ("message-id".equals(s)) - * { - * log.warn("messageID = " + v); - * ci.addMessageID(v); - * } - * if ("received".equals(s) || "x-received".equals(s)) - * { - * lastReceivedHeader = v; - * } - * } - * - * // sometimes the headers have an extra ctrl-m at the end, strip it if - * this is the case. - * if (lastReceivedHeader != null && lastReceivedHeader.endsWith("\r")) - * lastReceivedHeader = lastReceivedHeader.substring(0, - * lastReceivedHeader.length()-1); - * - * ci.addLastReceivedHeader(lastReceivedHeader); - * - * String from = froms[0].toString(); - * - * log.info (from + " lastReceived " + lastReceivedHeader); - * if (lastReceivedHeader == null) - * log.warn ("WARNING: " + from + " --> no received header!?"); - * else - * { - * String ipAddrStr = processLastReceived(lastReceivedHeader); - * if (ipAddrStr != null) - * { - * byte[] ipAddrBytes = Util.parseIPAddress(ipAddrStr); - * if (ipAddrBytes != null) - * { - * // InetAddress ipAddr = InetAddress.getByAddress(ipAddrBytes); - * // log.info ("Received: " + locationService.lookupLocation(ipAddr)); - * } - * } - * } - * } - */ - - public String toString() { - return Util.fieldsToString(this); - } -} +/* + * Copyright (C) 2012 The Stanford MobiSocial Laboratory + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package edu.stanford.muse.email; + +import com.sun.mail.imap.IMAPFolder; +import edu.stanford.muse.datacache.Blob; +import edu.stanford.muse.index.*; +import edu.stanford.muse.util.EmailUtils; +import edu.stanford.muse.util.JSONUtils; +import edu.stanford.muse.util.Util; +import edu.stanford.muse.webapp.HTMLUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.json.JSONArray; +import org.json.JSONException; +import org.json.JSONObject; +import org.jsoup.Jsoup; + +import javax.activation.DataHandler; +import javax.activation.DataSource; +import javax.mail.*; +import javax.mail.internet.AddressException; +import javax.mail.internet.InternetAddress; +import javax.mail.internet.MimeMessage; +import java.io.*; +import java.security.GeneralSecurityException; +import java.util.*; + +class EmailFetcherStats implements Cloneable, Serializable { + private final static long serialVersionUID = 1L; + + int nTotalMessages; // total # of messages to process + int nMessagesAdded; // running total messages newly added to the archive + int nMessagesAlreadyPresent; // running messages that were already present + int nErrors = 0; + int nMessagesFiltered = 0; + + public void merge(EmailFetcherStats other) { + this.nMessagesAdded += other.nMessagesAdded; + this.nMessagesAlreadyPresent += other.nMessagesAlreadyPresent; + this.nMessagesFiltered += other.nMessagesFiltered; + this.nErrors += other.nErrors; + this.nTotalMessages += other.nTotalMessages; + } + + public String toString() { + return Util.fieldsToString(this); + } +} + +/** + * Important class for importing email. + * implements an email fetcher for a range of message #s within a single folder. + * In contrast, MTEmailFetcher is responsible for an entire email account, including multiple folders. + * and MuseEmailFetcher is responsible for multiple accounts (but for a single user) + * email fetcher stats is associated with a single email fetcher + */ +public class EmailFetcherThread implements Runnable, Serializable { + private final static long serialVersionUID = 1L; + + public static final int IMAP_PREFETCH_BUFSIZE = 20 * 1024 * 1024; + /* used for buffering imap prefetch data -- necessary for good imap performance*/ + public static final String FORCED_ENCODING = "UTF-8"; + + public static Log log = LogFactory.getLog(EmailFetcherThread.class); + + // set up INVALID_DATE + public static Date INVALID_DATE; // like 0xdeadbeef + + static { + Calendar c = new GregorianCalendar(); + c.set(Calendar.YEAR, 1960); + c.set(Calendar.DAY_OF_MONTH, 1); + c.set(Calendar.MONTH, Calendar.JANUARY); + c.set(Calendar.HOUR_OF_DAY, 0); + c.set(Calendar.MINUTE, 0); + c.set(Calendar.SECOND, 0); + c.set(Calendar.MILLISECOND, 0); + INVALID_DATE = c.getTime(); + } + + private FetchConfig fetchConfig; + private boolean mayHaveRunOutOfMemory = false; + private FolderInfo fetchedFolderInfo; + transient Folder folder; + boolean use_uid_if_available; + + protected int threadID; + protected EmailStore emailStore; + + protected boolean isCancelled; + + public static boolean verbose = false; + public static boolean debug = false; + + // notes: begin_msg_index is always correct. end_msg_index = -1 means nMessages in folder. + // note: msg # begin_msg_index will be processed. msg # end_msg_index will not be processed. + protected int begin_msg_index = 0, end_msg_index = -1; + + EmailFetcherStats stats = new EmailFetcherStats(); + String currentStatus; + + + int totalMessagesInFetch, messagesCompletedInFetch; // this fetcher may be part of a bigger fetch operation. we need to track the progress of the bigger fetch in order to track progress accurately. + + public int getTotalMessagesInFetch() { + return totalMessagesInFetch; + } + + public void setTotalMessagesInFetch(int totalMessagesInFetch) { + this.totalMessagesInFetch = totalMessagesInFetch; + } + + public int getMessagesCompletedInFetch() { + return messagesCompletedInFetch; + } + + public void setMessagesCompletedInFetch(int messagesCompletedInFetch) { + this.messagesCompletedInFetch = messagesCompletedInFetch; + } + + // stats + int nMessagesProcessedSuccess, nUncachedMessagesProcessed, nMessagesCached; // running count of # of messages processed successfully + int nErrors = 0; + + public void cancel() { + isCancelled = true; + } + + public void setFetchConfig(FetchConfig fc) { + this.fetchConfig = fc; + } + + public int getThreadID() { + return threadID; + } + + public void setThreadID(int threadID) { + this.threadID = threadID; + } + + public int getNMessagesProcessed() { + return nMessagesProcessedSuccess; + } + + public int getNUncachedMessagesProcessed() { + return nUncachedMessagesProcessed; + } + + protected String folder_name() { + return fetchedFolderInfo.longName; + } + + protected String email_source() { + return fetchedFolderInfo.accountKey; + } + + public boolean mayHaveRunOutOfMemory() { + return mayHaveRunOutOfMemory; + } + + // private String folderPrefix; // prefix for folder files + transient Store store; // we don't really need this serialized across sessions + + transient Archive archive; + Collection dataErrors = new LinkedHashSet(); // log of input data errors + + Date prevDate = null; + + /* + * // comment out unused constructors, so it's cleaner/easier to trace the + * setting member fields. + * public EmailFetcherThread() { super(); } + * + * public EmailFetcherThread(EmailStore store, String folder_name) + * { + * this.emailStore = store; + * this.folder_name = folder_name; + * } + */ + + public EmailFetcherThread(EmailStore store, FolderInfo fi, int begin_msg_index, int end_msg_index) { + this.emailStore = store; + this.fetchedFolderInfo = fi; + stats.nTotalMessages = end_msg_index - begin_msg_index; + this.begin_msg_index = begin_msg_index; + this.end_msg_index = end_msg_index; + } + + public void setArchive(Archive a) { + archive = a; + } + + public Archive getArchive() { + return archive; + } + + /** + * merges results with another email fetcher. does some lightweight work + * including updating stats. consider removing this and simplifying in the + * future + */ + public void merge(EmailFetcherThread other) { + verify(); + if (other != null) { + other.verify(); + + // TOFIX: we should eliminate duplicates + dataErrors.addAll(other.dataErrors); + stats.merge(other.stats); + + nMessagesProcessedSuccess += other.nMessagesProcessedSuccess; + nErrors += other.nErrors; + mayHaveRunOutOfMemory |= other.mayHaveRunOutOfMemory; + } + verify(); + } + + /** + * intern a bunch of addrs, to save memory + * + * @throws UnsupportedEncodingException + */ + private static void internAddressList(Address[] addrs) throws UnsupportedEncodingException { + if (addrs == null) + return; + + for (Address a : addrs) { + if (a instanceof InternetAddress) { + InternetAddress ia = (InternetAddress) a; + String address = ia.getAddress(), personal = ia.getPersonal(); + if (address != null) + ia.setAddress(InternTable.intern(address)); + if (personal != null) + ia.setPersonal(InternTable.intern(personal)); + } + } + } + + /** + * Key method for importing email: converts a javamail obj. to our own data structure (EmailDocument) + */ + //public EmailDocument convertToEmailDocument(MimeMessage m, int num, String url) throws MessagingException, IOException + private EmailDocument convertToEmailDocument(MimeMessage m, String id) throws MessagingException, IOException { + // get the date. + // prevDate is a hack for the cases where the message is lacking an explicit Date: header. e.g. + // From hangal Sun Jun 10 13:46:46 2001 + // To: ewatkins@stanford.edu + // Subject: Re: return value bugs + // though the date is on the From separator line, the mbox provider fails to parse it and provide it to us. + // so as a hack, we will assign such messages the same date as the previous one this fetcher has seen! ;-) + // update: having the exact same date causes the message to be considered a duplicate, so just increment + // the timestamp it by 1 millisecond! + // a better fix would be to improve the parsing in the provider + + boolean hackyDate = false; + Date d = m.getSentDate(); + if (d == null) + d = m.getReceivedDate(); + if (d == null) { + if (prevDate != null) { + long newTime = prevDate.getTime() + 1L; // added +1 so that this email is not considered the same object as the prev. one if they are in the same thread + d = new Date(newTime); + dataErrors.add("No date for message id:" + id + ": " + EmailUtils.formatMessageHeader(m) + " assigned approximate date"); + } else { + d = INVALID_DATE; // wrong, but what can we do... :-( + dataErrors.add("No date for message id:" + id + ": " + EmailUtils.formatMessageHeader(m) + " assigned deliberately invalid date"); + } + hackyDate = true; + } else { + Calendar c = new GregorianCalendar(); + c.setTime(d); + int yy = c.get(Calendar.YEAR); + if (yy < 1960 || yy > 2020) { + dataErrors.add("Probably bad date: " + Util.formatDate(c) + " message: " + EmailUtils.formatMessageHeader(m)); + hackyDate = true; + } + } + + if (hackyDate && prevDate != null) { + long newTime = prevDate.getTime() + 1L; // added +1 so that this email is not considered the same object as the prev. one if they are in the same thread + d = new Date(newTime); + Util.ASSERT(!d.equals(prevDate)); + } + + Calendar c = new GregorianCalendar(); + c.setTime(d != null ? d : new Date()); + + prevDate = d; + + Address to[] = null, cc[] = null, bcc[] = null; + Address[] from = null; + try { + // allrecip = m.getAllRecipients(); // turns out to be too expensive because it looks for newsgroup headers for imap + // assemble to, cc, bcc into a list and copy it into allrecip + List
list = new ArrayList
(); + from = m.getFrom(); + to = m.getRecipients(Message.RecipientType.TO); + if (to != null) + list.addAll(Arrays.asList(to)); + cc = m.getRecipients(Message.RecipientType.CC); + if (cc != null) + list.addAll(Arrays.asList(cc)); + bcc = m.getRecipients(Message.RecipientType.BCC); + if (bcc != null) + list.addAll(Arrays.asList(bcc)); + + // intern the strings in these addresses to save memory cos they are repeated often in a large archive + internAddressList(from); + internAddressList(to); + internAddressList(cc); + internAddressList(bcc); + } catch (AddressException ae) { + String s = "Bad address in folder " + folder_name() + " message id" + id + " " + ae; + dataErrors.add(s); + } + + // take a deep breath. This object is going to live longer than most of us. + EmailDocument ed = new EmailDocument(id, email_source(), folder_name(), to, cc, bcc, from, m.getSubject(), m.getMessageID(), c.getTime()); + + String[] headers = m.getHeader("List-Post"); + if (headers != null && headers.length > 0) { + // trim the headers because they usually look like: "" + ed.sentToMailingLists = new String[headers.length]; + int i = 0; + for (String header : headers) { + header = header.trim(); + header = header.toLowerCase(); + + if (header.startsWith("<") && header.endsWith(">")) + header = header.substring(1, header.length() - 1); + if (header.startsWith("mailto:") && !"mailto:".equals(header)) // defensive check in case header == "mailto:" + header = header.substring(("mailto:").length()); + ed.sentToMailingLists[i++] = header; + } + } + if (hackyDate) { + String s = "Guessed date " + Util.formatDate(c) + " for message id: " + id + ": " + ed.getHeader(); + dataErrors.add(s); + ed.hackyDate = true; + } + + // check if the message has attachments. + // if it does and we're not downloading attachments, then we mark the ed as such. + // otherwise we had a problem where a message header (and maybe text) was downloaded but without attachments in one run + // but in a subsequent run where attachments were needed, we thought the message was already cached and there was no + // need to recompute it, leaving the attachments field in this ed incorrect. + List attachmentNames = getAttachmentNames(m, m); + if (!Util.nullOrEmpty(attachmentNames)) { + ed.attachmentsYetToBeDownloaded = true; // will set it to false later if attachments really were downloaded (not sure why) + // log.info ("added " + attachmentNames.size() + " attachments to message: " + ed); + } + return ed; + } + + /* + * we try to get the attachment names cheaply, i.e. without having to + * process the whole message + */ + private List getAttachmentNames(MimeMessage m, Part p) throws MessagingException, IOException { + List result = new ArrayList(); + try { + if (p.isMimeType("multipart/*") || p.isMimeType("message/rfc822")) { + if (p.isMimeType("multipart/alternative")) + return result; // ignore alternative's because real attachments don't have alternatives + DataHandler dh = p.getDataHandler(); + DataSource ds = dh.getDataSource(); + if (ds instanceof MultipartDataSource) { + MultipartDataSource mpds = (MultipartDataSource) ds; + for (int i = 0; i < mpds.getCount(); i++) + result.addAll(getAttachmentNames(m, mpds.getBodyPart(i))); + } else { + String name = ds.getName(); + if (!Util.nullOrEmpty(name)) + result.add(name); + } + } else { + String filename = p.getFileName(); + if (filename != null) + result.add(filename); + } + } catch (Exception e) { + // sometimes we see javax.mail.MessagingException: Unable to load BODYSTRUCTURE + // in this case, just ignore, not much we can do i guess. + Util.print_exception(e, log); + } + return result; + } + + // public void setEmailCache (DocCache cache) + // { + // this.cache = cache; + // } + + /** + * this method returns the text content of the message as a list of strings + * // each element of the list could be the content of a multipart message + * // m is the top level subject + * // p is the specific part that we are processing (p could be == m) + * also sets up names of attachments (though it will not download the + * attachment unless downloadAttachments is true) + */ + private List processMessagePart(int messageNum, Message m, Part p, List attachmentsList) throws MessagingException, IOException { + List list = new ArrayList(); // return list + if (p == null) { + dataErrors.add("part is null: " + folder_name() + " idx " + messageNum); + return list; + } + + if (p == m && p.isMimeType("text/html")) { + /* + String s = "top level part is html! message:" + m.getSubject() + " " + m.getDescription(); + dataErrors.add(s); + */ + // we don't normally expect the top-level part to have content-type text/html + // but we saw this happen on some sample archives pst -> emailchemy. so allow it and handle it by parsing the html + String html = (String) p.getContent(); + String text = Util.unescapeHTML(html); + org.jsoup.nodes.Document doc = Jsoup.parse(text); + + StringBuilder sb = new StringBuilder(); + HTMLUtils.extractTextFromHTML(doc.body(), sb); + list.add(sb.toString()); + return list; + } + + if (p.isMimeType("text/plain")) { + //make sure, p is not wrongly labelled as plain text. + Enumeration headers = p.getAllHeaders(); + boolean dirty = false; + if (headers != null) + while (headers.hasMoreElements()) { + Header h = (Header) headers.nextElement(); + String name = h.getName(); + String value = h.getValue(); + if (name != null && value != null) { + if (name.equals("Content-transfer-encoding") && value.equals("base64")) { + dirty = true; + break; + } + } + } + String fname = p.getFileName(); + if (fname != null) { + int idx = fname.lastIndexOf('.'); + if ((idx < fname.length()) && (idx >= 0)) { + String extension = fname.substring(idx); + //anything extension other than .txt is suspicious. + if (!extension.equals(".txt")) + dirty = true; + } + } + if (dirty) { + dataErrors.add("Dirty message part, has conflicting message part headers." + folder_name() + " Message# " + messageNum); + return list; + } + + log.debug("Message part with content type text/plain"); + String content; + String type = p.getContentType(); // new InputStreamReader(p.getInputStream(), "UTF-8"); + try { + if (type.contains("charset=")) { + byte b[] = Util.getBytesFromStream(p.getInputStream()); + content = new String(b, type.substring(type.indexOf("charset=") + "charset=".length())); + } else { + // if forced encoding is set, we read the string with that encoding, otherwise we just use whatever p.getContent gives us + if (FORCED_ENCODING != null) { + byte b[] = Util.getBytesFromStream(p.getInputStream()); + content = new String(b, FORCED_ENCODING); + } else + content = (String) p.getContent(); + } + } catch (UnsupportedEncodingException uee) { + dataErrors.add("Unsupported encoding: " + folder_name() + " Message #" + messageNum + " type " + type + ", using brute force conversion"); + // a particularly nasty issue:javamail can't handle utf-7 encoding which is common with hotmail and exchange servers. + // we're using the workaround suggested on this page: http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4304013 + // though it may be better to consider official support for utf-7 or other encodings. + + // TOFIX: I get an exception for utfutf8-encoding which has a base64 encoding embedded on it. + // Unsupported encoding: gmail-sent Message #10477 type text/plain; charset=x-utf8utf8; name="newyorker.txt", + // the hack below doesn't work for it. + ByteArrayOutputStream bao = new ByteArrayOutputStream(); + p.writeTo(bao); + content = bao.toString(); + } + list.add(content); + } else if (p.isMimeType("multipart/*") || p.isMimeType("message/rfc822")) { + // rfc822 mime type is for embedded mbox format or some such (appears for things like + // forwarded messages). the content appears to be just a multipart. + Object o = p.getContent(); + if (o instanceof Multipart) { + Multipart allParts = (Multipart) o; + if (p.isMimeType("multipart/alternative")) { + // this is an alternative mime type. v common case to have text and html alternatives + // so just process the text part if there is one, and avoid fetching the alternatives. + // useful esp. because many ordinary messages are alternative: text and html and we don't want to fetch the html. + // revisit in future we want to retain the html alternative for display purposes + Part[] parts = new Part[allParts.getCount()]; + for (int i = 0; i < parts.length; i++) + parts[i] = allParts.getBodyPart(i); + + for (int i = 0; i < parts.length; i++) { + Part thisPart = parts[i]; + if (thisPart.isMimeType("text/plain")) { + // common case, return quickly + list.add((String) thisPart.getContent()); + log.debug("Multipart/alternative with content type text/plain"); + return list; + } + } + + // no text part, let's look for an html part. this happens for html parts. + for (int i = 0; i < allParts.getCount(); i++) { + Part thisPart = parts[i]; + if (thisPart.isMimeType("text/html")) { + // common case, return quickly + String html = (String) thisPart.getContent(); + String text = Util.unescapeHTML(html); + org.jsoup.nodes.Document doc = Jsoup.parse(text); + + StringBuilder sb = new StringBuilder(); + HTMLUtils.extractTextFromHTML(doc.body(), sb); + list.add(sb.toString()); + + log.debug("Multipart/alternative with content type text/html"); + return list; + } + } + + // no text or html part. hmmm... blindly process the first part only + if (allParts.getCount() >= 1) + list.addAll(processMessagePart(messageNum, m, allParts.getBodyPart(0), attachmentsList)); + } else { + // process it like a regular multipart + for (int i = 0; i < allParts.getCount(); i++) { + BodyPart bp = allParts.getBodyPart(i); + list.addAll(processMessagePart(messageNum, m, bp, attachmentsList)); + } + } + } else if (o instanceof Part) + list.addAll(processMessagePart(messageNum, m, (Part) o, attachmentsList)); + else + dataErrors.add("Unhandled part content, " + folder_name() + " Message #" + messageNum + "Java type: " + o.getClass() + " Content-Type: " + p.getContentType()); + } else { + try { + // do attachments only if downloadAttachments is set. + // some apps do not need attachments, so this saves some time. + // however, it seems like a lot of time is taken in imap prefetch, which gets attachments too? + if (fetchConfig.downloadAttachments) + handleAttachments(messageNum, m, p, list, attachmentsList); + } catch (Exception e) { + dataErrors.add("Ignoring attachment for " + folder_name() + " Message #" + messageNum + ": " + Util.stackTrace(e)); + } + } + + return list; + } + + /** + * recursively processes attachments, fetching and saving it if needed + * parses the given part p, and adds it to hte attachmentsList. + * in some cases, like a text/html type without a filename, we instead append it to the textlist + * @throws MessagingException + */ + private void handleAttachments(int idx, Message m, Part p, List textList, List attachmentsList) throws MessagingException { + String ct = null; + if (!(m instanceof MimeMessage)) { + Exception e = new IllegalArgumentException("Not a MIME message!"); + e.fillInStackTrace(); + log.warn(Util.stackTrace(e)); + return; + } + + String filename = null; + try { + filename = p.getFileName(); + } catch (Exception e) { + // seen this happen with: + // Folders__gmail-sent Message #12185 Expected ';', got "Message" + // javax.mail.internet.ParseException: Expected ';', got "Message" + + dataErrors.add("Unable to read attachment name: " + folder_name() + " Message# " + idx); + return; + } + + String sanitizedFName = Util.sanitizeFolderName(emailStore.getAccountID() + "." + folder_name()); + if (filename == null) { + String tempFname = sanitizedFName + "." + idx; + dataErrors.add("attachment filename is null for " + sanitizedFName + " Message#" + idx + " assigning it the name: " + tempFname); + if (p.isMimeType("text/html")) { + try { + log.info("Turning message " + sanitizedFName + " Message#" + idx + " into text although it is an attachment"); + String html = (String) p.getContent(); + String text = Util.unescapeHTML(html); + org.jsoup.nodes.Document doc = Jsoup.parse(text); + + StringBuilder sb = new StringBuilder(); + HTMLUtils.extractTextFromHTML(doc.body(), sb); + textList.add(sb.toString()); + return; + } catch (Exception e) { + Util.print_exception("Error reading contents of text/html multipart without a filename!", e, log); + return; + } + } + filename = tempFname; + } + + // Replacing any of the disallowed filename characters (\/:*?"<>|&) to _ + // (note: & causes problems with URLs for serveAttachment etc, so it's also replaced) + String newFilename = Util.sanitizeFileName(filename); + + // Updating filename if it's changed after sanitizing. + if (!newFilename.equals(filename)) { + log.info("Filename changed from " + filename + " to " + newFilename); + filename = newFilename; + } + + try { + ct = p.getContentType(); + if (filename.indexOf(".") < 0) // no ext in filename... let's fix it if possible + { + // Using startsWith instead of equals because sometimes the ct has crud beyond the image/jpeg;...crud.... + // Below are the most common file types, more type can be added if needed + + // Most common APPLICATION TYPE + if (ct.startsWith("application/pdf")) + filename = filename + ".pdf"; + if (ct.startsWith("application/zip")) + filename = filename + ",zip"; + // Most common IMAGE TYPE + if (ct.startsWith("image/jpeg")) + filename = filename + ".jpg"; + if (ct.startsWith("image/gif")) + filename = filename + ".gif"; + if (ct.startsWith("image/png")) + filename = filename + ".png"; + // Most Common VIDEO TYPE + if (ct.startsWith("video/x-ms-wmv")) + filename = filename + ".wmv"; + // Most Common AUDIO TYPE + if (ct.startsWith("audio/mpeg")) + filename = filename + ".mp3"; + if (ct.startsWith("audio/mp4")) + filename = filename + ".mp4"; + // Most Common TEXT TYPE + if (ct.startsWith("text/html")) + filename = filename + ".html"; + // Windows Office + if (ct.startsWith("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) //Word + filename = filename + ".docx"; + if (ct.startsWith("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")) //Excel + filename = filename + ".xlsx"; + if (ct.startsWith("application/vnd.openxmlformats-officedocument.presentationml.presentation")) //PowerPoint + filename = filename + ".pptx"; + } + // retain only up to first semi-colon; often ct is something like text/plain; name="filename"' we don't want to log the filename + int x = ct.indexOf(";"); + if (x >= 0) + ct = ct.substring(0, x); + log.info("Attachment content type: " + ct + " filename = " + Util.blurKeepingExtension(filename)); + } catch (Exception pex) { + dataErrors.add("Can't read CONTENT-TYPE: " + ct + " filename:" + filename + " size = " + p.getSize() + " subject: " + m.getSubject() + " Date : " + m.getSentDate().toString() + "\n Exception: " + pex + "\n" + Util.stackTrace(pex)); + return; + } + + // if (filename == null && !p.isMimeType("text/html") && !p.isMimeType("message/partial")) // expected not to have a filename with mime type text/html + // log.warn ("Attachment filename is null: " + Util.stackTrace()); + + + boolean success = true; + // the size passed in here is the part size, which is not really the binary blob size. + // when we read the stream below in blobStore.add(), we'll set it again to the binary blob size + Blob b = new EmailAttachmentBlob(filename, p.getSize(), (MimeMessage) m, p); + + if (fetchConfig.downloadAttachments) { + // this containment check is only on the basis of file name and size currently, + // not on the actual hash + if (archive.getBlobStore().contains(b)) { + log.debug("Cache hit! " + b); + } else { + try { + if (filename.endsWith(".tif")) + log.info("Fetching attachment..." + Util.blurKeepingExtension(filename)); + + // performance critical! use large buffer! currently 256KB + // stream will be closed by callee + + long start = System.currentTimeMillis(); + long nBytes = archive.getBlobStore().add(b, new BufferedInputStream(p.getInputStream(), 256 * 1024)); + long end = System.currentTimeMillis(); + if (nBytes != -1) { + long diff = end - start; + String s = "attachment size " + nBytes + " bytes, fetched in " + diff + " millis"; + if (diff > 0) + s += " (" + (nBytes / diff) + " KB/s)"; + log.info(s); + } + + Util.ASSERT(archive.getBlobStore().contains(b)); + + } catch (IOException ioe) { + success = false; + dataErrors.add("WARNING: Unable to fetch attachment: filename: " + filename + " size = " + p.getSize() + " subject: " + m.getSubject() + " Date : " + m.getSentDate().toString() + "\nException: " + ioe); + ioe.printStackTrace(System.out); + } + } + + if (success) { + attachmentsList.add(b); + + /// generate thumbnail only if not already cached + try { + archive.getBlobStore().generate_thumbnail(b); // supplement + } catch (IOException ioe) { + log.warn("failed to create thumbnail, filename: " + filename + " size = " + p.getSize() + " subject: " + m.getSubject() + " Date : " + m.getSentDate().toString() + "\nException: " + ioe); + ioe.printStackTrace(System.out); + } + } + } + } + + @SuppressWarnings("unused") + private static String processLastReceived(String header) { + header = header.toLowerCase(); + StringTokenizer st = new StringTokenizer(header, " \t()[]"); + String x = st.nextToken(); + if (!x.equals("from")) { + log.warn("Warning: unrecognized header: " + header); + return null; + } + + while (st.hasMoreTokens()) { + String s = st.nextToken(); + if (Character.isDigit(s.charAt(0))) { + log.warn("IP address: " + s); + return s; + } + } + return null; + } + + public void verify() { + } + + public void finish() { + currentStatus = JSONUtils.getStatusJSON("Verifying email headers..."); + currentStatus = JSONUtils.getStatusJSON(""); + } + + /** + * prepare a status json with up to N_TEASERS teasers from the most recent + * emails, starting backwards from idx. specifically ask for ArrayList as + * List.get() can be costly otherwise. + */ + private static String getStatusJSONWithTeasers(String message, int pctComplete, long secsElapsed, long secsRemaining, ArrayList emails, int N_TEASERS) { + JSONObject json = new JSONObject(); + try { + json.put("pctComplete", pctComplete); + json.put("message", message); + json.put("secsElapsed", secsElapsed); + json.put("secsRemaining", secsRemaining); + if (!Util.nullOrEmpty(emails)) { + JSONArray arr = new JSONArray(); + int idx_end = emails.size(); + int idx_start = idx_end - N_TEASERS; + if (idx_start < 0) + idx_start = 0; + for (int i = idx_start, j = 0; i < idx_end; i++) { + EmailDocument email = emails.get(i); + if (email != null) { + String subject = email.description; + if (!Util.nullOrEmpty(subject)) + arr.put(j++, subject); + } + } + json.put("teasers", arr); + } + } catch (JSONException jsone) { + try { + json.put("error", jsone.toString()); + } catch (Exception e) { + Util.report_exception(e); + } + } + return json.toString(); + } + + /** + * best effort to prefetch messages for messages[startMsgIdx] onwards, up to + * the IMAP_PREFETCH_BUFSIZE + * return List if bodyTextOnly is true, otherwise List + */ + private List do_imap_prefetch(Message[] messages, int startMsgIdx, Folder folder, boolean bodyTextOnly) { + // its perfectly ok for correctness for this method to do nothing and return null + List prefetchedMessages = null; + try { + + if (IMAP_PREFETCH_BUFSIZE > 0 && folder instanceof IMAPFolder) { + int prefetch_messages_size = 0; + + int start_message_num = messages[startMsgIdx].getMessageNumber(); + int end_message_num = start_message_num; + + List messageNums = new ArrayList(); + + // figure out message num range to fetch. if anything is unusual -- bad content type, non-consec. msg nums etc -- break out. + // non consec. message numbers are a problem because they cause a very long imap command string, which we found was returning an "invalid command" response. + int prev_message_num = -1; + for (int msgIdx = startMsgIdx; msgIdx < messages.length; msgIdx++) { + if (bodyTextOnly) { + String contentType = messages[msgIdx].getContentType().toLowerCase(); + if (!contentType.startsWith("multipart/") && !contentType.startsWith("text/plain")) { + log.info("Warn: message idx" + msgIdx + " msg#" + messages[msgIdx].getMessageNumber() + " has unexpected content type " + contentType); + break; + } + } + + // check if sequence is as expected + int next_message_num = messages[msgIdx].getMessageNumber(); // may be better to switch this to uid and prefetcher uses uid fetch + if (next_message_num != prev_message_num + 1 && prev_message_num != -1) + break; + + // if this message would push prefetch size beyond the buf size, break out, not including this message + if (prefetch_messages_size + messages[msgIdx].getSize() >= IMAP_PREFETCH_BUFSIZE) + break; + prev_message_num = next_message_num; + prefetch_messages_size += messages[msgIdx].getSize(); + messageNums.add(next_message_num); + } + + if (messageNums.size() == 0) + return null; + + // now we prefetch messages from start_message_num to end_message_num + long startMillis = System.currentTimeMillis(); + log.info("prefetching " + messageNums.size() + " messages"); + ImapPrefetcher prefetcher = bodyTextOnly ? new TextOnlyImapPrefetcher(((ImapPopEmailStore) emailStore).session, messageNums) : new ImapPrefetcher(((ImapPopEmailStore) emailStore).session, messageNums); + prefetchedMessages = (List) ((IMAPFolder) folder).doCommand(prefetcher); // start_message_num, end_message_num)); + long elapsedMillis = System.currentTimeMillis() - startMillis; + long kb_per_sec = prefetch_messages_size / elapsedMillis; + log.info("prefetched " + messageNums.size() + " messages in " + Util.blur(folder.getName()) + " [" + start_message_num + ":" + end_message_num + "], " + Util.commatize(prefetch_messages_size / 1024) + "KB in " + Util.commatize(elapsedMillis) + "ms (" + Util.commatize(kb_per_sec) + " KB/sec)"); + } + } catch (Exception e) { + Util.print_exception(e, log); + } + return prefetchedMessages; + } + + private void fetchHeaders(Message[] messages) throws MessagingException { + // fetch headers (don't do it for mbox folders, waste of time) + // this is an essential perf. step so that we fetch the headers in bulk. + // otherwise it takes a long time to fetch header info one at a time for each message + if (!(emailStore instanceof MboxEmailStore)) { + long startTimeMillis = System.currentTimeMillis(); + currentStatus = JSONUtils.getStatusJSON("Reading headers from " + folder.getName() + "..."); + FetchProfile fp = new FetchProfile(); + fp.add(FetchProfile.Item.ENVELOPE); + fp.add(FetchProfile.Item.CONTENT_INFO); + fp.add(UIDFolder.FetchProfileItem.UID); // important, otherwise reading UIDs takes a long time later + fp.add("List-Post"); + folder.fetch(messages, fp); + long endTimeMillis = System.currentTimeMillis(); + log.info("Done fetching headers: " + Util.commatize(endTimeMillis - startTimeMillis) + "ms"); + } + } + + private void fetchHeaders(int nMessages) throws MessagingException { + // fetch headers (don't do it for mbox folders, waste of time) + // this is an essential perf. step so that we fetch the headers in bulk. + // otherwise it takes a long time to fetch header info one at a time for each message + if (!(emailStore instanceof MboxEmailStore)) { + long startTimeMillis = System.currentTimeMillis(); + currentStatus = JSONUtils.getStatusJSON("Reading headers from " + folder.getName() + "..."); + FetchProfile fp = new FetchProfile(); + fp.add(FetchProfile.Item.ENVELOPE); + fp.add(FetchProfile.Item.CONTENT_INFO); + fp.add(UIDFolder.FetchProfileItem.UID); // important, otherwise reading UIDs takes a long time later + fp.add("List-Post"); + for (int i = 0; i < nMessages; i++) { + Message[] messages = new Message[]{folder.getMessage(i)}; + folder.fetch(messages, fp); + } + long endTimeMillis = System.currentTimeMillis(); + log.info("Done fetching headers: " + Util.commatize(endTimeMillis - startTimeMillis) + "ms"); + } + } + + private Message[] removeMessagesAlreadyInArchive(Archive archive, Message[] messages) { + // early out for the common case that we have an empty archive + if (archive.getAllDocs().size() == 0) + return messages; + + List resultList = new ArrayList(); + for (int i = 0; i < messages.length; i++) { + //int idx = messages[i].getMessageNumber(); + Message m = messages[i]; + MimeMessage mm = (MimeMessage) m; + try { + EmailDocument ed = convertToEmailDocument(mm, "dummy"); // id doesn't really matter here + if (archive.containsDoc(ed)) { + stats.nMessagesAlreadyPresent++; + dataErrors.add("Duplicate message: " + ed); // note: report.jsp depends on this exact string + continue; + } + } catch (Exception e) { + Util.print_exception(e, log); + } + resultList.add(mm); + messages[i] = null; // no harm explicitly nulling out messages + } + Message[] resultArray = resultList.toArray(new Message[0]); + return resultArray; + } + + /** + * Make few post checks on the content and returns true if the message looks + * ok + */ + private boolean messageLooksOk(String content) { + if (content == null) + //let others handle it. + return true; + String[] lines = content.split("\n"); + int badlines = 0; + if (lines.length > 50) + for (String line : lines) { + if (!line.contains(" ")) + badlines++; + else + badlines = 0; + if (badlines > 50) + return false; + } + return true; + } + + //keep track of the total time elapsed in fetching messages across batches + static long fetchStartTime = System.currentTimeMillis(); + + /** + * fetch given message idx's in given folder -- @performance critical + * + * @param offset - the original offset of the first message in the messages array, important to initialize + * for proper assignment of unique id or doc Id + */ + //private void fetchUncachedMessages(String sanitizedFName, Folder folder, DocCache cache, List msgIdxs) throws MessagingException, FileNotFoundException, IOException, GeneralSecurityException { + private void fetchAndIndexMessages(Folder folder, Message[] messages, int offset, int totalMessages) throws MessagingException, IOException, GeneralSecurityException { + //mark the processing of new batch + if (offset == 0) + fetchStartTime = System.currentTimeMillis(); + + currentStatus = JSONUtils.getStatusJSON((emailStore instanceof MboxEmailStore) ? "Parsing " + folder.getName() + " (can take a while)..." : "Reading " + folder.getName() + "..."); + + // bulk fetch of all message headers + int n = messages.length; + + // eliminate any messages the archive already has + messages = removeMessagesAlreadyInArchive(archive, messages); + + log.info(n - messages.length + " message(s) already in the archive"); + + ArrayList emails = new ArrayList(); + + // for performance, we need to do bulk prefetches, instead of fetching 1 message at a time + // prefetchedMessages will be a temp cache of prefetched messages + int first_i_prefetched = -1, last_i_prefetched = -1; + List prefetchedMessages = null; // the type of this can be either list if text only, otherwise list + + long highestUID = archive.getLastUIDForFolder(fetchedFolderInfo.accountKey, fetchedFolderInfo.longName); + long lastAssignedUID = highestUID; + boolean bodyTextOnly = !fetchConfig.downloadAttachments; + try { + archive.openForWrite(); + for (int i = 0; i < messages.length; i++) { + // critical step: (thanks, yourkit!) + // null out the ref to the previous message, otherwise it stays in memory, and the heap effectively needs to be as big as the size of all messages + if (i > 0) + messages[i - 1] = null; + + if (isCancelled) + break; + + Message m = messages[i]; + MimeMessage mm = (MimeMessage) m; + + if (i >= last_i_prefetched) { + // critical perf. step: do a bulk imap prefetch + // the prefetch will fetch as many messages as possible up to a max buffer size, and return the messages prefetched + // last_i_prefetched tracks what is the last index into idxs that we have prefetched. + // when we run out of prefetched messages, we do another bulk prefetch + + prefetchedMessages = do_imap_prefetch(messages, i, folder, bodyTextOnly); + if (prefetchedMessages != null) { + first_i_prefetched = i; + last_i_prefetched = i + prefetchedMessages.size(); + } + } + + int pctDone = ((i + offset) * 100) / totalMessages; + long elapsedMillis = System.currentTimeMillis() - fetchStartTime; + long unprocessedSecs = Util.getUnprocessedMessage(i + offset, totalMessages, elapsedMillis); + int N_TEASERS = 50; // 50 ok here, because it takes a long time to fetch and process messages, so teaser computation is relatively not expensive + int nTriesForThisMessage = 0; + currentStatus = getStatusJSONWithTeasers("Reading " + Util.commatize(totalMessages) + " messages from " + folder.getName() + "...", pctDone, elapsedMillis / 1000, unprocessedSecs, emails, N_TEASERS); + + int messageNum = mm.getMessageNumber(); + + try { + long unique_id; + + // if we have uid, that's even better + // don't use uid's for mbox, it has a bug and always gives -1 + // see http://james.apache.org/server/rfclist/imap4/rfc2060.txt for uid spec + if (folder instanceof UIDFolder && !(emailStore instanceof MboxEmailStore)) { + long uid = ((UIDFolder) folder).getUID(m); + unique_id = uid; + } else + unique_id = lastAssignedUID + 1 + i + offset; // +1 since i starts from 0 (but lastAssignedUID can be -1 -- is that safe? -sgh) + + if (unique_id > highestUID) + highestUID = unique_id; + + String unique_id_as_string = Long.toString(unique_id); + + // well, we already converted to emaildoc above during removeMessagesAlreadyInArchive + // not a serious perf. concern now, but revisit if needed + EmailDocument ed = convertToEmailDocument(mm, unique_id_as_string); // this messageNum is mostly for debugging, it should not be used for equals etc. + // need to check this again, because there might be duplicates such within the set we are currently processing. + if (archive.containsDoc(ed)) { + stats.nMessagesAlreadyPresent++; + dataErrors.add("Duplicate message: " + ed); // note: report.jsp depends on this specific string + continue; + } + + MimeMessage originalMessage = mm; // this is the mm that has all the headers etc. + List attachmentsList = new ArrayList(); + + // if we already have it prefetched, use the prefetched version + List contents = null; + + if (first_i_prefetched >= 0 && prefetchedMessages != null) { + if (!fetchConfig.downloadAttachments) { + // text only means the prefetchedMessages are stored directly as a list of strings + String content = (String) prefetchedMessages.get(i - first_i_prefetched); // note: this_mm only has the prefetched content, but not the headers + contents = new ArrayList(); + + try { + // a special for yahoo which routinely uses quoted-printable. content looks like =0A0D.... = etc. + if (mm.isMimeType("multipart/alternative")) { + Multipart mm_mp = (Multipart) mm.getContent(); + Part p0 = mm_mp.getBodyPart(0); + if (p0 instanceof com.sun.mail.imap.IMAPBodyPart) { + String encoding = ((com.sun.mail.imap.IMAPBodyPart) p0).getEncoding(); + if ("quoted-printable".equals(encoding)) { + content = new String(Util.getBytesFromStream(javax.mail.internet.MimeUtility.decode(new java.io.ByteArrayInputStream(content.getBytes()), "quoted-printable"))); + } + } + } + } catch (Exception e) { + Util.print_exception("Error trying to parse encoding of multipart", e, log); + } + + contents.add(content); + } else { + // subtle issue here: the contentType of the prefetchedMessage needs to be be set to the original_mm's content-type. + // this was found for cases where the original message is multipart-alternative with a text and html part. + // if we don't set prefetchedMessage's content type, it gets a mime type of text/plain and a body = the entire multipart including both parts. + // found on sgh's sent mail w/subject: "text to add in help" from Fri, 7 Jun 2013 + MimeMessage prefetchedMessage = (MimeMessage) prefetchedMessages.get(i - first_i_prefetched); + String contentTypeHeaders[] = originalMessage.getHeader("Content-Type"); + String contentTypeHeader = null; + if (contentTypeHeaders != null && contentTypeHeaders.length == 1) + contentTypeHeader = contentTypeHeaders[0]; + + if (!Util.nullOrEmpty(contentTypeHeader)) // we do care about body structure, hang on to it + prefetchedMessage.setHeader("Content-Type", contentTypeHeader); + mm = prefetchedMessage; + } + prefetchedMessages.set(i - first_i_prefetched, null); // null out to save memory + } + + if (contents == null) + contents = processMessagePart(messageNum, originalMessage, mm, attachmentsList); + + // if mm is not prefetched, it is the same as original_mm + // will also work, but will be slow as javamail accesses and fetches each mm separately, instead of using the bulk prefetched version + // even when prefetched, the processMessagePart is somewhat expensive because the attachments have to be extracted etc. + + // we could overlap processMessagePart with do_imap_prefetch by prefetching in a separate thread, since prefetch is network limited. + // but profiling shows processMessagePart takes only 1/4th the time of do_imap_prefetch so overlapping would be a relatively small gain. + // not worth the effort right now. + ed.attachments = attachmentsList; + if (fetchConfig.downloadAttachments) + ed.attachmentsYetToBeDownloaded = false; // we've already downloaded our attachments + + // concat all the contents parts + StringBuilder sb = new StringBuilder(); + for (String s : contents) { + sb.append(s); + sb.append("\n"); + } + + String contentStr = sb.toString(); + if (!messageLooksOk(contentStr)) { + dataErrors.add("Skipping message as it seems to have very long words: " + ed); + continue; + } + contentStr = IndexUtils.normalizeNewlines(contentStr); // just get rid of \r's + + archive.addDoc(ed, contentStr); + + List linkList = new ArrayList(); + // linkList might be used only for slant + IndexUtils.populateDocLinks(ed, contentStr, linkList, true); + ed.links = linkList; + stats.nMessagesAdded++; + } catch (Exception ex) { + // sometimes we get unexpected folder closed, so try again + boolean retry = false; + if (ex instanceof javax.mail.FolderClosedException) { + log.warn("Oops, thread " + threadID + " got the folder closed in its face! " + ex.getMessage()); + + // sometimes we get this exception about folder closed + // retry up to 3 times, then give up + if (nTriesForThisMessage < 3) { + retry = true; + log.info("Re-opening email store; attempt #" + (nTriesForThisMessage + 1) + " for message " + i); + nTriesForThisMessage++; + messages = openFolderAndGetMessages(); + fetchHeaders(messages); + --i; // adjust the message index n try again + } + } + + if (!retry) { + // we sometimes see UnsupportedEncodingException with x-utf8utf8 mime type and ParseException + // nothing much can be done, just create a dummy doc and add it to the cache + nErrors++; + stats.nErrors++; + EmailDocument ed = new EmailDocument(Integer.toString(messageNum)); + log.warn("Exception reading message from " + folder_name() + " Message #" + messageNum + " " + ex.getMessage() + "\n" + Util.stackTrace(ex)); + + ed.setErrorString(Util.stackTrace(ex)); + } + } + } + } catch (Throwable t) { + Util.print_exception(t, log); + } finally { + // if (cancelled && false) // TODO: disable for now as currently only indexes are rolled back and allDocs/blobs are not rolled back in sync yet + // archive.rollbackIndexWrites(); + // else + currentStatus = JSONUtils.getStatusJSON("Saving archive..."); + archive.close(); + } + + fetchedFolderInfo.lastSeenUID = highestUID; + log.info("at end of fetch, folder info is " + fetchedFolderInfo); + + log.info("emailfetcher thread completed, archive has " + archive.getAllDocs().size() + " docs"); + } + + public FolderInfo getFetchedFolderInfo() { + return fetchedFolderInfo; + } + + private int openFolderAndGetMessageCount() throws MessagingException { + folder = null; + + store = emailStore.connect(); + folder = emailStore.get_folder(store, folder_name()); + if (folder != null) + return folder.getMessageCount(); + else + return 0; + } + + /** + * Comment by @vihari + * Not sure what uid id and folder are,I think this code should be more predictable + * The params begin idx and end idx are used for both uid filtering and Mbox message indexing. + * does not make sense + */ + private Message[] openFolderAndGetMessages() throws MessagingException { + if (folder == null) + openFolderAndGetMessageCount(); + + Message[] messages = null; + if (folder == null) + return messages; + + String descr = emailStore.getAccountID() + ":" + folder; + boolean haveUID = false; + int count = folder.getMessageCount(); + use_uid_if_available = (begin_msg_index == 1 && end_msg_index == count + 1); + log.info("use_uid_if_available is set to " + use_uid_if_available); + + if (fetchConfig.filter != null && fetchConfig.filter.isActive()) { + log.info("Issuing server side filters for " + fetchConfig.filter); + boolean useReceivedDateTerms = descr.indexOf("yahoo.com") >= 0; + messages = folder.search(fetchConfig.filter.convertToSearchTerm(useReceivedDateTerms)); + } else { + // mbox provider claims to provide UIDFolder but the uids are bogus so we treat mboemailstore folders as not uidfolders + boolean is_uid_folder = (folder instanceof UIDFolder) && !(emailStore instanceof MboxEmailStore); + + if (use_uid_if_available && is_uid_folder) { + // for uidfolders, we want to update the last seen uid in the FolderInfo + long uid = archive.getLastUIDForFolder(emailStore.getAccountID(), folder_name()); + if (uid > 0) { + messages = ((UIDFolder) folder).getMessagesByUID(uid + 1, UIDFolder.LASTUID); + log.info("Archive has already seen this folder: " + descr + " will only fetch messages from uid " + uid + " onwards, " + messages.length + " messages will be incrementally fetched"); + haveUID = true; + } else + log.info(descr + " is a UIDFolder but not seen before"); + } else + log.info(descr + " is not a UIDFolder"); + + if (!haveUID) { + log.info("All " + count + " messages in " + descr + " will be fetched"); + //messages = folder.getMessages(); + + if (begin_msg_index > 0 && end_msg_index > 0) { + // we have to use only specified messages + // if there are 8 messages, count = 8, end_msg_index will be 9 + if (end_msg_index > count + 1) + log.warn("Warning: bad end_msg_index " + end_msg_index + " count = " + count); // use the full messages + else { + int nMessages = end_msg_index - begin_msg_index; + Message[] newMessages = new Message[nMessages]; + for (int i = 0; i < end_msg_index - begin_msg_index; i++) + newMessages[i] = folder.getMessage(begin_msg_index + i);//messages[begin_msg_index - 1 + i]; // -1 cos messages array is indexed from 0, but begin_msg_index from 1 + log.info("total # of messages: " + count + " reduced # of messages: " + newMessages.length); + messages = newMessages; + } + } + } + } + + return messages; + } + + /** + * main fetch+index method + * The assumptions that the heap is big enough to enough to fit all the messages i the folder is not scalable for larger archive. + * Instead, we process each message individually. + * fetchHeaders may be penalised due to multiple requests of fetch? + * In order to make indexing of large archives possible, fetch of NON-MBOXEmailstrore formats is penalised. It is possible to avoid this by handling MBox and IMAP/POP formats differently. + */ + public void run() { + currentStatus = JSONUtils.getStatusJSON("Starting to process " + folder_name()); + + isCancelled = false; + Thread.currentThread().setName("EmailFetcher"); + nErrors = 0; + //Message[] messages = null; + // use_uid is set only if we are reading the whole folder. otherwise we won't use it, and we won't update the highest UID seen for the folder in the archive. + try { + // long t1 = System.currentTimeMillis(); + int nMessages = openFolderAndGetMessageCount(); + log.info("Total number of messages: " + nMessages); + + if (emailStore instanceof MboxEmailStore) { + // this is a special for mbox'es because we run out of memory if we try to openFolderAndGetMessages() + // so we process in batches + //TODO: Ideally, should cap on buffer size rather than on number of messages. + final int BATCH = 10000; + int nbatches = nMessages / BATCH; + nMessagesProcessedSuccess = 0; + long st = System.currentTimeMillis(); + int b; + for (b = 0; b < nbatches + 1; b++) { + begin_msg_index = b * BATCH + 1; + end_msg_index = Math.min((b + 1) * BATCH, nMessages) + 1; + log.info("Fetching messages in index [" + begin_msg_index + ", " + end_msg_index + "] batch: " + b + "/" + nbatches + "\nTotal Messages: " + nMessages); + Message[] messages = openFolderAndGetMessages(); + currentStatus = JSONUtils.getStatusJSON(""); + if (isCancelled) + return; + + if (messages.length > 0) { + try { + if (fetchConfig.downloadMessages) { + log.info(nMessages + " messages will be fetched for indexing"); + fetchAndIndexMessages(folder, messages, begin_msg_index, nMessages); + } else { + // this is for memory test screening mode. + // we create a dummy archive without any real contents + for (int i = 0; i < nMessages; i++) { + String unique_id_as_string = Long.toString(i); + + // well, we already converted to emaildoc above during removeMessagesAlreadyInArchive + // not a serious perf. concern now, but revisit if needed + EmailDocument ed = convertToEmailDocument((MimeMessage) messages[i], unique_id_as_string); // this messageNum is mostly for debugging, it should not be used for equals etc. + archive.addDocWithoutContents(ed); + } + } + } catch (Exception e) { + log.error("Exception trying to fetch messages, results will be incomplete! " + e + "\n" + Util.stackTrace(e)); + } + } + log.info("Fetch stats for this fetcher thread: " + stats); + } + log.info("Read #" + nMessages + " messages in #" + b + " batches of size: " + BATCH + " in " + (System.currentTimeMillis() - st) + "ms"); + } else { + // IMAP etc are pretty efficient with lazily populating message objects, so unlike mbox, its ok to use openFolderAndGetMessages() on the entire folder. + // remember to init the begin/end_msg_index before calling openFolderAndGetMessages + begin_msg_index = 1; + end_msg_index = nMessages + 1; + nMessagesProcessedSuccess = 0; + Message[] messages = openFolderAndGetMessages(); + + long st = System.currentTimeMillis(); + currentStatus = JSONUtils.getStatusJSON(""); + if (isCancelled) + return; + + if (messages.length > 0) { + try { + fetchHeaders(messages); // always fetch headers + if (fetchConfig.downloadMessages) { + log.info(nMessages + " messages will be fetched for indexing"); + //we process all the messages together here unlike the case of mstor + //hence the begin index is always 0 + fetchAndIndexMessages(folder, messages, 0, messages.length); + } else { + // this is for memory test screening mode. + // we create a dummy archive without any real contents + for (int i = 0; i < nMessages && i < messages.length; i++) { + String unique_id_as_string = Long.toString(i); + + // well, we already converted to emaildoc above during removeMessagesAlreadyInArchive + // not a serious perf. concern now, but revisit if needed + EmailDocument ed = convertToEmailDocument((MimeMessage) messages[i], unique_id_as_string); // this messageNum is mostly for debugging, it should not be used for equals etc. + archive.addDocWithoutContents(ed); + } + } + } catch (Exception e) { + Util.print_exception("Exception trying to fetch messages, results will be incomplete! ", e, log); + } + } + log.info("Read #" + nMessages + " messages in in " + (System.currentTimeMillis() - st) + "ms"); + } + } catch (Throwable t) { + if (t instanceof OutOfMemoryError) + this.mayHaveRunOutOfMemory = true; + // this is important, because there could be an out of memory etc over here. + Util.print_exception(t, log); + } finally { + try { + if (folder != null) + folder.close(false); + if (store != null) + store.close(); + } catch (Exception e) { + Util.print_exception(e); + } + } + } + + /* + * code for handling other kinds of headers, e.g. to find location of the + * message -- not used right now, but may use in the future. + * public void processHeaders(MimeMessage m) throws Exception + * { + * Address[] froms = m.getFrom(); + * if (froms == null) + * return; + * InternetAddress a = (InternetAddress) froms[0]; + * ContactInfo ci = addressBook.getContactInfoForAddress(a); + * Enumeration
e = (Enumeration
) m.getAllHeaders(); + * String lastReceivedHeader = null; + * while (e.hasMoreElements()) + * { + * Header h = e.nextElement(); + * String n = h.getName(); + * String v = h.getValue(); + * // log.info ("header: " + n + " = " + n); + * String s = n.toLowerCase(); + * if ("x-mailer".equals(s) || "user-agent".equals(s)) + * { + * log.warn (m.getFrom()[0] + " --> " + n + " " + v); + * ci.addMailer(v); + * } + * if ("x-originating-ip".equals(s) || "x-yahoo-post-ip".equals(s)) + * { + * log.warn (m.getFrom()[0] + " --> " + n + " " + v); + * ci.addIPAddr(v); + * } + * if ("x-yahoo-profile".equals(s)) + * log.warn (m.getFrom()[0] + " --> " + n + " " + v); + * if ("message-id".equals(s)) + * { + * log.warn("messageID = " + v); + * ci.addMessageID(v); + * } + * if ("received".equals(s) || "x-received".equals(s)) + * { + * lastReceivedHeader = v; + * } + * } + * + * // sometimes the headers have an extra ctrl-m at the end, strip it if + * this is the case. + * if (lastReceivedHeader != null && lastReceivedHeader.endsWith("\r")) + * lastReceivedHeader = lastReceivedHeader.substring(0, + * lastReceivedHeader.length()-1); + * + * ci.addLastReceivedHeader(lastReceivedHeader); + * + * String from = froms[0].toString(); + * + * log.info (from + " lastReceived " + lastReceivedHeader); + * if (lastReceivedHeader == null) + * log.warn ("WARNING: " + from + " --> no received header!?"); + * else + * { + * String ipAddrStr = processLastReceived(lastReceivedHeader); + * if (ipAddrStr != null) + * { + * byte[] ipAddrBytes = Util.parseIPAddress(ipAddrStr); + * if (ipAddrBytes != null) + * { + * // InetAddress ipAddr = InetAddress.getByAddress(ipAddrBytes); + * // log.info ("Received: " + locationService.lookupLocation(ipAddr)); + * } + * } + * } + * } + */ + + public String toString() { + return Util.fieldsToString(this); + } +} diff --git a/src/java/edu/stanford/muse/webapp/EmailRenderer.java b/src/java/edu/stanford/muse/webapp/EmailRenderer.java index 6ffa3b2..b9bf797 100755 --- a/src/java/edu/stanford/muse/webapp/EmailRenderer.java +++ b/src/java/edu/stanford/muse/webapp/EmailRenderer.java @@ -1,536 +1,536 @@ -package edu.stanford.muse.webapp; - -import java.io.IOException; -import java.util.*; - -import javax.mail.Address; -import javax.mail.internet.InternetAddress; - -import edu.stanford.muse.datacache.Blob; -import edu.stanford.muse.datacache.BlobStore; -import edu.stanford.muse.email.AddressBook; -import edu.stanford.muse.email.Contact; -import edu.stanford.muse.groups.SimilarGroup; -import edu.stanford.muse.index.*; -import edu.stanford.muse.ner.model.NEType; -import edu.stanford.muse.util.Pair; -import edu.stanford.muse.util.Span; -import edu.stanford.muse.util.Util; - -/** This class has util methods to display an email message in an html page */ - -public class EmailRenderer { - - static final int TEXT_WRAP_WIDTH = 80; // used to be 80, but that wraps - // around too soon. 120 is too - // much with courier font. - - public static Pair pagesForDocuments(Collection ds, Archive archive, String datasetTitle, - Set highlightTerms) - throws Exception{ - return pagesForDocuments(ds, archive, datasetTitle, null, highlightTerms, null, MultiDoc.ClusteringType.MONTHLY); - } - - public static Pair pagesForDocuments(Collection ds, Archive archive, String datasetTitle, - Set highlightTerms, Collection highlightAttachments) - throws Exception{ - return pagesForDocuments(ds, archive, datasetTitle, null, highlightTerms, highlightAttachments, MultiDoc.ClusteringType.MONTHLY); - } - - public static Pair pagesForDocuments(Collection ds, Archive archive, String datasetTitle, - Set highlightContactIds, Set highlightTerms) - throws Exception{ - return pagesForDocuments(ds, archive, datasetTitle, highlightContactIds, highlightTerms, null, MultiDoc.ClusteringType.MONTHLY); - } - - public static Pair pagesForDocuments(Collection ds, Archive archive, String datasetTitle, - Set highlightContactIds, Set highlightTerms, Collection highlightAttachments) - throws Exception{ - return pagesForDocuments(ds, archive, datasetTitle, highlightContactIds, highlightTerms, highlightAttachments, MultiDoc.ClusteringType.MONTHLY); - } - - /* - * returns pages and html for a collection of docs, which can be put into a - * jog frame. indexer clusters are used to - * - * Changed the first arg type from: Collection to Collection, as we get Collection in browse page or from docsforquery, its a hassle to make them all return EmailDocument - * especially when no other document type is used anywhere - */ - public static Pair pagesForDocuments(Collection ds, Archive archive, String datasetTitle, - Set highlightContactIds, Set highlightTerms, Collection highlightAttachments, MultiDoc.ClusteringType coptions) - throws Exception - { - StringBuilder html = new StringBuilder(); - int pageNum = 0; - List pages = new ArrayList(); - - // need clusters which map to sections in the browsing interface - List clusters; - - // indexer may or may not have indexed all the docs in ds - // if it has, use its clustering (could be yearly or monthly or category - // wise) - // if (indexer != null && indexer.clustersIncludeAllDocs(ds)) - // if (indexer != null) - clusters = archive.clustersForDocs(ds, coptions); - /* - * else { // categorize by month if the docs have dates if - * (EmailUtils.allDocsAreDatedDocs(ds)) clusters = - * IndexUtils.partitionDocsByInterval(new ArrayList((Set) - * ds), true); else // must be category docs clusters = - * CategoryDocument.clustersDocsByCategoryName((Collection) ds); } - */ - - List datasetDocs = new ArrayList<>(); - - // we build up a hierarchy of - for (MultiDoc md : clusters) - { - if (md.docs.size() == 0) - continue; - - String description = md.description; - description = description.replace("\"", "\\\""); // escape a double - // quote if any - // in the - // description - html.append("
\n"); - - List> clusterResult = new ArrayList<>(); - - for (Document d : md.docs) - { - String pdfAttrib = ""; - /* - * if (d instanceof PDFDocument) pdfAttrib = "pdfLink=\"" + - * ((PDFDocument) d).relativeURLForPDF + "\""; - */ - html.append("
\n"); - - datasetDocs.add(d); - pages.add(null); - clusterResult.add(null); - // clusterResult.add(docPageList); - // for (String s: docPageList) - { - String comment = Util.escapeHTML(d.comment); - html.append("
\n"); - } - - html.append("
"); // document - } - html.append("
\n"); // section - } - - DataSet dataset = new DataSet(datasetDocs, archive, datasetTitle, highlightContactIds, highlightTerms, highlightAttachments); - - return new Pair<>(dataset, html.toString()); - } - - /** - * format given addresses as comma separated html, linewrap after given - * number of chars - * - * @param addressBook - */ - public static String formatAddressesAsHTML(Address addrs[], AddressBook addressBook, int lineWrap, Set highlightUnstemmed, Set highlightNames, Set highlightAddresses) - { - StringBuilder sb = new StringBuilder(); - int outputLineLength = 0; - for (int i = 0; i < addrs.length; i++) - { - String thisAddrStr; - - Address a = addrs[i]; - if (a instanceof InternetAddress) - { - InternetAddress ia = (InternetAddress) a; - Pair p = JSPHelper.getNameAndURL((InternetAddress) a, addressBook); - String url = p.getSecond(); - String str = ia.toString(); - String addr = ia.getAddress(); - boolean match = false; - if(str!=null) { - //The goal here is to explain why a doc is selected and hence we should replicate Lucene doc selection and Lucene is case insensitive most of the times - String lc = str.toLowerCase(); - if (highlightUnstemmed != null) - for (String hs : highlightUnstemmed) { - String hlc = hs.toLowerCase().replaceAll("^\\W+|\\W+$",""); - if (lc.contains(hlc)) { - match = true; - break; - } - } - if (!match && highlightNames != null) - for (String hn : highlightNames) { - String hlc = hn.toLowerCase().replaceAll("^\\W+|\\W+$",""); - if (lc.contains(hlc)) { - match = true; - break; - } - } - } - if(addr!=null){ - if (!match && highlightAddresses != null) - for (String ha : highlightAddresses) - if (addr.contains(ha)) { - match = true; - break; - } - } - - if(match) - thisAddrStr = ("" + Util.escapeHTML(str) + ""); - else - thisAddrStr = ("" + Util.escapeHTML(str) + ""); - - if (str != null) - outputLineLength += str.length(); - } - else - { - String str = a.toString(); - thisAddrStr = str; - outputLineLength += str.length(); - JSPHelper.log.warn("Address is not an instance of InternetAddress - is of instance: "+a.getClass().getName() + ", highlighting won't work."); - } - - if (i + 1 < addrs.length) - outputLineLength += 2; // +2 for the comma that will follow... - - if (outputLineLength + 2 > lineWrap) - { - sb.append("
\n"); - outputLineLength = 0; - } - sb.append(thisAddrStr); - if (i + 1 < addrs.length) - sb.append(", "); - } - - return sb.toString(); - } - - /** - * returns a string for documents. - * - * @param highlightAttachments - * @throws Exception - */ - //TODO: inFull, debug params can be removed - //TODO: Consider a HighlighterOptions class - public static Pair htmlForDocument(Document d, Archive archive, String datasetTitle, BlobStore attachmentsStore, - Boolean sensitive, Set highlightContactIds, Set highlightTerms, Set highlightAttachments, Map> authorisedEntities, - boolean IA_links, boolean inFull, boolean debug) throws Exception - { - JSPHelper.log.debug("Generating HTML for document: " + d); - EmailDocument ed = null; - String html = null; - boolean overflow = false; - if (d instanceof EmailDocument) - { - // for email docs, 1 doc = 1 page - ed = (EmailDocument) d; - StringBuilder page = new StringBuilder(); - page.append("
\n"); - - page.append("
\n"); - page.append(EmailRenderer.getHTMLForHeader(archive, ed, sensitive, highlightContactIds, highlightTerms, IA_links, debug)); - page.append("
"); // muse-doc-header - - /* - * Map> sentimentMap = - * indexer.getSentiments(ed); for (String emotion: - * sentimentMap.keySet()) { page.append ("" + emotion + - * ": "); for (String word: sentimentMap.get(emotion)) - * page.append (word + " "); page.append ("
\n"); - * page.append("
\n"); } - */ - page.append("\n
\n"); - Pair contentsHtml = archive.getHTMLForContents(d, ((EmailDocument) d).getDate(), d.getUniqueId(), sensitive, highlightTerms, - authorisedEntities, IA_links, inFull, true); - - StringBuilder htmlMessageBody = contentsHtml.first; - overflow = contentsHtml.second; - // page.append(ed.getHTMLForContents(indexer, highlightTermsStemmed, - // highlightTermsUnstemmed, IA_links)); - page.append(htmlMessageBody); - page.append("\n
\n"); // muse-doc-body - - // page.append("\n
\n"); - List attachments = ed.attachments; - if (attachments != null && attachments.size() > 0) - { - // show thumbnails of all the attachments - - if (ModeConfig.isPublicMode()) { - page.append(attachments.size() + " attachment" + (attachments.size() == 1 ? "" : "s") + "."); - } else { - page.append("
\n
\n"); - page.append("\n"); - int i = 0; - for (; i < attachments.size(); i++) - { - if (i % 4 == 0) - page.append((i == 0) ? "\n" : "\n"); - page.append("\n"); - } - if (i % 4 != 0) - page.append(""); - page.append("
"); - - Blob attachment = attachments.get(i); - String thumbnailURL = null, attachmentURL = null; - boolean is_image = Util.is_image_filename(attachment.filename); - - if (attachmentsStore != null) - { - String contentFileDataStoreURL = attachmentsStore.get_URL(attachment); - attachmentURL = "serveAttachment.jsp?file=" + Util.URLtail(contentFileDataStoreURL); - String tnFileDataStoreURL = attachmentsStore.getViewURL(attachment, "tn"); - if (tnFileDataStoreURL != null) - thumbnailURL = "serveAttachment.jsp?file=" + Util.URLtail(tnFileDataStoreURL); - else - { - if (attachment.is_image()) - thumbnailURL = attachmentURL; - else - thumbnailURL = "images/sorry.png"; - } - } - else - JSPHelper.log.warn("attachments store is null!"); - - // toString the filename in any case, - String s = attachment.filename; - // cap to a length of 25, otherwise the attachment name - // overflows the tn - String display = Util.ellipsize(s, 25); - boolean highlight = highlightAttachments != null && highlightAttachments.contains(attachment); - page.append(" " + ""+ Util.escapeHTML(display) + " "); - page.append("
"); - - String css_class = "attachment-preview" + (is_image ? " img" : "") + (highlight ? " highlight" : ""); - String leader = ""); - page.append(leader + "href=\"" + attachmentURL + "\" src=\"" + thumbnailURL + "\">\n"); - page.append("\n"); - } - else - { - // page.append - // (" 
 
Not fetched
 
   "); - // page.append("
"); - page.append(leader + "src=\"images/no-attachment.png\">\n"); - // page.append ("\n"); - - if (thumbnailURL == null) - JSPHelper.log.info("No thumbnail for " + attachment); - if (attachmentURL == null) - JSPHelper.log.info("No attachment URL for " + attachment); - } - page.append("
"); - page.append("\n
\n"); // muse-doc-attachments - } - - } - page.append("\n
\n"); // .muse-doc - html = page.toString(); - } - else if (d instanceof DatedDocument) - { - /* - * DatedDocument dd = (DatedDocument) d; StringBuilder page = new - * StringBuilder(); - * - * page.append (dd.getHTMLForHeader()); // directly jam in contents - * page.append ("
\n"); page.append - * (dd.getHTMLForContents(indexer)); // directly jam in contents - * page.append ("\n
"); // doc-contents return page.toString(); - */ - html = "To be implemented"; - } - else - { - JSPHelper.log.warn("Unsupported Document: " + d.getClass().getName()); - html = ""; - } - - return new Pair(html, overflow); - } - - /** - * returns a HTML table string for the doc header - * - * @param sensitive - * - when set will highlight any sensitive info in subject based - * on preset regexs - * @throws IOException - */ - public static StringBuilder getHTMLForHeader(Archive archive, EmailDocument ed, Boolean sensitive, Set highlightContactIds, Set highlightTerms, - boolean IA_links, boolean debug) throws IOException - { - AddressBook addressBook = archive.addressBook; - GroupAssigner groupAssigner = archive.groupAssigner; - Set contactNames = new LinkedHashSet<>(); - Set contactAddresses = new LinkedHashSet<>(); - if(highlightContactIds!=null) - for(Integer hci: highlightContactIds) { - if(hci == null) - continue; - Contact c = archive.addressBook.getContact(hci); - if(c==null) - continue; - contactNames.addAll(c.names); - contactAddresses.addAll(c.emails); - } - contactNames.addAll(highlightTerms); - - StringBuilder result = new StringBuilder(); - // header table - result.append("\n"); - // result.append - // ("\n"); - if(debug) - result.append("\n"); - result.append(JSPHelper.getHTMLForDate(ed.date)); - - final String style = "\n"); - - if (ed.cc != null && ed.cc.length > 0) - { - result.append(style + "Cc: \n"); - } - - if (ed.bcc != null && ed.bcc.length > 0) - { - result.append(style + "Bcc: \n"); - } - - if (groupAssigner != null) - { - SimilarGroup g = groupAssigner.getClosestGroup(ed); - if (g != null && g.size() > 1) // if its just a singleton group, no - // point explicitly listing a group - // line - { - String url = "browse?groupIdx=" + groupAssigner.getClosestGroupIdx(ed); - result.append(style + "Group: \n"); - result.append("\n\n"); - } - } - - String x = ed.description; - if (x == null) - x = ""; - - result.append(style + "Subject: "); - //
 to escape special chars if any in the subject. max 70 chars in
-		// one line, otherwise spill to next line
-		result.append("
\n"); - result.append ("\n" + style + "ID: " + ""); - result.append("
Folder:" - // + this.folderName + "
docId: "+ed.getUniqueId()+"
"; - - // email specific headers - result.append(style + "From: "); - Address[] addrs = ed.from; - if (addrs != null) - { - result.append(formatAddressesAsHTML(addrs, addressBook, TEXT_WRAP_WIDTH, highlightTerms, contactNames, contactAddresses)); - } - - result.append(style + "To: "); - addrs = ed.to; - if (addrs != null) - result.append(formatAddressesAsHTML(addrs, addressBook, TEXT_WRAP_WIDTH, highlightTerms, contactNames, contactAddresses) + ""); - - result.append("\n
"); - result.append(formatAddressesAsHTML(ed.cc, addressBook, TEXT_WRAP_WIDTH, highlightTerms, contactNames, contactAddresses) + ""); - result.append("\n
"); - result.append(formatAddressesAsHTML(ed.bcc, addressBook, TEXT_WRAP_WIDTH, highlightTerms, contactNames, contactAddresses) + ""); - result.append("\n
"); - String description = g.elementsToString(); - result.append("" + g.name + "
"); - result.append("
"); - x = DatedDocument.formatStringForMaxCharsPerLine(x, 70).toString(); - if (x.endsWith("\n")) - x = x.substring(0, x.length() - 1); - - Span[] names = archive.getAllNamesInDoc(ed, false); - - // Contains all entities and id if it is authorised else null - Map entitiesWithId = new HashMap<>(); - //we annotate three specially recognized types - Map recMap = new HashMap<>(); - recMap.put(NEType.Type.PERSON.getCode(),"cp"); - recMap.put(NEType.Type.PLACE.getCode(),"cl"); - recMap.put(NEType.Type.ORGANISATION.getCode(),"co"); - Arrays.asList(names).stream().filter(n -> recMap.keySet().contains(NEType.getCoarseType(n.type).getCode())) - .forEach(n -> { - Set types = new HashSet<>(); - types.add(recMap.get(NEType.getCoarseType(n.type).getCode())); - entitiesWithId.put(n.text, new Entity(n.text, null, types)); - }); - - x = archive.annotate(x, ed.getDate(), ed.getUniqueId(), sensitive, highlightTerms, entitiesWithId, IA_links, false); - - result.append(x); - result.append("\n"); - result.append("\n
" + Util.hash (ed.getSignature()) + "
\n"); // end docheader table - - if (ModeConfig.isPublicMode()) - return new StringBuilder(Util.maskEmailDomain(result.toString())); - - return result; - } - - /** I'm not sure what this is used for -- I think its used only for rendering HTML for the message. */ - public static class Entity { - public Map ids; - //person,places,orgs, custom - public String name; - public Set types = new HashSet(); - - public Entity(String name, Map ids, Set types) { - this.name = name; - this.ids = ids; - this.types = types; - } - - @Override - public String toString() { - return types.toString(); - } - } -} +package edu.stanford.muse.webapp; + +import java.io.IOException; +import java.util.*; + +import javax.mail.Address; +import javax.mail.internet.InternetAddress; + +import edu.stanford.muse.datacache.Blob; +import edu.stanford.muse.datacache.BlobStore; +import edu.stanford.muse.email.AddressBook; +import edu.stanford.muse.email.Contact; +import edu.stanford.muse.groups.SimilarGroup; +import edu.stanford.muse.index.*; +import edu.stanford.muse.ner.model.NEType; +import edu.stanford.muse.util.Pair; +import edu.stanford.muse.util.Span; +import edu.stanford.muse.util.Util; + +/** This class has util methods to display an email message in an html page */ + +public class EmailRenderer { + + static final int TEXT_WRAP_WIDTH = 80; // used to be 80, but that wraps + // around too soon. 120 is too + // much with courier font. + + public static Pair pagesForDocuments(Collection ds, Archive archive, String datasetTitle, + Set highlightTerms) + throws Exception{ + return pagesForDocuments(ds, archive, datasetTitle, null, highlightTerms, null, MultiDoc.ClusteringType.MONTHLY); + } + + public static Pair pagesForDocuments(Collection ds, Archive archive, String datasetTitle, + Set highlightTerms, Collection highlightAttachments) + throws Exception{ + return pagesForDocuments(ds, archive, datasetTitle, null, highlightTerms, highlightAttachments, MultiDoc.ClusteringType.MONTHLY); + } + + public static Pair pagesForDocuments(Collection ds, Archive archive, String datasetTitle, + Set highlightContactIds, Set highlightTerms) + throws Exception{ + return pagesForDocuments(ds, archive, datasetTitle, highlightContactIds, highlightTerms, null, MultiDoc.ClusteringType.MONTHLY); + } + + public static Pair pagesForDocuments(Collection ds, Archive archive, String datasetTitle, + Set highlightContactIds, Set highlightTerms, Collection highlightAttachments) + throws Exception{ + return pagesForDocuments(ds, archive, datasetTitle, highlightContactIds, highlightTerms, highlightAttachments, MultiDoc.ClusteringType.MONTHLY); + } + + /* + * returns pages and html for a collection of docs, which can be put into a + * jog frame. indexer clusters are used to + * + * Changed the first arg type from: Collection to Collection, as we get Collection in browse page or from docsforquery, its a hassle to make them all return EmailDocument + * especially when no other document type is used anywhere + */ + public static Pair pagesForDocuments(Collection ds, Archive archive, String datasetTitle, + Set highlightContactIds, Set highlightTerms, Collection highlightAttachments, MultiDoc.ClusteringType coptions) + throws Exception + { + StringBuilder html = new StringBuilder(); + int pageNum = 0; + List pages = new ArrayList(); + + // need clusters which map to sections in the browsing interface + List clusters; + + // indexer may or may not have indexed all the docs in ds + // if it has, use its clustering (could be yearly or monthly or category + // wise) + // if (indexer != null && indexer.clustersIncludeAllDocs(ds)) + // if (indexer != null) + clusters = archive.clustersForDocs(ds, coptions); + /* + * else { // categorize by month if the docs have dates if + * (EmailUtils.allDocsAreDatedDocs(ds)) clusters = + * IndexUtils.partitionDocsByInterval(new ArrayList((Set) + * ds), true); else // must be category docs clusters = + * CategoryDocument.clustersDocsByCategoryName((Collection) ds); } + */ + + List datasetDocs = new ArrayList<>(); + + // we build up a hierarchy of + for (MultiDoc md : clusters) + { + if (md.docs.size() == 0) + continue; + + String description = md.description; + description = description.replace("\"", "\\\""); // escape a double + // quote if any + // in the + // description + html.append("
\n"); + + List> clusterResult = new ArrayList<>(); + + for (Document d : md.docs) + { + String pdfAttrib = ""; + /* + * if (d instanceof PDFDocument) pdfAttrib = "pdfLink=\"" + + * ((PDFDocument) d).relativeURLForPDF + "\""; + */ + html.append("
\n"); + + datasetDocs.add(d); + pages.add(null); + clusterResult.add(null); + // clusterResult.add(docPageList); + // for (String s: docPageList) + { + String comment = Util.escapeHTML(d.comment); + html.append("
\n"); + } + + html.append("
"); // document + } + html.append("
\n"); // section + } + + DataSet dataset = new DataSet(datasetDocs, archive, datasetTitle, highlightContactIds, highlightTerms, highlightAttachments); + + return new Pair<>(dataset, html.toString()); + } + + /** + * format given addresses as comma separated html, linewrap after given + * number of chars + * + * @param addressBook + */ + public static String formatAddressesAsHTML(Address addrs[], AddressBook addressBook, int lineWrap, Set highlightUnstemmed, Set highlightNames, Set highlightAddresses) + { + StringBuilder sb = new StringBuilder(); + int outputLineLength = 0; + for (int i = 0; i < addrs.length; i++) + { + String thisAddrStr; + + Address a = addrs[i]; + if (a instanceof InternetAddress) + { + InternetAddress ia = (InternetAddress) a; + Pair p = JSPHelper.getNameAndURL((InternetAddress) a, addressBook); + String url = p.getSecond(); + String str = ia.getPersonal() == null ? ia.getAddress() : ia.getPersonal() + "<" + ia.getAddress() + ">"; + String addr = ia.getAddress(); + boolean match = false; + if(str!=null) { + //The goal here is to explain why a doc is selected and hence we should replicate Lucene doc selection and Lucene is case insensitive most of the times + String lc = str.toLowerCase(); + if (highlightUnstemmed != null) + for (String hs : highlightUnstemmed) { + String hlc = hs.toLowerCase().replaceAll("^\\W+|\\W+$",""); + if (lc.contains(hlc)) { + match = true; + break; + } + } + if (!match && highlightNames != null) + for (String hn : highlightNames) { + String hlc = hn.toLowerCase().replaceAll("^\\W+|\\W+$",""); + if (lc.contains(hlc)) { + match = true; + break; + } + } + } + if(addr!=null){ + if (!match && highlightAddresses != null) + for (String ha : highlightAddresses) + if (addr.contains(ha)) { + match = true; + break; + } + } + + if(match) + thisAddrStr = ("" + Util.escapeHTML(str) + ""); + else + thisAddrStr = ("" + Util.escapeHTML(str) + ""); + + if (str != null) + outputLineLength += str.length(); + } + else + { + String str = a.toString(); + thisAddrStr = str; + outputLineLength += str.length(); + JSPHelper.log.warn("Address is not an instance of InternetAddress - is of instance: "+a.getClass().getName() + ", highlighting won't work."); + } + + if (i + 1 < addrs.length) + outputLineLength += 2; // +2 for the comma that will follow... + + if (outputLineLength + 2 > lineWrap) + { + sb.append("
\n"); + outputLineLength = 0; + } + sb.append(thisAddrStr); + if (i + 1 < addrs.length) + sb.append(", "); + } + + return sb.toString(); + } + + /** + * returns a string for documents. + * + * @param highlightAttachments + * @throws Exception + */ + //TODO: inFull, debug params can be removed + //TODO: Consider a HighlighterOptions class + public static Pair htmlForDocument(Document d, Archive archive, String datasetTitle, BlobStore attachmentsStore, + Boolean sensitive, Set highlightContactIds, Set highlightTerms, Set highlightAttachments, Map> authorisedEntities, + boolean IA_links, boolean inFull, boolean debug) throws Exception + { + JSPHelper.log.debug("Generating HTML for document: " + d); + EmailDocument ed = null; + String html = null; + boolean overflow = false; + if (d instanceof EmailDocument) + { + // for email docs, 1 doc = 1 page + ed = (EmailDocument) d; + StringBuilder page = new StringBuilder(); + page.append("
\n"); + + page.append("
\n"); + page.append(EmailRenderer.getHTMLForHeader(archive, ed, sensitive, highlightContactIds, highlightTerms, IA_links, debug)); + page.append("
"); // muse-doc-header + + /* + * Map> sentimentMap = + * indexer.getSentiments(ed); for (String emotion: + * sentimentMap.keySet()) { page.append ("" + emotion + + * ": "); for (String word: sentimentMap.get(emotion)) + * page.append (word + " "); page.append ("
\n"); + * page.append("
\n"); } + */ + page.append("\n
\n"); + Pair contentsHtml = archive.getHTMLForContents(d, ((EmailDocument) d).getDate(), d.getUniqueId(), sensitive, highlightTerms, + authorisedEntities, IA_links, inFull, true); + + StringBuilder htmlMessageBody = contentsHtml.first; + overflow = contentsHtml.second; + // page.append(ed.getHTMLForContents(indexer, highlightTermsStemmed, + // highlightTermsUnstemmed, IA_links)); + page.append(htmlMessageBody); + page.append("\n
\n"); // muse-doc-body + + // page.append("\n
\n"); + List attachments = ed.attachments; + if (attachments != null && attachments.size() > 0) + { + // show thumbnails of all the attachments + + if (ModeConfig.isPublicMode()) { + page.append(attachments.size() + " attachment" + (attachments.size() == 1 ? "" : "s") + "."); + } else { + page.append("
\n
\n"); + page.append("\n"); + int i = 0; + for (; i < attachments.size(); i++) + { + if (i % 4 == 0) + page.append((i == 0) ? "\n" : "\n"); + page.append("\n"); + } + if (i % 4 != 0) + page.append(""); + page.append("
"); + + Blob attachment = attachments.get(i); + String thumbnailURL = null, attachmentURL = null; + boolean is_image = Util.is_image_filename(attachment.filename); + + if (attachmentsStore != null) + { + String contentFileDataStoreURL = attachmentsStore.get_URL(attachment); + attachmentURL = "serveAttachment.jsp?file=" + Util.URLtail(contentFileDataStoreURL); + String tnFileDataStoreURL = attachmentsStore.getViewURL(attachment, "tn"); + if (tnFileDataStoreURL != null) + thumbnailURL = "serveAttachment.jsp?file=" + Util.URLtail(tnFileDataStoreURL); + else + { + if (attachment.is_image()) + thumbnailURL = attachmentURL; + else + thumbnailURL = "images/sorry.png"; + } + } + else + JSPHelper.log.warn("attachments store is null!"); + + // toString the filename in any case, + String s = attachment.filename; + // cap to a length of 25, otherwise the attachment name + // overflows the tn + String display = Util.ellipsize(s, 25); + boolean highlight = highlightAttachments != null && highlightAttachments.contains(attachment); + page.append(" " + ""+ Util.escapeHTML(display) + " "); + page.append("
"); + + String css_class = "attachment-preview" + (is_image ? " img" : "") + (highlight ? " highlight" : ""); + String leader = ""); + page.append(leader + "href=\"" + attachmentURL + "\" src=\"" + thumbnailURL + "\">\n"); + page.append("\n"); + } + else + { + // page.append + // (" 
 
Not fetched
 
   "); + // page.append("
"); + page.append(leader + "src=\"images/no-attachment.png\">\n"); + // page.append ("\n"); + + if (thumbnailURL == null) + JSPHelper.log.info("No thumbnail for " + attachment); + if (attachmentURL == null) + JSPHelper.log.info("No attachment URL for " + attachment); + } + page.append("
"); + page.append("\n
\n"); // muse-doc-attachments + } + + } + page.append("\n
\n"); // .muse-doc + html = page.toString(); + } + else if (d instanceof DatedDocument) + { + /* + * DatedDocument dd = (DatedDocument) d; StringBuilder page = new + * StringBuilder(); + * + * page.append (dd.getHTMLForHeader()); // directly jam in contents + * page.append ("
\n"); page.append + * (dd.getHTMLForContents(indexer)); // directly jam in contents + * page.append ("\n
"); // doc-contents return page.toString(); + */ + html = "To be implemented"; + } + else + { + JSPHelper.log.warn("Unsupported Document: " + d.getClass().getName()); + html = ""; + } + + return new Pair(html, overflow); + } + + /** + * returns a HTML table string for the doc header + * + * @param sensitive + * - when set will highlight any sensitive info in subject based + * on preset regexs + * @throws IOException + */ + public static StringBuilder getHTMLForHeader(Archive archive, EmailDocument ed, Boolean sensitive, Set highlightContactIds, Set highlightTerms, + boolean IA_links, boolean debug) throws IOException + { + AddressBook addressBook = archive.addressBook; + GroupAssigner groupAssigner = archive.groupAssigner; + Set contactNames = new LinkedHashSet<>(); + Set contactAddresses = new LinkedHashSet<>(); + if(highlightContactIds!=null) + for(Integer hci: highlightContactIds) { + if(hci == null) + continue; + Contact c = archive.addressBook.getContact(hci); + if(c==null) + continue; + contactNames.addAll(c.names); + contactAddresses.addAll(c.emails); + } + contactNames.addAll(highlightTerms); + + StringBuilder result = new StringBuilder(); + // header table + result.append("\n"); + // result.append + // ("\n"); + if(debug) + result.append("\n"); + result.append(JSPHelper.getHTMLForDate(ed.date)); + + final String style = "\n"); + + if (ed.cc != null && ed.cc.length > 0) + { + result.append(style + "Cc: \n"); + } + + if (ed.bcc != null && ed.bcc.length > 0) + { + result.append(style + "Bcc: \n"); + } + + if (groupAssigner != null) + { + SimilarGroup g = groupAssigner.getClosestGroup(ed); + if (g != null && g.size() > 1) // if its just a singleton group, no + // point explicitly listing a group + // line + { + String url = "browse?groupIdx=" + groupAssigner.getClosestGroupIdx(ed); + result.append(style + "Group: \n"); + result.append("\n\n"); + } + } + + String x = ed.description; + if (x == null) + x = ""; + + result.append(style + "Subject: "); + //
 to escape special chars if any in the subject. max 70 chars in
+		// one line, otherwise spill to next line
+		result.append("
\n"); + result.append ("\n" + style + "ID: " + ""); + result.append("
Folder:" + // + this.folderName + "
docId: "+ed.getUniqueId()+"
"; + + // email specific headers + result.append(style + "From: "); + Address[] addrs = ed.from; + if (addrs != null) + { + result.append(formatAddressesAsHTML(addrs, addressBook, TEXT_WRAP_WIDTH, highlightTerms, contactNames, contactAddresses)); + } + + result.append(style + "To: "); + addrs = ed.to; + if (addrs != null) + result.append(formatAddressesAsHTML(addrs, addressBook, TEXT_WRAP_WIDTH, highlightTerms, contactNames, contactAddresses) + ""); + + result.append("\n
"); + result.append(formatAddressesAsHTML(ed.cc, addressBook, TEXT_WRAP_WIDTH, highlightTerms, contactNames, contactAddresses) + ""); + result.append("\n
"); + result.append(formatAddressesAsHTML(ed.bcc, addressBook, TEXT_WRAP_WIDTH, highlightTerms, contactNames, contactAddresses) + ""); + result.append("\n
"); + String description = g.elementsToString(); + result.append("" + g.name + "
"); + result.append("
"); + x = DatedDocument.formatStringForMaxCharsPerLine(x, 70).toString(); + if (x.endsWith("\n")) + x = x.substring(0, x.length() - 1); + + Span[] names = archive.getAllNamesInDoc(ed, false); + + // Contains all entities and id if it is authorised else null + Map entitiesWithId = new HashMap<>(); + //we annotate three specially recognized types + Map recMap = new HashMap<>(); + recMap.put(NEType.Type.PERSON.getCode(),"cp"); + recMap.put(NEType.Type.PLACE.getCode(),"cl"); + recMap.put(NEType.Type.ORGANISATION.getCode(),"co"); + Arrays.asList(names).stream().filter(n -> recMap.keySet().contains(NEType.getCoarseType(n.type).getCode())) + .forEach(n -> { + Set types = new HashSet<>(); + types.add(recMap.get(NEType.getCoarseType(n.type).getCode())); + entitiesWithId.put(n.text, new Entity(n.text, null, types)); + }); + + x = archive.annotate(x, ed.getDate(), ed.getUniqueId(), sensitive, highlightTerms, entitiesWithId, IA_links, false); + + result.append(x); + result.append("\n"); + result.append("\n
" + Util.hash (ed.getSignature()) + "
\n"); // end docheader table + + if (ModeConfig.isPublicMode()) + return new StringBuilder(Util.maskEmailDomain(result.toString())); + + return result; + } + + /** I'm not sure what this is used for -- I think its used only for rendering HTML for the message. */ + public static class Entity { + public Map ids; + //person,places,orgs, custom + public String name; + public Set types = new HashSet(); + + public Entity(String name, Map ids, Set types) { + this.name = name; + this.ids = ids; + this.types = types; + } + + @Override + public String toString() { + return types.toString(); + } + } +} From c2934374e316cbb24f615fba8e05085909d4b8cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=A0=D0=BE=D1=81=D1=82=D0=BE=D0=B2=20=D0=90=D0=BB=D0=B5?= =?UTF-8?q?=D0=BA=D1=81=D0=B0=D0=BD=D0=B4=D1=80?= Date: Fri, 5 May 2017 22:37:07 +0300 Subject: [PATCH 03/33] encoding in attachments --- src/java/edu/stanford/muse/email/EmailFetcherThread.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/java/edu/stanford/muse/email/EmailFetcherThread.java b/src/java/edu/stanford/muse/email/EmailFetcherThread.java index e09caab..53835c3 100755 --- a/src/java/edu/stanford/muse/email/EmailFetcherThread.java +++ b/src/java/edu/stanford/muse/email/EmailFetcherThread.java @@ -24,6 +24,7 @@ import edu.stanford.muse.webapp.HTMLUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.james.mime4j.codec.DecoderUtil; import org.json.JSONArray; import org.json.JSONException; import org.json.JSONObject; @@ -588,6 +589,9 @@ private void handleAttachments(int idx, Message m, Part p, List textList String filename = null; try { filename = p.getFileName(); + if (filename != null) { + filename = DecoderUtil.decodeEncodedWords(filename, null); + } } catch (Exception e) { // seen this happen with: // Folders__gmail-sent Message #12185 Expected ';', got "Message" From d923dedcb292586792d575652a6cf1a24dfa3c3a Mon Sep 17 00:00:00 2001 From: Gleb Suvorov Date: Sat, 6 May 2017 00:15:14 +0300 Subject: [PATCH 04/33] unverified fix for cyrillic in search request --- .../edu/stanford/muse/webapp/JSPHelper.java | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/java/edu/stanford/muse/webapp/JSPHelper.java b/src/java/edu/stanford/muse/webapp/JSPHelper.java index f6a05bb..f894481 100755 --- a/src/java/edu/stanford/muse/webapp/JSPHelper.java +++ b/src/java/edu/stanford/muse/webapp/JSPHelper.java @@ -267,17 +267,17 @@ public static String[] convertRequestParamsToUTF8(String params[]) throws Unsupp // converts an array of strings from iso-8859-1 to utf8. useful for converting i18n chars in http request parameters public static String convertRequestParamToUTF8(String param) throws UnsupportedEncodingException { - if (RUNNING_ON_JETTY) - { - log.info("running on jetty: no conversion for " + param); + // if (RUNNING_ON_JETTY) + // { + // log.info("running on jetty: no conversion for " + param); return param; - } - if (param == null) - return null; - String newParam = new String(param.getBytes("ISO-8859-1"), "UTF-8"); - if (!newParam.equals(param)) - log.info("Converted to utf-8: " + param + " -> " + newParam); - return newParam; + // } + // if (param == null) + // return null; + // String newParam = new String(param.getBytes("ISO-8859-1"), "UTF-8"); + // if (!newParam.equals(param)) + // log.info("Converted to utf-8: " + param + " -> " + newParam); + // return newParam; } public static boolean runningOnLocalhost(HttpServletRequest request) From fa61cf73cdb791cf0db014df276de41c2c1c23b7 Mon Sep 17 00:00:00 2001 From: Gleb Suvorov Date: Wed, 24 May 2017 00:52:58 +0300 Subject: [PATCH 05/33] small memory fix --- src/java/edu/stanford/muse/email/EmailFetcherThread.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/java/edu/stanford/muse/email/EmailFetcherThread.java b/src/java/edu/stanford/muse/email/EmailFetcherThread.java index 53835c3..bb3192b 100755 --- a/src/java/edu/stanford/muse/email/EmailFetcherThread.java +++ b/src/java/edu/stanford/muse/email/EmailFetcherThread.java @@ -1282,7 +1282,7 @@ public void run() { // this is a special for mbox'es because we run out of memory if we try to openFolderAndGetMessages() // so we process in batches //TODO: Ideally, should cap on buffer size rather than on number of messages. - final int BATCH = 10000; + final int BATCH = 100; //it ate too much memory more than 12gb with 10000 int nbatches = nMessages / BATCH; nMessagesProcessedSuccess = 0; long st = System.currentTimeMillis(); From 182ca3fbce9aca54f52c0b5fd346ba9743f86670 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=A0=D0=BE=D1=81=D1=82=D0=BE=D0=B2=20=D0=90=D0=BB=D0=B5?= =?UTF-8?q?=D0=BA=D1=81=D0=B0=D0=BD=D0=B4=D1=80?= Date: Thu, 25 May 2017 22:25:50 +0300 Subject: [PATCH 06/33] encoding in download attachments --- src/java/edu/stanford/muse/util/Util.java | 5 ++--- .../edu/stanford/muse/webapp/EmailRenderer.java | 13 ++++++------- src/java/edu/stanford/muse/webapp/JSPHelper.java | 10 ++++++++-- 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/src/java/edu/stanford/muse/util/Util.java b/src/java/edu/stanford/muse/util/Util.java index 3991b95..c5419df 100755 --- a/src/java/edu/stanford/muse/util/Util.java +++ b/src/java/edu/stanford/muse/util/Util.java @@ -2412,9 +2412,8 @@ public static void test_tail() * actual file in the URL. * returns null if the input is null. */ - public static String URLtail(String url) - { - return tail(url, "/"); + public static String URLtail(String url) { + return URLEncode(tail(url, "/")); } /** diff --git a/src/java/edu/stanford/muse/webapp/EmailRenderer.java b/src/java/edu/stanford/muse/webapp/EmailRenderer.java index f413e3b..f56ccb3 100755 --- a/src/java/edu/stanford/muse/webapp/EmailRenderer.java +++ b/src/java/edu/stanford/muse/webapp/EmailRenderer.java @@ -1,11 +1,5 @@ package edu.stanford.muse.webapp; -import java.io.IOException; -import java.util.*; - -import javax.mail.Address; -import javax.mail.internet.InternetAddress; - import edu.stanford.muse.datacache.Blob; import edu.stanford.muse.datacache.BlobStore; import edu.stanford.muse.email.AddressBook; @@ -17,6 +11,11 @@ import edu.stanford.muse.util.Span; import edu.stanford.muse.util.Util; +import javax.mail.Address; +import javax.mail.internet.InternetAddress; +import java.io.IOException; +import java.util.*; + /** This class has util methods to display an email message in an html page */ public class EmailRenderer { @@ -337,7 +336,7 @@ public static Pair htmlForDocument(Document d, Archive archive, // d.hashCode() is just something to identify this // page/message page.append(""); - page.append(leader + "href=\"" + attachmentURL + "\" src=\"" + thumbnailURL + "\">\n"); + page.append(leader + "href=\"" + attachmentURL + "\" download src=\"" + thumbnailURL + "\">\n"); page.append("\n"); } else diff --git a/src/java/edu/stanford/muse/webapp/JSPHelper.java b/src/java/edu/stanford/muse/webapp/JSPHelper.java index f894481..0e4fe61 100755 --- a/src/java/edu/stanford/muse/webapp/JSPHelper.java +++ b/src/java/edu/stanford/muse/webapp/JSPHelper.java @@ -44,6 +44,8 @@ import javax.servlet.http.HttpSession; import javax.xml.transform.TransformerException; import java.io.*; +import java.net.URLDecoder; +import java.net.URLEncoder; import java.util.*; import java.util.stream.Collectors; @@ -1589,7 +1591,11 @@ public static void serveBlob(HttpServletRequest request, HttpServletResponse res { HttpSession session = request.getSession(); String filename = request.getParameter("file"); - filename = convertRequestParamToUTF8(filename); + try { + filename = URLDecoder.decode(filename, "utf-8"); + } catch (Exception e) { + throw new RuntimeException(e.getMessage(), e); + } String baseDir = (String) getSessionAttribute(session, "cacheDir"); if (filename.indexOf(".." + File.separator) >= 0) // avoid file injection! @@ -1663,7 +1669,7 @@ public static void writeFileToResponse(HttpSession session, HttpServletResponse if (asAttachment) { response.setHeader("Content-Length", String.valueOf(file.length())); - response.setHeader("Content-Disposition", "attachment; filename=\"" + file.getName() + "\""); + response.setHeader("Content-Disposition", "attachment; filename=\"" + URLEncoder.encode(file.getName(), "utf-8") + "\""); } // Prepare streams. BufferedInputStream input = null; From 78c6329e392e785a53c9e89e601e1d6b5fd5c2dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=A0=D0=BE=D1=81=D1=82=D0=BE=D0=B2=20=D0=90=D0=BB=D0=B5?= =?UTF-8?q?=D0=BA=D1=81=D0=B0=D0=BD=D0=B4=D1=80?= Date: Fri, 26 May 2017 00:25:13 +0300 Subject: [PATCH 07/33] encoding in file picker --- src/java/edu/stanford/muse/webapp/JSPHelper.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/java/edu/stanford/muse/webapp/JSPHelper.java b/src/java/edu/stanford/muse/webapp/JSPHelper.java index 0e4fe61..67c6139 100755 --- a/src/java/edu/stanford/muse/webapp/JSPHelper.java +++ b/src/java/edu/stanford/muse/webapp/JSPHelper.java @@ -1669,7 +1669,8 @@ public static void writeFileToResponse(HttpSession session, HttpServletResponse if (asAttachment) { response.setHeader("Content-Length", String.valueOf(file.length())); - response.setHeader("Content-Disposition", "attachment; filename=\"" + URLEncoder.encode(file.getName(), "utf-8") + "\""); + String fileName = URLEncoder.encode(file.getName(), "utf-8").replace("+", "%20"); + response.setHeader("Content-Disposition", "attachment; filename=\"" + fileName + "\""); } // Prepare streams. BufferedInputStream input = null; From 15277480d653ae2ca6de208e68ea57da1c2a6e3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=A0=D0=BE=D1=81=D1=82=D0=BE=D0=B2=20=D0=90=D0=BB=D0=B5?= =?UTF-8?q?=D0=BA=D1=81=D0=B0=D0=BD=D0=B4=D1=80?= Date: Fri, 26 May 2017 00:25:56 +0300 Subject: [PATCH 08/33] replace plus to spase on downloaded filename --- src/java/edu/stanford/muse/util/Util.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/java/edu/stanford/muse/util/Util.java b/src/java/edu/stanford/muse/util/Util.java index c5419df..5c0a8b4 100755 --- a/src/java/edu/stanford/muse/util/Util.java +++ b/src/java/edu/stanford/muse/util/Util.java @@ -2413,7 +2413,7 @@ public static void test_tail() * returns null if the input is null. */ public static String URLtail(String url) { - return URLEncode(tail(url, "/")); + return URLEncode(tail(url, "/")); } /** From 1999d666ef7087370f893ca3e1b9aeb5c15f728c Mon Sep 17 00:00:00 2001 From: Gleb Suvorov Date: Fri, 26 May 2017 00:33:26 +0300 Subject: [PATCH 09/33] gradual memory consumption fix for standalone.jar --- src/java/edu/stanford/muse/email/EmailFetcherThread.java | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/java/edu/stanford/muse/email/EmailFetcherThread.java b/src/java/edu/stanford/muse/email/EmailFetcherThread.java index bb3192b..ec4430a 100755 --- a/src/java/edu/stanford/muse/email/EmailFetcherThread.java +++ b/src/java/edu/stanford/muse/email/EmailFetcherThread.java @@ -1282,7 +1282,13 @@ public void run() { // this is a special for mbox'es because we run out of memory if we try to openFolderAndGetMessages() // so we process in batches //TODO: Ideally, should cap on buffer size rather than on number of messages. - final int BATCH = 100; //it ate too much memory more than 12gb with 10000 + int nMessagesperbathc = 10000; + long maxMemory = Runtime.getRuntime().maxMemory(); + if (maxMemory <= 4294967296L ) { nMessagesperbathc = 100; } + else { + if (maxMemory<= 8294967296L) { nMessagesperbathc = 1000; } + } + final int BATCH = nMessagesperbathc; //gradual decrease of batch size due to memory size int nbatches = nMessages / BATCH; nMessagesProcessedSuccess = 0; long st = System.currentTimeMillis(); From aed6ec3e998b23ee54544b56acac709f2ee0d5b0 Mon Sep 17 00:00:00 2001 From: arostov Date: Sun, 4 Jun 2017 22:02:58 +0300 Subject: [PATCH 10/33] save json on fs --- .../muse/email/EmailFetcherThread.java | 2 + .../muse/email/json/ArchiveSaver.java | 125 ++++++++++++++++++ 2 files changed, 127 insertions(+) create mode 100644 src/java/edu/stanford/muse/email/json/ArchiveSaver.java diff --git a/src/java/edu/stanford/muse/email/EmailFetcherThread.java b/src/java/edu/stanford/muse/email/EmailFetcherThread.java index ec4430a..db983f8 100755 --- a/src/java/edu/stanford/muse/email/EmailFetcherThread.java +++ b/src/java/edu/stanford/muse/email/EmailFetcherThread.java @@ -17,6 +17,7 @@ import com.sun.mail.imap.IMAPFolder; import edu.stanford.muse.datacache.Blob; +import edu.stanford.muse.email.json.ArchiveSaver; import edu.stanford.muse.index.*; import edu.stanford.muse.util.EmailUtils; import edu.stanford.muse.util.JSONUtils; @@ -1365,6 +1366,7 @@ public void run() { } log.info("Read #" + nMessages + " messages in in " + (System.currentTimeMillis() - st) + "ms"); } + new ArchiveSaver().save(archive); } catch (Throwable t) { if (t instanceof OutOfMemoryError) this.mayHaveRunOutOfMemory = true; diff --git a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java new file mode 100644 index 0000000..275b39f --- /dev/null +++ b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java @@ -0,0 +1,125 @@ +package edu.stanford.muse.email.json; + +import edu.stanford.muse.email.AddressBook; +import edu.stanford.muse.index.Archive; +import edu.stanford.muse.index.Document; +import edu.stanford.muse.index.EmailDocument; +import edu.stanford.muse.webapp.JSPHelper; +import org.codehaus.plexus.util.StringOutputStream; + +import javax.mail.Address; +import javax.mail.internet.InternetAddress; +import java.io.*; +import java.util.List; + +/** + * Created by sunchise on 04.06.17. + */ +public class ArchiveSaver { + + public void save(Archive archive) { + String fileName = System.getProperty("user.home") + File.separator + "archive.json"; + File file = new File(fileName); + if (file.exists()) { + file.delete(); + } + try { + file.createNewFile(); + } catch (IOException e) { + throw new RuntimeException(e.getMessage(), e); + } + BufferedWriter stream; + try { + stream = new BufferedWriter(new FileWriter(file)); + } catch (IOException e) { + throw new RuntimeException(e.getMessage(), e); + } + List allDocs = archive.getAllDocs(); + int i = 1; + try { + stream.append("["); + for (Document doc : allDocs) { + if (i > 1) { + stream.append(","); + } + stream.append("{"); + EmailDocument emailDocument = (EmailDocument) doc; + stream.append("\"emailId\": " + i++ + ","); + stream.append("\"dateField\": \"" + emailDocument.getDate().getTime() + "\","); + stream.append("\"isSent\": " + true + ","); + stream.append("\"toField\": ["); + if (emailDocument.to != null) { + boolean first = true; + for (Address address : emailDocument.to) { + if (!first) { + stream.append(","); + } + InternetAddress internetAddress = (InternetAddress) address; + stream.append("["); + stream.append(getAddressString(internetAddress)); + stream.append("]"); + first = false; + } + } + stream.append("],"); + stream.append("\"ccField\": ["); + if (emailDocument.cc != null && emailDocument.cc.length != 0) { + boolean first = true; + for (Address address : emailDocument.cc) { + if (!first) { + stream.append(","); + } + InternetAddress internetAddress = (InternetAddress) address; + stream.append("["); + stream.append(getAddressString(internetAddress)); + stream.append("]"); + first = false; + } + } else { + stream.append("["); + stream.append("\"ccPlaceholder\",\"ccPlaceholder\""); + stream.append("]"); + } + stream.append("],"); + + stream.append("\"fromField\": ["); + if (emailDocument.from != null && emailDocument.from.length > 0) { + boolean first = true; + for (Address address : emailDocument.from) { + if (!first) { + stream.append(","); + } + InternetAddress internetAddress = (InternetAddress) address; + stream.append("["); + stream.append(getAddressString(internetAddress)); + stream.append("], "); + stream.append("\"" + internetAddress.getAddress() + "\""); + first = false; + } + } else { + stream.append("["); + stream.append("\"fromPlaceholder\",\"fromPlaceholder\""); + stream.append("], "); + stream.append("\"fromPlaceholder\""); + } + stream.append("],"); + stream.append("\"subject\": \"" + String.valueOf(emailDocument.getSubject()).trim().replaceAll("\"", "''").replaceAll("\n", " ") + "\""); + stream.append("}"); + } + stream.append("]"); + stream.flush(); + } catch (IOException e) { + throw new RuntimeException(e.getMessage(), e); + } + } + + + + private String getAddressString(InternetAddress internetAddress) { + return "\"" + + (internetAddress.getPersonal() == null + ? internetAddress.getAddress() + : internetAddress.getPersonal().replaceAll("\"", "''")) + + "\", \"" + internetAddress.getAddress() + "\""; + } +} From 4917c8b3ec2eba8331b983be6aee6f65b94ca43b Mon Sep 17 00:00:00 2001 From: arostov Date: Mon, 5 Jun 2017 00:02:24 +0300 Subject: [PATCH 11/33] save json on fs by servlet --- .../stanford/muse/email/json/EmailInfo.java | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 src/java/edu/stanford/muse/email/json/EmailInfo.java diff --git a/src/java/edu/stanford/muse/email/json/EmailInfo.java b/src/java/edu/stanford/muse/email/json/EmailInfo.java new file mode 100644 index 0000000..5af3927 --- /dev/null +++ b/src/java/edu/stanford/muse/email/json/EmailInfo.java @@ -0,0 +1,56 @@ +package edu.stanford.muse.email.json; + +import java.io.Serializable; + +/** + * Created by sunchise on 03.06.17. + */ +public class EmailInfo implements Serializable { + + private final int emailId; + private final String dateField; + private final boolean isSent; + private final String[][] toField; + private final String[][] ccField; + private final Object[] fromField; + private final String subject; + + + public EmailInfo(int emailId, String dateField, boolean isSent, String[][] toField, String[][] ccField, Object[] fromField, String subject) { + this.emailId = emailId; + this.dateField = dateField; + this.isSent = isSent; + this.toField = toField; + this.ccField = ccField; + this.fromField = fromField; + this.subject = subject; + } + + public int getEmailId() { + return emailId; + } + + public String getDateField() { + return dateField; + } + + public boolean isSent() { + return isSent; + } + + public String[][] getToField() { + return toField; + } + + public String[][] getCcField() { + return ccField; + } + + public Object[] getFromField() { + return fromField; + } + + public String getSubject() { + return subject; + } +} From 427adf56cda3fa632b71070f9a87d5227e879c21 Mon Sep 17 00:00:00 2001 From: arostov Date: Mon, 5 Jun 2017 20:52:09 +0300 Subject: [PATCH 12/33] save json filter special chars --- .../muse/email/json/ArchiveSaver.java | 84 +++++++++---------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java index 275b39f..615251c 100644 --- a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java +++ b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java @@ -28,85 +28,79 @@ public void save(Archive archive) { } catch (IOException e) { throw new RuntimeException(e.getMessage(), e); } - BufferedWriter stream; - try { - stream = new BufferedWriter(new FileWriter(file)); - } catch (IOException e) { - throw new RuntimeException(e.getMessage(), e); - } List allDocs = archive.getAllDocs(); int i = 1; - try { - stream.append("["); + try (BufferedWriter stream = new BufferedWriter(new FileWriter(file))) { + append(stream, "["); for (Document doc : allDocs) { if (i > 1) { - stream.append(","); + append(stream, ","); } - stream.append("{"); + append(stream, "{"); EmailDocument emailDocument = (EmailDocument) doc; - stream.append("\"emailId\": " + i++ + ","); - stream.append("\"dateField\": \"" + emailDocument.getDate().getTime() + "\","); - stream.append("\"isSent\": " + true + ","); - stream.append("\"toField\": ["); + append(stream, "\"emailId\": " + i++ + ","); + append(stream, "\"dateField\": \"" + emailDocument.getDate().getTime() + "\","); + append(stream, "\"isSent\": " + true + ","); + append(stream, "\"toField\": ["); if (emailDocument.to != null) { boolean first = true; for (Address address : emailDocument.to) { if (!first) { - stream.append(","); + append(stream, ","); } InternetAddress internetAddress = (InternetAddress) address; - stream.append("["); - stream.append(getAddressString(internetAddress)); - stream.append("]"); + append(stream, "["); + append(stream, getAddressString(internetAddress)); + append(stream, "]"); first = false; } } - stream.append("],"); - stream.append("\"ccField\": ["); + append(stream, "],"); + append(stream, "\"ccField\": ["); if (emailDocument.cc != null && emailDocument.cc.length != 0) { boolean first = true; for (Address address : emailDocument.cc) { if (!first) { - stream.append(","); + append(stream, ","); } InternetAddress internetAddress = (InternetAddress) address; - stream.append("["); - stream.append(getAddressString(internetAddress)); - stream.append("]"); + append(stream, "["); + append(stream, getAddressString(internetAddress)); + append(stream, "]"); first = false; } } else { - stream.append("["); - stream.append("\"ccPlaceholder\",\"ccPlaceholder\""); - stream.append("]"); + append(stream, "["); + append(stream, "\"ccPlaceholder\",\"ccPlaceholder\""); + append(stream, "]"); } - stream.append("],"); + append(stream, "],"); - stream.append("\"fromField\": ["); + append(stream, "\"fromField\": ["); if (emailDocument.from != null && emailDocument.from.length > 0) { boolean first = true; for (Address address : emailDocument.from) { if (!first) { - stream.append(","); + append(stream, ","); } InternetAddress internetAddress = (InternetAddress) address; - stream.append("["); - stream.append(getAddressString(internetAddress)); - stream.append("], "); - stream.append("\"" + internetAddress.getAddress() + "\""); + append(stream, "["); + append(stream, getAddressString(internetAddress)); + append(stream, "], "); + append(stream, "\"" + internetAddress.getAddress() + "\""); first = false; } } else { - stream.append("["); - stream.append("\"fromPlaceholder\",\"fromPlaceholder\""); - stream.append("], "); - stream.append("\"fromPlaceholder\""); + append(stream, "["); + append(stream, "\"fromPlaceholder\",\"fromPlaceholder\""); + append(stream, "], "); + append(stream, "\"fromPlaceholder\""); } - stream.append("],"); - stream.append("\"subject\": \"" + String.valueOf(emailDocument.getSubject()).trim().replaceAll("\"", "''").replaceAll("\n", " ") + "\""); - stream.append("}"); + append(stream, "],"); + append(stream, "\"subject\": \"" + String.valueOf(emailDocument.getSubject()).trim().replaceAll("\"", "''").replaceAll("\n", " ") + "\""); + append(stream, "}"); } - stream.append("]"); + append(stream, "]"); stream.flush(); } catch (IOException e) { throw new RuntimeException(e.getMessage(), e); @@ -114,6 +108,12 @@ public void save(Archive archive) { } + private void append(Writer stream, String string) throws IOException { + string = string.replaceAll("\"", "''"); + string = string.replaceAll("\n", " "); + string = string.trim(); + stream.append(string); + } private String getAddressString(InternetAddress internetAddress) { return "\"" From aed3a79e278d8614fc3cb61a38b1b24d03c7e0d5 Mon Sep 17 00:00:00 2001 From: arostov Date: Mon, 5 Jun 2017 21:19:19 +0300 Subject: [PATCH 13/33] save json filter special chars --- .../stanford/muse/email/json/ArchiveSaver.java | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java index 615251c..6d86ffc 100644 --- a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java +++ b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java @@ -87,7 +87,7 @@ public void save(Archive archive) { append(stream, "["); append(stream, getAddressString(internetAddress)); append(stream, "], "); - append(stream, "\"" + internetAddress.getAddress() + "\""); + append(stream, "\"" + internetAddress.getAddress().replaceAll("\"", "''") + "\""); first = false; } } else { @@ -97,7 +97,7 @@ public void save(Archive archive) { append(stream, "\"fromPlaceholder\""); } append(stream, "],"); - append(stream, "\"subject\": \"" + String.valueOf(emailDocument.getSubject()).trim().replaceAll("\"", "''").replaceAll("\n", " ") + "\""); + append(stream, "\"subject\": \"" + String.valueOf(emailDocument.getSubject()).replaceAll("\"", "''") + "\""); append(stream, "}"); } append(stream, "]"); @@ -109,8 +109,11 @@ public void save(Archive archive) { private void append(Writer stream, String string) throws IOException { - string = string.replaceAll("\"", "''"); - string = string.replaceAll("\n", " "); + string = string.replaceAll("\\\n", " "); + string = string.replaceAll("\\\r", " "); + string = string.replaceAll(" {2,}", " "); + string = string.replaceAll("\\\" ", "\""); + string = string.replaceAll(" \\\"", "\""); string = string.trim(); stream.append(string); } @@ -118,8 +121,8 @@ private void append(Writer stream, String string) throws IOException { private String getAddressString(InternetAddress internetAddress) { return "\"" + (internetAddress.getPersonal() == null - ? internetAddress.getAddress() + ? internetAddress.getAddress().replaceAll("\"", "''") : internetAddress.getPersonal().replaceAll("\"", "''")) - + "\", \"" + internetAddress.getAddress() + "\""; + + "\", \"" + internetAddress.getAddress().replaceAll("\"", "''") + "\""; } } From a399e6471f0f47f1f7490460d9a082eb4c746b5f Mon Sep 17 00:00:00 2001 From: arostov Date: Mon, 5 Jun 2017 21:42:56 +0300 Subject: [PATCH 14/33] save json filter special chars --- src/java/edu/stanford/muse/email/json/ArchiveSaver.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java index 6d86ffc..be27816 100644 --- a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java +++ b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java @@ -110,6 +110,7 @@ public void save(Archive archive) { private void append(Writer stream, String string) throws IOException { string = string.replaceAll("\\\n", " "); + string = string.replaceAll("\\\\", "\\\\"); string = string.replaceAll("\\\r", " "); string = string.replaceAll(" {2,}", " "); string = string.replaceAll("\\\" ", "\""); From 9aecdc2d704949934da9feaa596fd0210416f9c9 Mon Sep 17 00:00:00 2001 From: arostov Date: Mon, 5 Jun 2017 22:02:16 +0300 Subject: [PATCH 15/33] save json filter special chars --- src/java/edu/stanford/muse/email/json/ArchiveSaver.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java index be27816..38d28f3 100644 --- a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java +++ b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java @@ -110,7 +110,7 @@ public void save(Archive archive) { private void append(Writer stream, String string) throws IOException { string = string.replaceAll("\\\n", " "); - string = string.replaceAll("\\\\", "\\\\"); + string = string.replaceAll("\\\\", "\\\\\\\\"); string = string.replaceAll("\\\r", " "); string = string.replaceAll(" {2,}", " "); string = string.replaceAll("\\\" ", "\""); From ca8d4e37de0b9f3f3cad70bfe8072b785942f315 Mon Sep 17 00:00:00 2001 From: arostov Date: Mon, 5 Jun 2017 23:01:22 +0300 Subject: [PATCH 16/33] save json filter special chars --- .../edu/stanford/muse/email/json/ArchiveSaver.java | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java index 38d28f3..b2244a1 100644 --- a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java +++ b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java @@ -109,12 +109,14 @@ public void save(Archive archive) { private void append(Writer stream, String string) throws IOException { - string = string.replaceAll("\\\n", " "); + string = string.replaceAll("\\s", " "); + string = string.replaceAll("\\n", " "); string = string.replaceAll("\\\\", "\\\\\\\\"); - string = string.replaceAll("\\\r", " "); + string = string.replaceAll("\\r", " "); string = string.replaceAll(" {2,}", " "); - string = string.replaceAll("\\\" ", "\""); - string = string.replaceAll(" \\\"", "\""); + string = string.replaceAll("\" ", "\""); + string = string.replaceAll(" \"", "\""); + string = string.replaceAll("[^\\w\\d\\sёЁА-Яа-я.,:\\\\\\[\\]|'\";()*?!#$%{}]", ""); string = string.trim(); stream.append(string); } From c8ef54585dc98e6bcccec2c577883d2a2e66d7c4 Mon Sep 17 00:00:00 2001 From: arostov Date: Tue, 6 Jun 2017 00:33:57 +0300 Subject: [PATCH 17/33] json format --- .../stanford/muse/email/json/ArchiveSaver.java | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java index b2244a1..9533ccc 100644 --- a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java +++ b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java @@ -39,7 +39,7 @@ public void save(Archive archive) { append(stream, "{"); EmailDocument emailDocument = (EmailDocument) doc; append(stream, "\"emailId\": " + i++ + ","); - append(stream, "\"dateField\": \"" + emailDocument.getDate().getTime() + "\","); + append(stream, "\"dateField\": \"" + emailDocument.getDate().getTime() / 1000 + "\","); append(stream, "\"isSent\": " + true + ","); append(stream, "\"toField\": ["); if (emailDocument.to != null) { @@ -87,7 +87,7 @@ public void save(Archive archive) { append(stream, "["); append(stream, getAddressString(internetAddress)); append(stream, "], "); - append(stream, "\"" + internetAddress.getAddress().replaceAll("\"", "''") + "\""); + append(stream, "\"" + internetAddress.getAddress().replaceAll("\"", "'") + "\""); first = false; } } else { @@ -97,7 +97,7 @@ public void save(Archive archive) { append(stream, "\"fromPlaceholder\""); } append(stream, "],"); - append(stream, "\"subject\": \"" + String.valueOf(emailDocument.getSubject()).replaceAll("\"", "''") + "\""); + append(stream, "\"subject\": \"" + String.valueOf(emailDocument.getSubject()).replaceAll("\"", "'").replace("Subject: ", "") + "\""); append(stream, "}"); } append(stream, "]"); @@ -122,10 +122,10 @@ private void append(Writer stream, String string) throws IOException { } private String getAddressString(InternetAddress internetAddress) { - return "\"" - + (internetAddress.getPersonal() == null - ? internetAddress.getAddress().replaceAll("\"", "''") - : internetAddress.getPersonal().replaceAll("\"", "''")) - + "\", \"" + internetAddress.getAddress().replaceAll("\"", "''") + "\""; + String personal = (internetAddress.getPersonal() == null + ? internetAddress.getAddress() + : internetAddress.getPersonal()) + .replaceAll("\"", "'"); + return "\"" + personal + "\", \"" + personal + "\""; } } From 429b639e9993a597d3069cdce6c1612fc9d630c5 Mon Sep 17 00:00:00 2001 From: arostov Date: Tue, 6 Jun 2017 01:36:32 +0300 Subject: [PATCH 18/33] json format --- .../edu/stanford/muse/email/json/ArchiveSaver.java | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java index 9533ccc..af7444f 100644 --- a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java +++ b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java @@ -76,7 +76,7 @@ public void save(Archive archive) { } append(stream, "],"); - append(stream, "\"fromField\": ["); + append(stream, "\"fromField\": "); if (emailDocument.from != null && emailDocument.from.length > 0) { boolean first = true; for (Address address : emailDocument.from) { @@ -86,17 +86,15 @@ public void save(Archive archive) { InternetAddress internetAddress = (InternetAddress) address; append(stream, "["); append(stream, getAddressString(internetAddress)); - append(stream, "], "); - append(stream, "\"" + internetAddress.getAddress().replaceAll("\"", "'") + "\""); + append(stream, "] "); first = false; } } else { append(stream, "["); append(stream, "\"fromPlaceholder\",\"fromPlaceholder\""); - append(stream, "], "); - append(stream, "\"fromPlaceholder\""); + append(stream, "] "); } - append(stream, "],"); + append(stream, ","); append(stream, "\"subject\": \"" + String.valueOf(emailDocument.getSubject()).replaceAll("\"", "'").replace("Subject: ", "") + "\""); append(stream, "}"); } From f0bd327f85500fa2fe1732f0919de7d846969eb8 Mon Sep 17 00:00:00 2001 From: arostov Date: Wed, 7 Jun 2017 00:44:50 +0300 Subject: [PATCH 19/33] megagraph5 --- src/java/edu/stanford/muse/email/json/ArchiveSaver.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java index af7444f..ae018e1 100644 --- a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java +++ b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java @@ -114,7 +114,7 @@ private void append(Writer stream, String string) throws IOException { string = string.replaceAll(" {2,}", " "); string = string.replaceAll("\" ", "\""); string = string.replaceAll(" \"", "\""); - string = string.replaceAll("[^\\w\\d\\sёЁА-Яа-я.,:\\\\\\[\\]|'\";()*?!#$%{}]", ""); + string = string.replaceAll("[^\\w\\d\\sёЁА-Яа-я.,:\\\\\\[\\]|'\";()*?!#$%{}@+\\-]", ""); string = string.trim(); stream.append(string); } From 3ff6e29149f39ed13a34f8008d27c5efc5023bf9 Mon Sep 17 00:00:00 2001 From: arostov Date: Wed, 7 Jun 2017 00:56:59 +0300 Subject: [PATCH 20/33] fix --- src/java/edu/stanford/muse/index/EmailDocument.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/java/edu/stanford/muse/index/EmailDocument.java b/src/java/edu/stanford/muse/index/EmailDocument.java index 9daa68a..03875e2 100755 --- a/src/java/edu/stanford/muse/index/EmailDocument.java +++ b/src/java/edu/stanford/muse/index/EmailDocument.java @@ -51,7 +51,7 @@ public class EmailDocument extends DatedDocument implements Serializable public String folderName, emailSource; public Set folderNames = new LinkedHashSet<>(), emailSources = new LinkedHashSet<>(); // email can now belong to multiple folders, folderName field also maintained for backward compatibility - public Address[] to, from, cc, bcc; + public Address[] to, from, cc, bcc; public String messageID; public String sentToMailingLists[]; From 54dbae205f1f4b44917181ccd4bc53370812708f Mon Sep 17 00:00:00 2001 From: arostov Date: Thu, 8 Jun 2017 01:17:25 +0300 Subject: [PATCH 21/33] fix --- src/java/edu/stanford/muse/email/json/ArchiveSaver.java | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java index ae018e1..a6b1291 100644 --- a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java +++ b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java @@ -80,14 +80,11 @@ public void save(Archive archive) { if (emailDocument.from != null && emailDocument.from.length > 0) { boolean first = true; for (Address address : emailDocument.from) { - if (!first) { - append(stream, ","); - } InternetAddress internetAddress = (InternetAddress) address; append(stream, "["); append(stream, getAddressString(internetAddress)); append(stream, "] "); - first = false; + break; } } else { append(stream, "["); From 2ca05efa12fd171aefcdc46c92bbfc68e99c3ad2 Mon Sep 17 00:00:00 2001 From: arostov Date: Fri, 9 Jun 2017 00:08:35 +0300 Subject: [PATCH 22/33] refactoring --- .../muse/email/json/ArchiveSaver.java | 95 +++------ .../edu/stanford/muse/email/json/Email.java | 188 ++++++++++++++++++ .../muse/email/json/EmailNameAgregator.java | 66 ++++++ 3 files changed, 281 insertions(+), 68 deletions(-) create mode 100644 src/java/edu/stanford/muse/email/json/Email.java create mode 100644 src/java/edu/stanford/muse/email/json/EmailNameAgregator.java diff --git a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java index a6b1291..cb96b24 100644 --- a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java +++ b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java @@ -1,16 +1,14 @@ package edu.stanford.muse.email.json; -import edu.stanford.muse.email.AddressBook; import edu.stanford.muse.index.Archive; import edu.stanford.muse.index.Document; import edu.stanford.muse.index.EmailDocument; -import edu.stanford.muse.webapp.JSPHelper; -import org.codehaus.plexus.util.StringOutputStream; import javax.mail.Address; import javax.mail.internet.InternetAddress; import java.io.*; import java.util.List; +import java.util.Map; /** * Created by sunchise on 04.06.17. @@ -29,71 +27,47 @@ public void save(Archive archive) { throw new RuntimeException(e.getMessage(), e); } List allDocs = archive.getAllDocs(); + EmailNameAgregator emailNameAgregator = new EmailNameAgregator(allDocs); int i = 1; try (BufferedWriter stream = new BufferedWriter(new FileWriter(file))) { append(stream, "["); + boolean fail = false; for (Document doc : allDocs) { - if (i > 1) { + if (i > 1 && !fail) { append(stream, ","); } - append(stream, "{"); - EmailDocument emailDocument = (EmailDocument) doc; - append(stream, "\"emailId\": " + i++ + ","); - append(stream, "\"dateField\": \"" + emailDocument.getDate().getTime() / 1000 + "\","); - append(stream, "\"isSent\": " + true + ","); - append(stream, "\"toField\": ["); - if (emailDocument.to != null) { - boolean first = true; - for (Address address : emailDocument.to) { - if (!first) { - append(stream, ","); - } + fail = false; + final EmailDocument emailDocument = (EmailDocument) doc; + Email email = new Email(i, + emailDocument.date, + true, + emailDocument.getSubject(), + emailDocument.from == null || emailDocument.from.length == 0 ? null : emailNameAgregator.getName(emailDocument.getFromEmailAddress()), + emailDocument.getFromEmailAddress()); + if (emailDocument.cc != null) { + for (Address address : emailDocument.cc) { InternetAddress internetAddress = (InternetAddress) address; - append(stream, "["); - append(stream, getAddressString(internetAddress)); - append(stream, "]"); - first = false; + email.addCc(emailNameAgregator.getName(internetAddress.getAddress()), internetAddress.getAddress()); } } - append(stream, "],"); - append(stream, "\"ccField\": ["); - if (emailDocument.cc != null && emailDocument.cc.length != 0) { - boolean first = true; - for (Address address : emailDocument.cc) { - if (!first) { - append(stream, ","); - } + if (emailDocument.bcc != null) { + for (Address address : emailDocument.bcc) { InternetAddress internetAddress = (InternetAddress) address; - append(stream, "["); - append(stream, getAddressString(internetAddress)); - append(stream, "]"); - first = false; + email.addCc(emailNameAgregator.getName(internetAddress.getAddress()), internetAddress.getAddress()); } - } else { - append(stream, "["); - append(stream, "\"ccPlaceholder\",\"ccPlaceholder\""); - append(stream, "]"); } - append(stream, "],"); - - append(stream, "\"fromField\": "); - if (emailDocument.from != null && emailDocument.from.length > 0) { - boolean first = true; - for (Address address : emailDocument.from) { + if (emailDocument.to != null) { + for (Address address : emailDocument.to) { InternetAddress internetAddress = (InternetAddress) address; - append(stream, "["); - append(stream, getAddressString(internetAddress)); - append(stream, "] "); - break; + email.addCc(emailNameAgregator.getName(internetAddress.getAddress()), internetAddress.getAddress()); } - } else { - append(stream, "["); - append(stream, "\"fromPlaceholder\",\"fromPlaceholder\""); - append(stream, "] "); } - append(stream, ","); - append(stream, "\"subject\": \"" + String.valueOf(emailDocument.getSubject()).replaceAll("\"", "'").replace("Subject: ", "") + "\""); - append(stream, "}"); + if (email.check()) { + append(stream, email.toJson()); + } else { + fail = true; + } + i++; } append(stream, "]"); stream.flush(); @@ -104,23 +78,8 @@ public void save(Archive archive) { private void append(Writer stream, String string) throws IOException { - string = string.replaceAll("\\s", " "); - string = string.replaceAll("\\n", " "); - string = string.replaceAll("\\\\", "\\\\\\\\"); - string = string.replaceAll("\\r", " "); - string = string.replaceAll(" {2,}", " "); - string = string.replaceAll("\" ", "\""); - string = string.replaceAll(" \"", "\""); - string = string.replaceAll("[^\\w\\d\\sёЁА-Яа-я.,:\\\\\\[\\]|'\";()*?!#$%{}@+\\-]", ""); string = string.trim(); stream.append(string); } - private String getAddressString(InternetAddress internetAddress) { - String personal = (internetAddress.getPersonal() == null - ? internetAddress.getAddress() - : internetAddress.getPersonal()) - .replaceAll("\"", "'"); - return "\"" + personal + "\", \"" + personal + "\""; - } } diff --git a/src/java/edu/stanford/muse/email/json/Email.java b/src/java/edu/stanford/muse/email/json/Email.java new file mode 100644 index 0000000..7fbd499 --- /dev/null +++ b/src/java/edu/stanford/muse/email/json/Email.java @@ -0,0 +1,188 @@ +package edu.stanford.muse.email.json; + +import org.json.JSONException; +import org.json.JSONObject; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.mail.Address; +import javax.mail.internet.InternetAddress; +import java.io.*; +import java.util.Calendar; +import java.util.Collection; +import java.util.Date; +import java.util.HashSet; +/* +{ + "emailId": 3, + "dateField": "1496222800", + "isSent": true, + "toField": [ + [ + "Александр Игоревич", + "Александр Игоревич" + ] + ], + "ccField": [ + [ + "ccPlaceholder", + "ccPlaceholder" + ] + ], + "fromField": [ + "WWF России", + "WWF России" + ], + "subject": "Барс по имени Крюк" + } + */ + + +public class Email { + private final Logger log = LoggerFactory.getLogger(Email.class); + + private final String id; + + private final Date date; + + private final boolean isSent; + + private final Collection to = new HashSet<>(); + + private final Collection cc = new HashSet<>(); + + private final EmailAddress from; + + private final String subject; + + private String toJson; + + public Email(String id, Date date, boolean isSent, EmailAddress from, String subject) { + this.id = id; + Calendar calendar = Calendar.getInstance(); + calendar.set(Calendar.YEAR, 1999); + calendar.set(Calendar.MONTH, Calendar.SEPTEMBER); + calendar.set(Calendar.DAY_OF_MONTH, 11); + Date minDate = calendar.getTime(); + if (date == null || minDate.compareTo(date) > 0) { + date = minDate; + } + this.date = date; + this.isSent = isSent; + this.from = from; + this.subject = subject; + } + + + public Email(int id, Date date, boolean isSent, String subject, String fromName, String fromEmail) { + this(String.valueOf(id), date, isSent, subject, fromName, fromEmail); + } + + public Email(String id, Date date, boolean isSent, String subject, String fromName, String fromEmail) { + this(id, date, isSent, new EmailAddress(fromName, fromEmail), subject); + } + + public void addTo(EmailAddress emailAddress) { + toJson = null; + to.add(emailAddress); + } + + public void addTo(String name, String email) { + toJson = null; + addTo(new EmailAddress(name, email)); + } + + public void addCc(EmailAddress emailAddress) { + toJson = null; + to.add(emailAddress); + } + + public void addCc(String name, String email) { + toJson = null; + addCc(new EmailAddress(name, email)); + } + + public String toJson() { + if (toJson == null) { + StringBuilder stream = new StringBuilder(); + stream.append("{"); + stream.append("\"emailId\": ").append(id).append(","); + stream.append("\"dateField\": ").append(date.getTime() / 1000).append(","); + stream.append("\"isSent\": ").append(isSent).append(","); + stream.append("\"toField\": ["); + stream.append(to.stream().map(EmailAddress::toJson).reduce((s, s2) -> s + "," + s2).orElse("")); + stream.append("],"); + stream.append("\"ccField\": ["); + if (cc.isEmpty()) { + stream.append(new EmailAddress("ccPlaceholder", "ccPlaceholder").toJson()); + } else { + stream.append(cc.stream().map(EmailAddress::toJson).reduce((s, s2) -> s + "," + s2).orElse("")); + } + stream.append("],"); + stream.append("\"fromField\": "); + if (from == null) { + stream.append(new EmailAddress("fromPlaceholder", "fromPlaceholder").toJson()); + } else { + stream.append(from.toJson()); + } + stream.append(","); + stream.append("\"subject\": \""); + append(stream, String.valueOf(subject).replaceAll("\"", "'").replace("Subject: ", "")).append("\""); + stream.append("}"); + toJson = stream.toString(); + } + return toJson; + } + + public boolean check() { + try { + new JSONObject(toJson()); + } catch (JSONException e) { + log.error("Not right format of json\n\n" + toJson + "\n\n" + e.getMessage()); + return false; + } + return true; + } + + public static class EmailAddress { + private final String name; + private final String email; + + public EmailAddress(String name, String email) { + this.name = name; + this.email = email; + } + + public String getName() { + return name == null ? email : name.replaceAll("\"", "'"); + } + + public String getEmail() { + return email.replaceAll("\"", "'"); + } + + public String toJson() { + StringBuilder stream = new StringBuilder(); + append(stream, "["); + append(stream, "\"" + getName() + "\""); + append(stream, ","); + append(stream, "\"" + getEmail() + "\""); + append(stream, "]"); + return stream.toString(); + } + } + + private static StringBuilder append(StringBuilder stream, String string) { + string = string.replaceAll("\\s", " "); + string = string.replaceAll("\\n", " "); + string = string.replaceAll("\\\\", "\\\\\\\\"); + string = string.replaceAll("\\r", " "); + string = string.replaceAll(" {2,}", " "); + string = string.replaceAll("\" ", "\""); + string = string.replaceAll(" \"", "\""); + string = string.replaceAll("[^\\w\\d\\sёЁА-Яа-я.,:\\\\\\[\\]|'\";()*?!#$%{}@+\\-]", ""); + string = string.trim(); + stream.append(string); + return stream; + } +} diff --git a/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java b/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java new file mode 100644 index 0000000..04d9038 --- /dev/null +++ b/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java @@ -0,0 +1,66 @@ +package edu.stanford.muse.email.json; + +import edu.stanford.muse.index.Document; +import edu.stanford.muse.index.EmailDocument; + +import javax.mail.Address; +import javax.mail.internet.InternetAddress; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class EmailNameAgregator { + private List allDocs; + final Map emailNameMap = new HashMap<>(); + + public EmailNameAgregator(List allDocs) { + this.allDocs = allDocs; + init(); + } + + private void init() { + emailNameMap.clear(); + allDocs.forEach(document -> { + EmailDocument emailDocument = (EmailDocument) document; + if (emailDocument.to != null) { + for (Address address : emailDocument.to) { + appendToEmailNameMap(emailNameMap, (InternetAddress) address); + } + } + if (emailDocument.cc != null) { + for (Address address : emailDocument.cc) { + appendToEmailNameMap(emailNameMap, (InternetAddress) address); + } + } + if (emailDocument.bcc != null) { + for (Address address : emailDocument.bcc) { + appendToEmailNameMap(emailNameMap, (InternetAddress) address); + } + } + }); + } + + public String getName(String email) { + return emailNameMap.get(email); + } + + + private void appendToEmailNameMap(Map emailNameMap, InternetAddress internetAddress) { + String email = internetAddress.getAddress(); + String personal = internetAddress.getPersonal(); + String name = emailNameMap.get(email); + if (name != null) { + if (personal != null && name.length() < personal.length()) { + if (personal.contains(" ") || (!name.contains(" "))) { + emailNameMap.put(email, personal); + } else { + if (!name.contains(" ") && personal.contains(" ")) { + emailNameMap.put(email, personal); + } + } + } + } else { + emailNameMap.put(email, personal); + } + } +} From 2b38c147648d590845ca3b9d076ff067b3d46a3e Mon Sep 17 00:00:00 2001 From: arostov Date: Fri, 9 Jun 2017 00:50:06 +0300 Subject: [PATCH 23/33] fix --- .../muse/email/json/EmailNameAgregator.java | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java b/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java index 04d9038..8d0e0e0 100644 --- a/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java +++ b/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java @@ -48,15 +48,22 @@ public String getName(String email) { private void appendToEmailNameMap(Map emailNameMap, InternetAddress internetAddress) { String email = internetAddress.getAddress(); String personal = internetAddress.getPersonal(); + if (personal == null) { + return; + } String name = emailNameMap.get(email); if (name != null) { - if (personal != null && name.length() < personal.length()) { + if (name.length() < personal.length()) { if (personal.contains(" ") || (!name.contains(" "))) { emailNameMap.put(email, personal); - } else { - if (!name.contains(" ") && personal.contains(" ")) { - emailNameMap.put(email, personal); - } + } + } else if (!name.contains(" ") && personal.contains(" ")) { + emailNameMap.put(email, personal); + } else if (name.contains(" ") && personal.contains(" ")) { + int nameWordsCount = name.split(" ").length; + int personalWordsCount = personal.split(" ").length; + if (personalWordsCount < 4 && personalWordsCount < nameWordsCount) { + emailNameMap.put(email, personal); } } } else { From 4d6437d4e238c49a283691fc1d689bb4c77557cf Mon Sep 17 00:00:00 2001 From: arostov Date: Fri, 9 Jun 2017 02:26:39 +0300 Subject: [PATCH 24/33] fix --- src/java/edu/stanford/muse/email/json/ArchiveSaver.java | 4 +++- src/java/edu/stanford/muse/email/json/Email.java | 8 ++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java index cb96b24..0c658f8 100644 --- a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java +++ b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java @@ -3,6 +3,7 @@ import edu.stanford.muse.index.Archive; import edu.stanford.muse.index.Document; import edu.stanford.muse.index.EmailDocument; +import edu.stanford.muse.util.Util; import javax.mail.Address; import javax.mail.internet.InternetAddress; @@ -38,7 +39,8 @@ public void save(Archive archive) { } fail = false; final EmailDocument emailDocument = (EmailDocument) doc; - Email email = new Email(i, + String messageID = Util.hash (emailDocument.getSignature()); + Email email = new Email(messageID, emailDocument.date, true, emailDocument.getSubject(), diff --git a/src/java/edu/stanford/muse/email/json/Email.java b/src/java/edu/stanford/muse/email/json/Email.java index 7fbd499..414845c 100644 --- a/src/java/edu/stanford/muse/email/json/Email.java +++ b/src/java/edu/stanford/muse/email/json/Email.java @@ -106,7 +106,7 @@ public String toJson() { if (toJson == null) { StringBuilder stream = new StringBuilder(); stream.append("{"); - stream.append("\"emailId\": ").append(id).append(","); + stream.append("\"emailId\": \"").append(id).append("\","); stream.append("\"dateField\": ").append(date.getTime() / 1000).append(","); stream.append("\"isSent\": ").append(isSent).append(","); stream.append("\"toField\": ["); @@ -127,7 +127,11 @@ public String toJson() { } stream.append(","); stream.append("\"subject\": \""); - append(stream, String.valueOf(subject).replaceAll("\"", "'").replace("Subject: ", "")).append("\""); + String formatedSubject = subject == null ? "Without subject" : String.valueOf(subject).replaceAll("\"", "'").replace("Subject: ", "").trim(); + if ("null".equals(formatedSubject)) { + formatedSubject = "Without subject"; + } + append(stream, formatedSubject).append("\""); stream.append("}"); toJson = stream.toString(); } From 0b407fa01dcebfdb66ea9c692dc14cf87609c9dc Mon Sep 17 00:00:00 2001 From: arostov Date: Sun, 11 Jun 2017 01:07:44 +0300 Subject: [PATCH 25/33] fix --- .../muse/email/json/ArchiveSaver.java | 17 ++++++- .../muse/email/json/EmailNameAgregator.java | 49 +++++++++++++++++++ 2 files changed, 65 insertions(+), 1 deletion(-) diff --git a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java index 0c658f8..c3f29c1 100644 --- a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java +++ b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java @@ -8,6 +8,7 @@ import javax.mail.Address; import javax.mail.internet.InternetAddress; import java.io.*; +import java.util.Base64; import java.util.List; import java.util.Map; @@ -16,8 +17,20 @@ */ public class ArchiveSaver { + private final String archiveName; + + public ArchiveSaver(String archiveName) { + this.archiveName = archiveName; + } + public void save(Archive archive) { - String fileName = System.getProperty("user.home") + File.separator + "archive.json"; + String folderName = new String(Base64.getEncoder().encode(archiveName.getBytes())); + String folderPath = System.getProperty("user.home") + File.separator + folderName; + File folder = new File(folderPath); + if (!folder.exists()) { + folder.mkdir(); + } + String fileName = folderPath + File.separator + "archive.json"; File file = new File(fileName); if (file.exists()) { file.delete(); @@ -29,6 +42,7 @@ public void save(Archive archive) { } List allDocs = archive.getAllDocs(); EmailNameAgregator emailNameAgregator = new EmailNameAgregator(allDocs); + emailNameAgregator.save(folderPath + File.separator + "email-names.json"); int i = 1; try (BufferedWriter stream = new BufferedWriter(new FileWriter(file))) { append(stream, "["); @@ -73,6 +87,7 @@ public void save(Archive archive) { } append(stream, "]"); stream.flush(); + stream.close(); } catch (IOException e) { throw new RuntimeException(e.getMessage(), e); } diff --git a/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java b/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java index 8d0e0e0..16cf9d7 100644 --- a/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java +++ b/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java @@ -1,10 +1,14 @@ package edu.stanford.muse.email.json; +import com.google.gson.Gson; +import com.google.gson.reflect.TypeToken; import edu.stanford.muse.index.Document; import edu.stanford.muse.index.EmailDocument; +import org.json.JSONObject; import javax.mail.Address; import javax.mail.internet.InternetAddress; +import java.io.*; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -18,6 +22,21 @@ public EmailNameAgregator(List allDocs) { init(); } + + public EmailNameAgregator(List allDocs, String fileName) { + this.allDocs = allDocs; + if (fileName == null) { + init(); + } else { + File file = new File(fileName); + if (file.exists()) { + load(fileName); + } else { + init(); + } + } + } + private void init() { emailNameMap.clear(); allDocs.forEach(document -> { @@ -70,4 +89,34 @@ private void appendToEmailNameMap(Map emailNameMap, InternetAddr emailNameMap.put(email, personal); } } + + public void save(String fileName) { + File file = new File(fileName); + if (file.exists()) { + file.delete(); + } + JSONObject json = new JSONObject(emailNameMap); + try (Writer writer = new FileWriter(file)) { + json.write(writer); + writer.close(); + } catch (IOException e) { + throw new RuntimeException(e.getMessage(), e); + } + } + + public void load(String fileName) { + File file = new File(fileName); + if (!file.exists()) { + file.delete(); + } + try (FileReader fileReader = new FileReader(file)) { + Map tempMap = new Gson().fromJson(fileReader, new TypeToken>() {}.getType()); + if (tempMap != null) { + emailNameMap.putAll(tempMap); + } + fileReader.close(); + } catch (IOException e) { + throw new RuntimeException(e.getMessage(), e); + } + } } From 384e97e784d57f9334b3adef39ac78ddda11e1c5 Mon Sep 17 00:00:00 2001 From: arostov Date: Tue, 13 Jun 2017 23:08:52 +0300 Subject: [PATCH 26/33] fix --- src/java/edu/stanford/muse/email/EmailFetcherThread.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/java/edu/stanford/muse/email/EmailFetcherThread.java b/src/java/edu/stanford/muse/email/EmailFetcherThread.java index db983f8..63a520a 100755 --- a/src/java/edu/stanford/muse/email/EmailFetcherThread.java +++ b/src/java/edu/stanford/muse/email/EmailFetcherThread.java @@ -1366,7 +1366,7 @@ public void run() { } log.info("Read #" + nMessages + " messages in in " + (System.currentTimeMillis() - st) + "ms"); } - new ArchiveSaver().save(archive); + new ArchiveSaver(archive.archiveTitle).save(archive); } catch (Throwable t) { if (t instanceof OutOfMemoryError) this.mayHaveRunOutOfMemory = true; From c704c374300c80c64e6f13229d6e780a66b8084f Mon Sep 17 00:00:00 2001 From: arostov Date: Thu, 15 Jun 2017 01:30:46 +0300 Subject: [PATCH 27/33] fix --- pom-common.xml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pom-common.xml b/pom-common.xml index 3e39c2a..63c659f 100755 --- a/pom-common.xml +++ b/pom-common.xml @@ -164,17 +164,17 @@ org.apache.tika tika-parsers - 1.14 + 1.15 org.apache.tika tika-core - 1.14 + 1.15 org.apache.pdfbox pdfbox - 1.8.1 + 2.0.6 From f2c91f00f1820b23c33f3ef0a251c84a42ad30aa Mon Sep 17 00:00:00 2001 From: arostov Date: Thu, 15 Jun 2017 01:32:57 +0300 Subject: [PATCH 28/33] fix --- src/java/edu/stanford/muse/datacache/Blob.java | 14 +++++++++++--- .../edu/stanford/muse/datacache/BlobStore.java | 2 +- .../stanford/muse/email/EmailFetcherThread.java | 1 + 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/java/edu/stanford/muse/datacache/Blob.java b/src/java/edu/stanford/muse/datacache/Blob.java index 19150d0..e952541 100755 --- a/src/java/edu/stanford/muse/datacache/Blob.java +++ b/src/java/edu/stanford/muse/datacache/Blob.java @@ -19,12 +19,14 @@ import edu.stanford.muse.util.Util; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.BodyContentHandler; import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; import java.io.IOException; import java.io.InputStream; @@ -132,9 +134,15 @@ public Pair getContent(BlobStore store) try { // skip mp3 files, tika has trouble with it and hangs if (!Util.nullOrEmpty(this.filename) && !this.filename.toLowerCase().endsWith(".mp3")) - parser.parse(stream, handler, metadata, context); - - String[] names = metadata.names(); + try { + parser.parse(stream, handler, metadata, context); + } catch (Exception e) { + log.error(e.getMessage(), e); + log.error(filename); + throw new RuntimeException(e.getMessage(), e); + } + + String[] names = metadata.names(); //Arrays.sort(names); for (String name : names) { // some metadata tags are problematic and result in large hex strings... ignore them. (caused memory problems with Henry's archive) diff --git a/src/java/edu/stanford/muse/datacache/BlobStore.java b/src/java/edu/stanford/muse/datacache/BlobStore.java index 960d22b..44c8daf 100755 --- a/src/java/edu/stanford/muse/datacache/BlobStore.java +++ b/src/java/edu/stanford/muse/datacache/BlobStore.java @@ -468,7 +468,7 @@ public void generate_thumbnail(Blob b) throws IOException { tnFilename = tmp_filename.substring(0, tmp_filename.length() - ".pdf".length()); // strip the ".pdf" tnFilename += "1.png"; String[] args = new String[]{"-imageType", "png", "-startPage", "1", "-endPage", "1", tmp_filename}; - org.apache.pdfbox.PDFToImage.main(args); + org.apache.pdfbox.tools.PDFToImage.main(args); log.info("Saving PDF thumbnail to " + tnFilename); filename = filename + ".png"; // make sure the suffix for the thumbnail is named with a .png suffix in the cache } catch (Throwable e) { diff --git a/src/java/edu/stanford/muse/email/EmailFetcherThread.java b/src/java/edu/stanford/muse/email/EmailFetcherThread.java index 63a520a..8c18a09 100755 --- a/src/java/edu/stanford/muse/email/EmailFetcherThread.java +++ b/src/java/edu/stanford/muse/email/EmailFetcherThread.java @@ -504,6 +504,7 @@ private List processMessagePart(int messageNum, Message m, Part p, List< // rfc822 mime type is for embedded mbox format or some such (appears for things like // forwarded messages). the content appears to be just a multipart. Object o = p.getContent(); + System.setProperty("mail.mime.multipart.allowempty", "true"); if (o instanceof Multipart) { Multipart allParts = (Multipart) o; if (p.isMimeType("multipart/alternative")) { From 31ff59977622fb9222d344dbf29704f28f9d3242 Mon Sep 17 00:00:00 2001 From: arostov Date: Sun, 18 Jun 2017 23:40:50 +0300 Subject: [PATCH 29/33] do not forget emails --- src/java/edu/stanford/muse/email/EmailFetcherThread.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/java/edu/stanford/muse/email/EmailFetcherThread.java b/src/java/edu/stanford/muse/email/EmailFetcherThread.java index 8c18a09..52168ea 100755 --- a/src/java/edu/stanford/muse/email/EmailFetcherThread.java +++ b/src/java/edu/stanford/muse/email/EmailFetcherThread.java @@ -1297,7 +1297,9 @@ public void run() { int b; for (b = 0; b < nbatches + 1; b++) { begin_msg_index = b * BATCH + 1; - end_msg_index = Math.min((b + 1) * BATCH, nMessages) + 1; + end_msg_index = Math.min((b + 1) * BATCH, nMessages); + log.info("begin_msg_index: " + begin_msg_index); + log.info("end_msg_index: " + end_msg_index); log.info("Fetching messages in index [" + begin_msg_index + ", " + end_msg_index + "] batch: " + b + "/" + nbatches + "\nTotal Messages: " + nMessages); Message[] messages = openFolderAndGetMessages(); currentStatus = JSONUtils.getStatusJSON(""); From bcbb7cbe4319ea4c2b7e20d5e721b5fcc8604b1e Mon Sep 17 00:00:00 2001 From: arostov Date: Wed, 21 Jun 2017 22:07:38 +0300 Subject: [PATCH 30/33] save archive data in special folder --- src/java/edu/stanford/muse/email/json/ArchiveSaver.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java index c3f29c1..9fd3abf 100644 --- a/src/java/edu/stanford/muse/email/json/ArchiveSaver.java +++ b/src/java/edu/stanford/muse/email/json/ArchiveSaver.java @@ -25,11 +25,16 @@ public ArchiveSaver(String archiveName) { public void save(Archive archive) { String folderName = new String(Base64.getEncoder().encode(archiveName.getBytes())); - String folderPath = System.getProperty("user.home") + File.separator + folderName; + String folderPath = System.getProperty("user.home") + File.separator + "epadd-data"; File folder = new File(folderPath); if (!folder.exists()) { folder.mkdir(); } + folderPath += File.separator + folderName; + folder = new File(folderPath); + if (!folder.exists()) { + folder.mkdir(); + } String fileName = folderPath + File.separator + "archive.json"; File file = new File(fileName); if (file.exists()) { From bfb5eb711c5ba0738b95faf85f20fb732e07fe19 Mon Sep 17 00:00:00 2001 From: arostov Date: Thu, 22 Jun 2017 22:26:23 +0300 Subject: [PATCH 31/33] eliminate wild chars --- .../muse/email/json/EmailNameAgregator.java | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java b/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java index 16cf9d7..2006563 100644 --- a/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java +++ b/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java @@ -70,6 +70,7 @@ private void appendToEmailNameMap(Map emailNameMap, InternetAddr if (personal == null) { return; } + personal = removeWildChars(personal); String name = emailNameMap.get(email); if (name != null) { if (name.length() < personal.length()) { @@ -90,6 +91,18 @@ private void appendToEmailNameMap(Map emailNameMap, InternetAddr } } + private String removeWildChars(String string) { + string = string.replaceAll("\\s", " "); + string = string.replaceAll("\\n", " "); + string = string.replaceAll("\\\\", "\\\\\\\\"); + string = string.replaceAll("\\r", " "); + string = string.replaceAll(" {2,}", " "); + string = string.replaceAll("\" ", "\""); + string = string.replaceAll(" \"", "\""); + string = string.replaceAll("[^\\w\\d\\sёЁА-Яа-я.,:\\\\\\[\\]|'\";()*?!#$%{}@+\\-]", ""); + return string.trim(); + } + public void save(String fileName) { File file = new File(fileName); if (file.exists()) { From 4d113338c6110d2e57cbffe27731f04ba3311017 Mon Sep 17 00:00:00 2001 From: arostov Date: Sun, 2 Jul 2017 21:38:38 +0300 Subject: [PATCH 32/33] introduced fix --- src/java/edu/stanford/muse/email/json/EmailNameAgregator.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java b/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java index 2006563..1090ec9 100644 --- a/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java +++ b/src/java/edu/stanford/muse/email/json/EmailNameAgregator.java @@ -100,6 +100,9 @@ private String removeWildChars(String string) { string = string.replaceAll("\" ", "\""); string = string.replaceAll(" \"", "\""); string = string.replaceAll("[^\\w\\d\\sёЁА-Яа-я.,:\\\\\\[\\]|'\";()*?!#$%{}@+\\-]", ""); + if (string.endsWith("'")) { + string = string.substring(0, string.length() - 1); + } return string.trim(); } From 45fde7a551194dcda2dfd4008318fd712b0479c7 Mon Sep 17 00:00:00 2001 From: arostov Date: Sun, 2 Jul 2017 22:29:01 +0300 Subject: [PATCH 33/33] error catching --- .../edu/stanford/muse/email/EmailFetcherThread.java | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/java/edu/stanford/muse/email/EmailFetcherThread.java b/src/java/edu/stanford/muse/email/EmailFetcherThread.java index 52168ea..e89a7ef 100755 --- a/src/java/edu/stanford/muse/email/EmailFetcherThread.java +++ b/src/java/edu/stanford/muse/email/EmailFetcherThread.java @@ -1101,7 +1101,17 @@ private void fetchAndIndexMessages(Folder folder, Message[] messages, int offset } if (contents == null) - contents = processMessagePart(messageNum, originalMessage, mm, attachmentsList); + try { + contents = processMessagePart(messageNum, originalMessage, mm, attachmentsList); + } catch (Exception e) { + log.error(e.getMessage(), e); + try { + log.error("MessageId: " + originalMessage.getMessageID()); + } catch (MessagingException e1) { + log.error(e.getMessage(), e); + } + throw e; + } // if mm is not prefetched, it is the same as original_mm // will also work, but will be slow as javamail accesses and fetches each mm separately, instead of using the bulk prefetched version