-
Notifications
You must be signed in to change notification settings - Fork 4
cyrillic symbols fix for search, emails and attachment #10
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
a477d8d
dcb6f6f
19caa83
c293437
d923ded
fa61cf7
182ca3f
78c6329
1527748
1999d66
c9aae5b
aed6ec3
4917c8b
427adf5
aed3a79
a399e64
9aecdc2
ca8d4e3
c8ef545
429b639
f0bd327
3ff6e29
54dbae2
2ca05ef
2b38c14
4d6437d
0b407fa
384e97e
c704c37
f2c91f0
31ff599
bcbb7cb
bfb5eb7
4d11333
45fde7a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,13 +17,15 @@ | |
|
|
||
| import com.sun.mail.imap.IMAPFolder; | ||
| import edu.stanford.muse.datacache.Blob; | ||
| import edu.stanford.muse.email.json.ArchiveSaver; | ||
| import edu.stanford.muse.index.*; | ||
| import edu.stanford.muse.util.EmailUtils; | ||
| import edu.stanford.muse.util.JSONUtils; | ||
| import edu.stanford.muse.util.Util; | ||
| import edu.stanford.muse.webapp.HTMLUtils; | ||
| import org.apache.commons.logging.Log; | ||
| import org.apache.commons.logging.LogFactory; | ||
| import org.apache.james.mime4j.codec.DecoderUtil; | ||
| import org.json.JSONArray; | ||
| import org.json.JSONException; | ||
| import org.json.JSONObject; | ||
|
|
@@ -472,12 +474,18 @@ private List<String> processMessagePart(int messageNum, Message m, Part p, List< | |
| String content; | ||
| String type = p.getContentType(); // new InputStreamReader(p.getInputStream(), "UTF-8"); | ||
| try { | ||
| // if forced encoding is set, we read the string with that encoding, otherwise we just use whatever p.getContent gives us | ||
| if (FORCED_ENCODING != null) { | ||
| if (type.contains("charset=")) { | ||
| byte b[] = Util.getBytesFromStream(p.getInputStream()); | ||
| content = new String(b, FORCED_ENCODING); | ||
| } else | ||
| content = (String) p.getContent(); | ||
| content = new String(b, type.substring(type.indexOf("charset=") + "charset=".length())); | ||
| } else { | ||
| // if forced encoding is set, we read the string with that encoding, otherwise we just use whatever p.getContent gives us | ||
| if (FORCED_ENCODING != null) { | ||
| byte b[] = Util.getBytesFromStream(p.getInputStream()); | ||
| content = new String(b, FORCED_ENCODING); | ||
| } else { | ||
| content = (String) p.getContent(); | ||
| } | ||
| } | ||
| } catch (UnsupportedEncodingException uee) { | ||
| dataErrors.add("Unsupported encoding: " + folder_name() + " Message #" + messageNum + " type " + type + ", using brute force conversion"); | ||
| // a particularly nasty issue:javamail can't handle utf-7 encoding which is common with hotmail and exchange servers. | ||
|
|
@@ -496,6 +504,7 @@ private List<String> processMessagePart(int messageNum, Message m, Part p, List< | |
| // rfc822 mime type is for embedded mbox format or some such (appears for things like | ||
| // forwarded messages). the content appears to be just a multipart. | ||
| Object o = p.getContent(); | ||
| System.setProperty("mail.mime.multipart.allowempty", "true"); | ||
| if (o instanceof Multipart) { | ||
| Multipart allParts = (Multipart) o; | ||
| if (p.isMimeType("multipart/alternative")) { | ||
|
|
@@ -582,6 +591,9 @@ private void handleAttachments(int idx, Message m, Part p, List<String> textList | |
| String filename = null; | ||
| try { | ||
| filename = p.getFileName(); | ||
| if (filename != null) { | ||
| filename = DecoderUtil.decodeEncodedWords(filename, null); | ||
| } | ||
| } catch (Exception e) { | ||
| // seen this happen with: | ||
| // Folders__gmail-sent Message #12185 Expected ';', got "Message" | ||
|
|
@@ -1089,7 +1101,17 @@ private void fetchAndIndexMessages(Folder folder, Message[] messages, int offset | |
| } | ||
|
|
||
| if (contents == null) | ||
| contents = processMessagePart(messageNum, originalMessage, mm, attachmentsList); | ||
| try { | ||
| contents = processMessagePart(messageNum, originalMessage, mm, attachmentsList); | ||
| } catch (Exception e) { | ||
| log.error(e.getMessage(), e); | ||
| try { | ||
| log.error("MessageId: " + originalMessage.getMessageID()); | ||
| } catch (MessagingException e1) { | ||
| log.error(e.getMessage(), e); | ||
| } | ||
| throw e; | ||
| } | ||
|
|
||
| // if mm is not prefetched, it is the same as original_mm | ||
| // will also work, but will be slow as javamail accesses and fetches each mm separately, instead of using the bulk prefetched version | ||
|
|
@@ -1272,14 +1294,22 @@ public void run() { | |
| // this is a special for mbox'es because we run out of memory if we try to openFolderAndGetMessages() | ||
| // so we process in batches | ||
| //TODO: Ideally, should cap on buffer size rather than on number of messages. | ||
| final int BATCH = 10000; | ||
| int nMessagesperbathc = 10000; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like the idea of fixing batch size based on 4GB/8GB limits. Minor typo in var name: bathc -> batch.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i tested on different computers approx 10+ times, and it seems ok. |
||
| long maxMemory = Runtime.getRuntime().maxMemory(); | ||
| if (maxMemory <= 4294967296L ) { nMessagesperbathc = 100; } | ||
| else { | ||
| if (maxMemory<= 8294967296L) { nMessagesperbathc = 1000; } | ||
| } | ||
| final int BATCH = nMessagesperbathc; //gradual decrease of batch size due to memory size | ||
| int nbatches = nMessages / BATCH; | ||
| nMessagesProcessedSuccess = 0; | ||
| long st = System.currentTimeMillis(); | ||
| int b; | ||
| for (b = 0; b < nbatches + 1; b++) { | ||
| begin_msg_index = b * BATCH + 1; | ||
| end_msg_index = Math.min((b + 1) * BATCH, nMessages) + 1; | ||
| end_msg_index = Math.min((b + 1) * BATCH, nMessages); | ||
| log.info("begin_msg_index: " + begin_msg_index); | ||
| log.info("end_msg_index: " + end_msg_index); | ||
| log.info("Fetching messages in index [" + begin_msg_index + ", " + end_msg_index + "] batch: " + b + "/" + nbatches + "\nTotal Messages: " + nMessages); | ||
| Message[] messages = openFolderAndGetMessages(); | ||
| currentStatus = JSONUtils.getStatusJSON(""); | ||
|
|
@@ -1349,6 +1379,7 @@ public void run() { | |
| } | ||
| log.info("Read #" + nMessages + " messages in in " + (System.currentTimeMillis() - st) + "ms"); | ||
| } | ||
| new ArchiveSaver(archive.archiveTitle).save(archive); | ||
| } catch (Throwable t) { | ||
| if (t instanceof OutOfMemoryError) | ||
| this.mayHaveRunOutOfMemory = true; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,107 @@ | ||
| package edu.stanford.muse.email.json; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Gleb, can you pls explain what this email.json package is for? |
||
|
|
||
| import edu.stanford.muse.index.Archive; | ||
| import edu.stanford.muse.index.Document; | ||
| import edu.stanford.muse.index.EmailDocument; | ||
| import edu.stanford.muse.util.Util; | ||
|
|
||
| import javax.mail.Address; | ||
| import javax.mail.internet.InternetAddress; | ||
| import java.io.*; | ||
| import java.util.Base64; | ||
| import java.util.List; | ||
| import java.util.Map; | ||
|
|
||
| /** | ||
| * Created by sunchise on 04.06.17. | ||
| */ | ||
| public class ArchiveSaver { | ||
|
|
||
| private final String archiveName; | ||
|
|
||
| public ArchiveSaver(String archiveName) { | ||
| this.archiveName = archiveName; | ||
| } | ||
|
|
||
| public void save(Archive archive) { | ||
| String folderName = new String(Base64.getEncoder().encode(archiveName.getBytes())); | ||
| String folderPath = System.getProperty("user.home") + File.separator + "epadd-data"; | ||
| File folder = new File(folderPath); | ||
| if (!folder.exists()) { | ||
| folder.mkdir(); | ||
| } | ||
| folderPath += File.separator + folderName; | ||
| folder = new File(folderPath); | ||
| if (!folder.exists()) { | ||
| folder.mkdir(); | ||
| } | ||
| String fileName = folderPath + File.separator + "archive.json"; | ||
| File file = new File(fileName); | ||
| if (file.exists()) { | ||
| file.delete(); | ||
| } | ||
| try { | ||
| file.createNewFile(); | ||
| } catch (IOException e) { | ||
| throw new RuntimeException(e.getMessage(), e); | ||
| } | ||
| List<Document> allDocs = archive.getAllDocs(); | ||
| EmailNameAgregator emailNameAgregator = new EmailNameAgregator(allDocs); | ||
| emailNameAgregator.save(folderPath + File.separator + "email-names.json"); | ||
| int i = 1; | ||
| try (BufferedWriter stream = new BufferedWriter(new FileWriter(file))) { | ||
| append(stream, "["); | ||
| boolean fail = false; | ||
| for (Document doc : allDocs) { | ||
| if (i > 1 && !fail) { | ||
| append(stream, ","); | ||
| } | ||
| fail = false; | ||
| final EmailDocument emailDocument = (EmailDocument) doc; | ||
| String messageID = Util.hash (emailDocument.getSignature()); | ||
| Email email = new Email(messageID, | ||
| emailDocument.date, | ||
| true, | ||
| emailDocument.getSubject(), | ||
| emailDocument.from == null || emailDocument.from.length == 0 ? null : emailNameAgregator.getName(emailDocument.getFromEmailAddress()), | ||
| emailDocument.getFromEmailAddress()); | ||
| if (emailDocument.cc != null) { | ||
| for (Address address : emailDocument.cc) { | ||
| InternetAddress internetAddress = (InternetAddress) address; | ||
| email.addCc(emailNameAgregator.getName(internetAddress.getAddress()), internetAddress.getAddress()); | ||
| } | ||
| } | ||
| if (emailDocument.bcc != null) { | ||
| for (Address address : emailDocument.bcc) { | ||
| InternetAddress internetAddress = (InternetAddress) address; | ||
| email.addCc(emailNameAgregator.getName(internetAddress.getAddress()), internetAddress.getAddress()); | ||
| } | ||
| } | ||
| if (emailDocument.to != null) { | ||
| for (Address address : emailDocument.to) { | ||
| InternetAddress internetAddress = (InternetAddress) address; | ||
| email.addCc(emailNameAgregator.getName(internetAddress.getAddress()), internetAddress.getAddress()); | ||
| } | ||
| } | ||
| if (email.check()) { | ||
| append(stream, email.toJson()); | ||
| } else { | ||
| fail = true; | ||
| } | ||
| i++; | ||
| } | ||
| append(stream, "]"); | ||
| stream.flush(); | ||
| stream.close(); | ||
| } catch (IOException e) { | ||
| throw new RuntimeException(e.getMessage(), e); | ||
| } | ||
| } | ||
|
|
||
|
|
||
| private void append(Writer stream, String string) throws IOException { | ||
| string = string.trim(); | ||
| stream.append(string); | ||
| } | ||
|
|
||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Gleb, thanks for this fix. One comment -- I'm not sure if we can assume charset will always be at the end of the string? I think we should make it robust by tokenizing the contentType string first on ";", trimming the token and then looking for token.substring("charset=").length()
(See Content-Type field spec: https://tools.ietf.org/html/rfc2045#page-12)