Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
a477d8d
encoding
ARostov May 4, 2017
dcb6f6f
cyrillic encoding fix
GSuvorov May 4, 2017
19caa83
Merge remote-tracking branch 'origin/master'
ARostov May 5, 2017
c293437
encoding in attachments
ARostov May 5, 2017
d923ded
unverified fix for cyrillic in search request
GSuvorov May 5, 2017
fa61cf7
small memory fix
GSuvorov May 23, 2017
182ca3f
encoding in download attachments
ARostov May 25, 2017
78c6329
encoding in file picker
ARostov May 25, 2017
1527748
replace plus to spase on downloaded filename
ARostov May 25, 2017
1999d66
gradual memory consumption fix for standalone.jar
GSuvorov May 25, 2017
c9aae5b
Merge branch 'master' of https://github.com/GSuvorov/muse
GSuvorov May 25, 2017
aed6ec3
save json on fs
ARostov Jun 4, 2017
4917c8b
save json on fs by servlet
ARostov Jun 4, 2017
427adf5
save json filter special chars
ARostov Jun 5, 2017
aed3a79
save json filter special chars
ARostov Jun 5, 2017
a399e64
save json filter special chars
ARostov Jun 5, 2017
9aecdc2
save json filter special chars
ARostov Jun 5, 2017
ca8d4e3
save json filter special chars
ARostov Jun 5, 2017
c8ef545
json format
ARostov Jun 5, 2017
429b639
json format
ARostov Jun 5, 2017
f0bd327
megagraph5
ARostov Jun 6, 2017
3ff6e29
fix
ARostov Jun 6, 2017
54dbae2
fix
ARostov Jun 7, 2017
2ca05ef
refactoring
ARostov Jun 8, 2017
2b38c14
fix
ARostov Jun 8, 2017
4d6437d
fix
ARostov Jun 8, 2017
0b407fa
fix
ARostov Jun 10, 2017
384e97e
fix
ARostov Jun 13, 2017
c704c37
fix
ARostov Jun 14, 2017
f2c91f0
fix
ARostov Jun 14, 2017
31ff599
do not forget emails
ARostov Jun 18, 2017
bcbb7cb
save archive data in special folder
ARostov Jun 21, 2017
bfb5eb7
eliminate wild chars
ARostov Jun 22, 2017
4d11333
introduced fix
ARostov Jul 2, 2017
45fde7a
error catching
ARostov Jul 2, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions pom-common.xml
Original file line number Diff line number Diff line change
Expand Up @@ -164,17 +164,17 @@
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>1.14</version>
<version>1.15</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>1.14</version>
<version>1.15</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>1.8.1</version>
<version>2.0.6</version>
</dependency>

<dependency>
Expand Down
14 changes: 11 additions & 3 deletions src/java/edu/stanford/muse/datacache/Blob.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,14 @@
import edu.stanford.muse.util.Util;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import java.io.IOException;
import java.io.InputStream;
Expand Down Expand Up @@ -132,9 +134,15 @@ public Pair<String, String> getContent(BlobStore store)
try {
// skip mp3 files, tika has trouble with it and hangs
if (!Util.nullOrEmpty(this.filename) && !this.filename.toLowerCase().endsWith(".mp3"))
parser.parse(stream, handler, metadata, context);

String[] names = metadata.names();
try {
parser.parse(stream, handler, metadata, context);
} catch (Exception e) {
log.error(e.getMessage(), e);
log.error(filename);
throw new RuntimeException(e.getMessage(), e);
}

String[] names = metadata.names();
//Arrays.sort(names);
for (String name : names) {
// some metadata tags are problematic and result in large hex strings... ignore them. (caused memory problems with Henry's archive)
Expand Down
2 changes: 1 addition & 1 deletion src/java/edu/stanford/muse/datacache/BlobStore.java
Original file line number Diff line number Diff line change
Expand Up @@ -468,7 +468,7 @@ public void generate_thumbnail(Blob b) throws IOException {
tnFilename = tmp_filename.substring(0, tmp_filename.length() - ".pdf".length()); // strip the ".pdf"
tnFilename += "1.png";
String[] args = new String[]{"-imageType", "png", "-startPage", "1", "-endPage", "1", tmp_filename};
org.apache.pdfbox.PDFToImage.main(args);
org.apache.pdfbox.tools.PDFToImage.main(args);
log.info("Saving PDF thumbnail to " + tnFilename);
filename = filename + ".png"; // make sure the suffix for the thumbnail is named with a .png suffix in the cache
} catch (Throwable e) {
Expand Down
47 changes: 39 additions & 8 deletions src/java/edu/stanford/muse/email/EmailFetcherThread.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,15 @@

import com.sun.mail.imap.IMAPFolder;
import edu.stanford.muse.datacache.Blob;
import edu.stanford.muse.email.json.ArchiveSaver;
import edu.stanford.muse.index.*;
import edu.stanford.muse.util.EmailUtils;
import edu.stanford.muse.util.JSONUtils;
import edu.stanford.muse.util.Util;
import edu.stanford.muse.webapp.HTMLUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.james.mime4j.codec.DecoderUtil;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
Expand Down Expand Up @@ -472,12 +474,18 @@ private List<String> processMessagePart(int messageNum, Message m, Part p, List<
String content;
String type = p.getContentType(); // new InputStreamReader(p.getInputStream(), "UTF-8");
try {
// if forced encoding is set, we read the string with that encoding, otherwise we just use whatever p.getContent gives us
if (FORCED_ENCODING != null) {
if (type.contains("charset=")) {
byte b[] = Util.getBytesFromStream(p.getInputStream());
content = new String(b, FORCED_ENCODING);
} else
content = (String) p.getContent();
content = new String(b, type.substring(type.indexOf("charset=") + "charset=".length()));
Copy link
Copy Markdown
Contributor

@hangal hangal Jun 14, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gleb, thanks for this fix. One comment -- I'm not sure if we can assume charset will always be at the end of the string? I think we should make it robust by tokenizing the contentType string first on ";", trimming the token and then looking for token.substring("charset=").length()

(See Content-Type field spec: https://tools.ietf.org/html/rfc2045#page-12)

} else {
// if forced encoding is set, we read the string with that encoding, otherwise we just use whatever p.getContent gives us
if (FORCED_ENCODING != null) {
byte b[] = Util.getBytesFromStream(p.getInputStream());
content = new String(b, FORCED_ENCODING);
} else {
content = (String) p.getContent();
}
}
} catch (UnsupportedEncodingException uee) {
dataErrors.add("Unsupported encoding: " + folder_name() + " Message #" + messageNum + " type " + type + ", using brute force conversion");
// a particularly nasty issue:javamail can't handle utf-7 encoding which is common with hotmail and exchange servers.
Expand All @@ -496,6 +504,7 @@ private List<String> processMessagePart(int messageNum, Message m, Part p, List<
// rfc822 mime type is for embedded mbox format or some such (appears for things like
// forwarded messages). the content appears to be just a multipart.
Object o = p.getContent();
System.setProperty("mail.mime.multipart.allowempty", "true");
if (o instanceof Multipart) {
Multipart allParts = (Multipart) o;
if (p.isMimeType("multipart/alternative")) {
Expand Down Expand Up @@ -582,6 +591,9 @@ private void handleAttachments(int idx, Message m, Part p, List<String> textList
String filename = null;
try {
filename = p.getFileName();
if (filename != null) {
filename = DecoderUtil.decodeEncodedWords(filename, null);
}
} catch (Exception e) {
// seen this happen with:
// Folders__gmail-sent Message #12185 Expected ';', got "Message"
Expand Down Expand Up @@ -1089,7 +1101,17 @@ private void fetchAndIndexMessages(Folder folder, Message[] messages, int offset
}

if (contents == null)
contents = processMessagePart(messageNum, originalMessage, mm, attachmentsList);
try {
contents = processMessagePart(messageNum, originalMessage, mm, attachmentsList);
} catch (Exception e) {
log.error(e.getMessage(), e);
try {
log.error("MessageId: " + originalMessage.getMessageID());
} catch (MessagingException e1) {
log.error(e.getMessage(), e);
}
throw e;
}

// if mm is not prefetched, it is the same as original_mm
// will also work, but will be slow as javamail accesses and fetches each mm separately, instead of using the bulk prefetched version
Expand Down Expand Up @@ -1272,14 +1294,22 @@ public void run() {
// this is a special for mbox'es because we run out of memory if we try to openFolderAndGetMessages()
// so we process in batches
//TODO: Ideally, should cap on buffer size rather than on number of messages.
final int BATCH = 10000;
int nMessagesperbathc = 10000;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like the idea of fixing batch size based on 4GB/8GB limits.
But what should the number be? I think we can easily support 1000 for 4GB, 2000 for 8GB.
Have you experience otherwise?
There is a small disadvantage to small batch sizes, which is that the archive has to be closed and opened multiple times.

Minor typo in var name: bathc -> batch.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i tested on different computers approx 10+ times, and it seems ok.

long maxMemory = Runtime.getRuntime().maxMemory();
if (maxMemory <= 4294967296L ) { nMessagesperbathc = 100; }
else {
if (maxMemory<= 8294967296L) { nMessagesperbathc = 1000; }
}
final int BATCH = nMessagesperbathc; //gradual decrease of batch size due to memory size
int nbatches = nMessages / BATCH;
nMessagesProcessedSuccess = 0;
long st = System.currentTimeMillis();
int b;
for (b = 0; b < nbatches + 1; b++) {
begin_msg_index = b * BATCH + 1;
end_msg_index = Math.min((b + 1) * BATCH, nMessages) + 1;
end_msg_index = Math.min((b + 1) * BATCH, nMessages);
log.info("begin_msg_index: " + begin_msg_index);
log.info("end_msg_index: " + end_msg_index);
log.info("Fetching messages in index [" + begin_msg_index + ", " + end_msg_index + "] batch: " + b + "/" + nbatches + "\nTotal Messages: " + nMessages);
Message[] messages = openFolderAndGetMessages();
currentStatus = JSONUtils.getStatusJSON("");
Expand Down Expand Up @@ -1349,6 +1379,7 @@ public void run() {
}
log.info("Read #" + nMessages + " messages in in " + (System.currentTimeMillis() - st) + "ms");
}
new ArchiveSaver(archive.archiveTitle).save(archive);
} catch (Throwable t) {
if (t instanceof OutOfMemoryError)
this.mayHaveRunOutOfMemory = true;
Expand Down
107 changes: 107 additions & 0 deletions src/java/edu/stanford/muse/email/json/ArchiveSaver.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
package edu.stanford.muse.email.json;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gleb, can you pls explain what this email.json package is for?


import edu.stanford.muse.index.Archive;
import edu.stanford.muse.index.Document;
import edu.stanford.muse.index.EmailDocument;
import edu.stanford.muse.util.Util;

import javax.mail.Address;
import javax.mail.internet.InternetAddress;
import java.io.*;
import java.util.Base64;
import java.util.List;
import java.util.Map;

/**
* Created by sunchise on 04.06.17.
*/
public class ArchiveSaver {

private final String archiveName;

public ArchiveSaver(String archiveName) {
this.archiveName = archiveName;
}

public void save(Archive archive) {
String folderName = new String(Base64.getEncoder().encode(archiveName.getBytes()));
String folderPath = System.getProperty("user.home") + File.separator + "epadd-data";
File folder = new File(folderPath);
if (!folder.exists()) {
folder.mkdir();
}
folderPath += File.separator + folderName;
folder = new File(folderPath);
if (!folder.exists()) {
folder.mkdir();
}
String fileName = folderPath + File.separator + "archive.json";
File file = new File(fileName);
if (file.exists()) {
file.delete();
}
try {
file.createNewFile();
} catch (IOException e) {
throw new RuntimeException(e.getMessage(), e);
}
List<Document> allDocs = archive.getAllDocs();
EmailNameAgregator emailNameAgregator = new EmailNameAgregator(allDocs);
emailNameAgregator.save(folderPath + File.separator + "email-names.json");
int i = 1;
try (BufferedWriter stream = new BufferedWriter(new FileWriter(file))) {
append(stream, "[");
boolean fail = false;
for (Document doc : allDocs) {
if (i > 1 && !fail) {
append(stream, ",");
}
fail = false;
final EmailDocument emailDocument = (EmailDocument) doc;
String messageID = Util.hash (emailDocument.getSignature());
Email email = new Email(messageID,
emailDocument.date,
true,
emailDocument.getSubject(),
emailDocument.from == null || emailDocument.from.length == 0 ? null : emailNameAgregator.getName(emailDocument.getFromEmailAddress()),
emailDocument.getFromEmailAddress());
if (emailDocument.cc != null) {
for (Address address : emailDocument.cc) {
InternetAddress internetAddress = (InternetAddress) address;
email.addCc(emailNameAgregator.getName(internetAddress.getAddress()), internetAddress.getAddress());
}
}
if (emailDocument.bcc != null) {
for (Address address : emailDocument.bcc) {
InternetAddress internetAddress = (InternetAddress) address;
email.addCc(emailNameAgregator.getName(internetAddress.getAddress()), internetAddress.getAddress());
}
}
if (emailDocument.to != null) {
for (Address address : emailDocument.to) {
InternetAddress internetAddress = (InternetAddress) address;
email.addCc(emailNameAgregator.getName(internetAddress.getAddress()), internetAddress.getAddress());
}
}
if (email.check()) {
append(stream, email.toJson());
} else {
fail = true;
}
i++;
}
append(stream, "]");
stream.flush();
stream.close();
} catch (IOException e) {
throw new RuntimeException(e.getMessage(), e);
}
}


private void append(Writer stream, String string) throws IOException {
string = string.trim();
stream.append(string);
}

}
Loading