diff --git a/WebContent/entity-types.jsp b/WebContent/entity-types.jsp
index 96426ef3..98179082 100644
--- a/WebContent/entity-types.jsp
+++ b/WebContent/entity-types.jsp
@@ -72,7 +72,7 @@
for(Short type: desc.keySet()){
JSONArray j = new JSONArray();
// if(NEType.Type.OTHER.getCode() == type || NEType.Type.PERSON.getCode() == type || desc.get(type)==null)
- if(NEType.Type.OTHER.getCode() == type || desc.get(type)==null)
+ if(desc.get(type)==null)
continue;
j.put(0, ""+Util.escapeHTML(desc.get(type))+"");
j.put(1, archive.collectionMetadata.entityCounts.getOrDefault(type,0));
diff --git a/WebContent/version.jsp b/WebContent/version.jsp
index 5f3448ff..d65cff32 100644
--- a/WebContent/version.jsp
+++ b/WebContent/version.jsp
@@ -1 +1 @@
-ePADD version Release 11.0.1
\ No newline at end of file
+ePADD version Release 11.0.1 - NB edition
\ No newline at end of file
diff --git a/src/java/edu/stanford/muse/Config.java b/src/java/edu/stanford/muse/Config.java
index 9de024bb..f80bd673 100755
--- a/src/java/edu/stanford/muse/Config.java
+++ b/src/java/edu/stanford/muse/Config.java
@@ -60,6 +60,9 @@ public class Config {
public static String FEATURES_INDEX;
public static String TABOO_FILE = "kill.txt";
+ // location of entities file written by external AI model
+ public static String ENTITIES_FILE;
+
//this is the folder name that contains the cache for internal authority assignment
public static int MAX_ENTITY_FEATURES = 200;
public static int MAX_TRY_TO_RESOLVE_NAMES = 10;
@@ -160,6 +163,8 @@ else if (mode != null)
if (!Util.nullOrEmpty(s))
OPENNLP_NER = Boolean.parseBoolean(s);
+ ENTITIES_FILE = props.getProperty("entities");
+
s = props.getProperty("epadd.default.lexicon", "general");
if (s != null) {
DEFAULT_LEXICON = s;
@@ -308,12 +313,12 @@ private static Properties readProperties() {
}
return props;
}
-
-// 2022-11-14
+
+// 2022-11-14
public static int setLanguage(String lang) {
int ret = 0;
if (Config.EPADD_LANGUAGE.equals(lang)) return 0;
-
+
Config.EPADD_LANGUAGE = lang;
try {
Parameters params = new Parameters();
@@ -322,13 +327,13 @@ public static int setLanguage(String lang) {
.configure(params.properties()
.setFileName(EPADD_PROPS_FILE));
Configuration config = builder.getConfiguration();
- config.setProperty("epadd.language", Config.EPADD_LANGUAGE);
- builder.save();
+ config.setProperty("epadd.language", Config.EPADD_LANGUAGE);
+ builder.save();
ret = 1;
} catch (ConfigurationException e) {
log.warn ("Error in writing "+ EPADD_PROPS_FILE);
ret = -1;
- }
+ }
return ret;
}
diff --git a/src/java/edu/stanford/muse/ie/variants/EntityBook.java b/src/java/edu/stanford/muse/ie/variants/EntityBook.java
index d243a427..bc6de158 100755
--- a/src/java/edu/stanford/muse/ie/variants/EntityBook.java
+++ b/src/java/edu/stanford/muse/ie/variants/EntityBook.java
@@ -73,8 +73,8 @@ public static String canonicalize(String s) {
}
public Integer getEntitiesCountMapModuloThreshold(double threshold) {
-
- List list = summary_L1_entityCountMap.values().stream().filter(summary->summary.score>threshold).collect(Collectors.toList());
+ // for external entities: ignore threshold
+ List list = new ArrayList<>(summary_L1_entityCountMap.values());
return list.size();
}
diff --git a/src/java/edu/stanford/muse/ie/variants/EntityBookManager.java b/src/java/edu/stanford/muse/ie/variants/EntityBookManager.java
index 2a7b4608..f168061c 100644
--- a/src/java/edu/stanford/muse/ie/variants/EntityBookManager.java
+++ b/src/java/edu/stanford/muse/ie/variants/EntityBookManager.java
@@ -92,7 +92,7 @@ private void recalculateCache(Short giventype){
EmailDocument edoc = mArchive.indexer.docForId(docid);
for (Span span : allspans) {
// bail out if not of entity type that we're looking for, or not enough confidence, but don't bail out if we have to do it for all types, i.e. type is Short.MAX_TYPE
- if (giventype!=Short.MAX_VALUE && (span.type != giventype || span.typeScore < theta))
+ if (giventype!=Short.MAX_VALUE && (span.type != giventype))
continue;
Short type = span.type;//if type is Short.Max_Type then set the type as the current type, if not this is like a NOP.
Double score = new Double(span.typeScore);
@@ -362,7 +362,7 @@ private void fillEntityBookFromLucene(Short type){
for (Span span : allspans) {
// bail out if not of entity type that we're looking for, or not enough confidence
- if (span.type != type || span.typeScore < theta)
+ if (span.type != type)
continue;
String name = span.getText();
diff --git a/src/java/edu/stanford/muse/index/Indexer.java b/src/java/edu/stanford/muse/index/Indexer.java
index 5ce80761..a2d1f39e 100755
--- a/src/java/edu/stanford/muse/index/Indexer.java
+++ b/src/java/edu/stanford/muse/index/Indexer.java
@@ -835,7 +835,7 @@ private synchronized Directory initializeDirectory(Directory dir, String name) t
return dir;
}
-// 2022-09-13
+// 2022-09-13
@Override
protected void finalize() {
try {
@@ -1001,7 +1001,7 @@ private void storeNameOffsets(org.apache.lucene.document.Document doc, List headers)
{
ByteArrayOutputStream bs = new ByteArrayOutputStream();
@@ -1009,29 +1009,31 @@ private void storeHeaders(org.apache.lucene.document.Document doc, List
for (Header h : headers)
{
headerString.add(h.getName() + ": " + h.getValue());
+ if (h.getName().equals("Message-ID"))
+ doc.add(new Field("Message-ID", h.getValue(), storeOnly_ft));
}
try {
ObjectOutputStream oos = new ObjectOutputStream(bs);
oos.writeObject(headerString);
oos.close();
bs.close();
-
+
doc.add(new Field("headers_original", bs.toByteArray(), storeOnly_ft));
} catch (IOException e) {
log.warn("Failed to serialize headers");
e.printStackTrace();
}
-
+
java.io.ObjectInputStream ois;
try {
ois = new java.io.ObjectInputStream(new java.io.ByteArrayInputStream(bs.toByteArray()));
java.util.List sp = (java.util.List) ois.readObject();
-
+
} catch (Exception e) {
e.printStackTrace();
}
-
+
}
private void storeTextHtmlPart(org.apache.lucene.document.Document doc, String textHtmlPart)
diff --git a/src/java/edu/stanford/muse/ner/NER.java b/src/java/edu/stanford/muse/ner/NER.java
index aef671ad..3dee59fc 100755
--- a/src/java/edu/stanford/muse/ner/NER.java
+++ b/src/java/edu/stanford/muse/ner/NER.java
@@ -225,8 +225,8 @@ public void recognizeArchive() throws CancelledException, IOException {
log.warn("title " + title + " content " + content + " in Ner.recognizeArchive()");
continue;
}
- Span[] names = nerModel.find(content);
- Span[] namesT = nerModel.find(title);
+ Span[] names = nerModel.find(ldoc.get("Message-ID"));
+ Span[] namesT = new Span[] {}; // ignore entities in title for now
recTime += System.currentTimeMillis() - st;
st = System.currentTimeMillis();
diff --git a/src/java/edu/stanford/muse/ner/model/LoadResultsModel.java b/src/java/edu/stanford/muse/ner/model/LoadResultsModel.java
new file mode 100644
index 00000000..a7f6fe99
--- /dev/null
+++ b/src/java/edu/stanford/muse/ner/model/LoadResultsModel.java
@@ -0,0 +1,83 @@
+package edu.stanford.muse.ner.model;
+
+import edu.stanford.muse.Config;
+import edu.stanford.muse.ner.tokenize.Tokenizer;
+import edu.stanford.muse.util.Span;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+import org.json.simple.parser.JSONParser;
+
+import java.io.FileReader;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+
+public class LoadResultsModel implements NERModel {
+
+ private static final Logger log = LogManager.getLogger(LoadResultsModel.class);
+
+ private static final Map typeMap = new HashMap<>() {{
+ put("PER", NEType.Type.PERSON.getCode());
+ put("LOC", NEType.Type.PLACE.getCode());
+ put("ORG", NEType.Type.ORGANISATION.getCode());
+ put("MISC", NEType.Type.OTHER.getCode());
+ }};
+
+ private static final Map> entitiesMap = new HashMap<>();
+
+ public LoadResultsModel() {
+ String filename = Config.ENTITIES_FILE;
+ if (filename == null) {
+ log.warn("No entity file defined, skipping");
+ return;
+ }
+ log.info("Start loading entities from {}", filename);
+ JSONParser parser = new JSONParser();
+ try {
+ Object obj = parser.parse(new FileReader(filename));
+ JSONArray emailEntities = (JSONArray) obj;
+ for (JSONObject emailDetails : (Iterable) emailEntities) {
+ String msgId = (String)emailDetails.get("message-id");
+ List spans = new ArrayList<>();
+ JSONArray entities = (JSONArray)emailDetails.get("entities");
+ for (JSONObject entity : (Iterable) entities) {
+ spans.add(makeSpanFromJson(entity));
+ }
+ entitiesMap.put(msgId, spans);
+ }
+ } catch (Exception e) {
+ log.error("Got error {}", e.getMessage());
+ }
+ }
+
+ private static Span makeSpanFromJson(JSONObject entity) {
+ String word = (String)entity.get("word");
+ Long start = (Long)entity.get("start");
+ Long end = (Long)entity.get("end");
+ String entityGroup = (String)entity.get("entity_group");
+ Double score = (Double)entity.get("score");
+
+ Span span = new Span(word, start.intValue(), end.intValue());
+ span.setType(
+ typeMap.getOrDefault(entityGroup, NEType.Type.OTHER.getCode()),
+ score.floatValue()
+ );
+ return span;
+ }
+
+ @Override
+ public Span[] find(String messageId) {
+ if (entitiesMap.containsKey(messageId))
+ return entitiesMap.get(messageId).toArray(new Span[0]);
+ log.warn("Could not find {} in entities cache...", messageId);
+ return new Span[] {};
+ }
+
+ @Override
+ public void setTokenizer(Tokenizer tokenizer) { }
+
+}
diff --git a/src/java/edu/stanford/muse/ner/model/NERModel.java b/src/java/edu/stanford/muse/ner/model/NERModel.java
index f3a06f89..8e270904 100755
--- a/src/java/edu/stanford/muse/ner/model/NERModel.java
+++ b/src/java/edu/stanford/muse/ner/model/NERModel.java
@@ -5,10 +5,10 @@
public interface NERModel {
/**
- * @param content - text in which to find entities
+ * @param messageId - Message-ID header of message
* @return spans of text found in the content that contain the type and offset info. of the entity
*/
- Span[] find (String content);
+ Span[] find (String messageId);
void setTokenizer(Tokenizer tokenizer);
}
diff --git a/src/java/edu/stanford/muse/webapp/JSPHelper.java b/src/java/edu/stanford/muse/webapp/JSPHelper.java
index dbb54057..d58c8a85 100755
--- a/src/java/edu/stanford/muse/webapp/JSPHelper.java
+++ b/src/java/edu/stanford/muse/webapp/JSPHelper.java
@@ -31,6 +31,7 @@
import edu.stanford.muse.index.*;
import edu.stanford.muse.ner.NER;
import edu.stanford.muse.ner.model.DummyNERModel;
+import edu.stanford.muse.ner.model.LoadResultsModel;
import edu.stanford.muse.ner.model.NBModel;
import edu.stanford.muse.ner.model.NERModel;
import edu.stanford.muse.util.DetailedFacetItem;
@@ -302,7 +303,7 @@ public static String convertRequestParamToUTF8(String param) throws UnsupportedE
}
if (param == null)
return null;
-
+
//6.3.2023 There was an issue not being able to import Mbox folders with names
//containing accented characters.
//String newParam = new String(param.getBytes(StandardCharsets.ISO_8859_1), StandardCharsets.UTF_8);
@@ -427,8 +428,8 @@ public static void fetchAndIndexEmails(Archive archive, MuseEmailFetcher m, Mult
log.info("Using dummy openNLPNER model, all CIC patterns will be treated as valid entities");
nerModel = new DummyNERModel();
} else {
- log.info("Loading Bayesian classifier from: " + modelFile + " ...");
- nerModel = NBModel.loadModelFromRules(NBModel.MODEL_FILENAME);
+ log.info("Loading entities from external file");
+ nerModel = new LoadResultsModel();
}
}
if (nerModel == null) {
@@ -475,7 +476,7 @@ public static void fetchAndIndexEmails(Archive archive, MuseEmailFetcher m, Mult
// add the new stores
// we add the following code to support file metada requirement in epadd+ project
-// 2022-09-05 Added handling for IMAP
+// 2022-09-05 Added handling for IMAP
// archive.collectionMetadata.setFileMetadatas(archive, allFolders);
// if (archive.isMBOX())
archive.collectionMetadata.setFileMetadatas(archive, allFolders);