Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion WebContent/entity-types.jsp
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@
for(Short type: desc.keySet()){
JSONArray j = new JSONArray();
// if(NEType.Type.OTHER.getCode() == type || NEType.Type.PERSON.getCode() == type || desc.get(type)==null)
if(NEType.Type.OTHER.getCode() == type || desc.get(type)==null)
if(desc.get(type)==null)
continue;
j.put(0, "<a href='list-entities?type="+type+"&archiveID="+archiveID+"' target='_blank'>"+Util.escapeHTML(desc.get(type))+"</a>");
j.put(1, archive.collectionMetadata.entityCounts.getOrDefault(type,0));
Expand Down
2 changes: 1 addition & 1 deletion WebContent/version.jsp
Original file line number Diff line number Diff line change
@@ -1 +1 @@
ePADD version Release 11.0.1
ePADD version Release 11.0.1 - NB edition
17 changes: 11 additions & 6 deletions src/java/edu/stanford/muse/Config.java
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ public class Config {
public static String FEATURES_INDEX;
public static String TABOO_FILE = "kill.txt";

// location of entities file written by external AI model
public static String ENTITIES_FILE;

//this is the folder name that contains the cache for internal authority assignment
public static int MAX_ENTITY_FEATURES = 200;
public static int MAX_TRY_TO_RESOLVE_NAMES = 10;
Expand Down Expand Up @@ -160,6 +163,8 @@ else if (mode != null)
if (!Util.nullOrEmpty(s))
OPENNLP_NER = Boolean.parseBoolean(s);

ENTITIES_FILE = props.getProperty("entities");

s = props.getProperty("epadd.default.lexicon", "general");
if (s != null) {
DEFAULT_LEXICON = s;
Expand Down Expand Up @@ -308,12 +313,12 @@ private static Properties readProperties() {
}
return props;
}
// 2022-11-14

// 2022-11-14
public static int setLanguage(String lang) {
int ret = 0;
if (Config.EPADD_LANGUAGE.equals(lang)) return 0;

Config.EPADD_LANGUAGE = lang;
try {
Parameters params = new Parameters();
Expand All @@ -322,13 +327,13 @@ public static int setLanguage(String lang) {
.configure(params.properties()
.setFileName(EPADD_PROPS_FILE));
Configuration config = builder.getConfiguration();
config.setProperty("epadd.language", Config.EPADD_LANGUAGE);
builder.save();
config.setProperty("epadd.language", Config.EPADD_LANGUAGE);
builder.save();
ret = 1;
} catch (ConfigurationException e) {
log.warn ("Error in writing "+ EPADD_PROPS_FILE);
ret = -1;
}
}
return ret;
}

Expand Down
4 changes: 2 additions & 2 deletions src/java/edu/stanford/muse/ie/variants/EntityBook.java
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ public static String canonicalize(String s) {
}

public Integer getEntitiesCountMapModuloThreshold(double threshold) {

List<Summary_L1> list = summary_L1_entityCountMap.values().stream().filter(summary->summary.score>threshold).collect(Collectors.toList());
// for external entities: ignore threshold
List<Summary_L1> list = new ArrayList<>(summary_L1_entityCountMap.values());
return list.size();
}

Expand Down
4 changes: 2 additions & 2 deletions src/java/edu/stanford/muse/ie/variants/EntityBookManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ private void recalculateCache(Short giventype){
EmailDocument edoc = mArchive.indexer.docForId(docid);
for (Span span : allspans) {
// bail out if not of entity type that we're looking for, or not enough confidence, but don't bail out if we have to do it for all types, i.e. type is Short.MAX_TYPE
if (giventype!=Short.MAX_VALUE && (span.type != giventype || span.typeScore < theta))
if (giventype!=Short.MAX_VALUE && (span.type != giventype))
continue;
Short type = span.type;//if type is Short.Max_Type then set the type as the current type, if not this is like a NOP.
Double score = new Double(span.typeScore);
Expand Down Expand Up @@ -362,7 +362,7 @@ private void fillEntityBookFromLucene(Short type){

for (Span span : allspans) {
// bail out if not of entity type that we're looking for, or not enough confidence
if (span.type != type || span.typeScore < theta)
if (span.type != type)
continue;

String name = span.getText();
Expand Down
14 changes: 8 additions & 6 deletions src/java/edu/stanford/muse/index/Indexer.java
Original file line number Diff line number Diff line change
Expand Up @@ -835,7 +835,7 @@ private synchronized Directory initializeDirectory(Directory dir, String name) t
return dir;
}

// 2022-09-13
// 2022-09-13
@Override
protected void finalize() {
try {
Expand Down Expand Up @@ -1001,37 +1001,39 @@ private void storeNameOffsets(org.apache.lucene.document.Document doc, List<Trip
}
}


private void storeHeaders(org.apache.lucene.document.Document doc, List<Header> headers)
{
ByteArrayOutputStream bs = new ByteArrayOutputStream();
List<String> headerString = new ArrayList<>();
for (Header h : headers)
{
headerString.add(h.getName() + ": " + h.getValue());
if (h.getName().equals("Message-ID"))
doc.add(new Field("Message-ID", h.getValue(), storeOnly_ft));
}
try {
ObjectOutputStream oos = new ObjectOutputStream(bs);
oos.writeObject(headerString);
oos.close();
bs.close();

doc.add(new Field("headers_original", bs.toByteArray(), storeOnly_ft));
} catch (IOException e) {
log.warn("Failed to serialize headers");
e.printStackTrace();
}

java.io.ObjectInputStream ois;
try {
ois = new java.io.ObjectInputStream(new java.io.ByteArrayInputStream(bs.toByteArray()));
java.util.List<String> sp = (java.util.List<String>) ois.readObject();


} catch (Exception e) {
e.printStackTrace();
}

}

private void storeTextHtmlPart(org.apache.lucene.document.Document doc, String textHtmlPart)
Expand Down
4 changes: 2 additions & 2 deletions src/java/edu/stanford/muse/ner/NER.java
Original file line number Diff line number Diff line change
Expand Up @@ -225,8 +225,8 @@ public void recognizeArchive() throws CancelledException, IOException {
log.warn("title " + title + " content " + content + " in Ner.recognizeArchive()");
continue;
}
Span[] names = nerModel.find(content);
Span[] namesT = nerModel.find(title);
Span[] names = nerModel.find(ldoc.get("Message-ID"));
Span[] namesT = new Span[] {}; // ignore entities in title for now
recTime += System.currentTimeMillis() - st;
st = System.currentTimeMillis();

Expand Down
83 changes: 83 additions & 0 deletions src/java/edu/stanford/muse/ner/model/LoadResultsModel.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
package edu.stanford.muse.ner.model;

import edu.stanford.muse.Config;
import edu.stanford.muse.ner.tokenize.Tokenizer;
import edu.stanford.muse.util.Span;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;

import java.io.FileReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;


public class LoadResultsModel implements NERModel {

private static final Logger log = LogManager.getLogger(LoadResultsModel.class);

private static final Map<String, Short> typeMap = new HashMap<>() {{
put("PER", NEType.Type.PERSON.getCode());
put("LOC", NEType.Type.PLACE.getCode());
put("ORG", NEType.Type.ORGANISATION.getCode());
put("MISC", NEType.Type.OTHER.getCode());
}};

private static final Map<String, List<Span>> entitiesMap = new HashMap<>();

public LoadResultsModel() {
String filename = Config.ENTITIES_FILE;
if (filename == null) {
log.warn("No entity file defined, skipping");
return;
}
log.info("Start loading entities from {}", filename);
JSONParser parser = new JSONParser();
try {
Object obj = parser.parse(new FileReader(filename));
JSONArray emailEntities = (JSONArray) obj;
for (JSONObject emailDetails : (Iterable<JSONObject>) emailEntities) {
String msgId = (String)emailDetails.get("message-id");
List<Span> spans = new ArrayList<>();
JSONArray entities = (JSONArray)emailDetails.get("entities");
for (JSONObject entity : (Iterable<JSONObject>) entities) {
spans.add(makeSpanFromJson(entity));
}
entitiesMap.put(msgId, spans);
}
} catch (Exception e) {
log.error("Got error {}", e.getMessage());
}
}

private static Span makeSpanFromJson(JSONObject entity) {
String word = (String)entity.get("word");
Long start = (Long)entity.get("start");
Long end = (Long)entity.get("end");
String entityGroup = (String)entity.get("entity_group");
Double score = (Double)entity.get("score");

Span span = new Span(word, start.intValue(), end.intValue());
span.setType(
typeMap.getOrDefault(entityGroup, NEType.Type.OTHER.getCode()),
score.floatValue()
);
return span;
}

@Override
public Span[] find(String messageId) {
if (entitiesMap.containsKey(messageId))
return entitiesMap.get(messageId).toArray(new Span[0]);
log.warn("Could not find {} in entities cache...", messageId);
return new Span[] {};
}

@Override
public void setTokenizer(Tokenizer tokenizer) { }

}
4 changes: 2 additions & 2 deletions src/java/edu/stanford/muse/ner/model/NERModel.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@

public interface NERModel {
/**
* @param content - text in which to find entities
* @param messageId - Message-ID header of message
* @return spans of text found in the content that contain the type and offset info. of the entity
*/
Span[] find (String content);
Span[] find (String messageId);

void setTokenizer(Tokenizer tokenizer);
}
9 changes: 5 additions & 4 deletions src/java/edu/stanford/muse/webapp/JSPHelper.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import edu.stanford.muse.index.*;
import edu.stanford.muse.ner.NER;
import edu.stanford.muse.ner.model.DummyNERModel;
import edu.stanford.muse.ner.model.LoadResultsModel;
import edu.stanford.muse.ner.model.NBModel;
import edu.stanford.muse.ner.model.NERModel;
import edu.stanford.muse.util.DetailedFacetItem;
Expand Down Expand Up @@ -302,7 +303,7 @@ public static String convertRequestParamToUTF8(String param) throws UnsupportedE
}
if (param == null)
return null;

//6.3.2023 There was an issue not being able to import Mbox folders with names
//containing accented characters.
//String newParam = new String(param.getBytes(StandardCharsets.ISO_8859_1), StandardCharsets.UTF_8);
Expand Down Expand Up @@ -427,8 +428,8 @@ public static void fetchAndIndexEmails(Archive archive, MuseEmailFetcher m, Mult
log.info("Using dummy openNLPNER model, all CIC patterns will be treated as valid entities");
nerModel = new DummyNERModel();
} else {
log.info("Loading Bayesian classifier from: " + modelFile + " ...");
nerModel = NBModel.loadModelFromRules(NBModel.MODEL_FILENAME);
log.info("Loading entities from external file");
nerModel = new LoadResultsModel();
}
}
if (nerModel == null) {
Expand Down Expand Up @@ -475,7 +476,7 @@ public static void fetchAndIndexEmails(Archive archive, MuseEmailFetcher m, Mult
// add the new stores

// we add the following code to support file metada requirement in epadd+ project
// 2022-09-05 Added handling for IMAP
// 2022-09-05 Added handling for IMAP
// archive.collectionMetadata.setFileMetadatas(archive, allFolders);
// if (archive.isMBOX())
archive.collectionMetadata.setFileMetadatas(archive, allFolders);
Expand Down