diff --git a/assembly/src/main/scala/org/clulab/reach/assembly/AssemblyManager.scala b/assembly/src/main/scala/org/clulab/reach/assembly/AssemblyManager.scala index fb1e8fa07..39dba3089 100644 --- a/assembly/src/main/scala/org/clulab/reach/assembly/AssemblyManager.scala +++ b/assembly/src/main/scala/org/clulab/reach/assembly/AssemblyManager.scala @@ -2,10 +2,12 @@ package org.clulab.reach.assembly import com.typesafe.scalalogging.LazyLogging import org.clulab.reach.assembly.representations._ -import collection.Map -import collection.immutable + +import collection.{Map, immutable, mutable} import org.clulab.odin._ import org.clulab.reach.mentions.{CorefMention, MentionOps} + +import scala.collection.mutable.ListBuffer // used to differentiate AssemblyModifications from Modifications on mentions import org.clulab.reach.mentions import java.io.File @@ -13,7 +15,7 @@ import java.io.File /** * Stores precedence information for two distinct [[EntityEventRepresentation]] - * @param before the [[EntityEventRepresentation] that precedes [[PrecedenceRelation.after]] + * @param before the [[EntityEventRepresentation]] that precedes [[PrecedenceRelation.after]] * @param after the [[EntityEventRepresentation]] that follows [[PrecedenceRelation.before]] * @param evidence the mentions that serve as evidence for this precedence relation * @param foundBy the name of the Sieve which found this relation @@ -69,6 +71,7 @@ class AssemblyManager( import AssemblyManager._ + private val nonAssemblyMentions = new mutable.ListBuffer[Mention]() // Because modifications don't feature into the hashcode, // a mention's identify at assembly consists of both the mention and its mods (i.e., the "state" of the mention) private var mentionStateToID: immutable.Map[MentionState, IDPointer] = m2id.toMap @@ -90,6 +93,8 @@ class AssemblyManager( // initialize to size of LUT 2 private var nextID: IDPointer = idToEER.size + def getNonAssemblyMentions:Iterable[Mention] = nonAssemblyMentions.toList + /** * Retrieve the set of mentions currently tracked by the manager */ @@ -293,12 +298,16 @@ class AssemblyManager( * See [[isValidMention]] for details on validation check * @param m an Odin Mention */ - def trackMention(m: Mention): Unit = isValidMention(m) match { - // do not store Sites, Activations, etc. in LUT 1 - case true => - // get or create an EntityEventRepresentation - val _ = getOrCreateEER(m) - case false => () + def trackMention(m: Mention): Unit = if (isValidMention(m)) { + m match { + case statistic if statistic matches "Statistic" => + nonAssemblyMentions += statistic + case _ => + // get or create an EntityEventRepresentation + val _ = getOrCreateEER(m) + } + } else { + () } /** @@ -572,7 +581,7 @@ class AssemblyManager( new SimpleEntity( id, // TODO: decide whether or not we should use a richer representation for the grounding ID - e.nsId, + e.nsId(), // modifications relevant to assembly if (mods.isDefined) modifications ++ mods.get else modifications, // source mention @@ -651,6 +660,59 @@ class AssemblyManager( */ private def createComplex(m: Mention): Complex = createComplexWithID(m)._1 + + private def createAssociationEventWithID(m: Mention): (Association, IDPointer) = { + // + // handle dispatch + // + + // check for coref + val assoc = getResolvedForm(m) + + // get polarity + val polarity = getPolarityLabel(assoc) + + // mention should be a Regulation + require(assoc matches "Association", "createAssociationEventWithID only handles Associations") + // mention's polarity should be either positive or negative +// require(polarity == AssemblyManager.positive || polarity == AssemblyManager.negative, "Polarity of Regulation must be positive or negative") +// // all controlled args must be simple events +// require(assoc.arguments("controlled").forall(_ matches "Event"), "The 'controlled' of any Regulation must be an Event") + +// val controllers: Set[IDPointer] = { +// assoc.arguments("controller") +// .toSet[Mention] +// .map(c => getOrCreateEERwithID(c)._2) +// } + + val themes: Set[IDPointer] = { + assoc.arguments("theme") + .toSet[Mention] + .map(c => getOrCreateEERwithID(c)._2) + } + + // prepare id + val id = getOrCreateID(m) + + // prepare Regulation + + val eer = + Association( + id, + themes, + polarity, + Some(m), + this + ) + + // update LUTs + // use original mention for later lookup + updateLUTs(id, m, eer) + + // eer and id pair + (eer, id) + } + // // SimpleEvent creation // @@ -761,7 +823,7 @@ class AssemblyManager( for ( src <- hasSource.arguments(src).toSet[Mention] ) yield { - val gid = src.toBioMention.nsId + val gid = src.toBioMention.nsId() representations.Location(gid).asInstanceOf[AssemblyModification] } // no mods @@ -796,7 +858,7 @@ class AssemblyManager( for ( d <- hasSource.arguments(dest).toSet[Mention] ) yield { - val gid = d.toBioMention.nsId + val gid = d.toBioMention.nsId() representations.Location(gid) } // no mods @@ -1145,13 +1207,11 @@ class AssemblyManager( private def getOrCreateEER(m: Mention): EER = { // ensure this mention should be stored in LUT 1 require(isValidMention(m), s"mention with the label ${m.label} cannot be tracked by the AssemblyManager") - hasMention(m) match { - // if an ID already exists, retrieve the associated representation - case true => - val id = mentionStateToID(getMentionState(m)) - idToEER(id) - // create new representation - case false => createEER(m) + if (hasMention(m)) { + val id = mentionStateToID(getMentionState(m)) + idToEER(id) + } else { + createEER(m) } } @@ -1161,15 +1221,14 @@ class AssemblyManager( * @param m an Odin Mention * @return a tuple of ([[EntityEventRepresentation]], [[IDPointer]]) */ - private def getOrCreateEERwithID(m: Mention): (EER, IDPointer) = hasMention(m) match { - case true => - val id = mentionStateToID(getMentionState(m)) - val eer = getEER(id) - (eer, id) - case false => - val eer = createEER(m) - val id = eer.uniqueID - (eer, id) + private def getOrCreateEERwithID(m: Mention): (EER, IDPointer) = if (hasMention(m)) { + val id = mentionStateToID(getMentionState(m)) + val eer = getEER(id) + (eer, id) + } else { + val eer = createEER(m) + val id = eer.uniqueID + (eer, id) } /** @@ -1188,6 +1247,7 @@ class AssemblyManager( case se if se matches "SimpleEvent" => createSimpleEventWithID(m) case regulation if regulation matches "Regulation" => createRegulationWithID(m) case activation if activation matches "ActivationEvent" => createActivationWithID(m) + case association if association matches "Association" => createAssociationEventWithID(m) case other => throw new Exception(s"createEERwithID failed for ${other.label}") } } @@ -1758,9 +1818,9 @@ class AssemblyManager( s"Mention(label=${m.label}, text='${m.text}', modifications=${bio.modifications}, doc=$docRepr)" } - def summarizeMentionIndex: Unit = println(mentionIndexSummary.sorted.mkString("\n")) + def summarizeMentionIndex(): Unit = println(mentionIndexSummary.sorted.mkString("\n")) - def summarizeEntities: Unit = println(getSimpleEntities.map(_.summarize).toSeq.sorted.mkString("\n")) + def summarizeEntities(): Unit = println(getSimpleEntities.map(_.summarize).toSeq.sorted.mkString("\n")) // @@ -1888,6 +1948,13 @@ object AssemblyManager { case event if event matches "Event" => isValidMention(event) } + // Assiciations must have two theme arguments + case association if association matches "Association" => + (association.arguments contains "theme") && (association.arguments("theme").size == 2) + + case significance if significance matches "Significance" => + (significance.arguments contains "kind") && (significance.arguments contains "value") + // assume invalid otherwise case _ => false } diff --git a/assembly/src/main/scala/org/clulab/reach/assembly/export/AssemblyExporter.scala b/assembly/src/main/scala/org/clulab/reach/assembly/export/AssemblyExporter.scala index 19929eb12..60fbf44a5 100644 --- a/assembly/src/main/scala/org/clulab/reach/assembly/export/AssemblyExporter.scala +++ b/assembly/src/main/scala/org/clulab/reach/assembly/export/AssemblyExporter.scala @@ -26,7 +26,7 @@ class AssemblyExporter(val manager: AssemblyManager) extends LazyLogging { val ignoreMods = false // distinct EntityEventRepresentations - val distinctEERS = manager.distinctEERs + val distinctEERS: Set[EER] = manager.distinctEERs // LUT for retrieving IDs to distinct EERs // TODO: A better version of this should probably belong to the manager @@ -121,9 +121,19 @@ class AssemblyExporter(val manager: AssemblyManager) extends LazyLogging { case se: SimpleEvent => se.input.values.flatten.map(m => createInput(m, mods)).mkString(", ") + case assoc: Association => + assoc.controlled.map{ + // get IDs of any events + case event: Event => EERLUT.getOrElse(event.equivalenceHash(ignoreMods = ignoreMods), reportError(assoc, event)) + // represent entities directly + case entity: Entity => + createInput(entity, s"$mods") + }.mkString(", ") + + // inputs to an activation are entities case act: Activation => - act.controlled.map { + act.controller.map { // get IDs of any events case event: Event => EERLUT.getOrElse(event.equivalenceHash(ignoreMods = ignoreMods), reportError(act, event)) // represent entities directly @@ -153,6 +163,15 @@ class AssemblyExporter(val manager: AssemblyManager) extends LazyLogging { case other => createInput(other, mods) }.mkString(", ") + case assoc: Association => + assoc.controlled.map{ + // get IDs of any events + case event: Event => EERLUT.getOrElse(event.equivalenceHash(ignoreMods = ignoreMods), reportError(assoc, event)) + // represent entities directly + case entity: Entity => + createInput(entity, s"$mods") + }.mkString(", ") + // positive activations produce an activated output entity case posact: Activation if posact.polarity == AssemblyManager.positive => posact.controlled.map(c => createInput(c, s"$mods.a")).mkString(", ") @@ -238,10 +257,43 @@ class AssemblyExporter(val manager: AssemblyManager) extends LazyLogging { precededBy(event), event.negated, event.evidence, - event + Some(event) ) } - rows.toSeq + + val statisticRows: Set[AssemblyRow] = (manager.getNonAssemblyMentions collect { + case significance if significance matches "Significance" => + AssemblyRow( + significance.arguments("kind").head.text, + significance.arguments("value").head.text, + NONE, + NONE, + NONE, + NONE, + significance.label, + Set.empty, + negated = false, + Set(significance), + None + ) + + case interval if interval matches "Confidence_interval" => + AssemblyRow( + interval.arguments("start").head.text, + interval.arguments("end").head.text, + NONE, + NONE, + interval.arguments("degree").head.text, + NONE, + interval.label, + Set.empty, + negated = false, + Set(interval), + None + ) + }).toSet + + rows.toSeq ++ statisticRows.toSeq } /** for debugging purposes */ @@ -293,6 +345,7 @@ object AssemblyExporter { val ENTITY = "entity" val REGULATION = "Regulation" val ACTIVATION = "Activation" + val ASSOCIATION = "Association" val TRANSLOCATION = "Translocation" // context types @@ -388,6 +441,7 @@ object AssemblyExporter { def getEventLabel(e: EntityEventRepresentation): String = e match { case reg: Regulation => s"$REGULATION (${reg.polarity})" case act: Activation => s"$ACTIVATION (${act.polarity})" + case assoc: Association => s"$ASSOCIATION (${assoc.polarity})" case se: SimpleEvent => se.label case ptm: SimpleEntity if ptm.modifications.exists(_.isInstanceOf[representations.PTM]) => ptm.modifications.find(_.isInstanceOf[representations.PTM]).get.asInstanceOf[representations.PTM].label diff --git a/assembly/src/main/scala/org/clulab/reach/assembly/export/AssemblyRow.scala b/assembly/src/main/scala/org/clulab/reach/assembly/export/AssemblyRow.scala index 6b85c4f94..cff044030 100644 --- a/assembly/src/main/scala/org/clulab/reach/assembly/export/AssemblyRow.scala +++ b/assembly/src/main/scala/org/clulab/reach/assembly/export/AssemblyRow.scala @@ -1,13 +1,17 @@ package org.clulab.reach.assembly.export -import org.clulab.odin.Mention +import org.clulab.odin.{EventMention, Mention, TextBoundMention} import org.clulab.reach.assembly.representations.EntityEventRepresentation import org.clulab.reach.mentions._ +import org.clulab.struct.Interval + +import scala.collection.mutable.ListBuffer + /** Fundamental attributes of an EER at export */ trait EERDescription { - val eer: EntityEventRepresentation + val eer: Option[EntityEventRepresentation] val input: String val output: String val controller: String @@ -34,7 +38,7 @@ class AssemblyRow( val negated: Boolean, val evidence: Set[Mention], // to make debugging easier - val eer: EntityEventRepresentation + val eer: Option[EntityEventRepresentation] ) extends EERDescription { // this might serve as a proxy for confidence, though // it would also be good to know how many times this event @@ -50,7 +54,7 @@ class AssemblyRow( def getTextualEvidence: Seq[String] = { evidence.toSeq.map { m => - val text = m.sentenceObj.getSentenceText + val text = getSentenceMarkup(m) cleanText(text) } } @@ -71,9 +75,10 @@ class AssemblyRow( // replace multiple whitespace characters (newlines, tabs, etc) with a single space val cleanContents = contents.replaceAll("\\s+", " ") // check for only whitespace - AssemblyExporter.WHITESPACE.pattern.matcher(cleanContents).matches match { - case false => cleanContents - case true => AssemblyExporter.NONE + if (AssemblyExporter.WHITESPACE.pattern.matcher(cleanContents).matches) { + AssemblyExporter.NONE + } else { + cleanContents } } @@ -126,6 +131,48 @@ class AssemblyRow( ) } + def getSentenceMarkup(m:Mention):String = m match { + case evt:BioEventMention => + val sent = m.sentenceObj + + val mentionInterval = evt.tokenInterval + val label = evt.label + + val argIntervals:Seq[(Interval, String)] = + (Seq((evt.trigger.tokenInterval, s"trigger")) ++ evt.arguments.flatMap{ + a => + val role = a._1 + val intervals = a._2.map(_.tokenInterval) + intervals.sorted.map(i => i -> role) + }.toSeq).sortBy{ + case (interval, _) => (interval.start, interval.end) + } + + + val tokens = new ListBuffer[String]() + for(ix <- sent.words.indices){ + if(ix == mentionInterval.start) + tokens += "" + + + for((int, arg) <- argIntervals){ + if(ix == int.start) + tokens += "" + + if(ix == int.end) + tokens += "" + } + + if(ix == mentionInterval.end) + tokens += "" + + tokens += sent.words(ix) + } + + tokens.mkString(" ") + case _ => m.sentenceObj.getSentenceText + } + val columns: Map[String, String] = baseColumns def toRow(cols: Seq[String], sep: String = AssemblyExporter.SEP): String = { @@ -150,6 +197,6 @@ object AssemblyRow { negated: Boolean, evidence: Set[Mention], // to make debugging easier - eer: EntityEventRepresentation + eer: Option[EntityEventRepresentation] ) = new AssemblyRow(input, output, source, destination, controller, eerID, label, precededBy, negated, evidence, eer) } \ No newline at end of file diff --git a/assembly/src/main/scala/org/clulab/reach/assembly/representations/Association.scala b/assembly/src/main/scala/org/clulab/reach/assembly/representations/Association.scala new file mode 100644 index 000000000..35c3ef1e0 --- /dev/null +++ b/assembly/src/main/scala/org/clulab/reach/assembly/representations/Association.scala @@ -0,0 +1,33 @@ +package org.clulab.reach.assembly.representations + +import org.clulab.odin.Mention +import org.clulab.reach.assembly.{AssemblyManager, IDPointer} + +class Association( + val uniqueID: IDPointer, + val controllerPointers: Set[IDPointer], + val controlledPointers: Set[IDPointer], + val polarity: String, + val sourceMention: Option[Mention], + val manager: AssemblyManager + ) extends ComplexEvent { + + + + override val eerString = "assembly.Association" + +} + +object Association{ + def apply(uniqueID: IDPointer, + themePointers: Set[IDPointer], + polarity: String, + sourceMention: Option[Mention], + manager: AssemblyManager) = + new Association(uniqueID, + themePointers.take(1), + themePointers.drop(1), + polarity, + sourceMention, + manager) +} diff --git a/bioresources/src/main/resources/org/clulab/reach/kb/NER-Grounding-Override.tsv b/bioresources/src/main/resources/org/clulab/reach/kb/NER-Grounding-Override.tsv index 301d8392b..5b8d7b084 100644 --- a/bioresources/src/main/resources/org/clulab/reach/kb/NER-Grounding-Override.tsv +++ b/bioresources/src/main/resources/org/clulab/reach/kb/NER-Grounding-Override.tsv @@ -841,3 +841,106 @@ CD8 þ T cells D018414 mesh CellType CD8 ϩ T cells D018414 mesh CellType CD8+ T cells D018414 mesh CellType CD8 T cells D018414 mesh CellType +frailty FR00001 frailty BioProcess +loss of fat FR00002 frailty BioProcess +loss of skeleton muscle FR00003 frailty BioProcess +sarcopenia FR00003 frailty BioProcess +frailty index FR00005 frailty BioProcess +frailty syndrome FR00001 frailty BioProcess +disrupted muscle mitochondrial homeostasis FR00004 frailty BioProcess +mitochondrial imbalance FR00006 frailty BioProcess +inflammatory dysregulation FR00007 frailty BioProcess +vascular homeostasis FR00008 frailty BioProcess +lipid metabolism FR00009 frailty BioProcess +insulin resistance FR00010 frailty BioProcess +energy imbalance FR00011 frailty BioProcess +decrease in grip strength FR00012 frailty BioProcess +weight loss FR00013 frailty BioProcess +exhaustion FR00014 frailty BioProcess +weakness FR00015 frailty BioProcess +slowness FR00016 frailty BioProcess +reduced physical activity FR00017 frailty BioProcess +pre-frailty FR00018 frailty BioProcess +inflammatory molecules FR00019 frailty BioEntity +physiological dysregulation FR00020 frailty BioProcess +morbidity FR00022 frailty BioProcess +mortality FR00023 frailty BioProcess +Upregulation of pro inflammatory cytokines FR00021 frailty BioProcess +d-dimer FR00024 frailty BioEntity +tumor necrosis factor-alpha FR00025 frailty BioEntity +inflammatory index FR00026 frailty BioEntity +inflammatory index score FR00026 frailty BioEntity +serum immune mediators FR00027 frailty BioEntity +gait speed FR00028 frailty PossibleController +muscle strength FR00029 frailty PossibleController +frail phenotype FR00001 frailty BioProcess +physiological dysregulation FR00030 frailty BioProcess +immunosenescence FR00031 frailty BioProcess +inflammaging FR00033 frailty BioProcess +sTNF-RII FR00032 frailty BioEntity +comorbidity FR00034 frailty PossibleController +lymphocyte subpopulations FR00035 frailty BioEntity +Osseointegration FR00036 frailty BioProcess +Biomineralization FR00037 frailty BioProcess +Bone Regeneration FR00038 frailty BioProcess +Pinch Strength FR00039 frailty BioProcess +Motor Activity FR00040 frailty BioProcess +excercise FR00041 frailty BioProcess +Cool-Down Exercise FR00041 frailty BioProcess +Hand Strength FR00043 frailty BioProcess +Skeletal Muscle Enlargement FR00044 frailty BioProcess +Anaerobic Threshold FR00045 frailty BioProcess +Walking FR00046 frailty BioProcess +Gait FR00047 frailty BioProcess +Standing Position FR00048 frailty BioProcess +Physical Fitness FR00049 frailty BioProcess +Muscle Stretching Exercises FR00050 frailty BioProcess +Muscle Stretching Exercise FR00050 frailty BioProcess +Stretching Exercise FR00050 frailty BioProcess +Stretching Exercises FR00050 frailty BioProcess +Pronation FR00051 frailty BioProcess +Psychomotor Performance FR00052 frailty BioProcess +Bone Density FR00053 frailty BioProcess +Posture FR00054 frailty BioProcess +Osteolysis FR00055 frailty BioProcess +Muscle Fatigue FR00056 frailty BioProcess +Osteogenesis FR00057 frailty BioProcess +Articular Range of Motion FR00058 frailty BioProcess +Core Stability FR00059 frailty BioProcess +Postural Balance FR00060 frailty BioProcess +Locomotion FR00061 frailty BioProcess +Physical Endurance FR00062 frailty BioProcess +Exergaming FR00063 frailty BioProcess +Isotonic Contraction FR00064 frailty BioProcess +Physical Exertion FR00065 frailty BioProcess +Musculoskeletal Physiological Phenomena FR00066 frailty BioProcess +Musculoskeletal Phenomena FR00066 frailty BioProcess +Excitation Contraction Coupling FR00068 frailty BioProcess +Muscle Strength FR00069 frailty BioProcess +Muscular Strength FR00069 frailty BioProcess +Muscle Contraction FR00070 frailty BioProcess +Muscle Contractions FR00070 frailty BioProcess +Muscular Contraction FR00070 frailty BioProcess +Muscular Contractions FR00070 frailty BioProcess +Intramuscular Absorption FR00071 frailty BioProcess +Chondrogenesis FR00072 frailty BioProcess +Muscle Development FR00073 frailty BioProcess +Muscular Development FR00073 frailty BioProcess +Cardiorespiratory Fitness FR00074 frailty BioProcess +Cardio respiratory Fitness FR00074 frailty BioProcess +Cardio-respiratory Fitness FR00074 frailty BioProcess +Uterine Contraction FR00075 frailty BioProcess +Tonic Immobility Response FR00076 frailty BioProcess +Bone Resorption FR00077 frailty BioProcess +Exercise Tolerance FR00078 frailty BioProcess +Physiologic Calcification FR00079 frailty BioProcess +Stair Climbing FR00080 frailty BioProcess +Muscle Tonus FR00081 frailty BioProcess +Muscular Tonus FR00081 frailty BioProcess +Plyometric Exercise FR00082 frailty BioProcess +Plyometric Exercises FR00082 frailty BioProcess +Isometric Contraction FR00083 frailty BioProcess +Isometric Contractions FR00083 frailty BioProcess +Walking Speed FR00084 frailty BioProcess +Muscle Relaxation FR00085 frailty BioProcess +Muscular Relaxation FR00085 frailty BioProcess diff --git a/bioresources/src/main/resources/org/clulab/reach/kb/ner_stoplist.txt b/bioresources/src/main/resources/org/clulab/reach/kb/ner_stoplist.txt index 0786ca216..fa10c2929 100644 --- a/bioresources/src/main/resources/org/clulab/reach/kb/ner_stoplist.txt +++ b/bioresources/src/main/resources/org/clulab/reach/kb/ner_stoplist.txt @@ -89,4 +89,5 @@ tube water wt figure -fig \ No newline at end of file +fig +mice \ No newline at end of file diff --git a/build.sbt b/build.sbt index 3b3096a55..66e8b3666 100644 --- a/build.sbt +++ b/build.sbt @@ -8,9 +8,9 @@ lazy val commonSettings = Seq( // 2.12.12 results in an exception when trying to access // a resource through getResource(). There might be a // change related to the leading / or something similar. - scalaVersion := "2.12.8", + scalaVersion := "2.12.16", - crossScalaVersions := Seq("2.11.12", "2.12.8"), + crossScalaVersions := Seq("2.11.12", "2.12.16"), scalacOptions ++= Seq("-feature", "-unchecked", "-deprecation"), diff --git a/export/src/main/scala/org/clulab/reach/export/TrainingDataExporter.scala b/export/src/main/scala/org/clulab/reach/export/TrainingDataExporter.scala new file mode 100644 index 000000000..2096c8bc7 --- /dev/null +++ b/export/src/main/scala/org/clulab/reach/export/TrainingDataExporter.scala @@ -0,0 +1,180 @@ +package org.clulab.reach.`export` + +import org.clulab.odin.{EventMention, Mention, TextBoundMention} +import org.clulab.struct.Interval +import org.json4s.JsonDSL._ +import org.json4s._ +import org.json4s.jackson.JsonMethods._ + +import scala.annotation.tailrec + +abstract class BaseDatum{ + def json: JValue +} +case class EmptyDatum(words:Seq[String]) extends BaseDatum { + def json: JValue = ("sentence_tokens" -> words.toList) +} + +case class RegulationDatum( + words: Seq[String], + eventIndices: Range, + type_ : String, + polarity: Boolean, + controllerIndices: Range, + controlledIndices: Range, + triggerIndices:Range, + ruleName:Option[String], + rule:Option[String] + ) extends BaseDatum { + val json: JValue = + ("sentence_tokens" -> words.toList) ~ + ("event_indices" -> eventIndices) ~ + ("type" -> type_) ~ + ("polarity" -> polarity) ~ + ("controller_indices" -> controllerIndices) ~ + ("controlled_indices" -> controlledIndices) ~ + ("trigger_indices" -> triggerIndices) ~ + ("rule_name" -> ruleName.orNull) ~ + ("rule" -> rule.orNull) +} + + +case class NoRegulationDatum( + words: Seq[String], + entitiesIndices: Seq[Range], + ) extends BaseDatum { + + val json: JValue = + ("sentence_tokens" -> words.toList) ~ + ("entities_indices" -> entitiesIndices.toList) +} + +object TrainingDataExporter { + + def getPolarity(label:String): Boolean = { + val l = label.toLowerCase + if(l contains "positive") + true + else if (l contains "negative") + false + else + true + } + + @tailrec + def getIndices(e:Mention, arg:Option[String] = None): Range = { + e match { + case _: EventMention if arg.nonEmpty && e.arguments.contains(arg.get) => getIndices(e.arguments(arg.get).head, arg) + case _ => e.start to e.end + } + } + + def jsonOutput(mentions: Seq[Mention], + allowedLabels:Option[Set[String]] = None, // If not specified, every event will be returned + includeRule:Boolean = false, + rulesDictionary:Option[Map[String, String]] = None): String = { + + def filterCriteria(e:EventMention): Boolean = { + if(allowedLabels.isEmpty || allowedLabels.get.contains(e.label)) + e.arguments.contains("controller") && e.arguments.contains("controlled") + else + false + } + // Iterate only through events + val events:Seq[EventMention] = + mentions collect { + case e:EventMention if filterCriteria(e) => e + } + + val sentencesWithRegulationIndices = events.map(_.sentence).toSet + + // Get the TB mentions + val tbMentions:Seq[TextBoundMention] = + mentions collect { + case tb:TextBoundMention if tb.matches("BioEntity")=> tb + } + + // Get all the sentences with no event associated to it + val emptyValues = + if(events.nonEmpty){ + val sentences = events.head.document.sentences + (for{ + (s, ix) <- sentences.zipWithIndex + if !sentencesWithRegulationIndices.contains(ix) + } yield EmptyDatum(words = s.words)).toSeq + } + else + Seq.empty[RegulationDatum] + + // Get the sentences with mentions but w/o regulations + val sentenceIxToMentions = tbMentions.groupBy(_.sentence) + + val mentionsWORegulationValues = + for { + (sIx, tbs) <- sentenceIxToMentions + if !(sentencesWithRegulationIndices contains sIx) + } yield { + val words = tbs.head.sentenceObj.words + NoRegulationDatum( + words = words, entitiesIndices = tbs.map(tb => getIndices(tb, None)) + ) + } + + val (regulationValues, hardNoRegulationValues) = + (events map { + e => + val trigger = e.trigger + + val controllerIndices = getIndices(e, Some("controller")) + val controlledIndices = getIndices(e, Some("controlled")) + + val words = e.sentenceObj.words + + // The positive instance + val datum = + RegulationDatum( + words, + getIndices(e), + e.label, + getPolarity(e.label), + controllerIndices, + controlledIndices, + getIndices(trigger), + Some(e.foundBy), + if(includeRule && rulesDictionary.isDefined) { + Some(rulesDictionary.get.getOrElse(e.foundBy, "MISSING VAL")) + } else + None + + ) + + // The "hard" negative instance + // Get all the tb mentions in the same sentence + val sentenceTbMentions = sentenceIxToMentions.getOrElse(e.sentence, Seq.empty) + // Get all the eligible tb mentions (those that aren't participant in the event + val iController = Interval(controllerIndices.start, controllerIndices.end) + val iControlled = Interval(controlledIndices.start, controlledIndices.end) + val otherMentions = sentenceTbMentions.map(tb => getIndices(tb)).filter(ix => { + val iTb = Interval(ix.start, ix.end) + !((iTb overlaps iController) || (iTb overlaps iControlled)) + }) + + // Now do the cross product of all the mentions that aren't participant + val hardNegative = if(otherMentions.nonEmpty) Some(NoRegulationDatum(words, otherMentions)) else None + + (datum, hardNegative) + }).unzip + +// val allDatums:Seq[BaseDatum] = regulationValues ++ hardNoRegulationValues ++ emptyValues ++ mentionsWORegulationValues + + val ret = ("regulations" -> regulationValues.map(_.json)) ~ + ("hardInstances" -> hardNoRegulationValues.collect{case Some(d) => d}.map(_.json)) ~ + ("withoutRegulations" -> mentionsWORegulationValues.map(_.json)) ~ + ("emptySentences" -> emptyValues.map(_.json)) + + + pretty(render(ret)) + } + + +} diff --git a/export/src/main/scala/org/clulab/reach/export/VisualAnalyticsDataExporter.scala b/export/src/main/scala/org/clulab/reach/export/VisualAnalyticsDataExporter.scala new file mode 100644 index 000000000..76f4190da --- /dev/null +++ b/export/src/main/scala/org/clulab/reach/export/VisualAnalyticsDataExporter.scala @@ -0,0 +1,176 @@ +package org.clulab.reach.`export` + +import org.clulab.odin.{EventMention, Mention, TextBoundMention} +import org.clulab.reach.FriesEntry +import org.clulab.reach.mentions.{BioEventMention, BioTextBoundMention, Negation} +import org.json4s.{JObject, JValue} +import org.json4s.JsonAST.JArray +import org.json4s.JsonDSL._ +import org.json4s.jackson.JsonMethods._ + +import scala.annotation.tailrec + +case class RelationDatum( + // Information about the event + controller: String, + controllerId: Seq[String], + controlled: Option[String], + controlledId: Option[Seq[String]], + sentenceTokens: Seq[String], + eventIndices: (Int, Int), + eventCharSpan: (Int, Int), + label: String, + negatedTrigger:Boolean, + polarity: Option[Boolean], // True for positive, False for negative, None for undefined + controllerIndices: (Int, Int), + controllerCharSpan: (Int, Int), + controlledIndices: Option[(Int, Int)], + controlledCharSpan: Option[(Int, Int)], + triggerIndices: Option[(Int, Int)], + triggerCharSpan: Option[(Int, Int)], + // Information about the rule that detected the extraction + ruleName: Option[String], + // Information about the context, to aid transformers + contextLeft: Option[Seq[String]], + contextRight: Option[Seq[String]] + ) { + val json: JValue = { + ("controller" -> controller)~ + ("controller_id" -> controllerId) ~ + ("controlled" -> controlled.orNull) ~ + ("controlled_id" -> controlledId.orNull) ~ + ("sentence_tokens" -> sentenceTokens) ~ + ("event_indices" -> List(eventIndices._1, eventIndices._2)) ~ + ("event_char_span" -> List(eventCharSpan._1, eventCharSpan._2)) ~ + ("label" -> label) ~ + ("negated_trigger" -> negatedTrigger) ~ + ("polarity" -> polarity) ~ + ("controller_indices" -> List(controllerIndices._1, controllerIndices._2)) ~ + ("controlled_indices" -> controlledIndices.map(i => List(i._1, i._2))) ~ + ("trigger_indices" -> triggerIndices.map(i => List(i._1, i._2))) ~ + ("controller_char_span" -> List(controllerCharSpan._1, controllerCharSpan._2)) ~ + ("controlled_char_span" -> controlledCharSpan.map(i => List(i._1, i._2))) ~ + ("trigger_char_span" -> triggerCharSpan.map(i => List(i._1, i._2))) ~ + ("rule_name" -> ruleName.orNull) ~ + ("context_left" -> contextLeft.orNull) ~ + ("context_right" -> contextRight.orNull) + } +} + +object RelationDatum{ + + @tailrec + private def getArgument(m:Mention, argumentName:String):Option[Mention] = { + if(m.isInstanceOf[TextBoundMention]) + Some(m) + else { + m.arguments.get(argumentName) match { + case Some(s:Seq[Mention]) => getArgument(s.head, argumentName) + case _ => m.arguments.get("theme") match { + case Some(b) => Some(b.head) + case _ => None + } + + } + } + } + + private def getPolarity(label: String): Option[Boolean] = { + val l = label.toLowerCase + if (l contains "positive") + Some(true) + else if (l contains "negative") + Some(false) + else + None + } + + private def isNegated(evt:BioEventMention): Boolean = { + evt.modifications.exists { + case _: Negation => + true + case _ => false + } + } + + private def getCharSpan(mention:Mention): (Int, Int) = { + val sent = mention.sentenceObj + val (firstToken, lastToken) = (mention.tokenInterval.start, mention.tokenInterval.end-1) + (sent.startOffsets(firstToken), sent.endOffsets(lastToken)) + } + + def fromEventMention(evt:EventMention): RelationDatum = { + val trigger = evt.trigger + val controller = getArgument(evt, "controller").get + val controlled = getArgument(evt, "controlled") + + val contextLeft = + if(evt.sentence > 0){ + Some(evt.document.sentences(evt.sentence - 1).words.toSeq) + } + else None + + val contextRight = + if (evt.sentence < (evt.document.sentences.length - 1)) { + Some(evt.document.sentences(evt.sentence + 1).words.toSeq) + } + else None + + RelationDatum( + controller = controller.text, + controllerId = { + val controllerGrounding = controller.asInstanceOf[BioTextBoundMention].grounding().get + Seq(controllerGrounding.namespace, controllerGrounding.id) + }, + controlled = controlled match { + case Some(c) => Some(c.text) + case _ => None + }, + controlledId = controlled match { + case Some(c) => + val controlledGrounding = c.asInstanceOf[BioTextBoundMention].grounding().get + Some(Seq(controlledGrounding.namespace, controlledGrounding.id)) + case _ => None + }, + sentenceTokens = evt.sentenceObj.words, + eventIndices = (evt.start, evt.end), + eventCharSpan = getCharSpan(evt), + label = evt.label, + negatedTrigger = isNegated(evt.asInstanceOf[BioEventMention]), + polarity = getPolarity(evt.label), + controllerIndices = (controller.start, controller.end), + controllerCharSpan = getCharSpan(controller), + controlledIndices = controlled.map(c =>(c.start, c.end)), + controlledCharSpan = controlled.map(getCharSpan), + triggerIndices = Some((trigger.start, trigger.end)), + triggerCharSpan = Some(getCharSpan(trigger)), + ruleName = Some(evt.foundBy), + contextLeft = contextLeft, + contextRight = contextRight + ) + } +} + +object VisualAnalyticsDataExporter { + + def jsonOutput(mentions:Seq[Mention], entry:Option[FriesEntry]):String = { + // Only activations, regulations and associations + val filteredMentions = + mentions.filter{ + m => + val label = m.label.toLowerCase() + m.isInstanceOf[EventMention] && Seq("activation", "regulation", "association").map(label.contains).exists(identity) + } + + val data = filteredMentions.map(e => RelationDatum.fromEventMention(e.asInstanceOf[EventMention])) + val entryText = entry map (_.text) + val json = JObject("mentions" -> JArray(data.map(_.json).toList)) + + pretty(render( + entryText match { + case Some(txt) => json ~ ("text" -> txt) + case None => json + })) + } + +} diff --git a/export/src/main/scala/org/clulab/reach/export/cmu/CMURow.scala b/export/src/main/scala/org/clulab/reach/export/cmu/CMURow.scala index b0d801f11..3500c23f8 100644 --- a/export/src/main/scala/org/clulab/reach/export/cmu/CMURow.scala +++ b/export/src/main/scala/org/clulab/reach/export/cmu/CMURow.scala @@ -34,7 +34,7 @@ class CMURow( precededBy, negated, evidence, - eer + Some(eer) ) { /** * Translates Reach namespaces into element types diff --git a/main/build.sbt b/main/build.sbt index 71c0480e3..8b44fd05d 100644 --- a/main/build.sbt +++ b/main/build.sbt @@ -4,7 +4,7 @@ libraryDependencies ++= { val luceVer = "5.3.1" Seq( - "ai.lum" %% "nxmlreader" % "0.1.2", + "ai.lum" %% "nxmlreader" % "0.1.5", "ai.lum" %% "common" % "0.1.4", "commons-io" % "commons-io" % "2.4", "jline" % "jline" % "2.12.1", diff --git a/main/src/main/resources/application.conf b/main/src/main/resources/application.conf index 6b3063b46..a112d062a 100644 --- a/main/src/main/resources/application.conf +++ b/main/src/main/resources/application.conf @@ -20,7 +20,7 @@ outDir = ${rootDir}/output # this is the directory that stores the raw nxml, .csv, and/or .tsv files # this directory *must* exist -papersDir = ${rootDir}/papers +papersDir = ${rootDir}/sers_985 # the encoding of input and output files @@ -37,10 +37,10 @@ ignoreSections = ["references", "materials", "materials|methods", "methods", "su # "assembly-tsv" (assembly output) # "arizona" (Arizona's custom tabular output for assembly) # "cmu" (CMU's custom tabular output for assembly) -outputTypes = ["fries", "cmu", "serial-json", "indexcard", "arizona", "text"] +outputTypes = ["visual-analytics"] #["visual-analytics", "fries", "cmu", "serial-json", "indexcard", "arizona", "text"] # number of simultaneous threads to use for parallelization -threadLimit = 2 +threadLimit = 10 # verbose logging verbose = true @@ -57,7 +57,7 @@ contextEngine { } polarity { - engine = Hybrid //Hybrid//DeepLearning //Linguistic + engine = Linguistic //Hybrid//DeepLearning //Linguistic negCountThreshold = 1 // when lower than or equal to this value, use linguistic approach in hybrid method maskOption = tag //tag_name //name //tag savedModel = SavedLSTM_WideBound_u // SavedLSTM // SavedLSTM_WideBound diff --git a/main/src/main/resources/org/clulab/reach/biogrammar/entities/entities.yml b/main/src/main/resources/org/clulab/reach/biogrammar/entities/entities.yml index 4d7e8d666..2a100c698 100644 --- a/main/src/main/resources/org/clulab/reach/biogrammar/entities/entities.yml +++ b/main/src/main/resources/org/clulab/reach/biogrammar/entities/entities.yml @@ -149,7 +149,7 @@ rules: [entity='B-Species'] [entity='I-Species']* - name: ner-cell-lines - label: CellLine + label: [CellLine, CellLineAsParticipant] action: mkNERMentions priority: 3 type: token @@ -157,7 +157,7 @@ rules: [entity='B-CellLine'] [entity='I-CellLine']* - name: ner-organ - label: Organ + label: [Organ, OrganAsParticipant] action: mkNERMentions priority: 3 type: token @@ -165,7 +165,7 @@ rules: [entity='B-Organ'] [entity='I-Organ']* - name: ner-cell-type - label: CellType + label: [CellType, CellTypeAsParticipant] action: mkNERMentions priority: 3 type: token @@ -173,7 +173,7 @@ rules: [entity='B-CellType'] [entity='I-CellType']* - name: ner-tissue-type - label: TissueType + label: [TissueType, TissueTypeAsParticipant] action: mkNERMentions priority: 3 type: token diff --git a/main/src/main/resources/org/clulab/reach/biogrammar/events/association-only_template.yml b/main/src/main/resources/org/clulab/reach/biogrammar/events/association-only_template.yml new file mode 100755 index 000000000..11657862d --- /dev/null +++ b/main/src/main/resources/org/clulab/reach/biogrammar/events/association-only_template.yml @@ -0,0 +1,112 @@ +# +# The rules below apply ONLY to activations (not regulations) +# + +vars: org/clulab/reach/biogrammar/vars.yml + +rules: + + +- name: ${ ruleType }_syntax_results_in + priority: ${ priority } + example: "IL-6 results in subsequent frailty association." + label: ${ label } + action: ${ actionFlow } + pattern: | + trigger = [word=/(?i)^(${ triggers })/ & tag=/^N/] [lemma=/^(${ auxtriggers })/ & tag=/^N/]? + controller:${ controllerType } = [word=/(?i)(${ significance_triggers })$/]) + [ word = /^=$/ ] + (? [ tag = CD ]+ [word = /\%/]?) + + - name: significance_2 + priority: ${ priority } + example: "r= 0.21" + label: Significance + type: token + action: mkSignificance + pattern: | + (? [ word =/(?i)${ significance_triggers }=$/ ]) + (? [ tag = CD ]+ [word = /\%/]?) + + - name: significance_3 + priority: ${ priority } + example: "p =0.21" + label: Significance + type: token + action: mkSignificance + pattern: | + (? [word = /(?i)${ significance_triggers }$/]) + (? [word = /^=/ & tag = /CD/]+ [word = /\%/]?) + + - name: confidence_inetrval_1 + priority: ${ priority } + example: "95 CI: 0.27 - 2.13" + label: Confidence_interval + type: token + pattern: | + (? [ tag = CD ]+ [ word = /\%/]?) + [word = /(?i)ci$/] + [ word = /^[=:]$/ ]? + (? [tag = CD]) + ([ word = /^[-\/]$/ ]* (? [tag = CD]))? \ No newline at end of file diff --git a/main/src/main/resources/org/clulab/reach/biogrammar/events_master.yml b/main/src/main/resources/org/clulab/reach/biogrammar/events_master.yml index 3888b7551..ea286d216 100644 --- a/main/src/main/resources/org/clulab/reach/biogrammar/events_master.yml +++ b/main/src/main/resources/org/clulab/reach/biogrammar/events_master.yml @@ -5,7 +5,7 @@ vars: auxtriggers: "activ|regul" posTriggers: "acceler|accept|activat|aid|allow|augment|cataly|caus|contribut|direct|driv|elev|elicit|enabl|enhanc|increas|induc|initi|interconvert|lead|led|mediat|modul|necess|overexpress|potenti|produc|prolong|promot|rais|reactivat|re-express|rescu|restor|retent|signal|stimul|support|synerg|synthes|trigger|underli|up-regul|upregul" negTriggers: "attenu|abolish|abrog|antagon|arrest|attenu|block|deactiv|decreas|degrad|deplet|deregul|diminish|disrupt|down-reg|downreg|dysregul|elimin|impair|imped|inactiv|inhibit|knockdown|limit|loss|lower|negat|nullifi|perturb|prevent|reduc|reliev|repress|resist|restrict|revers|sequester|shutdown|slow|starv|suppress|supress" - + assocTriggers: "associat|connect|link|influence|involv|mechanism" # For the record: OLD triggers, no longer used #posnouns: "acceler|activ|augment|cataly|caus|driv|elev|elicit|enhanc|increas|induc|induct|initi|produc|promot|promot|rais|reactiv|re-express|releas|stimul|trigger|up-regul|upregul" #negnouns: "decreas|inhibit|loss|repress|suppress|supress" @@ -508,3 +508,18 @@ rules: priority: "14" # must be 1 + priority of regulations! controlledType: "BioEntity" controllerType: "PossibleController" + + # Positive associations, from patterns that apply ONLY to associations + - import: org/clulab/reach/biogrammar/events/association-only_template.yml + vars: + label: "Association" + ruleType: "association" + triggers: ${assocTriggers} + actionFlow: "mkAssociation" + priority: "15" # must be 1 + priority of regulations! + controlledType: "PossibleController" + controllerType: "PossibleController" + + - import: org/clulab/reach/biogrammar/events/statistical_significance_events.yml + vars: + priority: "16" \ No newline at end of file diff --git a/main/src/main/resources/org/clulab/reach/biogrammar/taxonomy.yml b/main/src/main/resources/org/clulab/reach/biogrammar/taxonomy.yml index 61bd12eec..958a5493f 100644 --- a/main/src/main/resources/org/clulab/reach/biogrammar/taxonomy.yml +++ b/main/src/main/resources/org/clulab/reach/biogrammar/taxonomy.yml @@ -1,6 +1,9 @@ - Alias - ModificationTrigger - Site +- Statistic: + - Significance + - Confidence_interval - Context: - Species - CellLine @@ -57,6 +60,7 @@ - Deribosylation - Desumoylation - Deubiquitination + - ComplexEvent: - Regulation: - Positive_regulation @@ -64,9 +68,17 @@ - ActivationEvent: - Positive_activation - Negative_activation + - Association: + - Positive_association + - Negative_association + - Entity: # Any BioEntity may appear as the controlled in an Activation - BioEntity: + - TissueTypeAsParticipant + - OrganAsParticipant + - CellLineAsParticipant + - CellTypeAsParticipant - Disease - BioProcess # ex. "apoptosis" - BioChemicalEntity: diff --git a/main/src/main/resources/org/clulab/reach/biogrammar/vars.yml b/main/src/main/resources/org/clulab/reach/biogrammar/vars.yml index deba3abe3..344024660 100644 --- a/main/src/main/resources/org/clulab/reach/biogrammar/vars.yml +++ b/main/src/main/resources/org/clulab/reach/biogrammar/vars.yml @@ -76,6 +76,8 @@ noun_modifier_[^bt]: "nmod_[^bt]" noun_modifier_(by|of): "nmod_(by|of)" noun_modifier_o[nf]$: "^nmod_o[nf]$" +significance_triggers: "^([rp]|SMD|I\\d+)" + #ATTN: there are relations mentioned in the folders and files below, but the rules there are fairly specific, #so substituting with a variable could be damaging; changes might need to be made manually in these files if the names of #relations change in the next versions of Universal Dependencies diff --git a/main/src/main/scala/org/clulab/coref/Coref.scala b/main/src/main/scala/org/clulab/coref/Coref.scala index 5e3f2d93e..63dcd1dbd 100644 --- a/main/src/main/scala/org/clulab/coref/Coref.scala +++ b/main/src/main/scala/org/clulab/coref/Coref.scala @@ -318,6 +318,8 @@ class Coref extends LazyLogging { val tbms = mentions.filter(_.isInstanceOf[CorefTextBoundMention]).map(_.asInstanceOf[CorefTextBoundMention]) val sevts = mentions.filter(m => m.isInstanceOf[CorefEventMention] && m.matches("SimpleEvent")).map(_.asInstanceOf[CorefEventMention]) val cevts = mentions.filter(m => m.matches("ComplexEvent")) + // Added by Enrique to avoid discarding the Significance relations + val significanceevts = mentions.filter(m => m.matches("Significance") || m.matches("Confidence_interval")) val resolvedTBMs = resolveTBMs(tbms) val tbmSieveMap = tbmSieves(tbms.filter(_.isGeneric)) @@ -332,7 +334,7 @@ class Coref extends LazyLogging { val resolved = resolvedTBMs ++ resolvedSimple ++ resolvedComplex val retVal = corefDistinct(mentions.flatMap(mention => resolved.getOrElse(mention, Nil))) - retVal + retVal ++ significanceevts } diff --git a/main/src/main/scala/org/clulab/coref/CorefUtils.scala b/main/src/main/scala/org/clulab/coref/CorefUtils.scala index ab1ad0ca5..c0ab0de19 100644 --- a/main/src/main/scala/org/clulab/coref/CorefUtils.scala +++ b/main/src/main/scala/org/clulab/coref/CorefUtils.scala @@ -64,7 +64,7 @@ object CorefUtils { lbls match { case binding if lbls contains "Binding" => args.contains("theme") && args("theme").length >= 2 // Binding is not required to be binary anymore (see binding_token_5) - case simple if lbls contains "SimpleEvent" => + case simple if (lbls contains "SimpleEvent" )|| (lbls contains "Association") => ((args.contains("theme") && args("theme").nonEmpty) || (args.contains("substrate") && args("substrate").nonEmpty && args.contains("product") && args("product").nonEmpty)) diff --git a/main/src/main/scala/org/clulab/reach/ReachSystem.scala b/main/src/main/scala/org/clulab/reach/ReachSystem.scala index 24d46ed99..e7a3a627f 100644 --- a/main/src/main/scala/org/clulab/reach/ReachSystem.scala +++ b/main/src/main/scala/org/clulab/reach/ReachSystem.scala @@ -19,6 +19,9 @@ import ai.lum.common.Interval import org.clulab.processors.bionlp.BioNLPProcessor import org.clulab.reach.utils.Preprocess import ai.lum.nxmlreader.standoff.Implicits._ +import org.clulab.odin.impl.RuleReader + +import java.nio.charset.StandardCharsets // import org.clulab.reach.utils.MentionManager @@ -52,6 +55,20 @@ class ReachSystem( val textPreProc = new Preprocess val procAnnotator = new BioNLPProcessor() + + /** Returns a map with all the rule patterns in the system keyed by their name */ + lazy val rulePatternsMap:Map[String, String] = { + Seq(entityRules, modificationRules, eventRules) flatMap { + rules => + val reader = new RuleReader(actions, StandardCharsets.UTF_8, None) + val parsedRules = reader.getRules(rules) + parsedRules map { + pr => + pr.name -> pr.pattern + } + } + }.toMap + /** returns string with all rules used by the system */ def allRules: String = Seq(entityRules, modificationRules, eventRules, contextRules).mkString("\n\n") diff --git a/main/src/main/scala/org/clulab/reach/darpa/DarpaActions.scala b/main/src/main/scala/org/clulab/reach/darpa/DarpaActions.scala index ef215fdba..95f6cb741 100644 --- a/main/src/main/scala/org/clulab/reach/darpa/DarpaActions.scala +++ b/main/src/main/scala/org/clulab/reach/darpa/DarpaActions.scala @@ -4,8 +4,9 @@ import com.typesafe.scalalogging.LazyLogging import org.clulab.odin._ import org.clulab.polarity.PolarityEngine import org.clulab.reach._ +import org.clulab.reach.darpa.StrengthHandler.weakLemmas import org.clulab.reach.mentions._ -import org.clulab.reach.mentions.serialization.json.BioTextBoundMention +import org.clulab.reach.mentions.serialization.json.{BioTextBoundMention, formats} import org.clulab.struct.DirectedGraph import scala.annotation.tailrec @@ -247,6 +248,48 @@ class DarpaActions extends Actions with LazyLogging { new BioEventMention(m.copy(arguments = arguments), isDirect = true) } + def mkAssociation(mentions: Seq[Mention], state: State): Seq[Mention] = mentions flatMap { + case m: EventMention if m.matches("Association") => + // themes in a subject position + val theme1s = m.arguments.getOrElse("theme1", Nil).map(_.toBioMention) + // themes in an object position + val theme2s = m.arguments.getOrElse("theme2", Nil).map(_.toBioMention) + + val numPositiveCues = StrengthHandler.countPositiveCues(m) + val numNegativeCues = StrengthHandler.countNegativeCues(m) + + val newLabel = + if(numPositiveCues > 0 && numNegativeCues == 0) + Seq("Positive_association") + else if(numNegativeCues > 0) + if(numNegativeCues % 2 == 0) + Seq("Positive_association") + else + Seq("Negative_association") + else + Seq.empty[String] + + val labels = newLabel ++ m.labels + + + (theme1s, theme2s) match { + case (t1s, t2s) if t1s.nonEmpty && t2s.nonEmpty + => Seq(new BioEventMention(m - "theme1" - "theme2" + ("theme" -> Seq(t1s.head, t2s.head))).copy(labels = labels)) + case (t1s, t2a) if t2a.isEmpty => + Seq(new BioEventMention(m - "theme1" - "theme2" + ("theme" -> t1s)).copy(labels = labels)) + case _ => Nil + } + } + + def mkSignificance(mentions: Seq[Mention], state: State): Seq[Mention] = mentions //map { +// case m: RelationMention if m.matches("Significance") => +// val rawKind = m.arguments("kind").head +// val rawVal = m.arguments("value").head +// +// new BioRelationMention(m.copy( )) +// +// } + def mkBinding(mentions: Seq[Mention], state: State): Seq[Mention] = mentions flatMap { case m: EventMention if m.matches("Binding") => // themes in a subject position diff --git a/main/src/main/scala/org/clulab/reach/darpa/HypothesisHandler.scala b/main/src/main/scala/org/clulab/reach/darpa/HypothesisHandler.scala index eaef48ad0..272ba44be 100644 --- a/main/src/main/scala/org/clulab/reach/darpa/HypothesisHandler.scala +++ b/main/src/main/scala/org/clulab/reach/darpa/HypothesisHandler.scala @@ -24,6 +24,7 @@ object HypothesisHandler { "hypotheses", "hypothesize", "implication", + "implicate", "imply", "indicate", "predict", diff --git a/main/src/main/scala/org/clulab/reach/darpa/StrengthHandler.scala b/main/src/main/scala/org/clulab/reach/darpa/StrengthHandler.scala new file mode 100644 index 000000000..2f3b2afd9 --- /dev/null +++ b/main/src/main/scala/org/clulab/reach/darpa/StrengthHandler.scala @@ -0,0 +1,79 @@ +package org.clulab.reach.darpa + +import org.clulab.odin.{EventMention, Mention, State} +import org.clulab.reach.mentions.{BioEventMention, BioTextBoundMention, Hypothesis} +import org.clulab.struct.{DirectedGraph, Interval} + +object StrengthHandler { + val degree = 2 // Degree up to which we should follow the links in the graph + + + val strongLemmas = Set("higher", "positively", "increase", "elevated") + val weakLemmas = Set("lower", "negatively", "decrease", "reduce") + + // Recursive function that helps us get the words outside the event + def getSpannedIndexes(index: Int, degree: Int, dependencies: DirectedGraph[String]): Seq[Int] = { + degree match { + case 0 => Seq[Int]() // Base case of the recursion + case _ => + + val outgoing = dependencies.outgoingEdges + val incoming = dependencies.incomingEdges + + // Get incoming and outgoing edges + val t: Seq[(Int, String)] = incoming.lift(index) match { + case Some(x) => x + case None => Seq() + } + + val edges = t ++ (outgoing.lift(index) match { + case Some(x) => x + case None => Seq() + }) + + + // Each edge is a tuple of (endpoint index, edge label), so we map it to the first + // element of the tuple + val indexes: Seq[Int] = edges map (_._1) + + // Recursively call this function to get outter degrees + val higherOrderIndexes: Seq[Int] = indexes flatMap (getSpannedIndexes(_, degree - 1, dependencies)) + + indexes ++ higherOrderIndexes + } + } + + def countPositiveCues(mention: Mention): Int = countCues(mention, strongLemmas) + + def countNegativeCues(mention: Mention): Int = countCues(mention, weakLemmas) + + def countCues(mention: Mention, hints:Set[String]): Int = { + mention match { + case event: EventMention => + + // Get the dependencies of the sentence + val dependencies = event.sentenceObj.dependencies.getOrElse(new DirectedGraph[String](Nil, None)) + + val eventInterval: Seq[Int] = event.tokenInterval + + // Get the index of the word outside the event up to "degree" degrees + val spannedIndexes: Seq[Int] = eventInterval flatMap (getSpannedIndexes(_, degree, dependencies)) + + // Remove duplicates + val indexes: Seq[Int] = (eventInterval ++ spannedIndexes).distinct + + // Get the lemmas + val lemmas = indexes map (event.sentenceObj.lemmas.get(_)) + + // Search for the hints + (for { + // Zip the lemma with its index, this is necessary to build the Modifictaion + lemma <- lemmas + } yield hints contains lemma).count(identity) + + case _ => 0 + } + + } + +} diff --git a/main/src/main/scala/org/clulab/reach/indexer/NxmlSearcher.scala b/main/src/main/scala/org/clulab/reach/indexer/NxmlSearcher.scala index 5f78e2a02..eb34dfd71 100644 --- a/main/src/main/scala/org/clulab/reach/indexer/NxmlSearcher.scala +++ b/main/src/main/scala/org/clulab/reach/indexer/NxmlSearcher.scala @@ -219,6 +219,328 @@ class NxmlSearcher(val indexDir:String) { logger.debug("Done.") } + def chiltonUseCase(resultDir:String):Unit = { + val phrases = Seq("adipose-specific phospholipase a2", + "adpla", + "group xvi phospholipase a1\\/a2", + "hras-like suppressor 1", + "hras-like suppressor 3", + "hrasls3", + "hrev107", + "h-rev107", + "hrev107", + "h-rev107", + "hrev107-1", + "h-rev107-1", + "hrev107-1", + "h-rev107-1", + "hrev107-3", + "h-rev 107 protein homolog", + "hrsl3", + "mgc118754.", + "phospholipase a and acyltransferase 3", + "pla2g16", + "plaat3", + "plaat-3", + "plaat3", + "plaat-3", + "renal carcinoma antigen ny-ren-65", + "nmd", + "phosphatidylserine-specific phospholipase a1", + "phospholipase a1 member a", + "pla1a", + "ps-pla1", + "pspla1", + "ps-pla1", + "group 10 secretory phospholipase a2", + "group x secretory phospholipase a2", + "gxpla2", + "gx spla2", + "phosphatidylcholine 2-acylhydrolase 10", + "phospholipase a2 group x", + "pla2g10", + "spla2-x", + "fksg71", + "gxiib", + "gxiii spla2-like", + "phospholipase a2 group xiib", + "pla2g12b", + "pla2g13", + "spla2-gxiib", + "group ib phospholipase a2", + "phosphatidylcholine 2-acylhydrolase 1b", + "phospholipase a2", + "phospholipase a2 group ib", + "pla2", + "pla2a", + "pla2g1b", + "ppla2", + "giic spla2", + "group iia phospholipase a2", + "non-pancreatic secretory phospholipase a2", + "nps-pla2", + "phosphatidylcholine 2-acylhydrolase 2a", + "phospholipase a2 group iia", + "phospholipase a2, membrane associated", + "pla2b", + "pla2g2a", + "pla2l", + "rasf-a", + "phosphatidylcholine 2-acylhydrolase-like protein giic", + "phospholipase a2 group iic", + "pla2g2c", + "giid spla2", + "group iid secretory phospholipase a2", + "phosphatidylcholine 2-acylhydrolase 2d", + "phospholipase a2 group iid", + "pla2g2d", + "pla2iid", + "secretory-type pla, stroma-associated homolog", + "spla2-iid", + "spla2s", + "splash", + "giie spla2", + "group iie secretory phospholipase a2", + "phosphatidylcholine 2-acylhydrolase 2e", + "phospholipase a2 group iie", + "pla2g2e", + "spla2-iie", + "phosphatidylcholine 2-acylhydrolase 5", + "phospholipase a2 group v", + "pla2-10", + "pla2g5", + "1-alkyl-2-acetylglycerophosphocholine esterase", + "2-acetyl-1-alkylglycerophosphocholine esterase", + "group-viia phospholipase a2", + "gviia-pla2", + "ldl-associated phospholipase a2", + "ldl-pla2", + "ldl-pla(2)", + "ldl-pla2", + "paf 2-acylhydrolase", + "paf acetylhydrolase", + "pafah", + "phospholipase a2 group vii", + "pla2g7", + "platelet-activating factor acetylhydrolase", + "clec13c", + "m-type receptor", + "phospholipase a2 receptor 1", + "pla2g1r", + "pla2ir", + "pla2r", + "pla2-r", + "pla2r", + "pla2-r", + "pla2r1", + "secretory phospholipase a2 receptor", + "soluble secretory phospholipase a2 receptor", + ) + + val queryStr = phrases map (p => s"(${p})") mkString " OR " + + vanillaUseCase(queryStr, resultDir, 2000000) + } + + def skyeUseCase(resultDir:String):Unit = { + val phrases = Seq( + "CD antigen CD79b", + "Igb", + "IgM Fc fragment receptor", + "Ig gamma-2B chain C region", + "Fas apoptotic inhibitory molecule 3", + "CD antigen CD268", + "Keratin, type I cytoskeletal 15", + "ThB", + "Cytokeratin endo B", + "Wfdc2", + "B- and T-lymphocyte attenuator", + "Keratin-15", + "Keratin D", + "Fc-epsilon-RII", + "Cytokeratin-14", + "PLA2IID", + "Fcer2", + "Lyt3", + "CD antigen CD8b", + "Lymphocyte IgE receptor", + "Cd79b", + "Krt1-14", + "sPLA2-IID", + "Lyb-8", + "Krt18", + "BAFF-R", + "Igh-3", + "Bcmd", + "Keratin-14", + "Lyt-3", + "Faim3", + "WAP domain-containing protein HE4", + "CD antigen CD23", + "Siglec2", + "Tnfrsf13c", + "T-cell surface glycoprotein Lyt-3", + "GIID sPLA2", + "Krtdap", + "Cr2", + "Regulator of Fas-induced apoptosis Toso", + "Pla2g2d", + "Marco", + "Lymphocyte antigen 6D", + "Macrophage receptor with collagenous structure", + "B-cell receptor CD22", + "Macrophage receptor MARCO", + "3-HAO", + "CK-14", + "CD antigen CD272", + "Keratin, type I cytoskeletal 18", + "Pla2a2", + "Thymocyte B-cell antigen", + "CD antigen CD352", + "HAD", + "B-cell-activating factor receptor", + "Baffr", + "Cd22", + "Thb", + "Krt15", + "K18", + "N-arachidonyl glycine receptor", + "Fcer2a", + "Br3", + "Cd8b", + "Ig-beta", + "Keratin-18", + "Gpr18", + "Toso", + "Cytokeratin-15", + "Haao", + "Fcmr", + "Ly6d", + "Siglec-2", + "SLAM family member 6", + "Cd8b1", + "Sialic acid-binding Ig-like lectin 2", + "Kdap", + "BL-CAM", + "Secretory-type PLA, stroma-associated homolog", + "BAFF receptor", + "Ly-6D", + "CD antigen CD21", + "Keratinocyte differentiation-associated protein", + "G-protein coupled receptor 18", + "CK-15", + "Group IID secretory phospholipase A2", + "Phosphatidylcholine 2-acylhydrolase 2D", + "Kerd", + "Keratin, type I cytoskeletal 14", + "Krt1-15", + "B-cell maturation defect", + "Lymphocyte antigen 108", + "Ly108", + "B-cell-specific glycoprotein B29", + "Krt14", + "K14", + "Cytokeratin-18", + "NAGly receptor", + "CD antigen CD22", + "Krt1-18", + "Complement receptor type 2", + "Slamf6", + "T-cell surface antigen Leu-14", + "3-hydroxyanthranilate oxygenase", + "T-cell membrane glycoprotein Ly-3", + "B-lymphocyte cell adhesion molecule", + "3-hydroxyanthranilic acid dioxygenase", + "B- and T-lymphocyte-associated protein", + "3-hydroxyanthranilate 3,4-dioxygenase", + "Splash", + "He4", + "Ly-3", + "CK-18", + "BLyS receptor 3", + "Ly61", + "Lymphocyte antigen 3", + "Immunoglobulin-associated B29 protein", + "Complement C3d receptor", + "Btla", + "K15", + ) + + val queryStr = phrases map (p => s"(${p})") mkString " OR " + + vanillaUseCase(queryStr, resultDir, 1000000) + } + /** + * Use case for Frailty, specifically for muscle-related conditions + */ + def frailtyUseCase(resultDir:String): Unit = { + val phrases = Seq( + "Musculoskeletal Development", + "Osseointegration", + "Eye Movement", + "Biomineralization", + "Bone Regeneration", + "Pinch Strength", + "Motor Activity", + "Cool-Down Exercise", + "Dependent Ambulation", + "Hand Strength", + "Skeletal Muscle Enlargement", + "Anaerobic Threshold", + "Walking", + "Gait", + "Standing Position", + "Physical Fitness", + "Muscle Stretching Exercises", + "Pronation", + "Psychomotor Performance", + "Bone Density", + "Posture", + "Osteolysis", + "Muscle Fatigue", + "Osteogenesis", + "Articular Range of Motion", + "Core Stability", + "Postural Balance", + "Locomotion", + "Physical Endurance", + "Exergaming", + "Isotonic Contraction", + "Physical Exertion", + "Musculoskeletal Physiological Phenomena", + "Excitation Contraction Coupling", + "Muscle Strength", + "Muscle Contraction", + "Intramuscular Absorption", + "Chondrogenesis", + "Muscle Development", + "Cardiorespiratory Fitness", + "Uterine Contraction", + "Tonic Immobility Response", + "Knee-Chest Position", + "Bone Resorption", + "Exercise Tolerance", + "Physiologic Calcification", + "Stair Climbing", + "Muscle Tonus", + "Plyometric Exercise", + "Isometric Contraction", + "Walking Speed", + "Muscle Relaxation", + "muscle tissue", + "muscular tissue", + "fat tissue", + "adipose tissue", + "lymphoid tissue", + "autophagy", + "nucleophagy", + ) + + val queryStr = phrases map (p => s"(${p})") mkString " OR " + + vanillaUseCase(queryStr, resultDir, 1000000) + } + /** Finds all NXML that contain at least one biochemical interaction */ def useCaseAnyInteraction(resultDir:String, maxDocs:Int) { vanillaUseCase( diff --git a/main/src/test/scala/org/clulab/reach/TestAssociationEvents.scala b/main/src/test/scala/org/clulab/reach/TestAssociationEvents.scala new file mode 100644 index 000000000..ebac79346 --- /dev/null +++ b/main/src/test/scala/org/clulab/reach/TestAssociationEvents.scala @@ -0,0 +1,185 @@ +package org.clulab.reach + +import org.scalatest.{Matchers, FlatSpec} +import org.clulab.reach.mentions._ +import TestUtils._ + +class TestAssociationEvents extends FlatSpec with Matchers { + val sent1 = "We found that frailty and pre-frailty were associated with significantly elevated CRP and IL-6 levels across all geographical settings and among community and institutionalized participants." + + val sent2 = "Frailty (SMD = 1.12, 95%CI: 0.27–2.13, p = 0.01; I2 = 99%) and pre-frailty (SMD = 0.56, 95%CI: 0.00–1.11, p = 0.05; I2 = 99%) were associated with higher serum levels of IL-6 versus robust participants." + val sent3 = "A direct association between frailty and elevated levels of inflammation, as marked by elevated interleukin-6 (IL-6)" + val sent4 = "frailty and pre-frailty are associated with higher inflammatory parameters and in particular CRP and IL-6" + val sent5 = "Frailty and pre-frailty are associated with higher CRP and IL 6." + val sent6 = "There is evidence that anti-inflammatory interleukins (ILs) such as IL-10 and IL-4, although implicated to a lesser extent, also associate with the frailty phenotype" + val sent7 = "frailty is associated with alterations in the concentration of pro-inflammatory molecules." + val sent8 = "Frailty-associated physiological dysregulation" + val sent9 = "But when inflammation becomes chronic, often associated with aging or diseases" + val sent10 = "frailty and pre-frailty are associated with higher inflammatory parameter levels, in particular, CRP and IL6" + val sent11 = "Upregulation of pro-inflammatory cytokines has not only been associated with increased morbidity and mortality in older adults but also has been linked to frailty" + val sent12 = "Aging was significantly associated with higher fibrinogen (p=0.04) and D-dimer levels (p=0.01) but only among NF subjects." + val sent13 = "Upregulation of cytokines such as interleukin-1 (IL-1), interleukin-6 (IL-6) and tumor necrosis factor-α (TNF-α) that contribute to systemic inflammation have been independently associated with increased morbidity and mortality in older adults" + val sent14 = "inflammatory index, an additive index of serum IL-6 and soluble TNF-α receptor −1 (TNFR1) has been shown not only to best capture age-associated chronic inflammation but also predict mortality in older adults" + val sent15 = "age was positively associated with TNFR1 (r=0.22; p=0.02), TNFR2 (r=0.25; p=0.02) and the inflammatory index (r=0.28, p=0.008) but not IL-6 (r=0.05; p=0.5)" + val sent16 = "Age was positively associated with both fibrinogen (r=0.21; p=0.04) and D-dimer (r=0.27; p=0.01) (data not shown)" + val sent17 = "Age was associated with increased TNFR1 and TNFR2" + val sent18 = "we did not find IL-6 or CRP to be significantly associated with age" + val sent19 = "Aging was associated with increasing inflammatory index score in our study" + val sent20 = "Elevated levels of IL-6 have been linked to multiple age-associated conditions, such as atherosclerosis, dementia and frailty" + val sent21 = "Frailty in older adults has been associated with superoxide anion overproduction by nicotinamide adenine dinucleotide phosphate-oxidase (NADPH) oxidase and low-grade chronic inflammation" + val sent22 = "We did find aging to be not only associated with both these coagulation markers but also more strongly associated with a pro-thrombotic state than frailty status" + val sent23 = "subclinically higher levels of serum immune mediators in older adults relative to youngsters, both groups devoid of overt infectious disease, is associated with the pathophysiology of chronic conditions such as frailty" + val sent24 = "Enhanced levels of circulating immune mediators such as IL-6 are often taken as a surrogate for “inflammageing” and are consistently associated with the frail phenotype and mortality" + val sent25 = "There is evidence that anti-inflammatory interleukins (ILs) such as IL-10 and IL-4, although implicated to a lesser extent, also associate with the frailty phenotype [19–21]" + val sent26 = "other reports have associated augmented serum concentrations of IL-6 with lower gait speed and reduced muscle strength in a context of frailty" + val sent27 = "body of evidence connecting frailty within inflammaging and immunosenescence biomarkers" + val sent28 = "very few studies have assessed the link between lymphocyte subpopulations and frailty status in older adults" + val sent29 = "involvement of chronic inflammation in frailty in later life" + val sent30 = "Inflammatory molecules may directly contribute to frailty" + + + sent1 should "contain four positive association events" in { + val mentions = getBioMentions(sent1) + mentions.filter(_.label == "Positive_association") should have size 6 + } + + sent2 should "contain four positive association events" in { + val mentions = getBioMentions(sent2) + mentions.filter(_.label == "Positive_association") should have size 4 + } + + sent3 should "contain one positive association event" in { + val mentions = getBioMentions(sent3) + mentions.filter(_.label == "Positive_association") should have size 1 + } + + sent4 should "contain four positive association events" in { + val mentions = getBioMentions(sent4) + mentions.filter(_.label == "Positive_association") should have size 4 + } + + sent5 should "contain four association events" in { + val mentions = getBioMentions(sent5) + mentions.filter(_.label == "Positive_association") should have size 4 + } + + sent6 should "contain two association events" in { + val mentions = getBioMentions(sent6) + mentions.filter(_.label == "Association") should have size 2 + } + + sent7 should "contain one association event" in { + val mentions = getBioMentions(sent7) + mentions.filter(_.label == "Association") should have size 1 + } + + sent8 should "contain one association event" in { + val mentions = getBioMentions(sent8) + mentions.filter(_.label == "Association") should have size 1 + } + + sent9 should "contain two association events" in { + val mentions = getBioMentions(sent9) + mentions.filter(_.label == "Association") should have size 2 + } + + sent10 should "contain four positive association events" in { + val mentions = getBioMentions(sent10) + mentions.filter(_.label == "Positive_association") should have size 4 + } + + sent11 should "contain two positive association events" in { + val mentions = getBioMentions(sent11) + mentions.filter(_.label == "Positive_association") should have size 5 + } + + sent12 should "contain two positive association events" in { + val mentions = getBioMentions(sent12) + mentions.filter(_.label == "Positive_association") should have size 2 + } + + sent13 should "contain six positive association events" in { + val mentions = getBioMentions(sent13) + mentions.filter(_.label == "Positive_association") should have size 6 + } + + sent14 should "contain one association event" in { + val mentions = getBioMentions(sent14) + mentions.filter(_.label == "Association") should have size 1 + } + + sent15 should "contain four positive association events" in { + val mentions = getBioMentions(sent15) + mentions.filter(_.label == "Positive_association") should have size 4 + } + + sent16 should "contain two positive association events" in { + val mentions = getBioMentions(sent16) + mentions.filter(_.label == "Positive_association") should have size 2 + } + + sent17 should "contain two positive association events" in { + val mentions = getBioMentions(sent17) + mentions.filter(_.label == "Positive_association") should have size 2 + } + + sent18 should "contain two positive association event" in { + val mentions = getBioMentions(sent18) + mentions.filter(_.label == "Positive_association") should have size 2 + } + + sent19 should "contain one positive association event" in { + val mentions = getBioMentions(sent19) + mentions.filter(_.label == "Positive_association") should have size 1 + } + + sent20 should "contain three association events" ignore { + val mentions = getBioMentions(sent20) + mentions.filter(_.label == "Association") should have size 3 + } + + sent21 should "contain one association event" in { + val mentions = getBioMentions(sent21) + mentions.filter(_.label == "Association") should have size 1 + } + + sent23 should "contain one positive association event" in { + val mentions = getBioMentions(sent23) + mentions.filter(_.label == "Positive_association") should have size 1 + } + + sent24 should "contain two association events" in { + val mentions = getBioMentions(sent24) + mentions.filter(_.label == "Association") should have size 2 + } + + sent25 should "contain two association events" in { + val mentions = getBioMentions(sent25) + mentions.filter(_.label == "Association") should have size 2 + } + + sent26 should "contain two negative association events" in { + val mentions = getBioMentions(sent26) + mentions.filter(_.label == "Negative_association") should have size 2 + } + + sent27 should "contain two association events" in { + val mentions = getBioMentions(sent27) + mentions.filter(_.label == "Association") should have size 2 + } + + sent28 should "contain one association event" in { + val mentions = getBioMentions(sent28) + mentions.filter(_.label == "Association") should have size 1 + } + + sent29 should "contain one association event" in { + val mentions = getBioMentions(sent29) + mentions.filter(_.label == "Association") should have size 1 + } + + sent30 should "contain one association event" in { + val mentions = getBioMentions(sent29) + mentions.filter(_.label == "Association") should have size 1 + } +} diff --git a/processors/src/main/scala/org/clulab/processors/bionlp/BioPOSPostProcessor.scala b/processors/src/main/scala/org/clulab/processors/bionlp/BioPOSPostProcessor.scala index f733fcde1..fd7537ba9 100644 --- a/processors/src/main/scala/org/clulab/processors/bionlp/BioPOSPostProcessor.scala +++ b/processors/src/main/scala/org/clulab/processors/bionlp/BioPOSPostProcessor.scala @@ -58,6 +58,16 @@ class BioPOSPostProcessor extends SentencePostProcessor { case aa if aa == "His" => tags(i) = "NN" case aa if aa == "Pro" => tags(i) = "NN" + case f if f.toLowerCase() == "frailty" => tags(i) = "NN" + // pre-frailty case + case pf if pf.toLowerCase == "pre-frailty" => tags(i) = "NN" + // pro-inflammatory molecules case +// case pi if pi.toLowerCase == "molecules" +// && i >=2 && sentence.words(i-1).toLowerCase == "inflammatory" +// && sentence.words(i-2).toLowerCase == "pro" => +// tags(i-1) = "NN" +// tags(i-2) = "NN" + case _ => () } diff --git a/project/build.properties b/project/build.properties index 6db984250..e67343ae7 100644 --- a/project/build.properties +++ b/project/build.properties @@ -1 +1 @@ -sbt.version=1.4.0 +sbt.version=1.5.0 diff --git a/retrieve_entity_descriptions.sh b/retrieve_entity_descriptions.sh new file mode 100644 index 000000000..2640424eb --- /dev/null +++ b/retrieve_entity_descriptions.sh @@ -0,0 +1,13 @@ +#!/bin/bash +# Prints the synonyms of each entity type located in the KB files + +IFS=':' + +for ARG in "$@" +do + read -ra ID <<< "$ARG" + grep -r "${ID[1]}" bioresources/src/main/resources/org/clulab/reach/kb/ | \ + awk -F: '{print $2}' | \ + awk -F'\t' '{printf("\"%s\",\n", tolower($1))}' | \ + uniq +done \ No newline at end of file diff --git a/src/main/scala/org/clulab/reach/CLI.scala b/src/main/scala/org/clulab/reach/CLI.scala index 96b65464c..1596b1f28 100644 --- a/src/main/scala/org/clulab/reach/CLI.scala +++ b/src/main/scala/org/clulab/reach/CLI.scala @@ -82,7 +82,7 @@ abstract class CLI ( // Count the number of failed files, not failed formats. math.signum(processPaper(file, withAssembly)) } catch { - case e: Exception => + case e: Throwable => // The reading itself, rather than the format, could have failed. reportException(file, e) 1 diff --git a/src/main/scala/org/clulab/reach/ReachCLI.scala b/src/main/scala/org/clulab/reach/ReachCLI.scala index 162d3d750..1fe457e4e 100644 --- a/src/main/scala/org/clulab/reach/ReachCLI.scala +++ b/src/main/scala/org/clulab/reach/ReachCLI.scala @@ -11,6 +11,8 @@ import java.nio.charset.Charset import java.nio.charset.StandardCharsets.UTF_8 import ai.lum.common.FileUtils._ import ai.lum.common.ConfigUtils._ +import org.clulab.reach.RuleReader.readResource +import org.clulab.reach.`export`.VisualAnalyticsDataExporter //import jline.internal.InputStreamReader import org.clulab.odin._ import org.clulab.processors.Document @@ -19,6 +21,7 @@ import org.clulab.reach.`export`.cmu.CMUExporter import org.clulab.reach.assembly._ import org.clulab.reach.assembly.export.{AssemblyExporter, AssemblyRow, ExportFilters} import org.clulab.reach.export.OutputDegrader +import org.clulab.reach.export.TrainingDataExporter import org.clulab.reach.export.fries.FriesOutput import org.clulab.reach.export.indexcards.IndexCardOutput import org.clulab.reach.export.serial.SerialJsonOutput @@ -161,7 +164,7 @@ class ReachCLI ( outputDir: File, outputType: String, withAssembly: Boolean - ) = { + ): Unit = { val outFile = s"${outputDir.getAbsolutePath}${File.separator}$paperId" @@ -226,6 +229,29 @@ class ReachCLI ( val outFile = new File(outputDir, s"$paperId-cmu-out.tsv") outFile.writeString(output, java.nio.charset.StandardCharsets.UTF_8) + case ("training-data", _) => + val output = TrainingDataExporter.jsonOutput(mentions, + allowedLabels = Some(Set("Positive_activation", "Negative_activation", "Activation", + "Positive_regulation", "Negative_regulation", "Regulation")), + ) + val outFile = new File(outputDir, s"$paperId-classifictaion-out.json") + outFile.writeString(output, java.nio.charset.StandardCharsets.UTF_8) + + case ("rule-learning", _) => + val rulesDictionary = PaperReader.reachSystem.rulePatternsMap + val output = TrainingDataExporter.jsonOutput(mentions, + allowedLabels = Some(Set("Positive_activation", "Negative_activation", "Activation")), + includeRule = true, + rulesDictionary = Some(rulesDictionary)) + // Only look at activations + val outFile = new File(outputDir, s"$paperId-rule_learning-out.json") + outFile.writeString(output, java.nio.charset.StandardCharsets.UTF_8) + + case ("visual-analytics", _) => + val output = VisualAnalyticsDataExporter.jsonOutput(mentions, Some(entry)) + val outFile = new File(outputDir, s"$paperId-va.json") + outFile.writeString(output, java.nio.charset.StandardCharsets.UTF_8) + case _ => throw new RuntimeException(s"Output format ${outputType.toLowerCase} not yet supported!") } }